2022-11-12接到任务,把PDF中得表格转成Excel文件

PDF格式如下:





实施步骤:

1.先把PDF文件转成TXT文件再导入到程序

import re import pandas as pd import os #TXT文件导入到程序,并把回车符(n)替换成字符“(line)” with open('2005proposal.pdf.txt', 'r') as f: t = f.readlines() txt = '(line)'.join([i.strip() for i in t])

导入后:


2.使用正则表达式匹配“________________”把TXT分页

def find_table_page(txt): # 分页提取内容 pattern = re.compile('(?<=_)[^_]*(?:_)') table_page = pattern.findall(txt) # 只留带表格的页面 return [i for i in table_page if i != '_']

3.使用正则表达式匹配每页“as Percent of Employment“至”This“中的内容

def find_main_content(page): #匹配“as Percent of Employment“至”This“中的内容 pattern = re.compile('(?<=as Percent of Employment)(.*)(?=This)') content = pattern.findall(page) if content: return content[0] else: return []

4.使用”Total“对之前匹配的文本进行分割区域

def find_division(content): # 再”Total"按行分割 return [i for i in content.split('Total') if len(i)>=10] def clean_div(sub_div): # 再按行分割 return [i for i in sub_div.split('(line)') if i and not i.startswith(' ')] def find_region(sub_div): # Find the region from the div return sub_div[0] def find_data(sub_div): # Find the data sub_div = base_checking(sub_div) return [i+'%' for i in ' '.join(sub_div[1:]).split('%') if i != '']

5.使用正则表达式匹配每页数据并拆分成数组

def base_checking(clean_d): # 举例 ''' clean_d = ['Niven U.S. Army Reserve Center,', 'Close (34) 0 0 5 (34) 5 0 (29) (13) (42) 26,102 -0.2% Albermarle'] ''' for i in range(len(clean_d)): correct_base_name = re.compile('(?<=%).*').findall(clean_d[i]) if correct_base_name: correct_base_name = correct_base_name[0] # 删除空格 correct_base_name = correct_base_name[1:] right_row = [correct_base_name] + [i+'%' for i in re.compile('.*(?=%)').findall(clean_d[i]) if i!=''] clean_d[i] = " ".join(right_row) return clean_d def find_element(data): #查找Close|Realign|Gain开头的数据 action_patt = re.compile('(Close|Realign|Gain)+(/)?') action = action_patt.findall(data) action = "".join([i for j in action for i in j ]) #查找Close|Realign|Gain结尾的数据 base_patt = re.compile('(.*) (Close|Gain|Realign)') base = base_patt.findall(data) base = base[0][0] # 查找Close|Realign|Gain后面的数据 data_patt = re.compile('(Close|Gain|Realign) (.*)') num = data_patt.findall(data) num = num[0][1] num = num.replace(',', '') return base, action, num

6.遍历所有页数据并处理

clean_data = {} pages = find_table_page(txt) # 遍历所有页数据存入字典 junk_str = 'Out In Net Gain/(Loss) Net Mission(line)Total(line)Indirect(line)Total(line)Economic Contractor(line)Direct(line)Changes(line)Job(line)Area Changes(line)Employment(line)' pages[1] = pages[1].replace(junk_str,'') # Store data as a dictionary for p in range(1,len(pages)): content = find_main_content(pages[p]) if content: divs = find_division(content) for d in range(len(divs)): clean_d = clean_div(divs[d]) if clean_d: region = find_region(clean_d) data = find_data(clean_d) all_row = [] for row in range(len(data)): element = find_element(data[row]) if element: all_row.append(element) clean_data[region] = all_row

7.字典转LIST

region = [] base = [] action = [] data = [] for (key,values) in clean_data.items(): for v in range(len(values)): region.append(key) base.append(values[v][0]) action.append(values[v][1]) data.append(values[v][2])

8.生成DataFrame并清数据

# 新建DataFrame output = pd.DataFrame(data,columns=['Data']) output = output.Data.str.split(' ',expand=True) output.columns = ['mil_out','civ_out','mil_in','civ_in','mil_net','civ_net','net_contractor','direct','indirect','total','ea_emp','ch_as_percent_of_emp'] output['action'] = action output['base'] = base output['msa'] = region #更改数据帧的顺序 output = output.iloc[:,[-1,-2,-3]+list(range(12))] # 更改负数的格式 output.iloc[:,3:-1] = output.iloc[:,3:-1].apply(lambda x: x.str.replace('((.*))', '-\1'),axis = 1) # 删除某些单元格前面的空间 for z in range(2): output.base = output.base.apply(lambda x: x[1:] if x.startswith(' ') else x) # 导出表格 output.to_excel("output.xlsx")