2022-11-12接到任务,把PDF中得表格转成Excel文件
PDF格式如下:
实施步骤:
1.先把PDF文件转成TXT文件再导入到程序
import re
import pandas as pd
import os
#TXT文件导入到程序,并把回车符(n)替换成字符“(line)”
with open('2005proposal.pdf.txt', 'r') as f:
t = f.readlines()
txt = '(line)'.join([i.strip() for i in t])
导入后:
2.使用正则表达式匹配“________________”把TXT分页
def find_table_page(txt):
# 分页提取内容
pattern = re.compile('(?<=_)[^_]*(?:_)')
table_page = pattern.findall(txt)
# 只留带表格的页面
return [i for i in table_page if i != '_']
3.使用正则表达式匹配每页“as Percent of Employment“至”This“中的内容
def find_main_content(page):
#匹配“as Percent of Employment“至”This“中的内容
pattern = re.compile('(?<=as Percent of Employment)(.*)(?=This)')
content = pattern.findall(page)
if content:
return content[0]
else:
return []
4.使用”Total“对之前匹配的文本进行分割区域
def find_division(content):
# 再”Total"按行分割
return [i for i in content.split('Total') if len(i)>=10]
def clean_div(sub_div):
# 再按行分割
return [i for i in sub_div.split('(line)') if i and not i.startswith(' ')]
def find_region(sub_div):
# Find the region from the div
return sub_div[0]
def find_data(sub_div):
# Find the data
sub_div = base_checking(sub_div)
return [i+'%' for i in ' '.join(sub_div[1:]).split('%') if i != '']
5.使用正则表达式匹配每页数据并拆分成数组
def base_checking(clean_d):
# 举例
'''
clean_d = ['Niven U.S. Army Reserve Center,',
'Close (34) 0 0 5 (34) 5 0 (29) (13) (42) 26,102 -0.2% Albermarle']
'''
for i in range(len(clean_d)):
correct_base_name = re.compile('(?<=%).*').findall(clean_d[i])
if correct_base_name:
correct_base_name = correct_base_name[0]
# 删除空格
correct_base_name = correct_base_name[1:]
right_row = [correct_base_name] + [i+'%' for i in re.compile('.*(?=%)').findall(clean_d[i]) if i!='']
clean_d[i] = " ".join(right_row)
return clean_d
def find_element(data):
#查找Close|Realign|Gain开头的数据
action_patt = re.compile('(Close|Realign|Gain)+(/)?')
action = action_patt.findall(data)
action = "".join([i for j in action for i in j ])
#查找Close|Realign|Gain结尾的数据
base_patt = re.compile('(.*) (Close|Gain|Realign)')
base = base_patt.findall(data)
base = base[0][0]
# 查找Close|Realign|Gain后面的数据
data_patt = re.compile('(Close|Gain|Realign) (.*)')
num = data_patt.findall(data)
num = num[0][1]
num = num.replace(',', '')
return base, action, num
6.遍历所有页数据并处理
clean_data = {}
pages = find_table_page(txt)
# 遍历所有页数据存入字典
junk_str = 'Out In Net Gain/(Loss) Net Mission(line)Total(line)Indirect(line)Total(line)Economic Contractor(line)Direct(line)Changes(line)Job(line)Area Changes(line)Employment(line)'
pages[1] = pages[1].replace(junk_str,'')
# Store data as a dictionary
for p in range(1,len(pages)):
content = find_main_content(pages[p])
if content:
divs = find_division(content)
for d in range(len(divs)):
clean_d = clean_div(divs[d])
if clean_d:
region = find_region(clean_d)
data = find_data(clean_d)
all_row = []
for row in range(len(data)):
element = find_element(data[row])
if element:
all_row.append(element)
clean_data[region] = all_row
7.字典转LIST
region = []
base = []
action = []
data = []
for (key,values) in clean_data.items():
for v in range(len(values)):
region.append(key)
base.append(values[v][0])
action.append(values[v][1])
data.append(values[v][2])
8.生成DataFrame并清数据
# 新建DataFrame
output = pd.DataFrame(data,columns=['Data'])
output = output.Data.str.split(' ',expand=True)
output.columns = ['mil_out','civ_out','mil_in','civ_in','mil_net','civ_net','net_contractor','direct','indirect','total','ea_emp','ch_as_percent_of_emp']
output['action'] = action
output['base'] = base
output['msa'] = region
#更改数据帧的顺序
output = output.iloc[:,[-1,-2,-3]+list(range(12))]
# 更改负数的格式
output.iloc[:,3:-1] = output.iloc[:,3:-1].apply(lambda x: x.str.replace('((.*))', '-\1'),axis = 1)
# 删除某些单元格前面的空间
for z in range(2):
output.base = output.base.apply(lambda x: x[1:] if x.startswith(' ') else x)
# 导出表格
output.to_excel("output.xlsx")