关于PDF文件表格提取实现

pdf文件内容格式比较特殊，目前根据实际观察发现pdf内容只是把源目标文件，一行一行读到pdf中，通过定位方式实现同版面展示，所以表格在pdf中表现形式比较特殊。

实现思路：

通过pdf内容识别，找到表格所属的页（只是提高一些速度，减少其他内容），然后将表格所在的页面截取到新的pdf文件中,然后将新生成的pdf转换为html文件，通过算法重新组装表格，此方法可识别空白列以及一个表格中存在多行数据的情况

用的技术框架：

jsoup，itextpdf，pdfbox

/** * 读取pdf文件转为list集合 * @param pdfPath * @return */ public static List> getDataFromPdf(String pdfPath){ List> datas=new ArrayList<>(); String newPdfPath=pdfPath.replace(".pdf","_01.pdf"); String htmlPath=pdfPath.replace(".pdf","_01.html"); //确认附件表格所在的页面，返回页码 int[] pageNums=readPdf(pdfPath); //读取存在表格附件的页面 partitionPdfFile(pdfPath,newPdfPath,pageNums[0],pageNums[1]); byte[] bytes = getBytes(newPdfPath); try (BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(htmlPath)),"UTF-8"));){ //加载PDF文档 PDDocument document = PDDocument.load(bytes); PDFDomTree pdfDomTree = new PDFDomTree(); pdfDomTree.writeText(document,out); datas=ParseHtml(htmlPath); } catch (Exception e) { e.printStackTrace(); }finally { //删除缓存文件 File pdf_01=new File(newPdfPath); if(pdf_01.exists()){ pdf_01.delete(); } File html_01=new File(htmlPath); if(html_01.exists()){ html_01.delete(); } } return datas; } /*** * 读取pdf 确定内容所在页 * @param pdfPath */ private static int[] readPdf(String pdfPath){ int[] pageNums=new int[2]; try { PdfReader reader = new PdfReader(pdfPath); int pageNum = reader.getNumberOfPages(); boolean isGo=false; for(int i=1;i<=pageNum;i++){ String pageContent = PdfTextExtractor.getTextFromPage(reader, i);//读取第i页的文档内容 if((pageContent.trim().length()>0&&pageContent.startsWith("附件"))){ pageNums[0]=i; isGo=true; } if(isGo&&pageContent.trim().length()<50){ pageNums[1]=i-1; //break; } } } catch (Exception e) { e.printStackTrace(); }finally{ } return pageNums; }

/** * pdf 转换为html * @param html * @return * @throws IOException */ private static List> ParseHtml(String html) throws IOException { org.jsoup.nodes.Document document = Jsoup.parse(new File(html), "utf-8"); Elements postItems = document.select("div.page"); //循环处理每页 List> datas=new ArrayList<>(); for (int i=0;i

下面是html的解析方式，通过边框定位，找到每一行每一列所处的位置一级在该位置所属的元素。

/** * 从第二行开始（去除标题行） * @param postItem * @param table_col * @param index * @return */ private static List getRow(Element postItem,Elements postItems,Elements table_col,int index) { String top = (process(postItems.get(index).attr("style"), "top")); String bottom = (process(postItems.get(index + 1).attr("style"), "top")); Elements tables = postItem.select("[style*=top:]"); List data = new ArrayList<>(); double dbottom = Double.parseDouble(bottom); double dtop = Double.parseDouble(top); boolean isGo = false; for (int iiy = 0; iiy < table_col.size() - 1; iiy++) { StringBuilder sbs = new StringBuilder(); for (Element spostItem : tables) { String top2 = (process(spostItem.attr("style"), "top")); double top2s = Double.parseDouble(top2); if (top2s > dtop && top2s < dbottom) { String left2 = (process(spostItem.attr("style"), "left")); double[] cols = getRowCol(table_col, iiy); double left2s = Double.parseDouble(left2); if (left2s > cols[0] && left2s < cols[1]) { sbs.append(spostItem.text()); } } } if(sbs.length()==0) { data.add("-"); }else{ data.add(sbs.toString()); } } return data; } /** * 定位列的位置 * @param table_col * @param index * @return */ private static double[] getRowCol(Elements table_col,int index){ StringBuilder sbd=new StringBuilder(); String left=(process(table_col.get(index).attr("style"),"left")); String right=(process(table_col.get(index+1).attr("style"),"left")); return new double[]{Double.parseDouble(left),Double.parseDouble(right)}; } /** * 读取html中样式的指定属性 * @param style * @param extract * @return */ private static String process(String style,String extract) { if (style.contains(extract)) { style = style.substring(style.indexOf(extract+":")); style = style.substring(0, style.indexOf(";")); String attr = style.substring(style.indexOf(":") + 1); return (attr.substring(0,attr.length()-2)); } return null; }

pom配置

com.itextpdf itextpdf 5.5.13 org.jsoup jsoup 1.12.1 commons-io commons-io 2.5 org.apache.pdfbox fontbox 2.0.0 com.itextpdf.tool xmlworker 5.5.11 org.apache.poi ooxml-schemas 1.1