Java 文件处理系列之：word转pdf

日常操作中，word转pdf是较为常见的操作。尤其是前端上传word文档，需要在页面预览文档的情况。前端直接预览word需要特殊的处理，但是如果由后端先把word转为pdf，再预览，就会比较简单。

效果预览：

原始word文件.docx

转换之后的pdf文件.pdf

接下来就分享实测过的实现方式。

环境：JDK11、Springboot 2.3.7.RELEASE、windows10、Maven

第一步，Maven 依赖配置，主要导入一些工具包

org.springframework.boot spring-boot-starter-web org.springframework.boot spring-boot-starter-test test org.projectlombok lombok true org.apache.commons commons-lang3 3.4 com.deepoove poi-tl 1.10.2 fr.opensagres.xdocreport fr.opensagres.poi.xwpf.converter.pdf 2.0.2 org.apache.poi poi-scratchpad 4.1.2 fr.opensagres.xdocreport fr.opensagres.poi.xwpf.converter.core 2.0.2 fr.opensagres.xdocreport fr.opensagres.poi.xwpf.converter.xhtml 2.0.2 com.itextpdf itextpdf 5.5.13.2 com.itextpdf.tool xmlworker 5.5.13.2 com.itextpdf itext-asian 5.2.0 com.itextpdf html2pdf 4.0.1 org.jsoup jsoup 1.14.3

第二步，service 业务层构造

package com.yalin.cn.fileutil.word.service; import java.io.InputStream; import java.io.OutputStream; /** * @description: word生成pdf * @author: lyl * @create: 2021-05-08 16:31:47 **/ public interface IWordConvertPdfService { /** * docx 转pdf * * @param sourcePath word路径 * @param targetPath pdf路径 * @param imageDir word中的图片临时存放路径 * @return boolean */ boolean convert(String sourcePath, String targetPath, String imageDir); /** * docx 转pdf * * @param in word文件流 * @param targetPath pdf路径 * @param imageDir word中的图片临时存放路径 * @return boolean */ boolean convert(InputStream in, String targetPath, String imageDir); /** * docx 转pdf * * @param in word文件流 * @param out pdf文件流 * @param imageDir word中的图片临时存放路径 * @return boolean * @throws Exception 抛出异常 */ boolean convert(InputStream in, OutputStream out, String imageDir) throws Exception; }

第三步，service impl 业务实现层构造

package com.yalin.cn.fileutil.word.service.impl; import com.yalin.cn.fileutil.util.OfficeUtil; import com.yalin.cn.fileutil.word.service.IWordConvertPdfService; import lombok.extern.slf4j.Slf4j; import org.springframework.stereotype.Service; import java.io.InputStream; import java.io.OutputStream; import java.nio.file.Files; import java.nio.file.Paths; import java.util.Objects; /** * @description: word生成pdf * @author: lyl * @create: 2021-05-08 16:31:47 **/ @Service @Slf4j public class WordConvertPdfServiceImpl implements IWordConvertPdfService { /** * docx 转pdf * * @param sourcePath word路径 * @param targetPath pdf路径 * @param imageDir word中的图片临时存放路径 * @return boolean */ @Override public boolean convert(String sourcePath, String targetPath, String imageDir) { try (InputStream inputStream = Files.newInputStream(Paths.get(sourcePath)); OutputStream outputStream = Files.newOutputStream(Paths.get(targetPath))) { return convert(inputStream, outputStream, imageDir); } catch (Exception e) { log.error("convert(String, String, String)异常：{}", e); } return false; } /** * docx 转pdf * * @param in word文件流 * @param targetPath pdf路径 * @param imageDir word中的图片临时存放路径 * @return boolean */ @Override public boolean convert(InputStream in, String targetPath, String imageDir) { try (OutputStream outputStream = Files.newOutputStream(Paths.get(targetPath))) { return convert(in, outputStream, imageDir); } catch (Exception e) { log.error("convert(String, String, String)异常：{}", e); } return false; } /** * docx 转pdf * * @param in word文件流 * @param out pdf文件流 * @param imageDir word中的图片临时存放路径 * @return boolean */ @Override public boolean convert(InputStream in, OutputStream out, String imageDir) throws Exception { if (Objects.isNull(in)) { throw new Exception("模板文件流为null！"); } if (Objects.isNull(out)) { throw new Exception("目标文件流为null！"); } try { // word转pdf OfficeUtil.docxConvertPdf(in, out, imageDir); return true; } catch (Exception e) { log.error("fill(InputStream, OutputStream, String)异常：{}", e); } return false; } }

第四步，真正实现转换的工具类

package com.yalin.cn.fileutil.util; import com.itextpdf.text.*; import com.itextpdf.text.pdf.BaseFont; import com.itextpdf.text.pdf.PdfWriter; import com.itextpdf.tool.xml.XMLWorkerHelper; import com.yalin.cn.fileutil.font.AutoFontFactory; import fr.opensagres.poi.xwpf.converter.core.BasicURIResolver; import fr.opensagres.poi.xwpf.converter.core.FileImageExtractor; import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter; import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions; import org.apache.commons.lang3.StringUtils; import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.jsoup.Jsoup; import org.jsoup.nodes.Element; import org.jsoup.nodes.Entities; import org.jsoup.select.Elements; import java.io.*; import java.nio.charset.Charset; import java.util.Objects; /** * @description: word 转pdf * @author: lyl * @create: 2021-04-23 11:09:51 **/ public class OfficeUtil { /** * 将docx格式文件转成html * * @param in docx文件流 * @param imageDir docx文件中图片存储目录 * @return html */ public static String docx2Html(InputStream in, String imageDir) throws Exception { String content = null; ByteArrayOutputStream baos = null; try { // 1> 加载文档到XWPFDocument XWPFDocument document = new XWPFDocument(in); // 2> 解析XHTML配置（这里设置IURIResolver来设置图片存放的目录） XHTMLOptions options = XHTMLOptions.create(); // 存放word中图片的目录 if (Objects.nonNull(imageDir)) { options.setExtractor(new FileImageExtractor(new File(imageDir))); options.URIResolver(new BasicURIResolver(imageDir)); options.setIgnoreStylesIfUnused(false); options.setFragment(true); } // 3> 将XWPFDocument转换成XHTML baos = new ByteArrayOutputStream(); XHTMLConverter.getInstance().convert(document, baos, options); } catch (Exception e) { e.printStackTrace(); throw new Exception(e); } finally { try { if (in != null) { in.close(); } if (baos != null) { content = new String(baos.toByteArray(), "utf-8"); baos.close(); } } catch (Exception e) { e.printStackTrace(); } } return content; } /** * 使用jsoup规范化html * * @param html html内容 * @return 规范化后的html */ private static String formatHtml(String html) { org.jsoup.nodes.Document doc = Jsoup.parse(html); // 去除过大的宽度 String style = doc.attr("style"); if (StringUtils.isNotEmpty(style) && style.contains("width")) { doc.attr("style", ""); } Elements divs = doc.select("div"); for (Element div : divs) { String divStyle = div.attr("style"); if (StringUtils.isNotEmpty(divStyle) && divStyle.contains("width")) { div.attr("style", ""); } } // jsoup生成闭合标签 doc.outputSettings().syntax(org.jsoup.nodes.Document.OutputSettings.Syntax.xml); doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml); return doc.html(); } /** * html转成pdf * * @param html html * @param out 输出pdf文件流 */ public static void htmlToPdf(String html, OutputStream out) throws Exception { Document document = null; ByteArrayInputStream bais = null; try { // 纸 document = new Document(PageSize.A4); // 笔 PdfWriter writer = PdfWriter.getInstance(document, out); document.open(); // html转pdf bais = new ByteArrayInputStream(html.getBytes("UTF-8")); XMLWorkerHelper.getInstance().parseXHtml(writer, document, bais, Charset.forName("UTF-8"), new FontProvider() { @Override public boolean isRegistered(String s) { return false; } @Override public Font getFont(String s, String s1, boolean embedded, float size, int style, BaseColor baseColor) { // 配置字体 Font font = null; try { BaseFont bf = AutoFontFactory.getBaseFont(); font = new Font(bf, size, style, baseColor); font.setColor(baseColor); } catch (Exception e) { e.printStackTrace(); } return font; } }); } catch (Exception e) { e.printStackTrace(); throw new Exception(e); } finally { if (document != null) { document.close(); } if (bais != null) { try { bais.close(); } catch (IOException e) { e.printStackTrace(); } } } } /** * docx 转pdf * * @param in docx文件流 * @param out pdf文件流 * @param imageDir docx中图片存放路径 * @return boolean */ public static boolean docxConvertPdf(InputStream in, OutputStream out, String imageDir) { try { String docxHtml = docx2Html(in, imageDir); docxHtml = formatHtml(docxHtml); htmlToPdf(docxHtml, out); return true; } catch (Exception e) { e.printStackTrace(); } return false; } }

备注：OfficeUtil中的AutoFontFactory属于自定义的字体。因为linux环境下不支持某些中文字体，导致乱码。解决方案之一，就是从windows字体库中复制一个，放到resource目录下，在代码中引用即可。

package com.yalin.cn.fileutil.font; import com.itextpdf.text.DocumentException; import com.itextpdf.text.pdf.BaseFont; import java.io.IOException; /** * @description: 字体工厂 * @author: lyl * @create: 2022-01-17 15:38:29 **/ public class AutoFontFactory { /** * 获取基础字体 * * @return BaseFont * @throws IOException * @throws DocumentException */ public static BaseFont getBaseFont() throws IOException, DocumentException { // 方案一：使用资源字体(需要有字体) BaseFont bf = BaseFont.createFont("/font/simsun.ttc,0", BaseFont.IDENTITY_H, BaseFont.EMBEDDED); // 方案二：使用本地字体(本地需要有字体) // BaseFont bf = BaseFont.createFont("C:/Windows/Fonts/seguisym.ttf", BaseFont.IDENTITY_H, BaseFont.EMBEDDED); // 方案二：使用jar包：iTextAsian，这样只需一个jar包就可以了 // BaseFont bf = BaseFont.createFont("STSong-Light", "UniGB-UCS2-H", BaseFont.EMBEDDED); return bf; } }

测试类

@Test void wordConvertPdf() { String basePath = "C:\Users\lyl\Desktop\"; String sourcePath = basePath + "原始word文件.docx"; String targetPath = basePath + "转换之后的pdf文件.pdf"; String imagePath = basePath + "img" + File.separator; WordConvertPdfServiceImpl tt = new WordConvertPdfServiceImpl(); boolean flag = tt.convert(sourcePath, targetPath, imagePath); System.out.println(flag); }