日常操作中,word转pdf是较为常见的操作。尤其是前端上传word文档,需要在页面预览文档的情况。前端直接预览word需要特殊的处理,但是如果由后端先把word转为pdf,再预览,就会比较简单。
效果预览:
原始word文件.docx
转换之后的pdf文件.pdf
接下来就分享实测过的实现方式。
环境:JDK11、Springboot 2.3.7.RELEASE、windows10、Maven
- 第一步,Maven 依赖配置,主要导入一些工具包
- 第二步,service 业务层构造
package com.yalin.cn.fileutil.word.service;
import java.io.InputStream;
import java.io.OutputStream;
/**
* @description: word生成pdf
* @author: lyl
* @create: 2021-05-08 16:31:47
**/
public interface IWordConvertPdfService {
/**
* docx 转pdf
*
* @param sourcePath word路径
* @param targetPath pdf路径
* @param imageDir word中的图片临时存放路径
* @return boolean
*/
boolean convert(String sourcePath, String targetPath, String imageDir);
/**
* docx 转pdf
*
* @param in word文件流
* @param targetPath pdf路径
* @param imageDir word中的图片临时存放路径
* @return boolean
*/
boolean convert(InputStream in, String targetPath, String imageDir);
/**
* docx 转pdf
*
* @param in word文件流
* @param out pdf文件流
* @param imageDir word中的图片临时存放路径
* @return boolean
* @throws Exception 抛出异常
*/
boolean convert(InputStream in, OutputStream out, String imageDir) throws Exception;
}
- 第三步,service impl 业务实现层构造
package com.yalin.cn.fileutil.word.service.impl;
import com.yalin.cn.fileutil.util.OfficeUtil;
import com.yalin.cn.fileutil.word.service.IWordConvertPdfService;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Objects;
/**
* @description: word生成pdf
* @author: lyl
* @create: 2021-05-08 16:31:47
**/
@Service
@Slf4j
public class WordConvertPdfServiceImpl implements IWordConvertPdfService {
/**
* docx 转pdf
*
* @param sourcePath word路径
* @param targetPath pdf路径
* @param imageDir word中的图片临时存放路径
* @return boolean
*/
@Override
public boolean convert(String sourcePath, String targetPath, String imageDir) {
try (InputStream inputStream = Files.newInputStream(Paths.get(sourcePath));
OutputStream outputStream = Files.newOutputStream(Paths.get(targetPath))) {
return convert(inputStream, outputStream, imageDir);
} catch (Exception e) {
log.error("convert(String, String, String)异常:{}", e);
}
return false;
}
/**
* docx 转pdf
*
* @param in word文件流
* @param targetPath pdf路径
* @param imageDir word中的图片临时存放路径
* @return boolean
*/
@Override
public boolean convert(InputStream in, String targetPath, String imageDir) {
try (OutputStream outputStream = Files.newOutputStream(Paths.get(targetPath))) {
return convert(in, outputStream, imageDir);
} catch (Exception e) {
log.error("convert(String, String, String)异常:{}", e);
}
return false;
}
/**
* docx 转pdf
*
* @param in word文件流
* @param out pdf文件流
* @param imageDir word中的图片临时存放路径
* @return boolean
*/
@Override
public boolean convert(InputStream in, OutputStream out, String imageDir) throws Exception {
if (Objects.isNull(in)) {
throw new Exception("模板文件流为null!");
}
if (Objects.isNull(out)) {
throw new Exception("目标文件流为null!");
}
try {
// word转pdf
OfficeUtil.docxConvertPdf(in, out, imageDir);
return true;
} catch (Exception e) {
log.error("fill(InputStream, OutputStream, String)异常:{}", e);
}
return false;
}
}
- 第四步,真正实现转换的工具类
package com.yalin.cn.fileutil.util;
import com.itextpdf.text.*;
import com.itextpdf.text.pdf.BaseFont;
import com.itextpdf.text.pdf.PdfWriter;
import com.itextpdf.tool.xml.XMLWorkerHelper;
import com.yalin.cn.fileutil.font.AutoFontFactory;
import fr.opensagres.poi.xwpf.converter.core.BasicURIResolver;
import fr.opensagres.poi.xwpf.converter.core.FileImageExtractor;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.commons.lang3.StringUtils;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Entities;
import org.jsoup.select.Elements;
import java.io.*;
import java.nio.charset.Charset;
import java.util.Objects;
/**
* @description: word 转pdf
* @author: lyl
* @create: 2021-04-23 11:09:51
**/
public class OfficeUtil {
/**
* 将docx格式文件转成html
*
* @param in docx文件流
* @param imageDir docx文件中图片存储目录
* @return html
*/
public static String docx2Html(InputStream in, String imageDir) throws Exception {
String content = null;
ByteArrayOutputStream baos = null;
try {
// 1> 加载文档到XWPFDocument
XWPFDocument document = new XWPFDocument(in);
// 2> 解析XHTML配置(这里设置IURIResolver来设置图片存放的目录)
XHTMLOptions options = XHTMLOptions.create();
// 存放word中图片的目录
if (Objects.nonNull(imageDir)) {
options.setExtractor(new FileImageExtractor(new File(imageDir)));
options.URIResolver(new BasicURIResolver(imageDir));
options.setIgnoreStylesIfUnused(false);
options.setFragment(true);
}
// 3> 将XWPFDocument转换成XHTML
baos = new ByteArrayOutputStream();
XHTMLConverter.getInstance().convert(document, baos, options);
} catch (Exception e) {
e.printStackTrace();
throw new Exception(e);
} finally {
try {
if (in != null) {
in.close();
}
if (baos != null) {
content = new String(baos.toByteArray(), "utf-8");
baos.close();
}
} catch (Exception e) {
e.printStackTrace();
}
}
return content;
}
/**
* 使用jsoup规范化html
*
* @param html html内容
* @return 规范化后的html
*/
private static String formatHtml(String html) {
org.jsoup.nodes.Document doc = Jsoup.parse(html);
// 去除过大的宽度
String style = doc.attr("style");
if (StringUtils.isNotEmpty(style) && style.contains("width")) {
doc.attr("style", "");
}
Elements divs = doc.select("div");
for (Element div : divs) {
String divStyle = div.attr("style");
if (StringUtils.isNotEmpty(divStyle) && divStyle.contains("width")) {
div.attr("style", "");
}
}
// jsoup生成闭合标签
doc.outputSettings().syntax(org.jsoup.nodes.Document.OutputSettings.Syntax.xml);
doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
return doc.html();
}
/**
* html转成pdf
*
* @param html html
* @param out 输出pdf文件流
*/
public static void htmlToPdf(String html, OutputStream out) throws Exception {
Document document = null;
ByteArrayInputStream bais = null;
try {
// 纸
document = new Document(PageSize.A4);
// 笔
PdfWriter writer = PdfWriter.getInstance(document, out);
document.open();
// html转pdf
bais = new ByteArrayInputStream(html.getBytes("UTF-8"));
XMLWorkerHelper.getInstance().parseXHtml(writer, document, bais,
Charset.forName("UTF-8"), new FontProvider() {
@Override
public boolean isRegistered(String s) {
return false;
}
@Override
public Font getFont(String s, String s1, boolean embedded, float size, int style, BaseColor baseColor) {
// 配置字体
Font font = null;
try {
BaseFont bf = AutoFontFactory.getBaseFont();
font = new Font(bf, size, style, baseColor);
font.setColor(baseColor);
} catch (Exception e) {
e.printStackTrace();
}
return font;
}
});
} catch (Exception e) {
e.printStackTrace();
throw new Exception(e);
} finally {
if (document != null) {
document.close();
}
if (bais != null) {
try {
bais.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
/**
* docx 转pdf
*
* @param in docx文件流
* @param out pdf文件流
* @param imageDir docx中图片存放路径
* @return boolean
*/
public static boolean docxConvertPdf(InputStream in, OutputStream out, String imageDir) {
try {
String docxHtml = docx2Html(in, imageDir);
docxHtml = formatHtml(docxHtml);
htmlToPdf(docxHtml, out);
return true;
} catch (Exception e) {
e.printStackTrace();
}
return false;
}
}
备注:OfficeUtil中的AutoFontFactory属于自定义的字体。因为linux环境下不支持某些中文字体,导致乱码。解决方案之一,就是从windows字体库中复制一个,放到resource目录下,在代码中引用即可。
package com.yalin.cn.fileutil.font;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.pdf.BaseFont;
import java.io.IOException;
/**
* @description: 字体工厂
* @author: lyl
* @create: 2022-01-17 15:38:29
**/
public class AutoFontFactory {
/**
* 获取基础字体
*
* @return BaseFont
* @throws IOException
* @throws DocumentException
*/
public static BaseFont getBaseFont() throws IOException, DocumentException {
// 方案一:使用资源字体(需要有字体)
BaseFont bf = BaseFont.createFont("/font/simsun.ttc,0", BaseFont.IDENTITY_H, BaseFont.EMBEDDED);
// 方案二:使用本地字体(本地需要有字体)
// BaseFont bf = BaseFont.createFont("C:/Windows/Fonts/seguisym.ttf", BaseFont.IDENTITY_H, BaseFont.EMBEDDED);
// 方案二:使用jar包:iTextAsian,这样只需一个jar包就可以了
// BaseFont bf = BaseFont.createFont("STSong-Light", "UniGB-UCS2-H", BaseFont.EMBEDDED);
return bf;
}
}
测试类
@Test
void wordConvertPdf() {
String basePath = "C:\Users\lyl\Desktop\";
String sourcePath = basePath + "原始word文件.docx";
String targetPath = basePath + "转换之后的pdf文件.pdf";
String imagePath = basePath + "img" + File.separator;
WordConvertPdfServiceImpl tt = new WordConvertPdfServiceImpl();
boolean flag = tt.convert(sourcePath, targetPath, imagePath);
System.out.println(flag);
}