Java word转html 支持(doc,docx) 完美保留格式与图片,处理图片路径。

您所在的位置:网站首页 如何把word图片转换成图片格式 Java word转html 支持(doc,docx) 完美保留格式与图片,处理图片路径。

Java word转html 支持(doc,docx) 完美保留格式与图片,处理图片路径。

2024-07-07 20:28| 来源: 网络整理| 查看: 265

将Word2003 Word2007成Html。

maven:

UTF-8 1.0.6 fr.opensagres.xdocreport fr.opensagres.xdocreport.document ${xdocreport-version} fr.opensagres.xdocreport org.apache.poi.xwpf.converter.xhtml ${xdocreport-version} fr.opensagres.xdocreport org.apache.poi.xwpf.converter.pdf ${xdocreport-version} org.apache.poi poi 3.13 org.apache.poi poi-scratchpad 3.13 package com.hessianhealth.system.utils.wkhtmltopdfUtils; import java.io.*; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.converter.PicturesManager; import org.apache.poi.hwpf.converter.WordToHtmlConverter; import org.apache.poi.hwpf.usermodel.PictureType; import org.apache.poi.xwpf.converter.core.FileImageExtractor; import org.apache.poi.xwpf.converter.core.FileURIResolver; import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter; import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions; import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.w3c.dom.Document; /** * 将word以上版本转成Html * * @author lgn * */ public class WordToHtml { /** * 将Word2007+转成Html * docx * @throws Exception */ public static void word2007ToHtml() throws Exception { String filePath = "C:\\Users\\XX\\Desktop\\"; String fileName = "XX.docx"; String htmlName = "XX.html"; final String file = filePath + fileName; File f = new File(file); if (!f.exists()) { System.out.println("Sorry File does not Exists!"); } else { /* 判断是否为docx文件 */ if (f.getName().endsWith(".docx") || f.getName().endsWith(".DOCX")) { // 1)加载word文档生成XWPFDocument对象 FileInputStream in = new FileInputStream(f); XWPFDocument document = new XWPFDocument(in); // 2)解析XHTML配置(这里设置IURIResolver来设置图片存放的目录) File imageFolderFile = new File(filePath); XHTMLOptions options = XHTMLOptions.create().URIResolver(new FileURIResolver(imageFolderFile)); options.setExtractor(new FileImageExtractor(imageFolderFile)); options.setIgnoreStylesIfUnused(false); options.setFragment(true); // 3)将XWPFDocument转换成XHTML FileOutputStream out = new FileOutputStream(new File(filePath + htmlName)); XHTMLConverter.getInstance().convert(document, out, options); } else { System.out.println("Enter only as MS Office 2007+ files"); } } System.exit(0); } /** * word2003转换成html * 对于doc,可以用下面这种方式: * @throws Exception */ public static void word2003ToHtml() throws Exception { String filePath = "C:\\Users\\XX\\Desktop\\"; String fileName = "CC.doc"; String htmlName = "CC.html"; final String imagePath = filePath + "/image/"; final String file = filePath + fileName; InputStream input = new FileInputStream(new File(file)); HWPFDocument wordDocument = new HWPFDocument(input); WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter( DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument()); //设置图片存储位置 wordToHtmlConverter.setPicturesManager(new PicturesManager() { public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) { File imgPath=new File(imagePath); if (!imgPath.exists()) {//目录不存在则创建目录 imgPath.mkdirs(); } File file = new File(imagePath+suggestedName); try { FileOutputStream os = new FileOutputStream(file); os.write(content); os.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } String newPath=imagePath+suggestedName; return newPath; } }); //解析word文档 wordToHtmlConverter.processDocument(wordDocument); Document htmlDocument = wordToHtmlConverter.getDocument(); File htmlFile = new File(filePath+htmlName); FileOutputStream outStream = new FileOutputStream(htmlFile); DOMSource domSource = new DOMSource(htmlDocument); StreamResult streamResult = new StreamResult(outStream); TransformerFactory factory = TransformerFactory.newInstance(); Transformer serializer = factory.newTransformer(); serializer.setOutputProperty(OutputKeys.ENCODING,"utf-8"); serializer.setOutputProperty(OutputKeys.INDENT, "yes"); serializer.setOutputProperty(OutputKeys.METHOD, "html"); serializer.transform(domSource, streamResult); outStream.close(); } public static void main(String[] args) { try { word2003ToHtml(); } catch (Exception e) { e.printStackTrace(); } } }

如果2007转换成html 的时候docx里面如果有图片,在生成html的时候路径问题会非常麻烦,会生成一个图片的文件夹,如果有在线预览服务器上的html的需要,可以采用把图片直接处理成base64格式直接把图片嵌入到html:

package com.hessianhealth.system.utils.wkhtmltopdfUtils; import com.itextpdf.html2pdf.jsoup.Jsoup; import com.itextpdf.html2pdf.jsoup.nodes.Document; import com.itextpdf.html2pdf.jsoup.nodes.Element; import com.itextpdf.html2pdf.jsoup.select.Elements; import org.apache.commons.net.util.Base64; import org.apache.poi.xwpf.converter.core.BasicURIResolver; import org.apache.poi.xwpf.converter.core.FileImageExtractor; import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter; import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions; import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.apache.poi.xwpf.usermodel.XWPFPictureData; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.TransformerException; import java.io.*; import java.util.ArrayList; import java.util.List; /** * 2007转换成html * 对于docx,可以用下面这种方式: * @throws Exception */ public static void docxToHtml() throws Exception { String filePath = "C:\\Users\\XX\\Desktop\\"; String fileName = "可视化问题1.5.docx"; String htmlName = "可视化问题1.5.html"; String htmlPath=filePath + htmlName; String docPath=filePath + fileName; InputStream input = new FileInputStream(new File(docPath)); XWPFDocument document = new XWPFDocument(input); List list = document.getAllPictures(); ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); XHTMLConverter.getInstance().convert(document, outputStream, null); String s = new String(outputStream.toByteArray()); s = setImg(s, list); System.out.println(s); PrintStream printStream = new PrintStream(new FileOutputStream(htmlPath)); //将HTML文件内容写入文件中 printStream.println(s.toString()); document.close(); } /** * 处理文档中的图片,转化成字节流支持在线预览 * @param html * @param list * @return */ private static String setImg(String html, List list){ Document doc = Jsoup.parse(html); Elements elements = doc.getElementsByTag("img"); if (elements != null && elements.size() > 0 && list != null){ for(Element element : elements){ String src = element.attr("src"); for (XWPFPictureData data: list){ if (src.contains(data.getFileName())){ String type = src.substring(src.lastIndexOf(".") + 1); String base64 = "data:image/" + type + ";base64," + new String(Base64.encodeBase64(data.getData())); element.attr("src", base64); break; } } } } return doc.toString(); } public static void main(String[] args) { try { docxToHtml(); } catch (Exception e) { e.printStackTrace(); } }


【本文地址】


今日新闻


推荐新闻


CopyRight 2018-2019 办公设备维修网 版权所有 豫ICP备15022753号-3