POI实现解析Word文档表格(通过行列定位),实现文本,图片读取(doc,docx版本)

奔跑的菊花茶 2018-04-08 17:39:13 4292 收藏 2
展开
package com.zsl.word.over;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import java.util.UUID;

import org.apache.poi.POIXMLDocument;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.PicturesTable;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.hwpf.usermodel.Table;
import org.apache.poi.hwpf.usermodel.TableCell;
import org.apache.poi.hwpf.usermodel.TableIterator;
import org.apache.poi.hwpf.usermodel.TableRow;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFPicture;
import org.apache.poi.xwpf.usermodel.XWPFRun;
import org.apache.poi.xwpf.usermodel.XWPFTable;
import org.apache.poi.xwpf.usermodel.XWPFTableCell;
import org.apache.poi.xwpf.usermodel.XWPFTableRow;

/**
* POI解析WORD文档
*
* @author 4L
*
*/
public class PoiParseWord {

/**
* Word2003版本后缀名
*/
private final static String DOC = “doc”;

/**
* Word2007以上版本后缀名
*/
private final static String DOCX = “docx”;

/**
* 系统临时文件路径
*/
private final static String SYSTEM_TEMPORARY_PATH = System.getProperty(“java.io.tmpdir”);

/**
*
*/
private OPCPackage oPCPackage;
private XWPFDocument xwpfDocument;

private FileInputStream fileInputStream;
private POIFSFileSystem poifsFileSystem;
private HWPFDocument hwpfDocument;

public PoiParseWord(File file) {
String suffixName = file.getPath().split(“\\.”)[1];
try {
if (DOCX.equals(suffixName)) {
oPCPackage = POIXMLDocument.openPackage(file.getPath());
xwpfDocument = new XWPFDocument(oPCPackage);
}
if (DOC.equals(suffixName)) {
fileInputStream = new FileInputStream(file);
poifsFileSystem = new POIFSFileSystem(fileInputStream);
hwpfDocument = new HWPFDocument(poifsFileSystem);
}
} catch (IOException e) {
e.printStackTrace();
}
}

/**
* 取出word文档表格第cellRowIdx行,第cellColIdx列的值(DOCX)
*
* @param file
* 解析文件
* @param cellRowIdx
* 行
* @param cellColIdx
* 列
*/
@SuppressWarnings(“unused”)
public String getSpecifyDataForDocx(int cellRowIdx, int cellColIdx) {
try {
// 获取页面中的表格
Iterator<XWPFTable> it = xwpfDocument.getTablesIterator();
while (it.hasNext()) {
// 循环页面中的表格
XWPFTable table = (XWPFTable) it.next();
StringBuffer str = new StringBuffer();
// 获取表格中的行
XWPFTableRow row = table.getRow(cellRowIdx);
// 获取行中共有多少列
List<XWPFTableCell> cells = row.getTableCells();
// 获取列
XWPFTableCell cell = cells.get(cellColIdx);
// 获取列中的段落
StringBuffer allFilePath = new StringBuffer();
for (int i = 0; i < cell.getParagraphs().size(); i++) {
List<XWPFRun> runs = cell.getParagraphs().get(i).getRuns();
for (int j = 0; j < runs.size(); j++) {
// 获取单个对象
XWPFRun r = runs.get(j);
String text = r.getText(r.getTextPosition());
// 如果字符为空,可能是附件一类的文件,比如图片之类的,需要另外解析,此处处理为图片
if (text == null) {
List<XWPFPicture> piclist = r.getEmbeddedPictures();
for (int k = 0; k < piclist.size(); k++) {
String filePath = SYSTEM_TEMPORARY_PATH + UUID.randomUUID() + “.jpg”;
XWPFPicture pic = piclist.get(k);
byte[] picbyte = pic.getPictureData().getData();
// 将图片写入本地文件
@SuppressWarnings(“resource”)
FileOutputStream fos = new FileOutputStream(filePath);
fos.write(picbyte);
allFilePath.append(filePath);
}
} else {
allFilePath.append(text);
}
}
}
return allFilePath.toString();
}
} catch (IOException e) {
e.printStackTrace();
}
return null;
}

/**
* 关闭
*/
public void closeForDocx() {
try {
oPCPackage.close();
} catch (IOException e) {
e.printStackTrace();
}
}

/**
* 取出word文档表格第cellRowIdx行,第cellColIdx列的值(DOC)
*
* @param file
* 解析文件
* @param cellRowIdx
* 行
* @param cellColIdx
* 列
*/
@SuppressWarnings({ “resource”, “unused” })
public String getSpecifyDataForDoc(int cellRowIdx, int cellColIdx) {
try {
Range range = hwpfDocument.getRange();
TableIterator it = new TableIterator(range);
PicturesTable picturesTable = hwpfDocument.getPicturesTable();
while (it.hasNext()) {
Table tb = (Table) it.next();
TableRow tr = tb.getRow(cellRowIdx);
TableCell td = tr.getCell(cellColIdx);
CharacterRun cr = td.getCharacterRun(0);
if (picturesTable.hasPicture(cr)) {
Picture pic = picturesTable.extractPicture(cr, true);
byte[] picbyte = pic.getContent();
String filePath = SYSTEM_TEMPORARY_PATH + UUID.randomUUID() + “.jpg”;
// 将图片写入本地文件
FileOutputStream fos = new FileOutputStream(filePath);
fos.write(picbyte);
return filePath;
} else {
for (int k = 0; k < td.numParagraphs(); k++) {
Paragraph para = td.getParagraph(k);
String s = para.text();
s = s.substring(0, s.length() – 1);
return s;
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
}
File file = new File(“D:/123456.docx”);
PoiParseWord poiParseWord = new PoiParseWord(file);

// 文本 0,0代表第一行第一列(DOC)
poiParseWord.getSpecifyDataForDoc(1, 1);

// 文本 0,0代表第一行第一列(DOCX)
poiParseWord.getSpecifyDataForDocx(1, 1);

// 图片存入系统临时目录(DOC)
poiParseWord.getSpecifyDataForDoc(2, 2);

// 图片存入系统临时目录(DOCX)
poiParseWord.getSpecifyDataForDocx(2, 2);

所需jar包:

poi-3.17.jar

poi-ooxml-3.17.jar

poi-scratchpad-3.17.jar

poi-ooxml-schemas-3.17.jar

参考链接:

https://blog.csdn.net/shaolon/article/details/2202111

https://blog.csdn.net/q496749021/article/details/50266893

https://blog.csdn.net/tuzongxun/article/details/51992704
————————————————
版权声明:本文为CSDN博主「奔跑的菊花茶」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。
原文链接:https://blog.csdn.net/ayumu5566/article/details/79855314

版权声明:本文为aademeng原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/aademeng/articles/13049875.html