关键词匹配实体类:
@Data
@EqualsAndHashCode(callSuper = false)
public class TextConfig implements Serializable {
private static final long serialVersionUID = 1L;
/**
* 开始关键词,多个逗号分隔
*/
private String textStart ;
/**
* 结束关键词,多个逗号分隔
*/
private String textEnd ;
/**
* 包含关键词,多个逗号分隔
*/
private String textInclude ;
/**
* 不包含关键词,多个逗号分隔
*/
private String textExclude ;
}
import com.aspose.words.*;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.FileUtils;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.*;
@EqualsAndHashCode(callSuper = false)
@Slf4j
@Data
public class WordResolve extends Document {
/**
* 查找文本类型另存为word
* @param filePathName 文件保存路径
* @param copyFirst 是否复制关键词开始节点
* @param copyLast 是否复制关键词结束节点
*/
@SneakyThrows
public File findBetweenFile(TextConfig textConfig, String filePathName, boolean copyFirst, boolean copyLast) {
List<Paragraph> paragraphs = getAllParagraph();
Integer[] sec = findBetweenIndex(textConfig,getAllText());
if(sec == null){
return null;
}
Document doc = new Document();
Body body = doc.getFirstSection().getBody();
body.removeAllChildren();
NodeImporter importer = new NodeImporter(this, doc, ImportFormatMode.KEEP_SOURCE_FORMATTING);
Paragraph first = paragraphs.get(sec[0]);
Paragraph last = paragraphs.get(sec[1]);
List<CompositeNode<?>> parentNodes = new ArrayList<>(20);
boolean startCopying = false;
//读取文档的所有节点
NodeCollection<?> allNodeList = this.getChildNodes(NodeType.ANY, true);
for (int i = 0, j = allNodeList.getCount(); i < j; i++) {
Node node = allNodeList.get(i);
try{
if (node == first) {
startCopying = true;
if(!copyFirst){
continue;
}
}
if (node == last && !copyLast) {
// 到达结束节点后停止复制
break;
}
if (startCopying) {
boolean append = true;
for (CompositeNode<?> parentNode : parentNodes) {
NodeCollection<?> childNodes = parentNode.getChildNodes(node.getNodeType(), true);
if(childNodes.contains(node)){
append = false;
break;
}
}
if(append){
try{
body.appendChild(importer.importNode(node, true));
}catch (Exception e){
log.error("插入节点出错:{}",e.getMessage());
//ignore
}
}
}
if (node == last) {
// 到达结束节点后停止复制
break;
}
}finally {
if(startCopying && node.isComposite()){
CompositeNode<?> compositeNode = (CompositeNode<?>) node;
if(!parentNodes.contains(compositeNode)){
parentNodes.add(compositeNode);
}
}
}
}
File file = FileUtils.getFile(filePathName);
doc.save(filePathName);
return file;
}
/**
* 查找文本类型解析规则的开始结束段落索引
* @param strings word全部段落,每个段落的文本
*/
public Integer[] findBetweenIndex(TextConfig textConfig, List<String> strings){
String textStart = textConfig.getTextStart();
String textEnd = textConfig.getTextEnd();
//规定开始关键词必须包含哪些文字,多个逗号分隔(作为附加判断,可为空)
textInclude = StringUtil.defaultString(textConfig.getTextInclude(), "").replace(",", "");
//规定结束关键词必须不包含哪些文字,多个逗号分隔(作为附加判断,可为空)
textExclude = StringUtil.defaultString(textConfig.getTextExclude(), "").replace(",", "");
String[] in = StringUtil.isBlank(textInclude) ? null : textInclude.split(",");
String[] out = StringUtil.isBlank(textExclude) ? null : textExclude.split(",");
//满足开始位置和结束位置的全部关键词索引
List<Integer> startArr = new ArrayList<>();
List<Integer> endArr = new ArrayList<>();
for (int i = 0; i < strings.size(); i++) {
String text = strings.get(i);
if (text.contains(textStart)) {
startArr.add(i);
}
if (text.contains(textEnd)) {
endArr.add(i);
}
}
//进行包含和非包含的判断过滤
if(!startArr.isEmpty() && !endArr.isEmpty()){
for (Integer start : startArr) {
for (Integer end : endArr) {
//中间至少隔了一个段落
if(start + 1 < end){
StringJoiner jo = new StringJoiner("\n");
for (int i = start + 1; i < end; i++) {
jo.add(strings.get(i));
}
String word = jo.toString();
boolean match = true;
if(in != null){
for (String s : in) {
if(!word.contains(s)){
match = false;
break;
}
}
}
if(match && out != null){
for (String s : out) {
if(word.contains(s)){
match = false;
break;
}
}
}
if(match){
return new Integer[]{start,end};
}
}
}
}
}
return null;
}
/**
* 拿到文档全部段落文本
*/
public List<String> getAllText(){
List<String> strings = new ArrayList<>();
getAllParagraph().forEach(a-> strings.add(StringTool.safeToString(a.getText(), "")));
return strings;
}
/**
* 拿到文档全部段落
*/
public List<Paragraph> getAllParagraph(){
if(allParagraph == null){
allParagraph = findNodeByType(NodeType.PARAGRAPH, Paragraph.class);
}
return allParagraph;
}
}
使用方式:
word文档内容如下:
要截取标题三和标题四之间的内容,生成新的word,代码如下:
public class Test {
public static void main(String[] args) throws Exception{
//验证license
//new AsposeLicense().validate();
//读取word文件
WordResolve word = new WordResolve(new File("C:\\Users\\zhou\\Desktop\\测试.docx"));
TextConfig detail = new TextConfig();
detail.setTextStart("poi导出大数据量问题、写入速度慢");
detail.setTextEnd("国密验签失败");
File betweenFile = word.findBetweenFile(detail, "C:\\Users\\zhou\\Desktop\\betweenFile.docx", false, false);
System.out.println(betweenFile);
}
}
截取保存的文件如下: