首页 > 基础资料 博客日记
比对word文档并提取差异片段(java版)
2024-08-16 05:00:04基础资料围观273次
文章比对word文档并提取差异片段(java版)分享给大家,欢迎收藏Java资料网,专注分享技术知识
整体比较
有时候,我们想比对两个word文档,标记出两个文档之间的差异,这样一眼就能看出来修改了哪些地方,如下图,左边文档中的扩招2000人删除了,辞呈改成了说明,新增了并且加重处罚等文字,是否一目了然了。
代码实现:
package cn.hollycloud;
import lombok.AllArgsConstructor;
import lombok.Data;
import org.apache.commons.text.diff.CommandVisitor;
import org.apache.commons.text.diff.EditScript;
import org.apache.commons.text.diff.StringsComparator;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.junit.jupiter.api.Test;
import java.io.FileInputStream;
import java.util.ArrayList;
import java.util.List;
public class DocTest {
@Test
public void testCompare() {
try {
// 读取word文档
XWPFDocument doc1 = new XWPFDocument(new FileInputStream("D:\\doc\\1.docx"));
XWPFDocument doc2 = new XWPFDocument(new FileInputStream("D:\\doc\\2.docx"));
// 获取文档文本内容
XWPFWordExtractor extractor1 = new XWPFWordExtractor(doc1);
String content1 = extractor1.getText();
XWPFWordExtractor extractor2 = new XWPFWordExtractor(doc2);
String content2 = extractor2.getText();
// 关闭流
doc1.close();
doc2.close();
// commons-text api有很大调整,请注意你使用的版本,我使用的版本为1.11.0
StringsComparator comparator = new StringsComparator(content1, content2);
EditScript<Character> script = comparator.getScript();
SectionDiffCommandVisitor commandVisitor = new SectionDiffCommandVisitor();
script.visit(commandVisitor);
commandVisitor.finish();
System.out.println(commandVisitor.getLeftTemp());
System.out.println(commandVisitor.getRightTemp());
} catch (Exception e) {
e.printStackTrace();
}
}
// 源文本上显示变化内容
static class SectionDiffCommandVisitor implements CommandVisitor<Character> {
private List<String> leftContents = new ArrayList<>();
private List<String> rightContents = new ArrayList<>();
private StringBuilder leftTemp = new StringBuilder();
private StringBuilder rightTemp = new StringBuilder();
private int lastTag = 0; //0:keep,1:insert,2:delete
private String insertStart = "<em>";
private String insertEnd = "</em>";
private String deleteStart = "<del>";
private String deleteEnd = "</del>";
@Override
public void visitDeleteCommand(Character object) {
if (lastTag == 1) {
rightTemp.append(insertEnd);
leftTemp.append(deleteStart);
} else if (lastTag == 0) {
leftTemp.append(deleteStart);
}
leftTemp.append(object);
lastTag = 2;
}
@Override
public void visitInsertCommand(Character object) {
if (lastTag == 2) {
leftTemp.append(deleteEnd);
rightTemp.append(insertStart);
} else if (lastTag == 0) {
rightTemp.append(insertStart);
}
rightTemp.append(object);
lastTag = 1;
}
@Override
public void visitKeepCommand(Character object) {
finish(object);
}
public void finish() {
finish(null);
}
private void finish(Object object) {
if (lastTag == 1) {
rightTemp.append(insertEnd);
} else if (lastTag == 2) {
leftTemp.append(deleteEnd);
}
if (object != null) {
leftTemp.append(object);
rightTemp.append(object);
}
lastTag = 0;
}
public StringBuilder getLeftTemp() {
return leftTemp;
}
public StringBuilder getRightTemp() {
return rightTemp;
}
}
}
列出变化的片段
如果一个文档很长,这样比对效率就比较低了,我们只需要看有差异的片段,下面我用《赡养人类》这部小说尝试,修改了里面部分内容,列出了修改的片段,效果还不错
代码:
package cn.hollycloud;
import lombok.AllArgsConstructor;
import lombok.Data;
import org.apache.commons.text.diff.CommandVisitor;
import org.apache.commons.text.diff.EditScript;
import org.apache.commons.text.diff.StringsComparator;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.junit.jupiter.api.Test;
import java.io.FileInputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
public class DocTest {
@Test
public void testCompare() {
try {
// 读取word文档
XWPFDocument doc1 = new XWPFDocument(new FileInputStream("D:\\doc\\1.docx"));
XWPFDocument doc2 = new XWPFDocument(new FileInputStream("D:\\doc\\2.docx"));
// 获取文档文本内容
XWPFWordExtractor extractor1 = new XWPFWordExtractor(doc1);
String content1 = extractor1.getText();
XWPFWordExtractor extractor2 = new XWPFWordExtractor(doc2);
String content2 = extractor2.getText();
// 关闭流
doc1.close();
doc2.close();
// commons-text api有很大调整,请注意你使用的版本,我使用的版本为1.11.0
StringsComparator comparator = new StringsComparator(content1, content2);
EditScript<Character> script = comparator.getScript();
SectionDiffCommandVisitor commandVisitor = new SectionDiffCommandVisitor();
script.visit(commandVisitor);
commandVisitor.finish();
List<DiffSection> diffSections = commandVisitor.getDiffSections();
for (int i = 0; i < diffSections.size(); i++) {
System.out.println("差分内容#" + (i + 1) + ": ");
System.out.println(diffSections.get(i).getLeftContent());
System.out.println(diffSections.get(i).getRightContent());
}
} catch (Exception e) {
e.printStackTrace();
}
}
@Data
public static class DiffSection {
private String leftContent;
private String rightContent;
}
// 源文本上显示变化内容
static class SectionDiffCommandVisitor implements CommandVisitor<Character> {
private List<DiffSection> diffSections = new ArrayList<>();
private StringBuilder leftTemp = new StringBuilder();
private StringBuilder rightTemp = new StringBuilder();
private int lastTag = 0; //0:keep,1:insert,2:delete
private String insertStart = "<em>";
private String insertEnd = "</em>";
private String deleteStart = "<del>";
private String deleteEnd = "</del>";
private Pattern pattern = Pattern.compile("[。\\r?\\n\\.]");
private boolean hasChange = false;
@Override
public void visitDeleteCommand(Character object) {
if (lastTag == 1) {
rightTemp.append(insertEnd);
leftTemp.append(deleteStart);
} else if (lastTag == 0) {
leftTemp.append(deleteStart);
}
leftTemp.append(object);
lastTag = 2;
hasChange = true;
}
@Override
public void visitInsertCommand(Character object) {
if (lastTag == 2) {
leftTemp.append(deleteEnd);
rightTemp.append(insertStart);
} else if (lastTag == 0) {
rightTemp.append(insertStart);
}
rightTemp.append(object);
lastTag = 1;
hasChange = true;
}
@Override
public void visitKeepCommand(Character object) {
finish(object);
}
public void finish() {
finish(null);
}
private void finish(Object object) {
if (lastTag == 1) {
rightTemp.append(insertEnd);
} else if (lastTag == 2) {
leftTemp.append(deleteEnd);
}
if (object != null) {
leftTemp.append(object);
rightTemp.append(object);
}
// 到句子末尾
if (object == null || pattern.matcher(object.toString()).find()) {
if (hasChange && (isLegalStr(leftTemp.toString()) || isLegalStr(rightTemp.toString()))) {
DiffSection diffSection = new DiffSection();
diffSection.setLeftContent(leftTemp.toString());
diffSection.setRightContent(rightTemp.toString());
diffSections.add(diffSection);
}
leftTemp.setLength(0);
rightTemp.setLength(0);
hasChange = false;
}
lastTag = 0;
}
private boolean isLegalStr(String str) {
if (StringUtils.isEmpty(str)) return false;
String cleaned = str.replaceAll("[\\p{P}\\s]+", "")
.replace(insertStart, "").replace(insertEnd, "")
.replace(deleteStart, "").replace(deleteEnd, "");
return !StringUtils.isEmpty(cleaned);
}
public List<DiffSection> getDiffSections() {
return diffSections;
}
}
}
文章来源:https://blog.csdn.net/ting4937/article/details/137078577
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若内容造成侵权/违法违规/事实不符,请联系邮箱:jacktools123@163.com进行投诉反馈,一经查实,立即删除!
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若内容造成侵权/违法违规/事实不符,请联系邮箱:jacktools123@163.com进行投诉反馈,一经查实,立即删除!
标签: