首页 > 基础资料 博客日记

比对word文档并提取差异片段(java版)

2024-08-16 05:00:04基础资料围观159

文章比对word文档并提取差异片段(java版)分享给大家,欢迎收藏Java资料网,专注分享技术知识

整体比较

有时候,我们想比对两个word文档,标记出两个文档之间的差异,这样一眼就能看出来修改了哪些地方,如下图,左边文档中的扩招2000人删除了,辞呈改成了说明,新增了并且加重处罚等文字,是否一目了然了。

代码实现:

package cn.hollycloud;

import lombok.AllArgsConstructor;
import lombok.Data;
import org.apache.commons.text.diff.CommandVisitor;
import org.apache.commons.text.diff.EditScript;
import org.apache.commons.text.diff.StringsComparator;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.junit.jupiter.api.Test;

import java.io.FileInputStream;
import java.util.ArrayList;
import java.util.List;

public class DocTest {
    @Test
    public void testCompare() {
        try {
            // 读取word文档
            XWPFDocument doc1 = new XWPFDocument(new FileInputStream("D:\\doc\\1.docx"));
            XWPFDocument doc2 = new XWPFDocument(new FileInputStream("D:\\doc\\2.docx"));
            // 获取文档文本内容
            XWPFWordExtractor extractor1 = new XWPFWordExtractor(doc1);
            String content1 = extractor1.getText();
            XWPFWordExtractor extractor2 = new XWPFWordExtractor(doc2);
            String content2 = extractor2.getText();
            // 关闭流
            doc1.close();
            doc2.close();
            // commons-text api有很大调整,请注意你使用的版本,我使用的版本为1.11.0
            StringsComparator comparator = new StringsComparator(content1, content2);
            EditScript<Character> script = comparator.getScript();
            SectionDiffCommandVisitor commandVisitor = new SectionDiffCommandVisitor();
            script.visit(commandVisitor);
            commandVisitor.finish();
            System.out.println(commandVisitor.getLeftTemp());
            System.out.println(commandVisitor.getRightTemp());
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    // 源文本上显示变化内容
    static class SectionDiffCommandVisitor implements CommandVisitor<Character> {
        private List<String> leftContents = new ArrayList<>();
        private List<String> rightContents = new ArrayList<>();
        private StringBuilder leftTemp = new StringBuilder();
        private StringBuilder rightTemp = new StringBuilder();
        private int lastTag = 0; //0:keep,1:insert,2:delete
        private String insertStart = "<em>";
        private String insertEnd = "</em>";
        private String deleteStart = "<del>";
        private String deleteEnd = "</del>";

        @Override
        public void visitDeleteCommand(Character object) {
            if (lastTag == 1) {
                rightTemp.append(insertEnd);
                leftTemp.append(deleteStart);
            } else if (lastTag == 0) {
                leftTemp.append(deleteStart);
            }
            leftTemp.append(object);
            lastTag = 2;
        }

        @Override
        public void visitInsertCommand(Character object) {
            if (lastTag == 2) {
                leftTemp.append(deleteEnd);
                rightTemp.append(insertStart);
            } else if (lastTag == 0) {
                rightTemp.append(insertStart);
            }
            rightTemp.append(object);
            lastTag = 1;
        }

        @Override
        public void visitKeepCommand(Character object) {
            finish(object);
        }

        public void finish() {
            finish(null);
        }

        private void finish(Object object) {
            if (lastTag == 1) {
                rightTemp.append(insertEnd);
            } else if (lastTag == 2) {
                leftTemp.append(deleteEnd);
            }
            if (object != null) {
                leftTemp.append(object);
                rightTemp.append(object);
            }
            lastTag = 0;
        }

        public StringBuilder getLeftTemp() {
            return leftTemp;
        }

        public StringBuilder getRightTemp() {
            return rightTemp;
        }
    }
}

列出变化的片段

如果一个文档很长,这样比对效率就比较低了,我们只需要看有差异的片段,下面我用《赡养人类》这部小说尝试,修改了里面部分内容,列出了修改的片段,效果还不错

代码:

package cn.hollycloud;

import lombok.AllArgsConstructor;
import lombok.Data;
import org.apache.commons.text.diff.CommandVisitor;
import org.apache.commons.text.diff.EditScript;
import org.apache.commons.text.diff.StringsComparator;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.junit.jupiter.api.Test;

import java.io.FileInputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;

public class DocTest {
    @Test
    public void testCompare() {
        try {
            // 读取word文档
            XWPFDocument doc1 = new XWPFDocument(new FileInputStream("D:\\doc\\1.docx"));
            XWPFDocument doc2 = new XWPFDocument(new FileInputStream("D:\\doc\\2.docx"));
            // 获取文档文本内容
            XWPFWordExtractor extractor1 = new XWPFWordExtractor(doc1);
            String content1 = extractor1.getText();
            XWPFWordExtractor extractor2 = new XWPFWordExtractor(doc2);
            String content2 = extractor2.getText();
            // 关闭流
            doc1.close();
            doc2.close();
            // commons-text api有很大调整,请注意你使用的版本,我使用的版本为1.11.0
            StringsComparator comparator = new StringsComparator(content1, content2);
            EditScript<Character> script = comparator.getScript();
            SectionDiffCommandVisitor commandVisitor = new SectionDiffCommandVisitor();
            script.visit(commandVisitor);
            commandVisitor.finish();
            List<DiffSection> diffSections = commandVisitor.getDiffSections();
            for (int i = 0; i < diffSections.size(); i++) {
                System.out.println("差分内容#" + (i + 1) + ": ");
                System.out.println(diffSections.get(i).getLeftContent());
                System.out.println(diffSections.get(i).getRightContent());
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    @Data
    public static class DiffSection {
        private String leftContent;
        private String rightContent;
    }

    // 源文本上显示变化内容
    static class SectionDiffCommandVisitor implements CommandVisitor<Character> {
        private List<DiffSection> diffSections = new ArrayList<>();
        private StringBuilder leftTemp = new StringBuilder();
        private StringBuilder rightTemp = new StringBuilder();
        private int lastTag = 0; //0:keep,1:insert,2:delete
        private String insertStart = "<em>";
        private String insertEnd = "</em>";
        private String deleteStart = "<del>";
        private String deleteEnd = "</del>";
        private Pattern pattern = Pattern.compile("[。\\r?\\n\\.]");
        private boolean hasChange = false;

        @Override
        public void visitDeleteCommand(Character object) {
            if (lastTag == 1) {
                rightTemp.append(insertEnd);
                leftTemp.append(deleteStart);
            } else if (lastTag == 0) {
                leftTemp.append(deleteStart);
            }
            leftTemp.append(object);
            lastTag = 2;
            hasChange = true;
        }

        @Override
        public void visitInsertCommand(Character object) {
            if (lastTag == 2) {
                leftTemp.append(deleteEnd);
                rightTemp.append(insertStart);
            } else if (lastTag == 0) {
                rightTemp.append(insertStart);
            }
            rightTemp.append(object);
            lastTag = 1;
            hasChange = true;
        }

        @Override
        public void visitKeepCommand(Character object) {
            finish(object);
        }

        public void finish() {
            finish(null);
        }

        private void finish(Object object) {
            if (lastTag == 1) {
                rightTemp.append(insertEnd);
            } else if (lastTag == 2) {
                leftTemp.append(deleteEnd);
            }
            if (object != null) {
                leftTemp.append(object);
                rightTemp.append(object);
            }
            // 到句子末尾
            if (object == null || pattern.matcher(object.toString()).find()) {
                if (hasChange && (isLegalStr(leftTemp.toString()) || isLegalStr(rightTemp.toString()))) {
                    DiffSection diffSection = new DiffSection();
                    diffSection.setLeftContent(leftTemp.toString());
                    diffSection.setRightContent(rightTemp.toString());
                    diffSections.add(diffSection);
                }
                leftTemp.setLength(0);
                rightTemp.setLength(0);
                hasChange = false;
            }
            lastTag = 0;
        }

        private boolean isLegalStr(String str) {
            if (StringUtils.isEmpty(str)) return false;
            String cleaned = str.replaceAll("[\\p{P}\\s]+", "")
                    .replace(insertStart, "").replace(insertEnd, "")
                    .replace(deleteStart, "").replace(deleteEnd, "");
            return !StringUtils.isEmpty(cleaned);
        }

        public List<DiffSection> getDiffSections() {
            return diffSections;
        }
    }
}


文章来源:https://blog.csdn.net/ting4937/article/details/137078577
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若内容造成侵权/违法违规/事实不符,请联系邮箱:jacktools123@163.com进行投诉反馈,一经查实,立即删除!

标签:

相关文章

本站推荐

标签云