文档脚注问题和忽略招标文件问题

2026-06-10 14:56:59 +08:00
parent 24cbff1625
commit 51184df30d
3 changed files with 121 additions and 50 deletions
--- a/ai_check/src/main/java/org/dromara/aiCheck/docExamine/DocExamine.java
+++ b/ai_check/src/main/java/org/dromara/aiCheck/docExamine/DocExamine.java
@@ -18,8 +18,8 @@ import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.security.CodeSource;
-import java.util.*;
 import java.util.List;
+import java.util.*;
 import java.util.function.Consumer;


@@ -1338,13 +1338,30 @@ public class DocExamine implements AutoCloseable {
        return false;
    }

+    private boolean isInsideFootnote(Paragraph paragraph) {
+        if (paragraph == null) {
+            return false;
+        }
+        Node current = paragraph;
+        while (current != null) {
+            if (current.getNodeType() == NodeType.FOOTNOTE) {
+                return true;
+            }
+            current = current.getParentNode();
+        }
+        return false;
+    }
+
    private String extractParagraphPlainText(Paragraph paragraph) {
        if (paragraph == null) {
            return "";
        }
+        if (isInsideFootnote(paragraph)) {
+            return "";
+        }
        try {
            Paragraph sanitizedParagraph = (Paragraph) paragraph.deepClone(true);
-            removeShapeNodes(sanitizedParagraph);
+            removeIgnoredTextNodes(sanitizedParagraph);
            return sanitizedParagraph.toString(SaveFormat.TEXT).trim();
        } catch (Exception ignored) {
            try {
@@ -1355,19 +1372,19 @@ public class DocExamine implements AutoCloseable {
        }
    }

-    private void removeShapeNodes(Paragraph paragraph) {
+    private void removeIgnoredTextNodes(Paragraph paragraph) {
        if (paragraph == null) {
            return;
        }
-        List<Node> shapeNodes = new ArrayList<>();
+        List<Node> ignoredNodes = new ArrayList<>();
        for (Node child = paragraph.getFirstChild(); child != null; child = child.getNextSibling()) {
            int nodeType = child.getNodeType();
-            if (nodeType == NodeType.SHAPE || nodeType == NodeType.GROUP_SHAPE) {
-                shapeNodes.add(child);
+            if (nodeType == NodeType.SHAPE || nodeType == NodeType.GROUP_SHAPE || nodeType == NodeType.FOOTNOTE) {
+                ignoredNodes.add(child);
            }
        }
-        for (Node shapeNode : shapeNodes) {
-            shapeNode.remove();
+        for (Node ignoredNode : ignoredNodes) {
+            ignoredNode.remove();
        }
    }

--- a/ai_check/src/main/java/org/dromara/aiCheck/service/BiddingContent.java
+++ b/ai_check/src/main/java/org/dromara/aiCheck/service/BiddingContent.java
@@ -284,13 +284,30 @@ public class BiddingContent {
        return false;
    }

+    private boolean isInsideFootnote(Paragraph paragraph) {
+        if (paragraph == null) {
+            return false;
+        }
+        Node current = paragraph;
+        while (current != null) {
+            if (current.getNodeType() == NodeType.FOOTNOTE) {
+                return true;
+            }
+            current = current.getParentNode();
+        }
+        return false;
+    }
+
    private String extractParagraphPlainText(Paragraph paragraph) {
        if (paragraph == null) {
            return "";
        }
+        if (isInsideFootnote(paragraph)) {
+            return "";
+        }
        try {
            Paragraph sanitizedParagraph = (Paragraph) paragraph.deepClone(true);
-            removeShapeNodes(sanitizedParagraph);
+            removeIgnoredTextNodes(sanitizedParagraph);
            return sanitizedParagraph.toString(SaveFormat.TEXT).trim();
        } catch (Exception ignored) {
            try {
@@ -301,19 +318,19 @@ public class BiddingContent {
        }
    }

-    private void removeShapeNodes(Paragraph paragraph) {
+    private void removeIgnoredTextNodes(Paragraph paragraph) {
        if (paragraph == null) {
            return;
        }
-        List<Node> shapeNodes = new ArrayList<>();
+        List<Node> ignoredNodes = new ArrayList<>();
        for (Node child = paragraph.getFirstChild(); child != null; child = child.getNextSibling()) {
            int nodeType = child.getNodeType();
-            if (nodeType == NodeType.SHAPE || nodeType == NodeType.GROUP_SHAPE) {
-                shapeNodes.add(child);
+            if (nodeType == NodeType.SHAPE || nodeType == NodeType.GROUP_SHAPE || nodeType == NodeType.FOOTNOTE) {
+                ignoredNodes.add(child);
            }
        }
-        for (Node shapeNode : shapeNodes) {
-            shapeNode.remove();
+        for (Node ignoredNode : ignoredNodes) {
+            ignoredNode.remove();
        }
    }

--- a/ai_check/src/main/java/org/dromara/aiCheck/service/SmartFilterProcessor.java
+++ b/ai_check/src/main/java/org/dromara/aiCheck/service/SmartFilterProcessor.java
@@ -6,7 +6,6 @@ import org.dromara.aiCheck.Utils.MapTypeConverter;
 import org.dromara.chat.service.impl.LLMService;
 import org.dromara.common.core.utils.StringUtils;
 import org.dromara.common.core.utils.file.AsposeTempFileUtils;
-import org.dromara.common.core.utils.file.FileParseUtil;
 import org.dromara.review.domain.bo.SmartFilterConfig;
 import org.dromara.review.service.IAiPromptService;
 import org.dromara.system.domain.vo.SysOssVo;
@@ -60,21 +59,21 @@ public class SmartFilterProcessor {
            return filteredParagraphsWithOssId;
        }

-        // 预加载并解析招标文件内容，只执行一次
-        String biddingFileContent = null;
+        // 预加载并解析招标文件段落，只执行一次，段落粒度需与投标文件 Aspose 解析保持一致。
+        Set<String> biddingFileParagraphs = Collections.emptySet();
        Set<String> tempLocalFiles = new HashSet<>();

        // 缓存技术标准所在段落ID，key为ossId，value为doc_g段落ID集合
        Map<Long, Set<String>> fileTechnicalStandardParagraphIdCache = new HashMap<>();

        try {
-            // 检查是否需要忽略与投标文件相同的内容
+            // 检查是否需要忽略与招标文件相同的内容
            if (Boolean.TRUE.equals(config.getIgnoreBiddingFileContent()) && (config.getBiddingDocuId()!=null && config.getBiddingDocuId()>0)) {
-                // 获取招标文件的内容，只下载一次
+                // 获取招标文件的 Aspose 段落，只下载一次
                String biddingFileLocalPath = downloadToUniqueLocalPath(config.getBiddingDocuId(), "smartfilter_bid_");
                tempLocalFiles.add(biddingFileLocalPath);
                File file = new File(biddingFileLocalPath);
-                biddingFileContent = FileParseUtil.parseFile(file);
+                biddingFileParagraphs = extractComparableParagraphTextSet(file);
            }
            if (Boolean.TRUE.equals(config.getIgnoreTechnicalStandards())) {
                for (Map<String, Object> stringObjectMap : paragraphsWithOssId) {
@@ -106,9 +105,10 @@ public class SmartFilterProcessor {
                }

                String filteredText = paragraph;
-                // 忽略与投标文件相同的内容，使用预加载的投标文件内容
-                if (StringUtils.isNotBlank(biddingFileContent)) {
-                    filteredText = removeBiddingFileContent(paragraph, biddingFileContent);
+                // 忽略与招标文件相同的内容，使用与投标文件一致的 Aspose 段落粒度判断。
+                if (CollectionUtils.isNotEmpty(biddingFileParagraphs)) {
+                    String originalParagraph = (String) paraWithOssId.getOrDefault("originalText", paragraph);
+                    filteredText = removeBiddingFileContent(paragraph, originalParagraph, biddingFileParagraphs);
                }

                //  忽略标点符号和短文本
@@ -178,33 +178,17 @@ public class SmartFilterProcessor {
    }

    /**
-     * 移除与招标文件相同的内容
+     * 移除与招标文件相同的内容。
+     *
+     * 文本查重的投标文件段落 ID 来自 Aspose 原始段落序号，text 可能已经被切成句子或长段片段。
+     * 因此这里优先用 originalText 与招标文件 Aspose 段落比较，避免简单换行切分导致误过滤或 ID 定位不一致。
     */
-    private String removeBiddingFileContent(String content, String biddingContent) {
-        if (StringUtils.isBlank(biddingContent)) {
+    private String removeBiddingFileContent(String content, String originalContent, Set<String> biddingParagraphs) {
+        if (CollectionUtils.isEmpty(biddingParagraphs)) {
            return content;
        }
-
-        List<String> biddingParagraphs = splitToParagraphs(biddingContent);
-
-        // 遍历内容段落，过滤掉与招标文件重复的段落
-        boolean isDuplicate = false;
-
-        // 检查当前段落是否与招标文件中的任何段落重复
-        for (String biddingParagraph : biddingParagraphs) {
-            String cleanBiddingPara = biddingParagraph.trim();
-
-            if (content.equals(cleanBiddingPara)) {
-                isDuplicate = true;
-                break;
-            }
-        }
-
-        // 如果不是重复段落，则保留
-        if (!isDuplicate) {
-            return content;
-        }
-        return "";
+        String compareText = StringUtils.isNotBlank(originalContent) ? originalContent : content;
+        return biddingParagraphs.contains(normalizeDuplicateParagraphText(compareText)) ? "" : content;
    }
    // 使用正则表达式更精确地匹配页码格式
    private boolean containsPageNumber(String text) {
@@ -291,6 +275,38 @@ public class SmartFilterProcessor {
            .collect(Collectors.toList());
    }

+    private Set<String> extractComparableParagraphTextSet(File file) throws Exception {
+        Set<String> paragraphTextSet = new HashSet<>();
+        try {
+            LoadOptions loadOptions = new LoadOptions();
+            loadOptions.setTempFolder(ASPOSE_TEMP_DIR.toString());
+            Document doc = new Document(file.getPath(), loadOptions);
+            for (Paragraph paragraph : collectComparableParagraphs(doc)) {
+                String normalizedText = normalizeDuplicateParagraphText(extractParagraphPlainText(paragraph));
+                if (StringUtils.isNotBlank(normalizedText)) {
+                    paragraphTextSet.add(normalizedText);
+                }
+            }
+            return paragraphTextSet;
+        } finally {
+            AsposeTempFileUtils.cleanupStaleAsposeTempFiles(SmartFilterProcessor.class, ASPOSE_TEMP_DIR);
+        }
+    }
+
+    private String normalizeDuplicateParagraphText(String text) {
+        if (StringUtils.isBlank(text)) {
+            return "";
+        }
+        StringBuilder builder = new StringBuilder(text.length());
+        for (int i = 0; i < text.length(); i++) {
+            char current = text.charAt(i);
+            if (!Character.isWhitespace(current)) {
+                builder.append(current);
+            }
+        }
+        return builder.toString().trim();
+    }
+

    /**
     * 从文件中提取技术标准标题及其内容的段落ID
@@ -388,10 +404,27 @@ public class SmartFilterProcessor {
        return false;
    }

+    private static boolean isInsideFootnote(Paragraph paragraph) {
+        if (paragraph == null) {
+            return false;
+        }
+        Node current = paragraph;
+        while (current != null) {
+            if (current.getNodeType() == NodeType.FOOTNOTE) {
+                return true;
+            }
+            current = current.getParentNode();
+        }
+        return false;
+    }
+
    private static String extractParagraphPlainText(Paragraph paragraph) {
+        if (isInsideFootnote(paragraph)) {
+            return "";
+        }
        try {
            Paragraph sanitizedParagraph = (Paragraph) paragraph.deepClone(true);
-            removeShapeNodes(sanitizedParagraph);
+            removeIgnoredTextNodes(sanitizedParagraph);
            return sanitizedParagraph.toString(SaveFormat.TEXT).trim();
        } catch (Exception ignored) {
            try {
@@ -402,7 +435,7 @@ public class SmartFilterProcessor {
        }
    }

-    private static void removeShapeNodes(Paragraph paragraph) throws Exception {
+    private static void removeIgnoredTextNodes(Paragraph paragraph) throws Exception {
        NodeCollection shapes = paragraph.getChildNodes(NodeType.SHAPE, true);
        for (int i = shapes.getCount() - 1; i >= 0; i--) {
            shapes.get(i).remove();
@@ -411,6 +444,10 @@ public class SmartFilterProcessor {
        for (int i = groupShapes.getCount() - 1; i >= 0; i--) {
            groupShapes.get(i).remove();
        }
+        NodeCollection footnotes = paragraph.getChildNodes(NodeType.FOOTNOTE, true);
+        for (int i = footnotes.getCount() - 1; i >= 0; i--) {
+            footnotes.get(i).remove();
+        }
    }

    /**