From bc8f2acbc30fa3641e9f0fcf9c638a0d798710d7 Mon Sep 17 00:00:00 2001
From: cjh <949661474@qq.com>
Date: Tue, 2 Jun 2026 17:55:00 +0800
Subject: [PATCH] =?UTF-8?q?1.=E6=96=87=E6=9C=AC=E7=9B=B8=E4=BC=BC=E5=BA=A6?=
 =?UTF-8?q?=E5=8A=A0=E5=BF=AB=E5=AE=A1=E6=9F=A5=E9=80=9F=E5=BA=A62.?=
 =?UTF-8?q?=E5=BF=BD=E7=95=A5=E6=8A=80=E6=9C=AF=E6=A0=87=E5=87=86=E4=BF=AE?=
 =?UTF-8?q?=E6=94=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../aiCheck/service/BiddingContent.java       |  41 ++--
 .../aiCheck/service/SmartFilterProcessor.java | 173 ++++++++------
 .../BiddingAnalysisResultServiceImpl.java     | 216 ++++++++++++------
 3 files changed, 271 insertions(+), 159 deletions(-)
diff --git a/ai_check/src/main/java/org/dromara/aiCheck/service/BiddingContent.java b/ai_check/src/main/java/org/dromara/aiCheck/service/BiddingContent.java
index 1462f75..01f4913 100644
--- a/ai_check/src/main/java/org/dromara/aiCheck/service/BiddingContent.java
+++ b/ai_check/src/main/java/org/dromara/aiCheck/service/BiddingContent.java
@@ -5,15 +5,14 @@ import ai.z.openapi.service.ocr.HandwritingOcrResponse;
 import ai.z.openapi.service.ocr.HandwritingOcrUploadReq;
 import com.alibaba.fastjson.JSONArray;
 import com.alibaba.fastjson.JSONObject;
-import com.aspose.words.*;
 import com.aspose.words.Document;
 import com.aspose.words.Font;
 import com.aspose.words.ParagraphAlignment;
+import com.aspose.words.*;
 import com.spire.doc.fields.ShapeGroup;
 import com.spire.doc.fields.ShapeObject;
 import lombok.RequiredArgsConstructor;
 import lombok.extern.slf4j.Slf4j;
-import okhttp3.*;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.rendering.PDFRenderer;
 import org.apache.poi.hwpf.HWPFDocument;
@@ -24,9 +23,17 @@ import org.dromara.common.core.utils.StringUtils;
 import org.dromara.common.core.utils.file.AsposeTempFileUtils;
 import org.dromara.common.core.utils.file.FileParseUtil;
 import org.dromara.review.domain.bo.SmartFilterConfig;
+import org.opencv.core.Core;
+import org.opencv.core.Mat;
+import org.opencv.imgcodecs.Imgcodecs;
+import org.opencv.imgproc.Imgproc;
 import org.openxmlformats.schemas.wordprocessingml.x2006.main.*;
 import org.springframework.stereotype.Service;
 
+import javax.imageio.ImageIO;
+import javax.imageio.ImageReadParam;
+import javax.imageio.ImageReader;
+import javax.imageio.stream.ImageInputStream;
 import java.awt.*;
 import java.awt.image.BufferedImage;
 import java.io.File;
@@ -36,24 +43,14 @@ import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
-import java.util.*;
+import java.security.MessageDigest;
 import java.util.List;
+import java.util.*;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import java.util.stream.Collectors;
 
-import org.opencv.core.Core;
-import org.opencv.core.Mat;
-import org.opencv.imgcodecs.Imgcodecs;
-import org.opencv.imgproc.Imgproc;
-
-import javax.imageio.ImageIO;
-import javax.imageio.ImageReadParam;
-import javax.imageio.ImageReader;
-import javax.imageio.stream.ImageInputStream;
-
 import static cn.dev33.satoken.util.SaHexUtil.bytesToHex;
-import java.security.MessageDigest;
 
 @Slf4j
 @RequiredArgsConstructor
@@ -70,6 +67,7 @@ public class BiddingContent {
     private static final double OCR_SOLID_IMAGE_THRESHOLD = 0.995D;
     private static final int OCR_SAMPLE_GRID = 32;
     private static final int OCR_CACHE_MAX_SIZE = 2000;
+    private static final int TEXT_SIMILARITY_MAX_PARAGRAPH_LENGTH = 800;
     private static final Map<String, List<String>> OCR_RESULT_CACHE =
         Collections.synchronizedMap(new LinkedHashMap<>(256, 0.75F, true) {
             @Override
@@ -135,6 +133,7 @@ public class BiddingContent {
                         } else {
                             processedParagraphs = splitTextToSentences(originalParagraph);
                         }
+                        processedParagraphs = splitOversizedParagraphs(processedParagraphs);
 
                         for (int subIndex = 0; subIndex < processedParagraphs.size(); subIndex++) {
                             String processedText = processedParagraphs.get(subIndex);
@@ -543,6 +542,18 @@ public class BiddingContent {
         return sentences;
     }
 
+    private List<String> splitOversizedParagraphs(List<String> paragraphs) {
+        List<String> result = new ArrayList<>();
+        for (String paragraph : paragraphs) {
+            if (paragraph == null || paragraph.length() <= TEXT_SIMILARITY_MAX_PARAGRAPH_LENGTH) {
+                result.add(paragraph);
+            } else {
+                result.addAll(splitLongParagraph(paragraph, TEXT_SIMILARITY_MAX_PARAGRAPH_LENGTH));
+            }
+        }
+        return result;
+    }
+
     /**
      * 判断是否应该使用长段落分割策略
      *
@@ -563,7 +574,7 @@ public class BiddingContent {
 
         // 解析配置并检查是否启用忽略关键信息类型
         try {
-            SmartFilterConfig config = com.alibaba.fastjson.JSONObject.parseObject(smartFilterConfig, SmartFilterConfig.class);
+            SmartFilterConfig config = JSONObject.parseObject(smartFilterConfig, SmartFilterConfig.class);
             return config != null && config.getIgnoreKeyInfoTypesEnable();
         } catch (Exception e) {
             log.warn("解析智能过滤配置失败,使用默认段落分割策略: {}", e.getMessage());
diff --git a/ai_check/src/main/java/org/dromara/aiCheck/service/SmartFilterProcessor.java b/ai_check/src/main/java/org/dromara/aiCheck/service/SmartFilterProcessor.java
index 66c83c0..25b3b11 100644
--- a/ai_check/src/main/java/org/dromara/aiCheck/service/SmartFilterProcessor.java
+++ b/ai_check/src/main/java/org/dromara/aiCheck/service/SmartFilterProcessor.java
@@ -2,24 +2,11 @@ package org.dromara.aiCheck.service;
 
 import com.aspose.words.*;
 import com.baomidou.mybatisplus.core.toolkit.CollectionUtils;
-import com.fasterxml.jackson.core.JsonParser;
-import com.fasterxml.jackson.core.JsonProcessingException;
-import com.fasterxml.jackson.core.type.TypeReference;
-import com.fasterxml.jackson.databind.JsonNode;
-import com.fasterxml.jackson.databind.ObjectMapper;
 import org.dromara.aiCheck.Utils.MapTypeConverter;
-import org.dromara.aiCheck.config.AiCheckConcurrencySupport;
-import org.dromara.chat.domain.vo.LLMRequest;
-import org.dromara.chat.domain.vo.LLMResponse;
-import org.dromara.chat.domain.vo.Message;
 import org.dromara.chat.service.impl.LLMService;
-import org.dromara.common.core.domain.R;
-import org.dromara.common.core.utils.SpringUtils;
 import org.dromara.common.core.utils.StringUtils;
 import org.dromara.common.core.utils.file.AsposeTempFileUtils;
 import org.dromara.common.core.utils.file.FileParseUtil;
-import org.dromara.common.core.utils.file.FileUtils;
-import org.dromara.review.domain.bo.AiPromptBo;
 import org.dromara.review.domain.bo.SmartFilterConfig;
 import org.dromara.review.service.IAiPromptService;
 import org.dromara.system.domain.vo.SysOssVo;
@@ -28,17 +15,14 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.stereotype.Component;
-import reactor.core.publisher.Mono;
 
 import java.io.File;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
-import java.util.*;
 import java.util.List;
-import java.util.concurrent.*;
-import java.util.regex.Pattern;
+import java.util.*;
 import java.util.stream.Collectors;
 
 /**
@@ -80,8 +64,8 @@ public class SmartFilterProcessor {
         String biddingFileContent = null;
         Set<String> tempLocalFiles = new HashSet<>();
 
-        // 缓存已读取的文件内容，key为ossId，value为标题内容映射
-        Map<Long,List<String>> fileTitleContentCache = new HashMap<>();
+        // 缓存技术标准所在段落ID，key为ossId，value为doc_g段落ID集合
+        Map<Long, Set<String>> fileTechnicalStandardParagraphIdCache = new HashMap<>();
 
         try {
             // 检查是否需要忽略与投标文件相同的内容
@@ -92,18 +76,20 @@ public class SmartFilterProcessor {
                 File file = new File(biddingFileLocalPath);
                 biddingFileContent = FileParseUtil.parseFile(file);
             }
-            for (Map<String, Object> stringObjectMap : paragraphsWithOssId) {
-                Long ossId = (Long) stringObjectMap.get("ossId");
-                List<String> titleContentList = fileTitleContentCache.get(ossId);
-                if (titleContentList == null) {
-                    // 下载文件到本地
-                    String localFilePath = downloadToUniqueLocalPath(ossId, "smartfilter_src_");
-                    tempLocalFiles.add(localFilePath);
-                    File file = new File(localFilePath);
-                    // 使用Aspose.Words API解析文件并提取标题内容
-                    titleContentList = extractTitleWithContentFromFile(file);
-                    // 缓存结果
-                    fileTitleContentCache.put(ossId, titleContentList);
+            if (Boolean.TRUE.equals(config.getIgnoreTechnicalStandards())) {
+                for (Map<String, Object> stringObjectMap : paragraphsWithOssId) {
+                    Long ossId = MapTypeConverter.getMapLongValue(stringObjectMap, "ossId");
+                    Set<String> technicalStandardParagraphIds = fileTechnicalStandardParagraphIdCache.get(ossId);
+                    if (technicalStandardParagraphIds == null) {
+                        // 下载文件到本地
+                        String localFilePath = downloadToUniqueLocalPath(ossId, "smartfilter_src_");
+                        tempLocalFiles.add(localFilePath);
+                        File file = new File(localFilePath);
+                        // 使用Aspose.Words API解析文件并提取技术标准所在段落ID
+                        technicalStandardParagraphIds = extractTechnicalStandardParagraphIdsFromFile(file);
+                        // 缓存结果
+                        fileTechnicalStandardParagraphIdCache.put(ossId, technicalStandardParagraphIds);
+                    }
                 }
             }
 
@@ -137,8 +123,10 @@ public class SmartFilterProcessor {
 
                 // 忽略技术标准
                 if (Boolean.TRUE.equals(config.getIgnoreTechnicalStandards())) {
-                    List<String> technicalStandards = fileTitleContentCache.get(ossId);
-                    filteredText = removeTechnicalStandards(filteredText,technicalStandards);
+                    Set<String> technicalStandardParagraphIds = fileTechnicalStandardParagraphIdCache.get(ossId);
+                    if (isTechnicalStandardParagraph(paragraphId, technicalStandardParagraphIds)) {
+                        filteredText = "";
+                    }
                 }
 
                  //6. 忽略重点信息相似词 --只限于重点信息
@@ -272,22 +260,12 @@ public class SmartFilterProcessor {
     }
 
     /**
-     * 移除技术标准内容（包括编制说明及其子目录）
+     * 判断当前段落是否属于技术标准内容（包括编制说明及其子目录）
      */
-    private String removeTechnicalStandards(String content,List<String> technicalStandards) {
-        boolean flag = false;
-        for (String para : technicalStandards) {
-            String para2 = para.trim();
-
-            if (content.equals(para2)) {
-                flag = true;
-                break;
-            }
-        }
-        if(!flag) {
-            return content;
-        }
-        return "";
+    private boolean isTechnicalStandardParagraph(String paragraphId, Set<String> technicalStandardParagraphIds) {
+        return StringUtils.isNotBlank(paragraphId)
+            && CollectionUtils.isNotEmpty(technicalStandardParagraphIds)
+            && technicalStandardParagraphIds.contains(paragraphId);
     }
 
 
@@ -315,22 +293,18 @@ public class SmartFilterProcessor {
 
 
     /**
-     * 从文件中提取所有标题及其内容
+     * 从文件中提取技术标准标题及其内容的段落ID
      */
-    private List<String> extractTitleWithContentFromFile(File file) throws Exception {
-        List<String> result = new ArrayList<>();
-        result.add("编制说明");
-        result.add("编制依据");
-        result.add("编制原则");
-        result.add("编制范围");
+    private Set<String> extractTechnicalStandardParagraphIdsFromFile(File file) throws Exception {
+        Set<String> result = new HashSet<>();
         try {
             LoadOptions loadOptions = new LoadOptions();
             loadOptions.setTempFolder(ASPOSE_TEMP_DIR.toString());
             Document doc = new Document(file.getPath(), loadOptions);
-            result.addAll(extractSpecificTitleContent(doc, "编制说明"));
-            result.addAll(extractSpecificTitleContent(doc, "编制依据"));
-            result.addAll(extractSpecificTitleContent(doc, "编制原则"));
-            result.addAll(extractSpecificTitleContent(doc, "编制范围"));
+            result.addAll(extractSpecificTitleParagraphIds(doc, "编制说明"));
+            result.addAll(extractSpecificTitleParagraphIds(doc, "编制依据"));
+            result.addAll(extractSpecificTitleParagraphIds(doc, "编制原则"));
+            result.addAll(extractSpecificTitleParagraphIds(doc, "编制范围"));
             return result;
         } finally {
             AsposeTempFileUtils.cleanupStaleAsposeTempFiles(SmartFilterProcessor.class, ASPOSE_TEMP_DIR);
@@ -342,18 +316,18 @@ public class SmartFilterProcessor {
      *
      * @param doc  Aspose.Words文档对象
      * @param targetTitle 要提取的标题文本
-     * @return 该标题下的内容段落列表
+     * @return 该标题下的内容段落ID集合
      */
-    public static List<String> extractSpecificTitleContent(Document doc, String targetTitle) {
-        List<String> contentList = new ArrayList<>();
+    public static Set<String> extractSpecificTitleParagraphIds(Document doc, String targetTitle) throws Exception {
+        Set<String> paragraphIds = new HashSet<>();
         boolean isTargetTitleFound = false;
-        boolean isNextTitleFound = false;
 
-        // 获取文档中的所有段落
-        NodeCollection<com.aspose.words.Paragraph> paragraphs = doc.getChildNodes(NodeType.PARAGRAPH, true);
+        List<Paragraph> paragraphs = collectComparableParagraphs(doc);
 
-        for (com.aspose.words.Paragraph paragraph : paragraphs) {
-            String paragraphText = paragraph.getText().trim();
+        for (int i = 0; i < paragraphs.size(); i++) {
+            Paragraph paragraph = paragraphs.get(i);
+            String paragraphId = "doc_g" + (i + 1);
+            String paragraphText = extractParagraphPlainText(paragraph);
             if (paragraphText.isEmpty()) {
                 continue; // 跳过空段落
             }
@@ -362,17 +336,22 @@ public class SmartFilterProcessor {
             String styleName = paragraph.getParagraphFormat().getStyle().getName();
 
             // 检查是否是标题样式（以"标题"或"Heading"开头）
-            boolean isHeading = styleName.startsWith("Heading 2") || styleName.startsWith("Heading 1");
+            boolean isHeading = styleName.startsWith("Heading 2")
+                || styleName.startsWith("Heading 1")
+                || styleName.startsWith("标题 2")
+                || styleName.startsWith("标题 1");
+
+            if (paragraphText.equals(targetTitle)) {
+                paragraphIds.add(paragraphId);
+            }
 
             if (isTargetTitleFound) {
                 // 如果已经找到了目标标题，现在检查是否遇到了下一个标题
                 if (isHeading) {
                     // 遇到了下一个标题，结束当前标题的内容提取
-                    isNextTitleFound = true;
                     break;
                 } else {
-                    // 添加内容到列表
-                    contentList.add(paragraphText);
+                    paragraphIds.add(paragraphId);
                 }
             } else {
                 // 寻找目标标题
@@ -381,7 +360,57 @@ public class SmartFilterProcessor {
                 }
             }
         }
-        return contentList;
+        return paragraphIds;
+    }
+
+    private static List<Paragraph> collectComparableParagraphs(Document doc) {
+        List<Paragraph> paragraphs = new ArrayList<>();
+        for (Section section : doc.getSections()) {
+            for (Object paragraphObj : section.getBody().getChildNodes(NodeType.PARAGRAPH, true)) {
+                Paragraph paragraph = (Paragraph) paragraphObj;
+                if (!isInsideShape(paragraph)) {
+                    paragraphs.add(paragraph);
+                }
+            }
+        }
+        return paragraphs;
+    }
+
+    private static boolean isInsideShape(Paragraph paragraph) {
+        Node current = paragraph;
+        while (current != null) {
+            int nodeType = current.getNodeType();
+            if (nodeType == NodeType.SHAPE || nodeType == NodeType.GROUP_SHAPE) {
+                return true;
+            }
+            current = current.getParentNode();
+        }
+        return false;
+    }
+
+    private static String extractParagraphPlainText(Paragraph paragraph) {
+        try {
+            Paragraph sanitizedParagraph = (Paragraph) paragraph.deepClone(true);
+            removeShapeNodes(sanitizedParagraph);
+            return sanitizedParagraph.toString(SaveFormat.TEXT).trim();
+        } catch (Exception ignored) {
+            try {
+                return paragraph.toString(SaveFormat.TEXT).trim();
+            } catch (Exception ex) {
+                return "";
+            }
+        }
+    }
+
+    private static void removeShapeNodes(Paragraph paragraph) throws Exception {
+        NodeCollection shapes = paragraph.getChildNodes(NodeType.SHAPE, true);
+        for (int i = shapes.getCount() - 1; i >= 0; i--) {
+            shapes.get(i).remove();
+        }
+        NodeCollection groupShapes = paragraph.getChildNodes(NodeType.GROUP_SHAPE, true);
+        for (int i = groupShapes.getCount() - 1; i >= 0; i--) {
+            groupShapes.get(i).remove();
+        }
     }
 
     /**
diff --git a/ruoyi-modules/ai-intelligent-review/src/main/java/org/dromara/review/service/impl/BiddingAnalysisResultServiceImpl.java b/ruoyi-modules/ai-intelligent-review/src/main/java/org/dromara/review/service/impl/BiddingAnalysisResultServiceImpl.java
index 456611a..57e4bf8 100644
--- a/ruoyi-modules/ai-intelligent-review/src/main/java/org/dromara/review/service/impl/BiddingAnalysisResultServiceImpl.java
+++ b/ruoyi-modules/ai-intelligent-review/src/main/java/org/dromara/review/service/impl/BiddingAnalysisResultServiceImpl.java
@@ -247,8 +247,13 @@ public class BiddingAnalysisResultServiceImpl implements IBiddingAnalysisResultS
                                                               SmartFilterConfig smartFilterConfig) {
         try {
 
-            int totalItems1 = paragraphs1.size();
-            int totalItems2 = paragraphs2.size();
+            // 确定是否在相似度计算时忽略标点符号
+            boolean ignorePunctuation = smartFilterConfig != null
+                && Boolean.TRUE.equals(smartFilterConfig.getIgnorePunctuationAndShortText());
+            List<SimilarityParagraph> preparedParagraphs1 = prepareSimilarityParagraphs(paragraphs1, ignorePunctuation);
+            List<SimilarityParagraph> preparedParagraphs2 = prepareSimilarityParagraphs(paragraphs2, ignorePunctuation);
+            int totalItems1 = preparedParagraphs1.size();
+            int totalItems2 = preparedParagraphs2.size();
             int similarItemCount = 0;
             List<Map<String, Object>> duplicateContents = new ArrayList<>();
 
@@ -261,42 +266,24 @@ public class BiddingAnalysisResultServiceImpl implements IBiddingAnalysisResultS
                 return new SimilarityResult(0L, duplicateContents);
             }
 
-            // 确定是否在相似度计算时忽略标点符号
-            boolean ignorePunctuation = smartFilterConfig != null
-                && Boolean.TRUE.equals(smartFilterConfig.getIgnorePunctuationAndShortText());
-
             // 比较每对段落的相似度
-            // 缓存已查询过的pdfOssId结果
-            Map<Long, Long> pdfOssIdCache = new HashMap<>();
             Map<Long, Long> dtlIdCache = new HashMap<>();
             for (int i = 0; i < totalItems1; i++) {
-                String para1 = paragraphs1.get(i).get("text").toString();
-                String page1 = paragraphs1.get(i).get("page").toString();
-                String paragraphId = paragraphs1.get(i).get("paragraphId").toString();
-                // 直接从ossIds1列表中获取当前段落对应的ossId
-                Long ossId1 = (Long) paragraphs1.get(i).get("ossId");
-
-                if (para1 == null || para1.trim().isEmpty()) {
-                    continue;
-                }
+                SimilarityParagraph left = preparedParagraphs1.get(i);
 
                 for (int j = 0; j < totalItems2; j++) {
                     // 如果段落2的第j个元素已经被匹配过，则跳过
                     if (matchedIndices2.contains(j)) {
                         continue;
                     }
-                    String page2 = paragraphs2.get(j).get("page").toString();
-
-                    String para2 = paragraphs2.get(j).get("text").toString();
-                    String paragraphId2 = paragraphs2.get(j).get("paragraphId").toString();
-                    // 直接从ossIds1列表中获取当前段落对应的ossId
-                    Long ossId2 = (Long) paragraphs2.get(j).get("ossId");
-
-                    if (para2 == null || para2.trim().isEmpty()) {
+                    SimilarityParagraph right = preparedParagraphs2.get(j);
+                    if (!canReachSimilarityThreshold(left, right, threshold)) {
                         continue;
                     }
 
-                    double similarity = calculateParagraphSimilarity(para1, para2, ignorePunctuation);
+                    double similarity = left.normalizedText.equals(right.normalizedText)
+                        ? 1.0
+                        : calculateParagraphSimilarity(left.normalizedText, right.normalizedText);
                     // 如果相似度超过阈值，则认为是重复内容
                     if (similarity * 100 >= threshold) {
                         similarItemCount++;
@@ -307,22 +294,22 @@ public class BiddingAnalysisResultServiceImpl implements IBiddingAnalysisResultS
 
                         // 为每个重复段落创建包含内容和具体文件ID的Map
                         com.alibaba.fastjson.JSONObject duplicateItem = new com.alibaba.fastjson.JSONObject();
-                        duplicateItem.put("smallContent", para1);
-                        if (para1.length() > 20) {
-                            content = para1.substring(0, 20);
+                        duplicateItem.put("smallContent", left.originalText);
+                        if (left.originalText.length() > 20) {
+                            content = left.originalText.substring(0, 20);
                             duplicateItem.put("smallContent", content);
                         }
-                        duplicateItem.put("smallContentB", para2);
-                        if (para2.length() > 20) {
-                            content = para2.substring(0, 20);
+                        duplicateItem.put("smallContentB", right.originalText);
+                        if (right.originalText.length() > 20) {
+                            content = right.originalText.substring(0, 20);
                             duplicateItem.put("smallContentB", content);
                         }
-                        duplicateItem.put("content", para1);
-                        duplicateItem.put("contentB", para2);
-                        duplicateItem.put("page2",page2);
-                        duplicateItem.put("page1", page1);
-                        duplicateItem.put("paragraphId2", paragraphId2);
-                        duplicateItem.put("paragraphId1", paragraphId);
+                        duplicateItem.put("content", left.originalText);
+                        duplicateItem.put("contentB", right.originalText);
+                        duplicateItem.put("page2", right.page);
+                        duplicateItem.put("page1", left.page);
+                        duplicateItem.put("paragraphId2", right.paragraphId);
+                        duplicateItem.put("paragraphId1", left.paragraphId);
 
                         // 使用缓存获取pdf1
 //                        Long pdf1 = pdfOssIdCache.get(ossId1);
@@ -337,19 +324,19 @@ public class BiddingAnalysisResultServiceImpl implements IBiddingAnalysisResultS
 //                            pdf2 = biddingProposalDtlMapper.findPdfOssId(ossId2);
 //                            pdfOssIdCache.put(ossId2, pdf2);
 //                        }
-                        Long dtlId1 = dtlIdCache.get(ossId1);
+                        Long dtlId1 = dtlIdCache.get(left.ossId);
                         if (dtlId1 == null) {
-                            dtlId1 = biddingProposalDtlMapper.findDtlIdByOssId(ossId1);
-                            dtlIdCache.put(ossId1, dtlId1);
+                            dtlId1 = biddingProposalDtlMapper.findDtlIdByOssId(left.ossId);
+                            dtlIdCache.put(left.ossId, dtlId1);
                         }
 
-                        Long dtlId2 = dtlIdCache.get(ossId2);
+                        Long dtlId2 = dtlIdCache.get(right.ossId);
                         if (dtlId2 == null) {
-                            dtlId2 = biddingProposalDtlMapper.findDtlIdByOssId(ossId2);
-                            dtlIdCache.put(ossId2, dtlId2);
+                            dtlId2 = biddingProposalDtlMapper.findDtlIdByOssId(right.ossId);
+                            dtlIdCache.put(right.ossId, dtlId2);
                         }
                         //文本相似度结果，每个重复段落中已经包含了对应的具体文件ID
-                        BiddingAnalysisResultDtl textResult = addResult(duplicateItem.toString(), 3, ossId1, ossId2, dtlId1, dtlId2);
+                        BiddingAnalysisResultDtl textResult = addResult(duplicateItem.toString(), 3, left.ossId, right.ossId, dtlId1, dtlId2);
                         biddingResultList.add(textResult);
                         break; // 避免一个段落被多次匹配
                     }
@@ -509,15 +496,93 @@ public class BiddingAnalysisResultServiceImpl implements IBiddingAnalysisResultS
     }
 
     /**
-     * 计算两个段落的相似度(适用于中文文本)
-     * 使用字符级别的相似度算法,结合LCS和字符重叠度
-     *
-     * @param para1 段落1
-     * @param para2 段落2
-     * @param ignorePunctuation 是否在比较时忽略标点符号
-     * @return 相似度(0.0-1.0之间的值,1.0表示完全相同)
+     * 预处理段落，避免在两两比较时重复清洗文本和构建字符集合。
      */
-    private double calculateParagraphSimilarity(String para1, String para2, boolean ignorePunctuation) {
+    private List<SimilarityParagraph> prepareSimilarityParagraphs(List<Map<String, Object>> paragraphs, boolean ignorePunctuation) {
+        List<SimilarityParagraph> result = new ArrayList<>();
+        if (paragraphs == null || paragraphs.isEmpty()) {
+            return result;
+        }
+        for (Map<String, Object> paragraph : paragraphs) {
+            Object textObj = paragraph.get("text");
+            if (textObj == null) {
+                continue;
+            }
+            String originalText = textObj.toString().trim();
+            if (originalText.isEmpty()) {
+                continue;
+            }
+            String normalizedText = ignorePunctuation ? removePunctuation(originalText) : originalText;
+            normalizedText = normalizedText.trim();
+            if (normalizedText.isEmpty()) {
+                continue;
+            }
+            result.add(new SimilarityParagraph(
+                originalText,
+                normalizedText,
+                String.valueOf(paragraph.getOrDefault("page", "")),
+                String.valueOf(paragraph.getOrDefault("paragraphId", "")),
+                toLong(paragraph.get("ossId")),
+                buildCharSet(normalizedText)
+            ));
+        }
+        return result;
+    }
+
+    private boolean canReachSimilarityThreshold(SimilarityParagraph left, SimilarityParagraph right, int threshold) {
+        if (left.normalizedText.equals(right.normalizedText)) {
+            return true;
+        }
+        int maxLength = Math.max(left.length, right.length);
+        if (maxLength == 0) {
+            return false;
+        }
+        double lengthUpperBound = (double) Math.min(left.length, right.length) / maxLength;
+        double charOverlap = calculateCharOverlapSimilarity(left.uniqueChars, right.uniqueChars);
+        double similarityUpperBound = lengthUpperBound * 0.7 + charOverlap * 0.3;
+        return similarityUpperBound * 100 >= threshold;
+    }
+
+    private String removePunctuation(String text) {
+        StringBuilder builder = new StringBuilder(text.length());
+        for (int i = 0; i < text.length(); i++) {
+            char c = text.charAt(i);
+            int type = Character.getType(c);
+            if (type != Character.CONNECTOR_PUNCTUATION
+                && type != Character.DASH_PUNCTUATION
+                && type != Character.START_PUNCTUATION
+                && type != Character.END_PUNCTUATION
+                && type != Character.INITIAL_QUOTE_PUNCTUATION
+                && type != Character.FINAL_QUOTE_PUNCTUATION
+                && type != Character.OTHER_PUNCTUATION) {
+                builder.append(c);
+            }
+        }
+        return builder.toString();
+    }
+
+    private Set<Character> buildCharSet(String text) {
+        Set<Character> chars = new HashSet<>();
+        for (int i = 0; i < text.length(); i++) {
+            char c = text.charAt(i);
+            if (!Character.isWhitespace(c)) {
+                chars.add(c);
+            }
+        }
+        return chars;
+    }
+
+    private Long toLong(Object value) {
+        if (value instanceof Long) {
+            return (Long) value;
+        }
+        if (value instanceof Number) {
+            return ((Number) value).longValue();
+        }
+        return value == null ? null : Long.valueOf(value.toString());
+    }
+
+    private double calculateParagraphSimilarity(String para1, String para2) {
         if (para1 == null || para2 == null) {
             return 0.0;
         }
@@ -526,12 +591,6 @@ public class BiddingAnalysisResultServiceImpl implements IBiddingAnalysisResultS
         String text1 = para1.trim();
         String text2 = para2.trim();
 
-        // 如果配置了忽略标点符号，则在比较时临时移除标点符号（但不影响原始文本）
-        if (ignorePunctuation) {
-            text1 = text1.replaceAll("\\p{P}", "");
-            text2 = text2.replaceAll("\\p{P}", "");
-        }
-
         // 如果任一文本为空，返回0
         if (text1.isEmpty() || text2.isEmpty()) {
             return 0.0;
@@ -561,6 +620,27 @@ public class BiddingAnalysisResultServiceImpl implements IBiddingAnalysisResultS
         return finalSimilarity;
     }
 
+    private static class SimilarityParagraph {
+        private final String originalText;
+        private final String normalizedText;
+        private final String page;
+        private final String paragraphId;
+        private final Long ossId;
+        private final int length;
+        private final Set<Character> uniqueChars;
+
+        private SimilarityParagraph(String originalText, String normalizedText, String page,
+                                    String paragraphId, Long ossId, Set<Character> uniqueChars) {
+            this.originalText = originalText;
+            this.normalizedText = normalizedText;
+            this.page = page;
+            this.paragraphId = paragraphId;
+            this.ossId = ossId;
+            this.length = normalizedText.length();
+            this.uniqueChars = uniqueChars;
+        }
+    }
+
     /**
      * 基于最长公共子序列(LCS)计算相似度
      */
@@ -603,21 +683,13 @@ public class BiddingAnalysisResultServiceImpl implements IBiddingAnalysisResultS
      */
     private double calculateCharOverlapSimilarity(String text1, String text2) {
         // 将文本转换为字符集合(去除空白字符和标点符号)
-        Set<Character> chars1 = new HashSet<>();
-        Set<Character> chars2 = new HashSet<>();
+        Set<Character> chars1 = buildCharSet(text1);
+        Set<Character> chars2 = buildCharSet(text2);
 
-        for (char c : text1.toCharArray()) {
-            if (!Character.isWhitespace(c)) {
-                chars1.add(c);
-            }
-        }
-
-        for (char c : text2.toCharArray()) {
-            if (!Character.isWhitespace(c)) {
-                chars2.add(c);
-            }
-        }
+        return calculateCharOverlapSimilarity(chars1, chars2);
+    }
 
+    private double calculateCharOverlapSimilarity(Set<Character> chars1, Set<Character> chars2) {
         if (chars1.isEmpty() || chars2.isEmpty()) {
             return 0.0;
         }