From bc8f2acbc30fa3641e9f0fcf9c638a0d798710d7 Mon Sep 17 00:00:00 2001 From: cjh <949661474@qq.com> Date: Tue, 2 Jun 2026 17:55:00 +0800 Subject: [PATCH] =?UTF-8?q?1.=E6=96=87=E6=9C=AC=E7=9B=B8=E4=BC=BC=E5=BA=A6?= =?UTF-8?q?=E5=8A=A0=E5=BF=AB=E5=AE=A1=E6=9F=A5=E9=80=9F=E5=BA=A62.?= =?UTF-8?q?=E5=BF=BD=E7=95=A5=E6=8A=80=E6=9C=AF=E6=A0=87=E5=87=86=E4=BF=AE?= =?UTF-8?q?=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../aiCheck/service/BiddingContent.java | 41 ++-- .../aiCheck/service/SmartFilterProcessor.java | 173 ++++++++------ .../BiddingAnalysisResultServiceImpl.java | 216 ++++++++++++------ 3 files changed, 271 insertions(+), 159 deletions(-) diff --git a/ai_check/src/main/java/org/dromara/aiCheck/service/BiddingContent.java b/ai_check/src/main/java/org/dromara/aiCheck/service/BiddingContent.java index 1462f75..01f4913 100644 --- a/ai_check/src/main/java/org/dromara/aiCheck/service/BiddingContent.java +++ b/ai_check/src/main/java/org/dromara/aiCheck/service/BiddingContent.java @@ -5,15 +5,14 @@ import ai.z.openapi.service.ocr.HandwritingOcrResponse; import ai.z.openapi.service.ocr.HandwritingOcrUploadReq; import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONObject; -import com.aspose.words.*; import com.aspose.words.Document; import com.aspose.words.Font; import com.aspose.words.ParagraphAlignment; +import com.aspose.words.*; import com.spire.doc.fields.ShapeGroup; import com.spire.doc.fields.ShapeObject; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; -import okhttp3.*; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.rendering.PDFRenderer; import org.apache.poi.hwpf.HWPFDocument; @@ -24,9 +23,17 @@ import org.dromara.common.core.utils.StringUtils; import org.dromara.common.core.utils.file.AsposeTempFileUtils; import org.dromara.common.core.utils.file.FileParseUtil; import org.dromara.review.domain.bo.SmartFilterConfig; +import org.opencv.core.Core; +import org.opencv.core.Mat; +import org.opencv.imgcodecs.Imgcodecs; +import org.opencv.imgproc.Imgproc; import org.openxmlformats.schemas.wordprocessingml.x2006.main.*; import org.springframework.stereotype.Service; +import javax.imageio.ImageIO; +import javax.imageio.ImageReadParam; +import javax.imageio.ImageReader; +import javax.imageio.stream.ImageInputStream; import java.awt.*; import java.awt.image.BufferedImage; import java.io.File; @@ -36,24 +43,14 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; -import java.util.*; +import java.security.MessageDigest; import java.util.List; +import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; -import org.opencv.core.Core; -import org.opencv.core.Mat; -import org.opencv.imgcodecs.Imgcodecs; -import org.opencv.imgproc.Imgproc; - -import javax.imageio.ImageIO; -import javax.imageio.ImageReadParam; -import javax.imageio.ImageReader; -import javax.imageio.stream.ImageInputStream; - import static cn.dev33.satoken.util.SaHexUtil.bytesToHex; -import java.security.MessageDigest; @Slf4j @RequiredArgsConstructor @@ -70,6 +67,7 @@ public class BiddingContent { private static final double OCR_SOLID_IMAGE_THRESHOLD = 0.995D; private static final int OCR_SAMPLE_GRID = 32; private static final int OCR_CACHE_MAX_SIZE = 2000; + private static final int TEXT_SIMILARITY_MAX_PARAGRAPH_LENGTH = 800; private static final Map> OCR_RESULT_CACHE = Collections.synchronizedMap(new LinkedHashMap<>(256, 0.75F, true) { @Override @@ -135,6 +133,7 @@ public class BiddingContent { } else { processedParagraphs = splitTextToSentences(originalParagraph); } + processedParagraphs = splitOversizedParagraphs(processedParagraphs); for (int subIndex = 0; subIndex < processedParagraphs.size(); subIndex++) { String processedText = processedParagraphs.get(subIndex); @@ -543,6 +542,18 @@ public class BiddingContent { return sentences; } + private List splitOversizedParagraphs(List paragraphs) { + List result = new ArrayList<>(); + for (String paragraph : paragraphs) { + if (paragraph == null || paragraph.length() <= TEXT_SIMILARITY_MAX_PARAGRAPH_LENGTH) { + result.add(paragraph); + } else { + result.addAll(splitLongParagraph(paragraph, TEXT_SIMILARITY_MAX_PARAGRAPH_LENGTH)); + } + } + return result; + } + /** * 判断是否应该使用长段落分割策略 * @@ -563,7 +574,7 @@ public class BiddingContent { // 解析配置并检查是否启用忽略关键信息类型 try { - SmartFilterConfig config = com.alibaba.fastjson.JSONObject.parseObject(smartFilterConfig, SmartFilterConfig.class); + SmartFilterConfig config = JSONObject.parseObject(smartFilterConfig, SmartFilterConfig.class); return config != null && config.getIgnoreKeyInfoTypesEnable(); } catch (Exception e) { log.warn("解析智能过滤配置失败,使用默认段落分割策略: {}", e.getMessage()); diff --git a/ai_check/src/main/java/org/dromara/aiCheck/service/SmartFilterProcessor.java b/ai_check/src/main/java/org/dromara/aiCheck/service/SmartFilterProcessor.java index 66c83c0..25b3b11 100644 --- a/ai_check/src/main/java/org/dromara/aiCheck/service/SmartFilterProcessor.java +++ b/ai_check/src/main/java/org/dromara/aiCheck/service/SmartFilterProcessor.java @@ -2,24 +2,11 @@ package org.dromara.aiCheck.service; import com.aspose.words.*; import com.baomidou.mybatisplus.core.toolkit.CollectionUtils; -import com.fasterxml.jackson.core.JsonParser; -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.core.type.TypeReference; -import com.fasterxml.jackson.databind.JsonNode; -import com.fasterxml.jackson.databind.ObjectMapper; import org.dromara.aiCheck.Utils.MapTypeConverter; -import org.dromara.aiCheck.config.AiCheckConcurrencySupport; -import org.dromara.chat.domain.vo.LLMRequest; -import org.dromara.chat.domain.vo.LLMResponse; -import org.dromara.chat.domain.vo.Message; import org.dromara.chat.service.impl.LLMService; -import org.dromara.common.core.domain.R; -import org.dromara.common.core.utils.SpringUtils; import org.dromara.common.core.utils.StringUtils; import org.dromara.common.core.utils.file.AsposeTempFileUtils; import org.dromara.common.core.utils.file.FileParseUtil; -import org.dromara.common.core.utils.file.FileUtils; -import org.dromara.review.domain.bo.AiPromptBo; import org.dromara.review.domain.bo.SmartFilterConfig; import org.dromara.review.service.IAiPromptService; import org.dromara.system.domain.vo.SysOssVo; @@ -28,17 +15,14 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Component; -import reactor.core.publisher.Mono; import java.io.File; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; -import java.util.*; import java.util.List; -import java.util.concurrent.*; -import java.util.regex.Pattern; +import java.util.*; import java.util.stream.Collectors; /** @@ -80,8 +64,8 @@ public class SmartFilterProcessor { String biddingFileContent = null; Set tempLocalFiles = new HashSet<>(); - // 缓存已读取的文件内容,key为ossId,value为标题内容映射 - Map> fileTitleContentCache = new HashMap<>(); + // 缓存技术标准所在段落ID,key为ossId,value为doc_g段落ID集合 + Map> fileTechnicalStandardParagraphIdCache = new HashMap<>(); try { // 检查是否需要忽略与投标文件相同的内容 @@ -92,18 +76,20 @@ public class SmartFilterProcessor { File file = new File(biddingFileLocalPath); biddingFileContent = FileParseUtil.parseFile(file); } - for (Map stringObjectMap : paragraphsWithOssId) { - Long ossId = (Long) stringObjectMap.get("ossId"); - List titleContentList = fileTitleContentCache.get(ossId); - if (titleContentList == null) { - // 下载文件到本地 - String localFilePath = downloadToUniqueLocalPath(ossId, "smartfilter_src_"); - tempLocalFiles.add(localFilePath); - File file = new File(localFilePath); - // 使用Aspose.Words API解析文件并提取标题内容 - titleContentList = extractTitleWithContentFromFile(file); - // 缓存结果 - fileTitleContentCache.put(ossId, titleContentList); + if (Boolean.TRUE.equals(config.getIgnoreTechnicalStandards())) { + for (Map stringObjectMap : paragraphsWithOssId) { + Long ossId = MapTypeConverter.getMapLongValue(stringObjectMap, "ossId"); + Set technicalStandardParagraphIds = fileTechnicalStandardParagraphIdCache.get(ossId); + if (technicalStandardParagraphIds == null) { + // 下载文件到本地 + String localFilePath = downloadToUniqueLocalPath(ossId, "smartfilter_src_"); + tempLocalFiles.add(localFilePath); + File file = new File(localFilePath); + // 使用Aspose.Words API解析文件并提取技术标准所在段落ID + technicalStandardParagraphIds = extractTechnicalStandardParagraphIdsFromFile(file); + // 缓存结果 + fileTechnicalStandardParagraphIdCache.put(ossId, technicalStandardParagraphIds); + } } } @@ -137,8 +123,10 @@ public class SmartFilterProcessor { // 忽略技术标准 if (Boolean.TRUE.equals(config.getIgnoreTechnicalStandards())) { - List technicalStandards = fileTitleContentCache.get(ossId); - filteredText = removeTechnicalStandards(filteredText,technicalStandards); + Set technicalStandardParagraphIds = fileTechnicalStandardParagraphIdCache.get(ossId); + if (isTechnicalStandardParagraph(paragraphId, technicalStandardParagraphIds)) { + filteredText = ""; + } } //6. 忽略重点信息相似词 --只限于重点信息 @@ -272,22 +260,12 @@ public class SmartFilterProcessor { } /** - * 移除技术标准内容(包括编制说明及其子目录) + * 判断当前段落是否属于技术标准内容(包括编制说明及其子目录) */ - private String removeTechnicalStandards(String content,List technicalStandards) { - boolean flag = false; - for (String para : technicalStandards) { - String para2 = para.trim(); - - if (content.equals(para2)) { - flag = true; - break; - } - } - if(!flag) { - return content; - } - return ""; + private boolean isTechnicalStandardParagraph(String paragraphId, Set technicalStandardParagraphIds) { + return StringUtils.isNotBlank(paragraphId) + && CollectionUtils.isNotEmpty(technicalStandardParagraphIds) + && technicalStandardParagraphIds.contains(paragraphId); } @@ -315,22 +293,18 @@ public class SmartFilterProcessor { /** - * 从文件中提取所有标题及其内容 + * 从文件中提取技术标准标题及其内容的段落ID */ - private List extractTitleWithContentFromFile(File file) throws Exception { - List result = new ArrayList<>(); - result.add("编制说明"); - result.add("编制依据"); - result.add("编制原则"); - result.add("编制范围"); + private Set extractTechnicalStandardParagraphIdsFromFile(File file) throws Exception { + Set result = new HashSet<>(); try { LoadOptions loadOptions = new LoadOptions(); loadOptions.setTempFolder(ASPOSE_TEMP_DIR.toString()); Document doc = new Document(file.getPath(), loadOptions); - result.addAll(extractSpecificTitleContent(doc, "编制说明")); - result.addAll(extractSpecificTitleContent(doc, "编制依据")); - result.addAll(extractSpecificTitleContent(doc, "编制原则")); - result.addAll(extractSpecificTitleContent(doc, "编制范围")); + result.addAll(extractSpecificTitleParagraphIds(doc, "编制说明")); + result.addAll(extractSpecificTitleParagraphIds(doc, "编制依据")); + result.addAll(extractSpecificTitleParagraphIds(doc, "编制原则")); + result.addAll(extractSpecificTitleParagraphIds(doc, "编制范围")); return result; } finally { AsposeTempFileUtils.cleanupStaleAsposeTempFiles(SmartFilterProcessor.class, ASPOSE_TEMP_DIR); @@ -342,18 +316,18 @@ public class SmartFilterProcessor { * * @param doc Aspose.Words文档对象 * @param targetTitle 要提取的标题文本 - * @return 该标题下的内容段落列表 + * @return 该标题下的内容段落ID集合 */ - public static List extractSpecificTitleContent(Document doc, String targetTitle) { - List contentList = new ArrayList<>(); + public static Set extractSpecificTitleParagraphIds(Document doc, String targetTitle) throws Exception { + Set paragraphIds = new HashSet<>(); boolean isTargetTitleFound = false; - boolean isNextTitleFound = false; - // 获取文档中的所有段落 - NodeCollection paragraphs = doc.getChildNodes(NodeType.PARAGRAPH, true); + List paragraphs = collectComparableParagraphs(doc); - for (com.aspose.words.Paragraph paragraph : paragraphs) { - String paragraphText = paragraph.getText().trim(); + for (int i = 0; i < paragraphs.size(); i++) { + Paragraph paragraph = paragraphs.get(i); + String paragraphId = "doc_g" + (i + 1); + String paragraphText = extractParagraphPlainText(paragraph); if (paragraphText.isEmpty()) { continue; // 跳过空段落 } @@ -362,17 +336,22 @@ public class SmartFilterProcessor { String styleName = paragraph.getParagraphFormat().getStyle().getName(); // 检查是否是标题样式(以"标题"或"Heading"开头) - boolean isHeading = styleName.startsWith("Heading 2") || styleName.startsWith("Heading 1"); + boolean isHeading = styleName.startsWith("Heading 2") + || styleName.startsWith("Heading 1") + || styleName.startsWith("标题 2") + || styleName.startsWith("标题 1"); + + if (paragraphText.equals(targetTitle)) { + paragraphIds.add(paragraphId); + } if (isTargetTitleFound) { // 如果已经找到了目标标题,现在检查是否遇到了下一个标题 if (isHeading) { // 遇到了下一个标题,结束当前标题的内容提取 - isNextTitleFound = true; break; } else { - // 添加内容到列表 - contentList.add(paragraphText); + paragraphIds.add(paragraphId); } } else { // 寻找目标标题 @@ -381,7 +360,57 @@ public class SmartFilterProcessor { } } } - return contentList; + return paragraphIds; + } + + private static List collectComparableParagraphs(Document doc) { + List paragraphs = new ArrayList<>(); + for (Section section : doc.getSections()) { + for (Object paragraphObj : section.getBody().getChildNodes(NodeType.PARAGRAPH, true)) { + Paragraph paragraph = (Paragraph) paragraphObj; + if (!isInsideShape(paragraph)) { + paragraphs.add(paragraph); + } + } + } + return paragraphs; + } + + private static boolean isInsideShape(Paragraph paragraph) { + Node current = paragraph; + while (current != null) { + int nodeType = current.getNodeType(); + if (nodeType == NodeType.SHAPE || nodeType == NodeType.GROUP_SHAPE) { + return true; + } + current = current.getParentNode(); + } + return false; + } + + private static String extractParagraphPlainText(Paragraph paragraph) { + try { + Paragraph sanitizedParagraph = (Paragraph) paragraph.deepClone(true); + removeShapeNodes(sanitizedParagraph); + return sanitizedParagraph.toString(SaveFormat.TEXT).trim(); + } catch (Exception ignored) { + try { + return paragraph.toString(SaveFormat.TEXT).trim(); + } catch (Exception ex) { + return ""; + } + } + } + + private static void removeShapeNodes(Paragraph paragraph) throws Exception { + NodeCollection shapes = paragraph.getChildNodes(NodeType.SHAPE, true); + for (int i = shapes.getCount() - 1; i >= 0; i--) { + shapes.get(i).remove(); + } + NodeCollection groupShapes = paragraph.getChildNodes(NodeType.GROUP_SHAPE, true); + for (int i = groupShapes.getCount() - 1; i >= 0; i--) { + groupShapes.get(i).remove(); + } } /** diff --git a/ruoyi-modules/ai-intelligent-review/src/main/java/org/dromara/review/service/impl/BiddingAnalysisResultServiceImpl.java b/ruoyi-modules/ai-intelligent-review/src/main/java/org/dromara/review/service/impl/BiddingAnalysisResultServiceImpl.java index 456611a..57e4bf8 100644 --- a/ruoyi-modules/ai-intelligent-review/src/main/java/org/dromara/review/service/impl/BiddingAnalysisResultServiceImpl.java +++ b/ruoyi-modules/ai-intelligent-review/src/main/java/org/dromara/review/service/impl/BiddingAnalysisResultServiceImpl.java @@ -247,8 +247,13 @@ public class BiddingAnalysisResultServiceImpl implements IBiddingAnalysisResultS SmartFilterConfig smartFilterConfig) { try { - int totalItems1 = paragraphs1.size(); - int totalItems2 = paragraphs2.size(); + // 确定是否在相似度计算时忽略标点符号 + boolean ignorePunctuation = smartFilterConfig != null + && Boolean.TRUE.equals(smartFilterConfig.getIgnorePunctuationAndShortText()); + List preparedParagraphs1 = prepareSimilarityParagraphs(paragraphs1, ignorePunctuation); + List preparedParagraphs2 = prepareSimilarityParagraphs(paragraphs2, ignorePunctuation); + int totalItems1 = preparedParagraphs1.size(); + int totalItems2 = preparedParagraphs2.size(); int similarItemCount = 0; List> duplicateContents = new ArrayList<>(); @@ -261,42 +266,24 @@ public class BiddingAnalysisResultServiceImpl implements IBiddingAnalysisResultS return new SimilarityResult(0L, duplicateContents); } - // 确定是否在相似度计算时忽略标点符号 - boolean ignorePunctuation = smartFilterConfig != null - && Boolean.TRUE.equals(smartFilterConfig.getIgnorePunctuationAndShortText()); - // 比较每对段落的相似度 - // 缓存已查询过的pdfOssId结果 - Map pdfOssIdCache = new HashMap<>(); Map dtlIdCache = new HashMap<>(); for (int i = 0; i < totalItems1; i++) { - String para1 = paragraphs1.get(i).get("text").toString(); - String page1 = paragraphs1.get(i).get("page").toString(); - String paragraphId = paragraphs1.get(i).get("paragraphId").toString(); - // 直接从ossIds1列表中获取当前段落对应的ossId - Long ossId1 = (Long) paragraphs1.get(i).get("ossId"); - - if (para1 == null || para1.trim().isEmpty()) { - continue; - } + SimilarityParagraph left = preparedParagraphs1.get(i); for (int j = 0; j < totalItems2; j++) { // 如果段落2的第j个元素已经被匹配过,则跳过 if (matchedIndices2.contains(j)) { continue; } - String page2 = paragraphs2.get(j).get("page").toString(); - - String para2 = paragraphs2.get(j).get("text").toString(); - String paragraphId2 = paragraphs2.get(j).get("paragraphId").toString(); - // 直接从ossIds1列表中获取当前段落对应的ossId - Long ossId2 = (Long) paragraphs2.get(j).get("ossId"); - - if (para2 == null || para2.trim().isEmpty()) { + SimilarityParagraph right = preparedParagraphs2.get(j); + if (!canReachSimilarityThreshold(left, right, threshold)) { continue; } - double similarity = calculateParagraphSimilarity(para1, para2, ignorePunctuation); + double similarity = left.normalizedText.equals(right.normalizedText) + ? 1.0 + : calculateParagraphSimilarity(left.normalizedText, right.normalizedText); // 如果相似度超过阈值,则认为是重复内容 if (similarity * 100 >= threshold) { similarItemCount++; @@ -307,22 +294,22 @@ public class BiddingAnalysisResultServiceImpl implements IBiddingAnalysisResultS // 为每个重复段落创建包含内容和具体文件ID的Map com.alibaba.fastjson.JSONObject duplicateItem = new com.alibaba.fastjson.JSONObject(); - duplicateItem.put("smallContent", para1); - if (para1.length() > 20) { - content = para1.substring(0, 20); + duplicateItem.put("smallContent", left.originalText); + if (left.originalText.length() > 20) { + content = left.originalText.substring(0, 20); duplicateItem.put("smallContent", content); } - duplicateItem.put("smallContentB", para2); - if (para2.length() > 20) { - content = para2.substring(0, 20); + duplicateItem.put("smallContentB", right.originalText); + if (right.originalText.length() > 20) { + content = right.originalText.substring(0, 20); duplicateItem.put("smallContentB", content); } - duplicateItem.put("content", para1); - duplicateItem.put("contentB", para2); - duplicateItem.put("page2",page2); - duplicateItem.put("page1", page1); - duplicateItem.put("paragraphId2", paragraphId2); - duplicateItem.put("paragraphId1", paragraphId); + duplicateItem.put("content", left.originalText); + duplicateItem.put("contentB", right.originalText); + duplicateItem.put("page2", right.page); + duplicateItem.put("page1", left.page); + duplicateItem.put("paragraphId2", right.paragraphId); + duplicateItem.put("paragraphId1", left.paragraphId); // 使用缓存获取pdf1 // Long pdf1 = pdfOssIdCache.get(ossId1); @@ -337,19 +324,19 @@ public class BiddingAnalysisResultServiceImpl implements IBiddingAnalysisResultS // pdf2 = biddingProposalDtlMapper.findPdfOssId(ossId2); // pdfOssIdCache.put(ossId2, pdf2); // } - Long dtlId1 = dtlIdCache.get(ossId1); + Long dtlId1 = dtlIdCache.get(left.ossId); if (dtlId1 == null) { - dtlId1 = biddingProposalDtlMapper.findDtlIdByOssId(ossId1); - dtlIdCache.put(ossId1, dtlId1); + dtlId1 = biddingProposalDtlMapper.findDtlIdByOssId(left.ossId); + dtlIdCache.put(left.ossId, dtlId1); } - Long dtlId2 = dtlIdCache.get(ossId2); + Long dtlId2 = dtlIdCache.get(right.ossId); if (dtlId2 == null) { - dtlId2 = biddingProposalDtlMapper.findDtlIdByOssId(ossId2); - dtlIdCache.put(ossId2, dtlId2); + dtlId2 = biddingProposalDtlMapper.findDtlIdByOssId(right.ossId); + dtlIdCache.put(right.ossId, dtlId2); } //文本相似度结果,每个重复段落中已经包含了对应的具体文件ID - BiddingAnalysisResultDtl textResult = addResult(duplicateItem.toString(), 3, ossId1, ossId2, dtlId1, dtlId2); + BiddingAnalysisResultDtl textResult = addResult(duplicateItem.toString(), 3, left.ossId, right.ossId, dtlId1, dtlId2); biddingResultList.add(textResult); break; // 避免一个段落被多次匹配 } @@ -509,15 +496,93 @@ public class BiddingAnalysisResultServiceImpl implements IBiddingAnalysisResultS } /** - * 计算两个段落的相似度(适用于中文文本) - * 使用字符级别的相似度算法,结合LCS和字符重叠度 - * - * @param para1 段落1 - * @param para2 段落2 - * @param ignorePunctuation 是否在比较时忽略标点符号 - * @return 相似度(0.0-1.0之间的值,1.0表示完全相同) + * 预处理段落,避免在两两比较时重复清洗文本和构建字符集合。 */ - private double calculateParagraphSimilarity(String para1, String para2, boolean ignorePunctuation) { + private List prepareSimilarityParagraphs(List> paragraphs, boolean ignorePunctuation) { + List result = new ArrayList<>(); + if (paragraphs == null || paragraphs.isEmpty()) { + return result; + } + for (Map paragraph : paragraphs) { + Object textObj = paragraph.get("text"); + if (textObj == null) { + continue; + } + String originalText = textObj.toString().trim(); + if (originalText.isEmpty()) { + continue; + } + String normalizedText = ignorePunctuation ? removePunctuation(originalText) : originalText; + normalizedText = normalizedText.trim(); + if (normalizedText.isEmpty()) { + continue; + } + result.add(new SimilarityParagraph( + originalText, + normalizedText, + String.valueOf(paragraph.getOrDefault("page", "")), + String.valueOf(paragraph.getOrDefault("paragraphId", "")), + toLong(paragraph.get("ossId")), + buildCharSet(normalizedText) + )); + } + return result; + } + + private boolean canReachSimilarityThreshold(SimilarityParagraph left, SimilarityParagraph right, int threshold) { + if (left.normalizedText.equals(right.normalizedText)) { + return true; + } + int maxLength = Math.max(left.length, right.length); + if (maxLength == 0) { + return false; + } + double lengthUpperBound = (double) Math.min(left.length, right.length) / maxLength; + double charOverlap = calculateCharOverlapSimilarity(left.uniqueChars, right.uniqueChars); + double similarityUpperBound = lengthUpperBound * 0.7 + charOverlap * 0.3; + return similarityUpperBound * 100 >= threshold; + } + + private String removePunctuation(String text) { + StringBuilder builder = new StringBuilder(text.length()); + for (int i = 0; i < text.length(); i++) { + char c = text.charAt(i); + int type = Character.getType(c); + if (type != Character.CONNECTOR_PUNCTUATION + && type != Character.DASH_PUNCTUATION + && type != Character.START_PUNCTUATION + && type != Character.END_PUNCTUATION + && type != Character.INITIAL_QUOTE_PUNCTUATION + && type != Character.FINAL_QUOTE_PUNCTUATION + && type != Character.OTHER_PUNCTUATION) { + builder.append(c); + } + } + return builder.toString(); + } + + private Set buildCharSet(String text) { + Set chars = new HashSet<>(); + for (int i = 0; i < text.length(); i++) { + char c = text.charAt(i); + if (!Character.isWhitespace(c)) { + chars.add(c); + } + } + return chars; + } + + private Long toLong(Object value) { + if (value instanceof Long) { + return (Long) value; + } + if (value instanceof Number) { + return ((Number) value).longValue(); + } + return value == null ? null : Long.valueOf(value.toString()); + } + + private double calculateParagraphSimilarity(String para1, String para2) { if (para1 == null || para2 == null) { return 0.0; } @@ -526,12 +591,6 @@ public class BiddingAnalysisResultServiceImpl implements IBiddingAnalysisResultS String text1 = para1.trim(); String text2 = para2.trim(); - // 如果配置了忽略标点符号,则在比较时临时移除标点符号(但不影响原始文本) - if (ignorePunctuation) { - text1 = text1.replaceAll("\\p{P}", ""); - text2 = text2.replaceAll("\\p{P}", ""); - } - // 如果任一文本为空,返回0 if (text1.isEmpty() || text2.isEmpty()) { return 0.0; @@ -561,6 +620,27 @@ public class BiddingAnalysisResultServiceImpl implements IBiddingAnalysisResultS return finalSimilarity; } + private static class SimilarityParagraph { + private final String originalText; + private final String normalizedText; + private final String page; + private final String paragraphId; + private final Long ossId; + private final int length; + private final Set uniqueChars; + + private SimilarityParagraph(String originalText, String normalizedText, String page, + String paragraphId, Long ossId, Set uniqueChars) { + this.originalText = originalText; + this.normalizedText = normalizedText; + this.page = page; + this.paragraphId = paragraphId; + this.ossId = ossId; + this.length = normalizedText.length(); + this.uniqueChars = uniqueChars; + } + } + /** * 基于最长公共子序列(LCS)计算相似度 */ @@ -603,21 +683,13 @@ public class BiddingAnalysisResultServiceImpl implements IBiddingAnalysisResultS */ private double calculateCharOverlapSimilarity(String text1, String text2) { // 将文本转换为字符集合(去除空白字符和标点符号) - Set chars1 = new HashSet<>(); - Set chars2 = new HashSet<>(); + Set chars1 = buildCharSet(text1); + Set chars2 = buildCharSet(text2); - for (char c : text1.toCharArray()) { - if (!Character.isWhitespace(c)) { - chars1.add(c); - } - } - - for (char c : text2.toCharArray()) { - if (!Character.isWhitespace(c)) { - chars2.add(c); - } - } + return calculateCharOverlapSimilarity(chars1, chars2); + } + private double calculateCharOverlapSimilarity(Set chars1, Set chars2) { if (chars1.isEmpty() || chars2.isEmpty()) { return 0.0; }