From 51184df30dc57f299c2faf5f46274c5eb61af8ac Mon Sep 17 00:00:00 2001 From: cjh <949661474@qq.com> Date: Wed, 10 Jun 2026 14:56:59 +0800 Subject: [PATCH] =?UTF-8?q?=E6=96=87=E6=A1=A3=E8=84=9A=E6=B3=A8=E9=97=AE?= =?UTF-8?q?=E9=A2=98=E5=92=8C=E5=BF=BD=E7=95=A5=E6=8B=9B=E6=A0=87=E6=96=87?= =?UTF-8?q?=E4=BB=B6=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../aiCheck/docExamine/DocExamine.java | 33 ++++-- .../aiCheck/service/BiddingContent.java | 31 +++-- .../aiCheck/service/SmartFilterProcessor.java | 107 ++++++++++++------ 3 files changed, 121 insertions(+), 50 deletions(-) diff --git a/ai_check/src/main/java/org/dromara/aiCheck/docExamine/DocExamine.java b/ai_check/src/main/java/org/dromara/aiCheck/docExamine/DocExamine.java index f23436d..7e26008 100644 --- a/ai_check/src/main/java/org/dromara/aiCheck/docExamine/DocExamine.java +++ b/ai_check/src/main/java/org/dromara/aiCheck/docExamine/DocExamine.java @@ -18,8 +18,8 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.security.CodeSource; -import java.util.*; import java.util.List; +import java.util.*; import java.util.function.Consumer; @@ -1338,13 +1338,30 @@ public class DocExamine implements AutoCloseable { return false; } + private boolean isInsideFootnote(Paragraph paragraph) { + if (paragraph == null) { + return false; + } + Node current = paragraph; + while (current != null) { + if (current.getNodeType() == NodeType.FOOTNOTE) { + return true; + } + current = current.getParentNode(); + } + return false; + } + private String extractParagraphPlainText(Paragraph paragraph) { if (paragraph == null) { return ""; } + if (isInsideFootnote(paragraph)) { + return ""; + } try { Paragraph sanitizedParagraph = (Paragraph) paragraph.deepClone(true); - removeShapeNodes(sanitizedParagraph); + removeIgnoredTextNodes(sanitizedParagraph); return sanitizedParagraph.toString(SaveFormat.TEXT).trim(); } catch (Exception ignored) { try { @@ -1355,19 +1372,19 @@ public class DocExamine implements AutoCloseable { } } - private void removeShapeNodes(Paragraph paragraph) { + private void removeIgnoredTextNodes(Paragraph paragraph) { if (paragraph == null) { return; } - List shapeNodes = new ArrayList<>(); + List ignoredNodes = new ArrayList<>(); for (Node child = paragraph.getFirstChild(); child != null; child = child.getNextSibling()) { int nodeType = child.getNodeType(); - if (nodeType == NodeType.SHAPE || nodeType == NodeType.GROUP_SHAPE) { - shapeNodes.add(child); + if (nodeType == NodeType.SHAPE || nodeType == NodeType.GROUP_SHAPE || nodeType == NodeType.FOOTNOTE) { + ignoredNodes.add(child); } } - for (Node shapeNode : shapeNodes) { - shapeNode.remove(); + for (Node ignoredNode : ignoredNodes) { + ignoredNode.remove(); } } diff --git a/ai_check/src/main/java/org/dromara/aiCheck/service/BiddingContent.java b/ai_check/src/main/java/org/dromara/aiCheck/service/BiddingContent.java index 01f4913..bba919c 100644 --- a/ai_check/src/main/java/org/dromara/aiCheck/service/BiddingContent.java +++ b/ai_check/src/main/java/org/dromara/aiCheck/service/BiddingContent.java @@ -284,13 +284,30 @@ public class BiddingContent { return false; } + private boolean isInsideFootnote(Paragraph paragraph) { + if (paragraph == null) { + return false; + } + Node current = paragraph; + while (current != null) { + if (current.getNodeType() == NodeType.FOOTNOTE) { + return true; + } + current = current.getParentNode(); + } + return false; + } + private String extractParagraphPlainText(Paragraph paragraph) { if (paragraph == null) { return ""; } + if (isInsideFootnote(paragraph)) { + return ""; + } try { Paragraph sanitizedParagraph = (Paragraph) paragraph.deepClone(true); - removeShapeNodes(sanitizedParagraph); + removeIgnoredTextNodes(sanitizedParagraph); return sanitizedParagraph.toString(SaveFormat.TEXT).trim(); } catch (Exception ignored) { try { @@ -301,19 +318,19 @@ public class BiddingContent { } } - private void removeShapeNodes(Paragraph paragraph) { + private void removeIgnoredTextNodes(Paragraph paragraph) { if (paragraph == null) { return; } - List shapeNodes = new ArrayList<>(); + List ignoredNodes = new ArrayList<>(); for (Node child = paragraph.getFirstChild(); child != null; child = child.getNextSibling()) { int nodeType = child.getNodeType(); - if (nodeType == NodeType.SHAPE || nodeType == NodeType.GROUP_SHAPE) { - shapeNodes.add(child); + if (nodeType == NodeType.SHAPE || nodeType == NodeType.GROUP_SHAPE || nodeType == NodeType.FOOTNOTE) { + ignoredNodes.add(child); } } - for (Node shapeNode : shapeNodes) { - shapeNode.remove(); + for (Node ignoredNode : ignoredNodes) { + ignoredNode.remove(); } } diff --git a/ai_check/src/main/java/org/dromara/aiCheck/service/SmartFilterProcessor.java b/ai_check/src/main/java/org/dromara/aiCheck/service/SmartFilterProcessor.java index 25b3b11..889481d 100644 --- a/ai_check/src/main/java/org/dromara/aiCheck/service/SmartFilterProcessor.java +++ b/ai_check/src/main/java/org/dromara/aiCheck/service/SmartFilterProcessor.java @@ -6,7 +6,6 @@ import org.dromara.aiCheck.Utils.MapTypeConverter; import org.dromara.chat.service.impl.LLMService; import org.dromara.common.core.utils.StringUtils; import org.dromara.common.core.utils.file.AsposeTempFileUtils; -import org.dromara.common.core.utils.file.FileParseUtil; import org.dromara.review.domain.bo.SmartFilterConfig; import org.dromara.review.service.IAiPromptService; import org.dromara.system.domain.vo.SysOssVo; @@ -60,21 +59,21 @@ public class SmartFilterProcessor { return filteredParagraphsWithOssId; } - // 预加载并解析招标文件内容,只执行一次 - String biddingFileContent = null; + // 预加载并解析招标文件段落,只执行一次,段落粒度需与投标文件 Aspose 解析保持一致。 + Set biddingFileParagraphs = Collections.emptySet(); Set tempLocalFiles = new HashSet<>(); // 缓存技术标准所在段落ID,key为ossId,value为doc_g段落ID集合 Map> fileTechnicalStandardParagraphIdCache = new HashMap<>(); try { - // 检查是否需要忽略与投标文件相同的内容 + // 检查是否需要忽略与招标文件相同的内容 if (Boolean.TRUE.equals(config.getIgnoreBiddingFileContent()) && (config.getBiddingDocuId()!=null && config.getBiddingDocuId()>0)) { - // 获取招标文件的内容,只下载一次 + // 获取招标文件的 Aspose 段落,只下载一次 String biddingFileLocalPath = downloadToUniqueLocalPath(config.getBiddingDocuId(), "smartfilter_bid_"); tempLocalFiles.add(biddingFileLocalPath); File file = new File(biddingFileLocalPath); - biddingFileContent = FileParseUtil.parseFile(file); + biddingFileParagraphs = extractComparableParagraphTextSet(file); } if (Boolean.TRUE.equals(config.getIgnoreTechnicalStandards())) { for (Map stringObjectMap : paragraphsWithOssId) { @@ -106,9 +105,10 @@ public class SmartFilterProcessor { } String filteredText = paragraph; - // 忽略与投标文件相同的内容,使用预加载的投标文件内容 - if (StringUtils.isNotBlank(biddingFileContent)) { - filteredText = removeBiddingFileContent(paragraph, biddingFileContent); + // 忽略与招标文件相同的内容,使用与投标文件一致的 Aspose 段落粒度判断。 + if (CollectionUtils.isNotEmpty(biddingFileParagraphs)) { + String originalParagraph = (String) paraWithOssId.getOrDefault("originalText", paragraph); + filteredText = removeBiddingFileContent(paragraph, originalParagraph, biddingFileParagraphs); } // 忽略标点符号和短文本 @@ -178,33 +178,17 @@ public class SmartFilterProcessor { } /** - * 移除与招标文件相同的内容 + * 移除与招标文件相同的内容。 + * + * 文本查重的投标文件段落 ID 来自 Aspose 原始段落序号,text 可能已经被切成句子或长段片段。 + * 因此这里优先用 originalText 与招标文件 Aspose 段落比较,避免简单换行切分导致误过滤或 ID 定位不一致。 */ - private String removeBiddingFileContent(String content, String biddingContent) { - if (StringUtils.isBlank(biddingContent)) { + private String removeBiddingFileContent(String content, String originalContent, Set biddingParagraphs) { + if (CollectionUtils.isEmpty(biddingParagraphs)) { return content; } - - List biddingParagraphs = splitToParagraphs(biddingContent); - - // 遍历内容段落,过滤掉与招标文件重复的段落 - boolean isDuplicate = false; - - // 检查当前段落是否与招标文件中的任何段落重复 - for (String biddingParagraph : biddingParagraphs) { - String cleanBiddingPara = biddingParagraph.trim(); - - if (content.equals(cleanBiddingPara)) { - isDuplicate = true; - break; - } - } - - // 如果不是重复段落,则保留 - if (!isDuplicate) { - return content; - } - return ""; + String compareText = StringUtils.isNotBlank(originalContent) ? originalContent : content; + return biddingParagraphs.contains(normalizeDuplicateParagraphText(compareText)) ? "" : content; } // 使用正则表达式更精确地匹配页码格式 private boolean containsPageNumber(String text) { @@ -291,6 +275,38 @@ public class SmartFilterProcessor { .collect(Collectors.toList()); } + private Set extractComparableParagraphTextSet(File file) throws Exception { + Set paragraphTextSet = new HashSet<>(); + try { + LoadOptions loadOptions = new LoadOptions(); + loadOptions.setTempFolder(ASPOSE_TEMP_DIR.toString()); + Document doc = new Document(file.getPath(), loadOptions); + for (Paragraph paragraph : collectComparableParagraphs(doc)) { + String normalizedText = normalizeDuplicateParagraphText(extractParagraphPlainText(paragraph)); + if (StringUtils.isNotBlank(normalizedText)) { + paragraphTextSet.add(normalizedText); + } + } + return paragraphTextSet; + } finally { + AsposeTempFileUtils.cleanupStaleAsposeTempFiles(SmartFilterProcessor.class, ASPOSE_TEMP_DIR); + } + } + + private String normalizeDuplicateParagraphText(String text) { + if (StringUtils.isBlank(text)) { + return ""; + } + StringBuilder builder = new StringBuilder(text.length()); + for (int i = 0; i < text.length(); i++) { + char current = text.charAt(i); + if (!Character.isWhitespace(current)) { + builder.append(current); + } + } + return builder.toString().trim(); + } + /** * 从文件中提取技术标准标题及其内容的段落ID @@ -388,10 +404,27 @@ public class SmartFilterProcessor { return false; } + private static boolean isInsideFootnote(Paragraph paragraph) { + if (paragraph == null) { + return false; + } + Node current = paragraph; + while (current != null) { + if (current.getNodeType() == NodeType.FOOTNOTE) { + return true; + } + current = current.getParentNode(); + } + return false; + } + private static String extractParagraphPlainText(Paragraph paragraph) { + if (isInsideFootnote(paragraph)) { + return ""; + } try { Paragraph sanitizedParagraph = (Paragraph) paragraph.deepClone(true); - removeShapeNodes(sanitizedParagraph); + removeIgnoredTextNodes(sanitizedParagraph); return sanitizedParagraph.toString(SaveFormat.TEXT).trim(); } catch (Exception ignored) { try { @@ -402,7 +435,7 @@ public class SmartFilterProcessor { } } - private static void removeShapeNodes(Paragraph paragraph) throws Exception { + private static void removeIgnoredTextNodes(Paragraph paragraph) throws Exception { NodeCollection shapes = paragraph.getChildNodes(NodeType.SHAPE, true); for (int i = shapes.getCount() - 1; i >= 0; i--) { shapes.get(i).remove(); @@ -411,6 +444,10 @@ public class SmartFilterProcessor { for (int i = groupShapes.getCount() - 1; i >= 0; i--) { groupShapes.get(i).remove(); } + NodeCollection footnotes = paragraph.getChildNodes(NodeType.FOOTNOTE, true); + for (int i = footnotes.getCount() - 1; i >= 0; i--) { + footnotes.get(i).remove(); + } } /**