文档脚注问题和忽略招标文件问题

This commit is contained in:
cjh
2026-06-10 14:56:59 +08:00
parent 24cbff1625
commit 51184df30d
3 changed files with 121 additions and 50 deletions

View File

@@ -18,8 +18,8 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.security.CodeSource;
import java.util.*;
import java.util.List;
import java.util.*;
import java.util.function.Consumer;
@@ -1338,13 +1338,30 @@ public class DocExamine implements AutoCloseable {
return false;
}
private boolean isInsideFootnote(Paragraph paragraph) {
if (paragraph == null) {
return false;
}
Node current = paragraph;
while (current != null) {
if (current.getNodeType() == NodeType.FOOTNOTE) {
return true;
}
current = current.getParentNode();
}
return false;
}
private String extractParagraphPlainText(Paragraph paragraph) {
if (paragraph == null) {
return "";
}
if (isInsideFootnote(paragraph)) {
return "";
}
try {
Paragraph sanitizedParagraph = (Paragraph) paragraph.deepClone(true);
removeShapeNodes(sanitizedParagraph);
removeIgnoredTextNodes(sanitizedParagraph);
return sanitizedParagraph.toString(SaveFormat.TEXT).trim();
} catch (Exception ignored) {
try {
@@ -1355,19 +1372,19 @@ public class DocExamine implements AutoCloseable {
}
}
private void removeShapeNodes(Paragraph paragraph) {
private void removeIgnoredTextNodes(Paragraph paragraph) {
if (paragraph == null) {
return;
}
List<Node> shapeNodes = new ArrayList<>();
List<Node> ignoredNodes = new ArrayList<>();
for (Node child = paragraph.getFirstChild(); child != null; child = child.getNextSibling()) {
int nodeType = child.getNodeType();
if (nodeType == NodeType.SHAPE || nodeType == NodeType.GROUP_SHAPE) {
shapeNodes.add(child);
if (nodeType == NodeType.SHAPE || nodeType == NodeType.GROUP_SHAPE || nodeType == NodeType.FOOTNOTE) {
ignoredNodes.add(child);
}
}
for (Node shapeNode : shapeNodes) {
shapeNode.remove();
for (Node ignoredNode : ignoredNodes) {
ignoredNode.remove();
}
}

View File

@@ -284,13 +284,30 @@ public class BiddingContent {
return false;
}
private boolean isInsideFootnote(Paragraph paragraph) {
if (paragraph == null) {
return false;
}
Node current = paragraph;
while (current != null) {
if (current.getNodeType() == NodeType.FOOTNOTE) {
return true;
}
current = current.getParentNode();
}
return false;
}
private String extractParagraphPlainText(Paragraph paragraph) {
if (paragraph == null) {
return "";
}
if (isInsideFootnote(paragraph)) {
return "";
}
try {
Paragraph sanitizedParagraph = (Paragraph) paragraph.deepClone(true);
removeShapeNodes(sanitizedParagraph);
removeIgnoredTextNodes(sanitizedParagraph);
return sanitizedParagraph.toString(SaveFormat.TEXT).trim();
} catch (Exception ignored) {
try {
@@ -301,19 +318,19 @@ public class BiddingContent {
}
}
private void removeShapeNodes(Paragraph paragraph) {
private void removeIgnoredTextNodes(Paragraph paragraph) {
if (paragraph == null) {
return;
}
List<Node> shapeNodes = new ArrayList<>();
List<Node> ignoredNodes = new ArrayList<>();
for (Node child = paragraph.getFirstChild(); child != null; child = child.getNextSibling()) {
int nodeType = child.getNodeType();
if (nodeType == NodeType.SHAPE || nodeType == NodeType.GROUP_SHAPE) {
shapeNodes.add(child);
if (nodeType == NodeType.SHAPE || nodeType == NodeType.GROUP_SHAPE || nodeType == NodeType.FOOTNOTE) {
ignoredNodes.add(child);
}
}
for (Node shapeNode : shapeNodes) {
shapeNode.remove();
for (Node ignoredNode : ignoredNodes) {
ignoredNode.remove();
}
}

View File

@@ -6,7 +6,6 @@ import org.dromara.aiCheck.Utils.MapTypeConverter;
import org.dromara.chat.service.impl.LLMService;
import org.dromara.common.core.utils.StringUtils;
import org.dromara.common.core.utils.file.AsposeTempFileUtils;
import org.dromara.common.core.utils.file.FileParseUtil;
import org.dromara.review.domain.bo.SmartFilterConfig;
import org.dromara.review.service.IAiPromptService;
import org.dromara.system.domain.vo.SysOssVo;
@@ -60,21 +59,21 @@ public class SmartFilterProcessor {
return filteredParagraphsWithOssId;
}
// 预加载并解析招标文件内容,只执行一次
String biddingFileContent = null;
// 预加载并解析招标文件段落,只执行一次,段落粒度需与投标文件 Aspose 解析保持一致。
Set<String> biddingFileParagraphs = Collections.emptySet();
Set<String> tempLocalFiles = new HashSet<>();
// 缓存技术标准所在段落IDkey为ossIdvalue为doc_g段落ID集合
Map<Long, Set<String>> fileTechnicalStandardParagraphIdCache = new HashMap<>();
try {
// 检查是否需要忽略与标文件相同的内容
// 检查是否需要忽略与标文件相同的内容
if (Boolean.TRUE.equals(config.getIgnoreBiddingFileContent()) && (config.getBiddingDocuId()!=null && config.getBiddingDocuId()>0)) {
// 获取招标文件的内容,只下载一次
// 获取招标文件的 Aspose 段落,只下载一次
String biddingFileLocalPath = downloadToUniqueLocalPath(config.getBiddingDocuId(), "smartfilter_bid_");
tempLocalFiles.add(biddingFileLocalPath);
File file = new File(biddingFileLocalPath);
biddingFileContent = FileParseUtil.parseFile(file);
biddingFileParagraphs = extractComparableParagraphTextSet(file);
}
if (Boolean.TRUE.equals(config.getIgnoreTechnicalStandards())) {
for (Map<String, Object> stringObjectMap : paragraphsWithOssId) {
@@ -106,9 +105,10 @@ public class SmartFilterProcessor {
}
String filteredText = paragraph;
// 忽略与标文件相同的内容,使用预加载的投标文件内容
if (StringUtils.isNotBlank(biddingFileContent)) {
filteredText = removeBiddingFileContent(paragraph, biddingFileContent);
// 忽略与标文件相同的内容,使用与投标文件一致的 Aspose 段落粒度判断。
if (CollectionUtils.isNotEmpty(biddingFileParagraphs)) {
String originalParagraph = (String) paraWithOssId.getOrDefault("originalText", paragraph);
filteredText = removeBiddingFileContent(paragraph, originalParagraph, biddingFileParagraphs);
}
// 忽略标点符号和短文本
@@ -178,33 +178,17 @@ public class SmartFilterProcessor {
}
/**
* 移除与招标文件相同的内容
* 移除与招标文件相同的内容
*
* 文本查重的投标文件段落 ID 来自 Aspose 原始段落序号text 可能已经被切成句子或长段片段。
* 因此这里优先用 originalText 与招标文件 Aspose 段落比较,避免简单换行切分导致误过滤或 ID 定位不一致。
*/
private String removeBiddingFileContent(String content, String biddingContent) {
if (StringUtils.isBlank(biddingContent)) {
private String removeBiddingFileContent(String content, String originalContent, Set<String> biddingParagraphs) {
if (CollectionUtils.isEmpty(biddingParagraphs)) {
return content;
}
List<String> biddingParagraphs = splitToParagraphs(biddingContent);
// 遍历内容段落,过滤掉与招标文件重复的段落
boolean isDuplicate = false;
// 检查当前段落是否与招标文件中的任何段落重复
for (String biddingParagraph : biddingParagraphs) {
String cleanBiddingPara = biddingParagraph.trim();
if (content.equals(cleanBiddingPara)) {
isDuplicate = true;
break;
}
}
// 如果不是重复段落,则保留
if (!isDuplicate) {
return content;
}
return "";
String compareText = StringUtils.isNotBlank(originalContent) ? originalContent : content;
return biddingParagraphs.contains(normalizeDuplicateParagraphText(compareText)) ? "" : content;
}
// 使用正则表达式更精确地匹配页码格式
private boolean containsPageNumber(String text) {
@@ -291,6 +275,38 @@ public class SmartFilterProcessor {
.collect(Collectors.toList());
}
private Set<String> extractComparableParagraphTextSet(File file) throws Exception {
Set<String> paragraphTextSet = new HashSet<>();
try {
LoadOptions loadOptions = new LoadOptions();
loadOptions.setTempFolder(ASPOSE_TEMP_DIR.toString());
Document doc = new Document(file.getPath(), loadOptions);
for (Paragraph paragraph : collectComparableParagraphs(doc)) {
String normalizedText = normalizeDuplicateParagraphText(extractParagraphPlainText(paragraph));
if (StringUtils.isNotBlank(normalizedText)) {
paragraphTextSet.add(normalizedText);
}
}
return paragraphTextSet;
} finally {
AsposeTempFileUtils.cleanupStaleAsposeTempFiles(SmartFilterProcessor.class, ASPOSE_TEMP_DIR);
}
}
private String normalizeDuplicateParagraphText(String text) {
if (StringUtils.isBlank(text)) {
return "";
}
StringBuilder builder = new StringBuilder(text.length());
for (int i = 0; i < text.length(); i++) {
char current = text.charAt(i);
if (!Character.isWhitespace(current)) {
builder.append(current);
}
}
return builder.toString().trim();
}
/**
* 从文件中提取技术标准标题及其内容的段落ID
@@ -388,10 +404,27 @@ public class SmartFilterProcessor {
return false;
}
private static boolean isInsideFootnote(Paragraph paragraph) {
if (paragraph == null) {
return false;
}
Node current = paragraph;
while (current != null) {
if (current.getNodeType() == NodeType.FOOTNOTE) {
return true;
}
current = current.getParentNode();
}
return false;
}
private static String extractParagraphPlainText(Paragraph paragraph) {
if (isInsideFootnote(paragraph)) {
return "";
}
try {
Paragraph sanitizedParagraph = (Paragraph) paragraph.deepClone(true);
removeShapeNodes(sanitizedParagraph);
removeIgnoredTextNodes(sanitizedParagraph);
return sanitizedParagraph.toString(SaveFormat.TEXT).trim();
} catch (Exception ignored) {
try {
@@ -402,7 +435,7 @@ public class SmartFilterProcessor {
}
}
private static void removeShapeNodes(Paragraph paragraph) throws Exception {
private static void removeIgnoredTextNodes(Paragraph paragraph) throws Exception {
NodeCollection shapes = paragraph.getChildNodes(NodeType.SHAPE, true);
for (int i = shapes.getCount() - 1; i >= 0; i--) {
shapes.get(i).remove();
@@ -411,6 +444,10 @@ public class SmartFilterProcessor {
for (int i = groupShapes.getCount() - 1; i >= 0; i--) {
groupShapes.get(i).remove();
}
NodeCollection footnotes = paragraph.getChildNodes(NodeType.FOOTNOTE, true);
for (int i = footnotes.getCount() - 1; i >= 0; i--) {
footnotes.get(i).remove();
}
}
/**