文档脚注问题和忽略招标文件问题
This commit is contained in:
@@ -18,8 +18,8 @@ import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.security.CodeSource;
|
||||
import java.util.*;
|
||||
import java.util.List;
|
||||
import java.util.*;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
|
||||
@@ -1338,13 +1338,30 @@ public class DocExamine implements AutoCloseable {
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean isInsideFootnote(Paragraph paragraph) {
|
||||
if (paragraph == null) {
|
||||
return false;
|
||||
}
|
||||
Node current = paragraph;
|
||||
while (current != null) {
|
||||
if (current.getNodeType() == NodeType.FOOTNOTE) {
|
||||
return true;
|
||||
}
|
||||
current = current.getParentNode();
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private String extractParagraphPlainText(Paragraph paragraph) {
|
||||
if (paragraph == null) {
|
||||
return "";
|
||||
}
|
||||
if (isInsideFootnote(paragraph)) {
|
||||
return "";
|
||||
}
|
||||
try {
|
||||
Paragraph sanitizedParagraph = (Paragraph) paragraph.deepClone(true);
|
||||
removeShapeNodes(sanitizedParagraph);
|
||||
removeIgnoredTextNodes(sanitizedParagraph);
|
||||
return sanitizedParagraph.toString(SaveFormat.TEXT).trim();
|
||||
} catch (Exception ignored) {
|
||||
try {
|
||||
@@ -1355,19 +1372,19 @@ public class DocExamine implements AutoCloseable {
|
||||
}
|
||||
}
|
||||
|
||||
private void removeShapeNodes(Paragraph paragraph) {
|
||||
private void removeIgnoredTextNodes(Paragraph paragraph) {
|
||||
if (paragraph == null) {
|
||||
return;
|
||||
}
|
||||
List<Node> shapeNodes = new ArrayList<>();
|
||||
List<Node> ignoredNodes = new ArrayList<>();
|
||||
for (Node child = paragraph.getFirstChild(); child != null; child = child.getNextSibling()) {
|
||||
int nodeType = child.getNodeType();
|
||||
if (nodeType == NodeType.SHAPE || nodeType == NodeType.GROUP_SHAPE) {
|
||||
shapeNodes.add(child);
|
||||
if (nodeType == NodeType.SHAPE || nodeType == NodeType.GROUP_SHAPE || nodeType == NodeType.FOOTNOTE) {
|
||||
ignoredNodes.add(child);
|
||||
}
|
||||
}
|
||||
for (Node shapeNode : shapeNodes) {
|
||||
shapeNode.remove();
|
||||
for (Node ignoredNode : ignoredNodes) {
|
||||
ignoredNode.remove();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -284,13 +284,30 @@ public class BiddingContent {
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean isInsideFootnote(Paragraph paragraph) {
|
||||
if (paragraph == null) {
|
||||
return false;
|
||||
}
|
||||
Node current = paragraph;
|
||||
while (current != null) {
|
||||
if (current.getNodeType() == NodeType.FOOTNOTE) {
|
||||
return true;
|
||||
}
|
||||
current = current.getParentNode();
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private String extractParagraphPlainText(Paragraph paragraph) {
|
||||
if (paragraph == null) {
|
||||
return "";
|
||||
}
|
||||
if (isInsideFootnote(paragraph)) {
|
||||
return "";
|
||||
}
|
||||
try {
|
||||
Paragraph sanitizedParagraph = (Paragraph) paragraph.deepClone(true);
|
||||
removeShapeNodes(sanitizedParagraph);
|
||||
removeIgnoredTextNodes(sanitizedParagraph);
|
||||
return sanitizedParagraph.toString(SaveFormat.TEXT).trim();
|
||||
} catch (Exception ignored) {
|
||||
try {
|
||||
@@ -301,19 +318,19 @@ public class BiddingContent {
|
||||
}
|
||||
}
|
||||
|
||||
private void removeShapeNodes(Paragraph paragraph) {
|
||||
private void removeIgnoredTextNodes(Paragraph paragraph) {
|
||||
if (paragraph == null) {
|
||||
return;
|
||||
}
|
||||
List<Node> shapeNodes = new ArrayList<>();
|
||||
List<Node> ignoredNodes = new ArrayList<>();
|
||||
for (Node child = paragraph.getFirstChild(); child != null; child = child.getNextSibling()) {
|
||||
int nodeType = child.getNodeType();
|
||||
if (nodeType == NodeType.SHAPE || nodeType == NodeType.GROUP_SHAPE) {
|
||||
shapeNodes.add(child);
|
||||
if (nodeType == NodeType.SHAPE || nodeType == NodeType.GROUP_SHAPE || nodeType == NodeType.FOOTNOTE) {
|
||||
ignoredNodes.add(child);
|
||||
}
|
||||
}
|
||||
for (Node shapeNode : shapeNodes) {
|
||||
shapeNode.remove();
|
||||
for (Node ignoredNode : ignoredNodes) {
|
||||
ignoredNode.remove();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -6,7 +6,6 @@ import org.dromara.aiCheck.Utils.MapTypeConverter;
|
||||
import org.dromara.chat.service.impl.LLMService;
|
||||
import org.dromara.common.core.utils.StringUtils;
|
||||
import org.dromara.common.core.utils.file.AsposeTempFileUtils;
|
||||
import org.dromara.common.core.utils.file.FileParseUtil;
|
||||
import org.dromara.review.domain.bo.SmartFilterConfig;
|
||||
import org.dromara.review.service.IAiPromptService;
|
||||
import org.dromara.system.domain.vo.SysOssVo;
|
||||
@@ -60,21 +59,21 @@ public class SmartFilterProcessor {
|
||||
return filteredParagraphsWithOssId;
|
||||
}
|
||||
|
||||
// 预加载并解析招标文件内容,只执行一次
|
||||
String biddingFileContent = null;
|
||||
// 预加载并解析招标文件段落,只执行一次,段落粒度需与投标文件 Aspose 解析保持一致。
|
||||
Set<String> biddingFileParagraphs = Collections.emptySet();
|
||||
Set<String> tempLocalFiles = new HashSet<>();
|
||||
|
||||
// 缓存技术标准所在段落ID,key为ossId,value为doc_g段落ID集合
|
||||
Map<Long, Set<String>> fileTechnicalStandardParagraphIdCache = new HashMap<>();
|
||||
|
||||
try {
|
||||
// 检查是否需要忽略与投标文件相同的内容
|
||||
// 检查是否需要忽略与招标文件相同的内容
|
||||
if (Boolean.TRUE.equals(config.getIgnoreBiddingFileContent()) && (config.getBiddingDocuId()!=null && config.getBiddingDocuId()>0)) {
|
||||
// 获取招标文件的内容,只下载一次
|
||||
// 获取招标文件的 Aspose 段落,只下载一次
|
||||
String biddingFileLocalPath = downloadToUniqueLocalPath(config.getBiddingDocuId(), "smartfilter_bid_");
|
||||
tempLocalFiles.add(biddingFileLocalPath);
|
||||
File file = new File(biddingFileLocalPath);
|
||||
biddingFileContent = FileParseUtil.parseFile(file);
|
||||
biddingFileParagraphs = extractComparableParagraphTextSet(file);
|
||||
}
|
||||
if (Boolean.TRUE.equals(config.getIgnoreTechnicalStandards())) {
|
||||
for (Map<String, Object> stringObjectMap : paragraphsWithOssId) {
|
||||
@@ -106,9 +105,10 @@ public class SmartFilterProcessor {
|
||||
}
|
||||
|
||||
String filteredText = paragraph;
|
||||
// 忽略与投标文件相同的内容,使用预加载的投标文件内容
|
||||
if (StringUtils.isNotBlank(biddingFileContent)) {
|
||||
filteredText = removeBiddingFileContent(paragraph, biddingFileContent);
|
||||
// 忽略与招标文件相同的内容,使用与投标文件一致的 Aspose 段落粒度判断。
|
||||
if (CollectionUtils.isNotEmpty(biddingFileParagraphs)) {
|
||||
String originalParagraph = (String) paraWithOssId.getOrDefault("originalText", paragraph);
|
||||
filteredText = removeBiddingFileContent(paragraph, originalParagraph, biddingFileParagraphs);
|
||||
}
|
||||
|
||||
// 忽略标点符号和短文本
|
||||
@@ -178,33 +178,17 @@ public class SmartFilterProcessor {
|
||||
}
|
||||
|
||||
/**
|
||||
* 移除与招标文件相同的内容
|
||||
* 移除与招标文件相同的内容。
|
||||
*
|
||||
* 文本查重的投标文件段落 ID 来自 Aspose 原始段落序号,text 可能已经被切成句子或长段片段。
|
||||
* 因此这里优先用 originalText 与招标文件 Aspose 段落比较,避免简单换行切分导致误过滤或 ID 定位不一致。
|
||||
*/
|
||||
private String removeBiddingFileContent(String content, String biddingContent) {
|
||||
if (StringUtils.isBlank(biddingContent)) {
|
||||
private String removeBiddingFileContent(String content, String originalContent, Set<String> biddingParagraphs) {
|
||||
if (CollectionUtils.isEmpty(biddingParagraphs)) {
|
||||
return content;
|
||||
}
|
||||
|
||||
List<String> biddingParagraphs = splitToParagraphs(biddingContent);
|
||||
|
||||
// 遍历内容段落,过滤掉与招标文件重复的段落
|
||||
boolean isDuplicate = false;
|
||||
|
||||
// 检查当前段落是否与招标文件中的任何段落重复
|
||||
for (String biddingParagraph : biddingParagraphs) {
|
||||
String cleanBiddingPara = biddingParagraph.trim();
|
||||
|
||||
if (content.equals(cleanBiddingPara)) {
|
||||
isDuplicate = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// 如果不是重复段落,则保留
|
||||
if (!isDuplicate) {
|
||||
return content;
|
||||
}
|
||||
return "";
|
||||
String compareText = StringUtils.isNotBlank(originalContent) ? originalContent : content;
|
||||
return biddingParagraphs.contains(normalizeDuplicateParagraphText(compareText)) ? "" : content;
|
||||
}
|
||||
// 使用正则表达式更精确地匹配页码格式
|
||||
private boolean containsPageNumber(String text) {
|
||||
@@ -291,6 +275,38 @@ public class SmartFilterProcessor {
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private Set<String> extractComparableParagraphTextSet(File file) throws Exception {
|
||||
Set<String> paragraphTextSet = new HashSet<>();
|
||||
try {
|
||||
LoadOptions loadOptions = new LoadOptions();
|
||||
loadOptions.setTempFolder(ASPOSE_TEMP_DIR.toString());
|
||||
Document doc = new Document(file.getPath(), loadOptions);
|
||||
for (Paragraph paragraph : collectComparableParagraphs(doc)) {
|
||||
String normalizedText = normalizeDuplicateParagraphText(extractParagraphPlainText(paragraph));
|
||||
if (StringUtils.isNotBlank(normalizedText)) {
|
||||
paragraphTextSet.add(normalizedText);
|
||||
}
|
||||
}
|
||||
return paragraphTextSet;
|
||||
} finally {
|
||||
AsposeTempFileUtils.cleanupStaleAsposeTempFiles(SmartFilterProcessor.class, ASPOSE_TEMP_DIR);
|
||||
}
|
||||
}
|
||||
|
||||
private String normalizeDuplicateParagraphText(String text) {
|
||||
if (StringUtils.isBlank(text)) {
|
||||
return "";
|
||||
}
|
||||
StringBuilder builder = new StringBuilder(text.length());
|
||||
for (int i = 0; i < text.length(); i++) {
|
||||
char current = text.charAt(i);
|
||||
if (!Character.isWhitespace(current)) {
|
||||
builder.append(current);
|
||||
}
|
||||
}
|
||||
return builder.toString().trim();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 从文件中提取技术标准标题及其内容的段落ID
|
||||
@@ -388,10 +404,27 @@ public class SmartFilterProcessor {
|
||||
return false;
|
||||
}
|
||||
|
||||
private static boolean isInsideFootnote(Paragraph paragraph) {
|
||||
if (paragraph == null) {
|
||||
return false;
|
||||
}
|
||||
Node current = paragraph;
|
||||
while (current != null) {
|
||||
if (current.getNodeType() == NodeType.FOOTNOTE) {
|
||||
return true;
|
||||
}
|
||||
current = current.getParentNode();
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private static String extractParagraphPlainText(Paragraph paragraph) {
|
||||
if (isInsideFootnote(paragraph)) {
|
||||
return "";
|
||||
}
|
||||
try {
|
||||
Paragraph sanitizedParagraph = (Paragraph) paragraph.deepClone(true);
|
||||
removeShapeNodes(sanitizedParagraph);
|
||||
removeIgnoredTextNodes(sanitizedParagraph);
|
||||
return sanitizedParagraph.toString(SaveFormat.TEXT).trim();
|
||||
} catch (Exception ignored) {
|
||||
try {
|
||||
@@ -402,7 +435,7 @@ public class SmartFilterProcessor {
|
||||
}
|
||||
}
|
||||
|
||||
private static void removeShapeNodes(Paragraph paragraph) throws Exception {
|
||||
private static void removeIgnoredTextNodes(Paragraph paragraph) throws Exception {
|
||||
NodeCollection shapes = paragraph.getChildNodes(NodeType.SHAPE, true);
|
||||
for (int i = shapes.getCount() - 1; i >= 0; i--) {
|
||||
shapes.get(i).remove();
|
||||
@@ -411,6 +444,10 @@ public class SmartFilterProcessor {
|
||||
for (int i = groupShapes.getCount() - 1; i >= 0; i--) {
|
||||
groupShapes.get(i).remove();
|
||||
}
|
||||
NodeCollection footnotes = paragraph.getChildNodes(NodeType.FOOTNOTE, true);
|
||||
for (int i = footnotes.getCount() - 1; i >= 0; i--) {
|
||||
footnotes.get(i).remove();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
Reference in New Issue
Block a user