1.文本相似度加快审查速度2.忽略技术标准修改

This commit is contained in:
cjh
2026-06-02 17:55:00 +08:00
parent c7fa70707f
commit bc8f2acbc3
3 changed files with 271 additions and 159 deletions

View File

@@ -5,15 +5,14 @@ import ai.z.openapi.service.ocr.HandwritingOcrResponse;
import ai.z.openapi.service.ocr.HandwritingOcrUploadReq;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.aspose.words.*;
import com.aspose.words.Document;
import com.aspose.words.Font;
import com.aspose.words.ParagraphAlignment;
import com.aspose.words.*;
import com.spire.doc.fields.ShapeGroup;
import com.spire.doc.fields.ShapeObject;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import okhttp3.*;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.poi.hwpf.HWPFDocument;
@@ -24,9 +23,17 @@ import org.dromara.common.core.utils.StringUtils;
import org.dromara.common.core.utils.file.AsposeTempFileUtils;
import org.dromara.common.core.utils.file.FileParseUtil;
import org.dromara.review.domain.bo.SmartFilterConfig;
import org.opencv.core.Core;
import org.opencv.core.Mat;
import org.opencv.imgcodecs.Imgcodecs;
import org.opencv.imgproc.Imgproc;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.*;
import org.springframework.stereotype.Service;
import javax.imageio.ImageIO;
import javax.imageio.ImageReadParam;
import javax.imageio.ImageReader;
import javax.imageio.stream.ImageInputStream;
import java.awt.*;
import java.awt.image.BufferedImage;
import java.io.File;
@@ -36,24 +43,14 @@ import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
import java.security.MessageDigest;
import java.util.List;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.opencv.core.Core;
import org.opencv.core.Mat;
import org.opencv.imgcodecs.Imgcodecs;
import org.opencv.imgproc.Imgproc;
import javax.imageio.ImageIO;
import javax.imageio.ImageReadParam;
import javax.imageio.ImageReader;
import javax.imageio.stream.ImageInputStream;
import static cn.dev33.satoken.util.SaHexUtil.bytesToHex;
import java.security.MessageDigest;
@Slf4j
@RequiredArgsConstructor
@@ -70,6 +67,7 @@ public class BiddingContent {
private static final double OCR_SOLID_IMAGE_THRESHOLD = 0.995D;
private static final int OCR_SAMPLE_GRID = 32;
private static final int OCR_CACHE_MAX_SIZE = 2000;
private static final int TEXT_SIMILARITY_MAX_PARAGRAPH_LENGTH = 800;
private static final Map<String, List<String>> OCR_RESULT_CACHE =
Collections.synchronizedMap(new LinkedHashMap<>(256, 0.75F, true) {
@Override
@@ -135,6 +133,7 @@ public class BiddingContent {
} else {
processedParagraphs = splitTextToSentences(originalParagraph);
}
processedParagraphs = splitOversizedParagraphs(processedParagraphs);
for (int subIndex = 0; subIndex < processedParagraphs.size(); subIndex++) {
String processedText = processedParagraphs.get(subIndex);
@@ -543,6 +542,18 @@ public class BiddingContent {
return sentences;
}
private List<String> splitOversizedParagraphs(List<String> paragraphs) {
List<String> result = new ArrayList<>();
for (String paragraph : paragraphs) {
if (paragraph == null || paragraph.length() <= TEXT_SIMILARITY_MAX_PARAGRAPH_LENGTH) {
result.add(paragraph);
} else {
result.addAll(splitLongParagraph(paragraph, TEXT_SIMILARITY_MAX_PARAGRAPH_LENGTH));
}
}
return result;
}
/**
* 判断是否应该使用长段落分割策略
*
@@ -563,7 +574,7 @@ public class BiddingContent {
// 解析配置并检查是否启用忽略关键信息类型
try {
SmartFilterConfig config = com.alibaba.fastjson.JSONObject.parseObject(smartFilterConfig, SmartFilterConfig.class);
SmartFilterConfig config = JSONObject.parseObject(smartFilterConfig, SmartFilterConfig.class);
return config != null && config.getIgnoreKeyInfoTypesEnable();
} catch (Exception e) {
log.warn("解析智能过滤配置失败,使用默认段落分割策略: {}", e.getMessage());

View File

@@ -2,24 +2,11 @@ package org.dromara.aiCheck.service;
import com.aspose.words.*;
import com.baomidou.mybatisplus.core.toolkit.CollectionUtils;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.dromara.aiCheck.Utils.MapTypeConverter;
import org.dromara.aiCheck.config.AiCheckConcurrencySupport;
import org.dromara.chat.domain.vo.LLMRequest;
import org.dromara.chat.domain.vo.LLMResponse;
import org.dromara.chat.domain.vo.Message;
import org.dromara.chat.service.impl.LLMService;
import org.dromara.common.core.domain.R;
import org.dromara.common.core.utils.SpringUtils;
import org.dromara.common.core.utils.StringUtils;
import org.dromara.common.core.utils.file.AsposeTempFileUtils;
import org.dromara.common.core.utils.file.FileParseUtil;
import org.dromara.common.core.utils.file.FileUtils;
import org.dromara.review.domain.bo.AiPromptBo;
import org.dromara.review.domain.bo.SmartFilterConfig;
import org.dromara.review.service.IAiPromptService;
import org.dromara.system.domain.vo.SysOssVo;
@@ -28,17 +15,14 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import reactor.core.publisher.Mono;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
import java.util.List;
import java.util.concurrent.*;
import java.util.regex.Pattern;
import java.util.*;
import java.util.stream.Collectors;
/**
@@ -80,8 +64,8 @@ public class SmartFilterProcessor {
String biddingFileContent = null;
Set<String> tempLocalFiles = new HashSet<>();
// 缓存已读取的文件内容key为ossIdvalue为标题内容映射
Map<Long,List<String>> fileTitleContentCache = new HashMap<>();
// 缓存技术标准所在段落IDkey为ossIdvalue为doc_g段落ID集合
Map<Long, Set<String>> fileTechnicalStandardParagraphIdCache = new HashMap<>();
try {
// 检查是否需要忽略与投标文件相同的内容
@@ -92,18 +76,20 @@ public class SmartFilterProcessor {
File file = new File(biddingFileLocalPath);
biddingFileContent = FileParseUtil.parseFile(file);
}
for (Map<String, Object> stringObjectMap : paragraphsWithOssId) {
Long ossId = (Long) stringObjectMap.get("ossId");
List<String> titleContentList = fileTitleContentCache.get(ossId);
if (titleContentList == null) {
// 下载文件到本地
String localFilePath = downloadToUniqueLocalPath(ossId, "smartfilter_src_");
tempLocalFiles.add(localFilePath);
File file = new File(localFilePath);
// 使用Aspose.Words API解析文件并提取标题内容
titleContentList = extractTitleWithContentFromFile(file);
// 缓存结果
fileTitleContentCache.put(ossId, titleContentList);
if (Boolean.TRUE.equals(config.getIgnoreTechnicalStandards())) {
for (Map<String, Object> stringObjectMap : paragraphsWithOssId) {
Long ossId = MapTypeConverter.getMapLongValue(stringObjectMap, "ossId");
Set<String> technicalStandardParagraphIds = fileTechnicalStandardParagraphIdCache.get(ossId);
if (technicalStandardParagraphIds == null) {
// 下载文件到本地
String localFilePath = downloadToUniqueLocalPath(ossId, "smartfilter_src_");
tempLocalFiles.add(localFilePath);
File file = new File(localFilePath);
// 使用Aspose.Words API解析文件并提取技术标准所在段落ID
technicalStandardParagraphIds = extractTechnicalStandardParagraphIdsFromFile(file);
// 缓存结果
fileTechnicalStandardParagraphIdCache.put(ossId, technicalStandardParagraphIds);
}
}
}
@@ -137,8 +123,10 @@ public class SmartFilterProcessor {
// 忽略技术标准
if (Boolean.TRUE.equals(config.getIgnoreTechnicalStandards())) {
List<String> technicalStandards = fileTitleContentCache.get(ossId);
filteredText = removeTechnicalStandards(filteredText,technicalStandards);
Set<String> technicalStandardParagraphIds = fileTechnicalStandardParagraphIdCache.get(ossId);
if (isTechnicalStandardParagraph(paragraphId, technicalStandardParagraphIds)) {
filteredText = "";
}
}
//6. 忽略重点信息相似词 --只限于重点信息
@@ -272,22 +260,12 @@ public class SmartFilterProcessor {
}
/**
* 移除技术标准内容(包括编制说明及其子目录)
* 判断当前段落是否属于技术标准内容(包括编制说明及其子目录)
*/
private String removeTechnicalStandards(String content,List<String> technicalStandards) {
boolean flag = false;
for (String para : technicalStandards) {
String para2 = para.trim();
if (content.equals(para2)) {
flag = true;
break;
}
}
if(!flag) {
return content;
}
return "";
private boolean isTechnicalStandardParagraph(String paragraphId, Set<String> technicalStandardParagraphIds) {
return StringUtils.isNotBlank(paragraphId)
&& CollectionUtils.isNotEmpty(technicalStandardParagraphIds)
&& technicalStandardParagraphIds.contains(paragraphId);
}
@@ -315,22 +293,18 @@ public class SmartFilterProcessor {
/**
* 从文件中提取所有标题及其内容
* 从文件中提取技术标准标题及其内容的段落ID
*/
private List<String> extractTitleWithContentFromFile(File file) throws Exception {
List<String> result = new ArrayList<>();
result.add("编制说明");
result.add("编制依据");
result.add("编制原则");
result.add("编制范围");
private Set<String> extractTechnicalStandardParagraphIdsFromFile(File file) throws Exception {
Set<String> result = new HashSet<>();
try {
LoadOptions loadOptions = new LoadOptions();
loadOptions.setTempFolder(ASPOSE_TEMP_DIR.toString());
Document doc = new Document(file.getPath(), loadOptions);
result.addAll(extractSpecificTitleContent(doc, "编制说明"));
result.addAll(extractSpecificTitleContent(doc, "编制依据"));
result.addAll(extractSpecificTitleContent(doc, "编制原则"));
result.addAll(extractSpecificTitleContent(doc, "编制范围"));
result.addAll(extractSpecificTitleParagraphIds(doc, "编制说明"));
result.addAll(extractSpecificTitleParagraphIds(doc, "编制依据"));
result.addAll(extractSpecificTitleParagraphIds(doc, "编制原则"));
result.addAll(extractSpecificTitleParagraphIds(doc, "编制范围"));
return result;
} finally {
AsposeTempFileUtils.cleanupStaleAsposeTempFiles(SmartFilterProcessor.class, ASPOSE_TEMP_DIR);
@@ -342,18 +316,18 @@ public class SmartFilterProcessor {
*
* @param doc Aspose.Words文档对象
* @param targetTitle 要提取的标题文本
* @return 该标题下的内容段落列表
* @return 该标题下的内容段落ID集合
*/
public static List<String> extractSpecificTitleContent(Document doc, String targetTitle) {
List<String> contentList = new ArrayList<>();
public static Set<String> extractSpecificTitleParagraphIds(Document doc, String targetTitle) throws Exception {
Set<String> paragraphIds = new HashSet<>();
boolean isTargetTitleFound = false;
boolean isNextTitleFound = false;
// 获取文档中的所有段落
NodeCollection<com.aspose.words.Paragraph> paragraphs = doc.getChildNodes(NodeType.PARAGRAPH, true);
List<Paragraph> paragraphs = collectComparableParagraphs(doc);
for (com.aspose.words.Paragraph paragraph : paragraphs) {
String paragraphText = paragraph.getText().trim();
for (int i = 0; i < paragraphs.size(); i++) {
Paragraph paragraph = paragraphs.get(i);
String paragraphId = "doc_g" + (i + 1);
String paragraphText = extractParagraphPlainText(paragraph);
if (paragraphText.isEmpty()) {
continue; // 跳过空段落
}
@@ -362,17 +336,22 @@ public class SmartFilterProcessor {
String styleName = paragraph.getParagraphFormat().getStyle().getName();
// 检查是否是标题样式(以"标题"或"Heading"开头)
boolean isHeading = styleName.startsWith("Heading 2") || styleName.startsWith("Heading 1");
boolean isHeading = styleName.startsWith("Heading 2")
|| styleName.startsWith("Heading 1")
|| styleName.startsWith("标题 2")
|| styleName.startsWith("标题 1");
if (paragraphText.equals(targetTitle)) {
paragraphIds.add(paragraphId);
}
if (isTargetTitleFound) {
// 如果已经找到了目标标题,现在检查是否遇到了下一个标题
if (isHeading) {
// 遇到了下一个标题,结束当前标题的内容提取
isNextTitleFound = true;
break;
} else {
// 添加内容到列表
contentList.add(paragraphText);
paragraphIds.add(paragraphId);
}
} else {
// 寻找目标标题
@@ -381,7 +360,57 @@ public class SmartFilterProcessor {
}
}
}
return contentList;
return paragraphIds;
}
private static List<Paragraph> collectComparableParagraphs(Document doc) {
List<Paragraph> paragraphs = new ArrayList<>();
for (Section section : doc.getSections()) {
for (Object paragraphObj : section.getBody().getChildNodes(NodeType.PARAGRAPH, true)) {
Paragraph paragraph = (Paragraph) paragraphObj;
if (!isInsideShape(paragraph)) {
paragraphs.add(paragraph);
}
}
}
return paragraphs;
}
private static boolean isInsideShape(Paragraph paragraph) {
Node current = paragraph;
while (current != null) {
int nodeType = current.getNodeType();
if (nodeType == NodeType.SHAPE || nodeType == NodeType.GROUP_SHAPE) {
return true;
}
current = current.getParentNode();
}
return false;
}
private static String extractParagraphPlainText(Paragraph paragraph) {
try {
Paragraph sanitizedParagraph = (Paragraph) paragraph.deepClone(true);
removeShapeNodes(sanitizedParagraph);
return sanitizedParagraph.toString(SaveFormat.TEXT).trim();
} catch (Exception ignored) {
try {
return paragraph.toString(SaveFormat.TEXT).trim();
} catch (Exception ex) {
return "";
}
}
}
private static void removeShapeNodes(Paragraph paragraph) throws Exception {
NodeCollection shapes = paragraph.getChildNodes(NodeType.SHAPE, true);
for (int i = shapes.getCount() - 1; i >= 0; i--) {
shapes.get(i).remove();
}
NodeCollection groupShapes = paragraph.getChildNodes(NodeType.GROUP_SHAPE, true);
for (int i = groupShapes.getCount() - 1; i >= 0; i--) {
groupShapes.get(i).remove();
}
}
/**