1.文本相似度加快审查速度2.忽略技术标准修改
This commit is contained in:
@@ -5,15 +5,14 @@ import ai.z.openapi.service.ocr.HandwritingOcrResponse;
|
||||
import ai.z.openapi.service.ocr.HandwritingOcrUploadReq;
|
||||
import com.alibaba.fastjson.JSONArray;
|
||||
import com.alibaba.fastjson.JSONObject;
|
||||
import com.aspose.words.*;
|
||||
import com.aspose.words.Document;
|
||||
import com.aspose.words.Font;
|
||||
import com.aspose.words.ParagraphAlignment;
|
||||
import com.aspose.words.*;
|
||||
import com.spire.doc.fields.ShapeGroup;
|
||||
import com.spire.doc.fields.ShapeObject;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import okhttp3.*;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.rendering.PDFRenderer;
|
||||
import org.apache.poi.hwpf.HWPFDocument;
|
||||
@@ -24,9 +23,17 @@ import org.dromara.common.core.utils.StringUtils;
|
||||
import org.dromara.common.core.utils.file.AsposeTempFileUtils;
|
||||
import org.dromara.common.core.utils.file.FileParseUtil;
|
||||
import org.dromara.review.domain.bo.SmartFilterConfig;
|
||||
import org.opencv.core.Core;
|
||||
import org.opencv.core.Mat;
|
||||
import org.opencv.imgcodecs.Imgcodecs;
|
||||
import org.opencv.imgproc.Imgproc;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.*;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
import javax.imageio.ImageReadParam;
|
||||
import javax.imageio.ImageReader;
|
||||
import javax.imageio.stream.ImageInputStream;
|
||||
import java.awt.*;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.File;
|
||||
@@ -36,24 +43,14 @@ import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.*;
|
||||
import java.security.MessageDigest;
|
||||
import java.util.List;
|
||||
import java.util.*;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.opencv.core.Core;
|
||||
import org.opencv.core.Mat;
|
||||
import org.opencv.imgcodecs.Imgcodecs;
|
||||
import org.opencv.imgproc.Imgproc;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
import javax.imageio.ImageReadParam;
|
||||
import javax.imageio.ImageReader;
|
||||
import javax.imageio.stream.ImageInputStream;
|
||||
|
||||
import static cn.dev33.satoken.util.SaHexUtil.bytesToHex;
|
||||
import java.security.MessageDigest;
|
||||
|
||||
@Slf4j
|
||||
@RequiredArgsConstructor
|
||||
@@ -70,6 +67,7 @@ public class BiddingContent {
|
||||
private static final double OCR_SOLID_IMAGE_THRESHOLD = 0.995D;
|
||||
private static final int OCR_SAMPLE_GRID = 32;
|
||||
private static final int OCR_CACHE_MAX_SIZE = 2000;
|
||||
private static final int TEXT_SIMILARITY_MAX_PARAGRAPH_LENGTH = 800;
|
||||
private static final Map<String, List<String>> OCR_RESULT_CACHE =
|
||||
Collections.synchronizedMap(new LinkedHashMap<>(256, 0.75F, true) {
|
||||
@Override
|
||||
@@ -135,6 +133,7 @@ public class BiddingContent {
|
||||
} else {
|
||||
processedParagraphs = splitTextToSentences(originalParagraph);
|
||||
}
|
||||
processedParagraphs = splitOversizedParagraphs(processedParagraphs);
|
||||
|
||||
for (int subIndex = 0; subIndex < processedParagraphs.size(); subIndex++) {
|
||||
String processedText = processedParagraphs.get(subIndex);
|
||||
@@ -543,6 +542,18 @@ public class BiddingContent {
|
||||
return sentences;
|
||||
}
|
||||
|
||||
private List<String> splitOversizedParagraphs(List<String> paragraphs) {
|
||||
List<String> result = new ArrayList<>();
|
||||
for (String paragraph : paragraphs) {
|
||||
if (paragraph == null || paragraph.length() <= TEXT_SIMILARITY_MAX_PARAGRAPH_LENGTH) {
|
||||
result.add(paragraph);
|
||||
} else {
|
||||
result.addAll(splitLongParagraph(paragraph, TEXT_SIMILARITY_MAX_PARAGRAPH_LENGTH));
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* 判断是否应该使用长段落分割策略
|
||||
*
|
||||
@@ -563,7 +574,7 @@ public class BiddingContent {
|
||||
|
||||
// 解析配置并检查是否启用忽略关键信息类型
|
||||
try {
|
||||
SmartFilterConfig config = com.alibaba.fastjson.JSONObject.parseObject(smartFilterConfig, SmartFilterConfig.class);
|
||||
SmartFilterConfig config = JSONObject.parseObject(smartFilterConfig, SmartFilterConfig.class);
|
||||
return config != null && config.getIgnoreKeyInfoTypesEnable();
|
||||
} catch (Exception e) {
|
||||
log.warn("解析智能过滤配置失败,使用默认段落分割策略: {}", e.getMessage());
|
||||
|
||||
@@ -2,24 +2,11 @@ package org.dromara.aiCheck.service;
|
||||
|
||||
import com.aspose.words.*;
|
||||
import com.baomidou.mybatisplus.core.toolkit.CollectionUtils;
|
||||
import com.fasterxml.jackson.core.JsonParser;
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.core.type.TypeReference;
|
||||
import com.fasterxml.jackson.databind.JsonNode;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import org.dromara.aiCheck.Utils.MapTypeConverter;
|
||||
import org.dromara.aiCheck.config.AiCheckConcurrencySupport;
|
||||
import org.dromara.chat.domain.vo.LLMRequest;
|
||||
import org.dromara.chat.domain.vo.LLMResponse;
|
||||
import org.dromara.chat.domain.vo.Message;
|
||||
import org.dromara.chat.service.impl.LLMService;
|
||||
import org.dromara.common.core.domain.R;
|
||||
import org.dromara.common.core.utils.SpringUtils;
|
||||
import org.dromara.common.core.utils.StringUtils;
|
||||
import org.dromara.common.core.utils.file.AsposeTempFileUtils;
|
||||
import org.dromara.common.core.utils.file.FileParseUtil;
|
||||
import org.dromara.common.core.utils.file.FileUtils;
|
||||
import org.dromara.review.domain.bo.AiPromptBo;
|
||||
import org.dromara.review.domain.bo.SmartFilterConfig;
|
||||
import org.dromara.review.service.IAiPromptService;
|
||||
import org.dromara.system.domain.vo.SysOssVo;
|
||||
@@ -28,17 +15,14 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.stereotype.Component;
|
||||
import reactor.core.publisher.Mono;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.*;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.*;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
@@ -80,8 +64,8 @@ public class SmartFilterProcessor {
|
||||
String biddingFileContent = null;
|
||||
Set<String> tempLocalFiles = new HashSet<>();
|
||||
|
||||
// 缓存已读取的文件内容,key为ossId,value为标题内容映射
|
||||
Map<Long,List<String>> fileTitleContentCache = new HashMap<>();
|
||||
// 缓存技术标准所在段落ID,key为ossId,value为doc_g段落ID集合
|
||||
Map<Long, Set<String>> fileTechnicalStandardParagraphIdCache = new HashMap<>();
|
||||
|
||||
try {
|
||||
// 检查是否需要忽略与投标文件相同的内容
|
||||
@@ -92,18 +76,20 @@ public class SmartFilterProcessor {
|
||||
File file = new File(biddingFileLocalPath);
|
||||
biddingFileContent = FileParseUtil.parseFile(file);
|
||||
}
|
||||
for (Map<String, Object> stringObjectMap : paragraphsWithOssId) {
|
||||
Long ossId = (Long) stringObjectMap.get("ossId");
|
||||
List<String> titleContentList = fileTitleContentCache.get(ossId);
|
||||
if (titleContentList == null) {
|
||||
// 下载文件到本地
|
||||
String localFilePath = downloadToUniqueLocalPath(ossId, "smartfilter_src_");
|
||||
tempLocalFiles.add(localFilePath);
|
||||
File file = new File(localFilePath);
|
||||
// 使用Aspose.Words API解析文件并提取标题内容
|
||||
titleContentList = extractTitleWithContentFromFile(file);
|
||||
// 缓存结果
|
||||
fileTitleContentCache.put(ossId, titleContentList);
|
||||
if (Boolean.TRUE.equals(config.getIgnoreTechnicalStandards())) {
|
||||
for (Map<String, Object> stringObjectMap : paragraphsWithOssId) {
|
||||
Long ossId = MapTypeConverter.getMapLongValue(stringObjectMap, "ossId");
|
||||
Set<String> technicalStandardParagraphIds = fileTechnicalStandardParagraphIdCache.get(ossId);
|
||||
if (technicalStandardParagraphIds == null) {
|
||||
// 下载文件到本地
|
||||
String localFilePath = downloadToUniqueLocalPath(ossId, "smartfilter_src_");
|
||||
tempLocalFiles.add(localFilePath);
|
||||
File file = new File(localFilePath);
|
||||
// 使用Aspose.Words API解析文件并提取技术标准所在段落ID
|
||||
technicalStandardParagraphIds = extractTechnicalStandardParagraphIdsFromFile(file);
|
||||
// 缓存结果
|
||||
fileTechnicalStandardParagraphIdCache.put(ossId, technicalStandardParagraphIds);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -137,8 +123,10 @@ public class SmartFilterProcessor {
|
||||
|
||||
// 忽略技术标准
|
||||
if (Boolean.TRUE.equals(config.getIgnoreTechnicalStandards())) {
|
||||
List<String> technicalStandards = fileTitleContentCache.get(ossId);
|
||||
filteredText = removeTechnicalStandards(filteredText,technicalStandards);
|
||||
Set<String> technicalStandardParagraphIds = fileTechnicalStandardParagraphIdCache.get(ossId);
|
||||
if (isTechnicalStandardParagraph(paragraphId, technicalStandardParagraphIds)) {
|
||||
filteredText = "";
|
||||
}
|
||||
}
|
||||
|
||||
//6. 忽略重点信息相似词 --只限于重点信息
|
||||
@@ -272,22 +260,12 @@ public class SmartFilterProcessor {
|
||||
}
|
||||
|
||||
/**
|
||||
* 移除技术标准内容(包括编制说明及其子目录)
|
||||
* 判断当前段落是否属于技术标准内容(包括编制说明及其子目录)
|
||||
*/
|
||||
private String removeTechnicalStandards(String content,List<String> technicalStandards) {
|
||||
boolean flag = false;
|
||||
for (String para : technicalStandards) {
|
||||
String para2 = para.trim();
|
||||
|
||||
if (content.equals(para2)) {
|
||||
flag = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if(!flag) {
|
||||
return content;
|
||||
}
|
||||
return "";
|
||||
private boolean isTechnicalStandardParagraph(String paragraphId, Set<String> technicalStandardParagraphIds) {
|
||||
return StringUtils.isNotBlank(paragraphId)
|
||||
&& CollectionUtils.isNotEmpty(technicalStandardParagraphIds)
|
||||
&& technicalStandardParagraphIds.contains(paragraphId);
|
||||
}
|
||||
|
||||
|
||||
@@ -315,22 +293,18 @@ public class SmartFilterProcessor {
|
||||
|
||||
|
||||
/**
|
||||
* 从文件中提取所有标题及其内容
|
||||
* 从文件中提取技术标准标题及其内容的段落ID
|
||||
*/
|
||||
private List<String> extractTitleWithContentFromFile(File file) throws Exception {
|
||||
List<String> result = new ArrayList<>();
|
||||
result.add("编制说明");
|
||||
result.add("编制依据");
|
||||
result.add("编制原则");
|
||||
result.add("编制范围");
|
||||
private Set<String> extractTechnicalStandardParagraphIdsFromFile(File file) throws Exception {
|
||||
Set<String> result = new HashSet<>();
|
||||
try {
|
||||
LoadOptions loadOptions = new LoadOptions();
|
||||
loadOptions.setTempFolder(ASPOSE_TEMP_DIR.toString());
|
||||
Document doc = new Document(file.getPath(), loadOptions);
|
||||
result.addAll(extractSpecificTitleContent(doc, "编制说明"));
|
||||
result.addAll(extractSpecificTitleContent(doc, "编制依据"));
|
||||
result.addAll(extractSpecificTitleContent(doc, "编制原则"));
|
||||
result.addAll(extractSpecificTitleContent(doc, "编制范围"));
|
||||
result.addAll(extractSpecificTitleParagraphIds(doc, "编制说明"));
|
||||
result.addAll(extractSpecificTitleParagraphIds(doc, "编制依据"));
|
||||
result.addAll(extractSpecificTitleParagraphIds(doc, "编制原则"));
|
||||
result.addAll(extractSpecificTitleParagraphIds(doc, "编制范围"));
|
||||
return result;
|
||||
} finally {
|
||||
AsposeTempFileUtils.cleanupStaleAsposeTempFiles(SmartFilterProcessor.class, ASPOSE_TEMP_DIR);
|
||||
@@ -342,18 +316,18 @@ public class SmartFilterProcessor {
|
||||
*
|
||||
* @param doc Aspose.Words文档对象
|
||||
* @param targetTitle 要提取的标题文本
|
||||
* @return 该标题下的内容段落列表
|
||||
* @return 该标题下的内容段落ID集合
|
||||
*/
|
||||
public static List<String> extractSpecificTitleContent(Document doc, String targetTitle) {
|
||||
List<String> contentList = new ArrayList<>();
|
||||
public static Set<String> extractSpecificTitleParagraphIds(Document doc, String targetTitle) throws Exception {
|
||||
Set<String> paragraphIds = new HashSet<>();
|
||||
boolean isTargetTitleFound = false;
|
||||
boolean isNextTitleFound = false;
|
||||
|
||||
// 获取文档中的所有段落
|
||||
NodeCollection<com.aspose.words.Paragraph> paragraphs = doc.getChildNodes(NodeType.PARAGRAPH, true);
|
||||
List<Paragraph> paragraphs = collectComparableParagraphs(doc);
|
||||
|
||||
for (com.aspose.words.Paragraph paragraph : paragraphs) {
|
||||
String paragraphText = paragraph.getText().trim();
|
||||
for (int i = 0; i < paragraphs.size(); i++) {
|
||||
Paragraph paragraph = paragraphs.get(i);
|
||||
String paragraphId = "doc_g" + (i + 1);
|
||||
String paragraphText = extractParagraphPlainText(paragraph);
|
||||
if (paragraphText.isEmpty()) {
|
||||
continue; // 跳过空段落
|
||||
}
|
||||
@@ -362,17 +336,22 @@ public class SmartFilterProcessor {
|
||||
String styleName = paragraph.getParagraphFormat().getStyle().getName();
|
||||
|
||||
// 检查是否是标题样式(以"标题"或"Heading"开头)
|
||||
boolean isHeading = styleName.startsWith("Heading 2") || styleName.startsWith("Heading 1");
|
||||
boolean isHeading = styleName.startsWith("Heading 2")
|
||||
|| styleName.startsWith("Heading 1")
|
||||
|| styleName.startsWith("标题 2")
|
||||
|| styleName.startsWith("标题 1");
|
||||
|
||||
if (paragraphText.equals(targetTitle)) {
|
||||
paragraphIds.add(paragraphId);
|
||||
}
|
||||
|
||||
if (isTargetTitleFound) {
|
||||
// 如果已经找到了目标标题,现在检查是否遇到了下一个标题
|
||||
if (isHeading) {
|
||||
// 遇到了下一个标题,结束当前标题的内容提取
|
||||
isNextTitleFound = true;
|
||||
break;
|
||||
} else {
|
||||
// 添加内容到列表
|
||||
contentList.add(paragraphText);
|
||||
paragraphIds.add(paragraphId);
|
||||
}
|
||||
} else {
|
||||
// 寻找目标标题
|
||||
@@ -381,7 +360,57 @@ public class SmartFilterProcessor {
|
||||
}
|
||||
}
|
||||
}
|
||||
return contentList;
|
||||
return paragraphIds;
|
||||
}
|
||||
|
||||
private static List<Paragraph> collectComparableParagraphs(Document doc) {
|
||||
List<Paragraph> paragraphs = new ArrayList<>();
|
||||
for (Section section : doc.getSections()) {
|
||||
for (Object paragraphObj : section.getBody().getChildNodes(NodeType.PARAGRAPH, true)) {
|
||||
Paragraph paragraph = (Paragraph) paragraphObj;
|
||||
if (!isInsideShape(paragraph)) {
|
||||
paragraphs.add(paragraph);
|
||||
}
|
||||
}
|
||||
}
|
||||
return paragraphs;
|
||||
}
|
||||
|
||||
private static boolean isInsideShape(Paragraph paragraph) {
|
||||
Node current = paragraph;
|
||||
while (current != null) {
|
||||
int nodeType = current.getNodeType();
|
||||
if (nodeType == NodeType.SHAPE || nodeType == NodeType.GROUP_SHAPE) {
|
||||
return true;
|
||||
}
|
||||
current = current.getParentNode();
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private static String extractParagraphPlainText(Paragraph paragraph) {
|
||||
try {
|
||||
Paragraph sanitizedParagraph = (Paragraph) paragraph.deepClone(true);
|
||||
removeShapeNodes(sanitizedParagraph);
|
||||
return sanitizedParagraph.toString(SaveFormat.TEXT).trim();
|
||||
} catch (Exception ignored) {
|
||||
try {
|
||||
return paragraph.toString(SaveFormat.TEXT).trim();
|
||||
} catch (Exception ex) {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static void removeShapeNodes(Paragraph paragraph) throws Exception {
|
||||
NodeCollection shapes = paragraph.getChildNodes(NodeType.SHAPE, true);
|
||||
for (int i = shapes.getCount() - 1; i >= 0; i--) {
|
||||
shapes.get(i).remove();
|
||||
}
|
||||
NodeCollection groupShapes = paragraph.getChildNodes(NodeType.GROUP_SHAPE, true);
|
||||
for (int i = groupShapes.getCount() - 1; i >= 0; i--) {
|
||||
groupShapes.get(i).remove();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -247,8 +247,13 @@ public class BiddingAnalysisResultServiceImpl implements IBiddingAnalysisResultS
|
||||
SmartFilterConfig smartFilterConfig) {
|
||||
try {
|
||||
|
||||
int totalItems1 = paragraphs1.size();
|
||||
int totalItems2 = paragraphs2.size();
|
||||
// 确定是否在相似度计算时忽略标点符号
|
||||
boolean ignorePunctuation = smartFilterConfig != null
|
||||
&& Boolean.TRUE.equals(smartFilterConfig.getIgnorePunctuationAndShortText());
|
||||
List<SimilarityParagraph> preparedParagraphs1 = prepareSimilarityParagraphs(paragraphs1, ignorePunctuation);
|
||||
List<SimilarityParagraph> preparedParagraphs2 = prepareSimilarityParagraphs(paragraphs2, ignorePunctuation);
|
||||
int totalItems1 = preparedParagraphs1.size();
|
||||
int totalItems2 = preparedParagraphs2.size();
|
||||
int similarItemCount = 0;
|
||||
List<Map<String, Object>> duplicateContents = new ArrayList<>();
|
||||
|
||||
@@ -261,42 +266,24 @@ public class BiddingAnalysisResultServiceImpl implements IBiddingAnalysisResultS
|
||||
return new SimilarityResult(0L, duplicateContents);
|
||||
}
|
||||
|
||||
// 确定是否在相似度计算时忽略标点符号
|
||||
boolean ignorePunctuation = smartFilterConfig != null
|
||||
&& Boolean.TRUE.equals(smartFilterConfig.getIgnorePunctuationAndShortText());
|
||||
|
||||
// 比较每对段落的相似度
|
||||
// 缓存已查询过的pdfOssId结果
|
||||
Map<Long, Long> pdfOssIdCache = new HashMap<>();
|
||||
Map<Long, Long> dtlIdCache = new HashMap<>();
|
||||
for (int i = 0; i < totalItems1; i++) {
|
||||
String para1 = paragraphs1.get(i).get("text").toString();
|
||||
String page1 = paragraphs1.get(i).get("page").toString();
|
||||
String paragraphId = paragraphs1.get(i).get("paragraphId").toString();
|
||||
// 直接从ossIds1列表中获取当前段落对应的ossId
|
||||
Long ossId1 = (Long) paragraphs1.get(i).get("ossId");
|
||||
|
||||
if (para1 == null || para1.trim().isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
SimilarityParagraph left = preparedParagraphs1.get(i);
|
||||
|
||||
for (int j = 0; j < totalItems2; j++) {
|
||||
// 如果段落2的第j个元素已经被匹配过,则跳过
|
||||
if (matchedIndices2.contains(j)) {
|
||||
continue;
|
||||
}
|
||||
String page2 = paragraphs2.get(j).get("page").toString();
|
||||
|
||||
String para2 = paragraphs2.get(j).get("text").toString();
|
||||
String paragraphId2 = paragraphs2.get(j).get("paragraphId").toString();
|
||||
// 直接从ossIds1列表中获取当前段落对应的ossId
|
||||
Long ossId2 = (Long) paragraphs2.get(j).get("ossId");
|
||||
|
||||
if (para2 == null || para2.trim().isEmpty()) {
|
||||
SimilarityParagraph right = preparedParagraphs2.get(j);
|
||||
if (!canReachSimilarityThreshold(left, right, threshold)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
double similarity = calculateParagraphSimilarity(para1, para2, ignorePunctuation);
|
||||
double similarity = left.normalizedText.equals(right.normalizedText)
|
||||
? 1.0
|
||||
: calculateParagraphSimilarity(left.normalizedText, right.normalizedText);
|
||||
// 如果相似度超过阈值,则认为是重复内容
|
||||
if (similarity * 100 >= threshold) {
|
||||
similarItemCount++;
|
||||
@@ -307,22 +294,22 @@ public class BiddingAnalysisResultServiceImpl implements IBiddingAnalysisResultS
|
||||
|
||||
// 为每个重复段落创建包含内容和具体文件ID的Map
|
||||
com.alibaba.fastjson.JSONObject duplicateItem = new com.alibaba.fastjson.JSONObject();
|
||||
duplicateItem.put("smallContent", para1);
|
||||
if (para1.length() > 20) {
|
||||
content = para1.substring(0, 20);
|
||||
duplicateItem.put("smallContent", left.originalText);
|
||||
if (left.originalText.length() > 20) {
|
||||
content = left.originalText.substring(0, 20);
|
||||
duplicateItem.put("smallContent", content);
|
||||
}
|
||||
duplicateItem.put("smallContentB", para2);
|
||||
if (para2.length() > 20) {
|
||||
content = para2.substring(0, 20);
|
||||
duplicateItem.put("smallContentB", right.originalText);
|
||||
if (right.originalText.length() > 20) {
|
||||
content = right.originalText.substring(0, 20);
|
||||
duplicateItem.put("smallContentB", content);
|
||||
}
|
||||
duplicateItem.put("content", para1);
|
||||
duplicateItem.put("contentB", para2);
|
||||
duplicateItem.put("page2",page2);
|
||||
duplicateItem.put("page1", page1);
|
||||
duplicateItem.put("paragraphId2", paragraphId2);
|
||||
duplicateItem.put("paragraphId1", paragraphId);
|
||||
duplicateItem.put("content", left.originalText);
|
||||
duplicateItem.put("contentB", right.originalText);
|
||||
duplicateItem.put("page2", right.page);
|
||||
duplicateItem.put("page1", left.page);
|
||||
duplicateItem.put("paragraphId2", right.paragraphId);
|
||||
duplicateItem.put("paragraphId1", left.paragraphId);
|
||||
|
||||
// 使用缓存获取pdf1
|
||||
// Long pdf1 = pdfOssIdCache.get(ossId1);
|
||||
@@ -337,19 +324,19 @@ public class BiddingAnalysisResultServiceImpl implements IBiddingAnalysisResultS
|
||||
// pdf2 = biddingProposalDtlMapper.findPdfOssId(ossId2);
|
||||
// pdfOssIdCache.put(ossId2, pdf2);
|
||||
// }
|
||||
Long dtlId1 = dtlIdCache.get(ossId1);
|
||||
Long dtlId1 = dtlIdCache.get(left.ossId);
|
||||
if (dtlId1 == null) {
|
||||
dtlId1 = biddingProposalDtlMapper.findDtlIdByOssId(ossId1);
|
||||
dtlIdCache.put(ossId1, dtlId1);
|
||||
dtlId1 = biddingProposalDtlMapper.findDtlIdByOssId(left.ossId);
|
||||
dtlIdCache.put(left.ossId, dtlId1);
|
||||
}
|
||||
|
||||
Long dtlId2 = dtlIdCache.get(ossId2);
|
||||
Long dtlId2 = dtlIdCache.get(right.ossId);
|
||||
if (dtlId2 == null) {
|
||||
dtlId2 = biddingProposalDtlMapper.findDtlIdByOssId(ossId2);
|
||||
dtlIdCache.put(ossId2, dtlId2);
|
||||
dtlId2 = biddingProposalDtlMapper.findDtlIdByOssId(right.ossId);
|
||||
dtlIdCache.put(right.ossId, dtlId2);
|
||||
}
|
||||
//文本相似度结果,每个重复段落中已经包含了对应的具体文件ID
|
||||
BiddingAnalysisResultDtl textResult = addResult(duplicateItem.toString(), 3, ossId1, ossId2, dtlId1, dtlId2);
|
||||
BiddingAnalysisResultDtl textResult = addResult(duplicateItem.toString(), 3, left.ossId, right.ossId, dtlId1, dtlId2);
|
||||
biddingResultList.add(textResult);
|
||||
break; // 避免一个段落被多次匹配
|
||||
}
|
||||
@@ -509,15 +496,93 @@ public class BiddingAnalysisResultServiceImpl implements IBiddingAnalysisResultS
|
||||
}
|
||||
|
||||
/**
|
||||
* 计算两个段落的相似度(适用于中文文本)
|
||||
* 使用字符级别的相似度算法,结合LCS和字符重叠度
|
||||
*
|
||||
* @param para1 段落1
|
||||
* @param para2 段落2
|
||||
* @param ignorePunctuation 是否在比较时忽略标点符号
|
||||
* @return 相似度(0.0-1.0之间的值,1.0表示完全相同)
|
||||
* 预处理段落,避免在两两比较时重复清洗文本和构建字符集合。
|
||||
*/
|
||||
private double calculateParagraphSimilarity(String para1, String para2, boolean ignorePunctuation) {
|
||||
private List<SimilarityParagraph> prepareSimilarityParagraphs(List<Map<String, Object>> paragraphs, boolean ignorePunctuation) {
|
||||
List<SimilarityParagraph> result = new ArrayList<>();
|
||||
if (paragraphs == null || paragraphs.isEmpty()) {
|
||||
return result;
|
||||
}
|
||||
for (Map<String, Object> paragraph : paragraphs) {
|
||||
Object textObj = paragraph.get("text");
|
||||
if (textObj == null) {
|
||||
continue;
|
||||
}
|
||||
String originalText = textObj.toString().trim();
|
||||
if (originalText.isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
String normalizedText = ignorePunctuation ? removePunctuation(originalText) : originalText;
|
||||
normalizedText = normalizedText.trim();
|
||||
if (normalizedText.isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
result.add(new SimilarityParagraph(
|
||||
originalText,
|
||||
normalizedText,
|
||||
String.valueOf(paragraph.getOrDefault("page", "")),
|
||||
String.valueOf(paragraph.getOrDefault("paragraphId", "")),
|
||||
toLong(paragraph.get("ossId")),
|
||||
buildCharSet(normalizedText)
|
||||
));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private boolean canReachSimilarityThreshold(SimilarityParagraph left, SimilarityParagraph right, int threshold) {
|
||||
if (left.normalizedText.equals(right.normalizedText)) {
|
||||
return true;
|
||||
}
|
||||
int maxLength = Math.max(left.length, right.length);
|
||||
if (maxLength == 0) {
|
||||
return false;
|
||||
}
|
||||
double lengthUpperBound = (double) Math.min(left.length, right.length) / maxLength;
|
||||
double charOverlap = calculateCharOverlapSimilarity(left.uniqueChars, right.uniqueChars);
|
||||
double similarityUpperBound = lengthUpperBound * 0.7 + charOverlap * 0.3;
|
||||
return similarityUpperBound * 100 >= threshold;
|
||||
}
|
||||
|
||||
private String removePunctuation(String text) {
|
||||
StringBuilder builder = new StringBuilder(text.length());
|
||||
for (int i = 0; i < text.length(); i++) {
|
||||
char c = text.charAt(i);
|
||||
int type = Character.getType(c);
|
||||
if (type != Character.CONNECTOR_PUNCTUATION
|
||||
&& type != Character.DASH_PUNCTUATION
|
||||
&& type != Character.START_PUNCTUATION
|
||||
&& type != Character.END_PUNCTUATION
|
||||
&& type != Character.INITIAL_QUOTE_PUNCTUATION
|
||||
&& type != Character.FINAL_QUOTE_PUNCTUATION
|
||||
&& type != Character.OTHER_PUNCTUATION) {
|
||||
builder.append(c);
|
||||
}
|
||||
}
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
private Set<Character> buildCharSet(String text) {
|
||||
Set<Character> chars = new HashSet<>();
|
||||
for (int i = 0; i < text.length(); i++) {
|
||||
char c = text.charAt(i);
|
||||
if (!Character.isWhitespace(c)) {
|
||||
chars.add(c);
|
||||
}
|
||||
}
|
||||
return chars;
|
||||
}
|
||||
|
||||
private Long toLong(Object value) {
|
||||
if (value instanceof Long) {
|
||||
return (Long) value;
|
||||
}
|
||||
if (value instanceof Number) {
|
||||
return ((Number) value).longValue();
|
||||
}
|
||||
return value == null ? null : Long.valueOf(value.toString());
|
||||
}
|
||||
|
||||
private double calculateParagraphSimilarity(String para1, String para2) {
|
||||
if (para1 == null || para2 == null) {
|
||||
return 0.0;
|
||||
}
|
||||
@@ -526,12 +591,6 @@ public class BiddingAnalysisResultServiceImpl implements IBiddingAnalysisResultS
|
||||
String text1 = para1.trim();
|
||||
String text2 = para2.trim();
|
||||
|
||||
// 如果配置了忽略标点符号,则在比较时临时移除标点符号(但不影响原始文本)
|
||||
if (ignorePunctuation) {
|
||||
text1 = text1.replaceAll("\\p{P}", "");
|
||||
text2 = text2.replaceAll("\\p{P}", "");
|
||||
}
|
||||
|
||||
// 如果任一文本为空,返回0
|
||||
if (text1.isEmpty() || text2.isEmpty()) {
|
||||
return 0.0;
|
||||
@@ -561,6 +620,27 @@ public class BiddingAnalysisResultServiceImpl implements IBiddingAnalysisResultS
|
||||
return finalSimilarity;
|
||||
}
|
||||
|
||||
private static class SimilarityParagraph {
|
||||
private final String originalText;
|
||||
private final String normalizedText;
|
||||
private final String page;
|
||||
private final String paragraphId;
|
||||
private final Long ossId;
|
||||
private final int length;
|
||||
private final Set<Character> uniqueChars;
|
||||
|
||||
private SimilarityParagraph(String originalText, String normalizedText, String page,
|
||||
String paragraphId, Long ossId, Set<Character> uniqueChars) {
|
||||
this.originalText = originalText;
|
||||
this.normalizedText = normalizedText;
|
||||
this.page = page;
|
||||
this.paragraphId = paragraphId;
|
||||
this.ossId = ossId;
|
||||
this.length = normalizedText.length();
|
||||
this.uniqueChars = uniqueChars;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 基于最长公共子序列(LCS)计算相似度
|
||||
*/
|
||||
@@ -603,21 +683,13 @@ public class BiddingAnalysisResultServiceImpl implements IBiddingAnalysisResultS
|
||||
*/
|
||||
private double calculateCharOverlapSimilarity(String text1, String text2) {
|
||||
// 将文本转换为字符集合(去除空白字符和标点符号)
|
||||
Set<Character> chars1 = new HashSet<>();
|
||||
Set<Character> chars2 = new HashSet<>();
|
||||
Set<Character> chars1 = buildCharSet(text1);
|
||||
Set<Character> chars2 = buildCharSet(text2);
|
||||
|
||||
for (char c : text1.toCharArray()) {
|
||||
if (!Character.isWhitespace(c)) {
|
||||
chars1.add(c);
|
||||
}
|
||||
}
|
||||
|
||||
for (char c : text2.toCharArray()) {
|
||||
if (!Character.isWhitespace(c)) {
|
||||
chars2.add(c);
|
||||
}
|
||||
}
|
||||
return calculateCharOverlapSimilarity(chars1, chars2);
|
||||
}
|
||||
|
||||
private double calculateCharOverlapSimilarity(Set<Character> chars1, Set<Character> chars2) {
|
||||
if (chars1.isEmpty() || chars2.isEmpty()) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user