【功能新增】AI:知识库,新增切片接口

This commit is contained in:
YunaiV 2025-03-01 07:48:16 +08:00
parent e5cc9d2ad8
commit d1e207899a
7 changed files with 85 additions and 13 deletions

View File

@ -0,0 +1,5 @@
### 切片内容
GET {{baseUrl}}/ai/knowledge/segment/split?url=https://static.iocoder.cn/README_yudao.md&segmentMaxTokens=800
Content-Type: application/json
Authorization: Bearer {{token}}
tenant-id: {{adminTenantId}}

View File

@ -10,12 +10,16 @@ import cn.iocoder.yudao.module.ai.controller.admin.knowledge.vo.segment.AiKnowle
import cn.iocoder.yudao.module.ai.dal.dataobject.knowledge.AiKnowledgeSegmentDO;
import cn.iocoder.yudao.module.ai.service.knowledge.AiKnowledgeSegmentService;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.Parameter;
import io.swagger.v3.oas.annotations.Parameters;
import io.swagger.v3.oas.annotations.tags.Tag;
import jakarta.annotation.Resource;
import jakarta.validation.Valid;
import org.springframework.validation.annotation.Validated;
import org.springframework.web.bind.annotation.*;
import java.util.List;
import static cn.iocoder.yudao.framework.common.pojo.CommonResult.success;
// TODO @芋艿增加权限标识
@ -30,7 +34,8 @@ public class AiKnowledgeSegmentController {
@GetMapping("/page")
@Operation(summary = "获取段落分页")
public CommonResult<PageResult<AiKnowledgeSegmentRespVO>> getKnowledgeSegmentPage(@Valid AiKnowledgeSegmentPageReqVO pageReqVO) {
public CommonResult<PageResult<AiKnowledgeSegmentRespVO>> getKnowledgeSegmentPage(
@Valid AiKnowledgeSegmentPageReqVO pageReqVO) {
PageResult<AiKnowledgeSegmentDO> pageResult = segmentService.getKnowledgeSegmentPage(pageReqVO);
return success(BeanUtils.toBean(pageResult, AiKnowledgeSegmentRespVO.class));
}
@ -44,9 +49,23 @@ public class AiKnowledgeSegmentController {
@PutMapping("/update-status")
@Operation(summary = "启禁用段落内容")
public CommonResult<Boolean> updateKnowledgeSegmentStatus(@Valid @RequestBody AiKnowledgeSegmentUpdateStatusReqVO reqVO) {
public CommonResult<Boolean> updateKnowledgeSegmentStatus(
@Valid @RequestBody AiKnowledgeSegmentUpdateStatusReqVO reqVO) {
segmentService.updateKnowledgeSegmentStatus(reqVO);
return success(true);
}
@GetMapping("/split")
@Operation(summary = "切片内容")
@Parameters({
@Parameter(name = "url", description = "文档 URL", required = true),
@Parameter(name = "segmentMaxTokens", description = "分段的最大 Token 数", required = true)
})
public CommonResult<List<AiKnowledgeSegmentRespVO>> splitContent(
@RequestParam("url") String url,
@RequestParam(value = "segmentMaxTokens") Integer segmentMaxTokens) {
List<AiKnowledgeSegmentDO> segments = segmentService.splitContent(url, segmentMaxTokens);
return success(BeanUtils.toBean(segments, AiKnowledgeSegmentRespVO.class));
}
}

View File

@ -3,7 +3,7 @@ package cn.iocoder.yudao.module.ai.controller.admin.knowledge.vo.segment;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.Data;
@Schema(description = "管理后台 - AI 知识库-文档 Response VO")
@Schema(description = "管理后台 - AI 知识库文档分片 Response VO")
@Data
public class AiKnowledgeSegmentRespVO {
@ -22,13 +22,19 @@ public class AiKnowledgeSegmentRespVO {
@Schema(description = "切片内容", requiredMode = Schema.RequiredMode.REQUIRED, example = "Java 开发手册")
private String content;
@Schema(description = "切片内容长度", requiredMode = Schema.RequiredMode.REQUIRED, example = "1024")
private Integer contentLength;
@Schema(description = "token 数量", requiredMode = Schema.RequiredMode.REQUIRED, example = "1024")
private Integer tokens;
@Schema(description = "字符", requiredMode = Schema.RequiredMode.REQUIRED, example = "1008")
private Integer wordCount;
@Schema(description = "召回次", requiredMode = Schema.RequiredMode.REQUIRED, example = "10")
private Integer retrievalCount;
@Schema(description = "文档状态", requiredMode = Schema.RequiredMode.REQUIRED, example = "1")
private Integer status;
@Schema(description = "创建时间", requiredMode = Schema.RequiredMode.REQUIRED)
private Long createTime;
}

View File

@ -63,4 +63,12 @@ public interface AiKnowledgeDocumentService {
*/
AiKnowledgeDocumentDO validateKnowledgeDocumentExists(Long id);
/**
* 读取 URL 内容
*
* @param url URL
* @return 内容
*/
String readUrl(String url);
}

View File

@ -127,9 +127,10 @@ public class AiKnowledgeDocumentServiceImpl implements AiKnowledgeDocumentServic
return knowledgeDocument;
}
private static String readUrl(String url) {
@Override
public String readUrl(String url) {
// 下载文件
ByteArrayResource resource = null;
ByteArrayResource resource;
try {
byte[] bytes = HttpUtil.downloadBytes(url);
if (bytes.length == 0) {

View File

@ -66,4 +66,13 @@ public interface AiKnowledgeSegmentService {
*/
List<AiKnowledgeSegmentDO> similaritySearch(AiKnowledgeSegmentSearchReqVO reqVO);
/**
* 根据 URL 内容切片创建多个段落
*
* @param url URL 地址
* @param segmentMaxTokens 段落最大 Token
* @return 切片后的段落列表
*/
List<AiKnowledgeSegmentDO> splitContent(String url, Integer segmentMaxTokens);
}

View File

@ -75,9 +75,7 @@ public class AiKnowledgeSegmentServiceImpl implements AiKnowledgeSegmentService
VectorStore vectorStore = getVectorStoreById(knowledgeDO);
// 2. 文档切片
Document document = new Document(content);
TextSplitter textSplitter = buildTokenTextSplitter(documentDO.getSegmentMaxTokens());
List<Document> documentSegments = textSplitter.apply(Collections.singletonList(document));
List<Document> documentSegments = splitContentByToken(content, documentDO.getSegmentMaxTokens());
// 3.1 存储切片
List<AiKnowledgeSegmentDO> segmentDOs = convertList(documentSegments, segment -> {
@ -86,7 +84,8 @@ public class AiKnowledgeSegmentServiceImpl implements AiKnowledgeSegmentService
}
return new AiKnowledgeSegmentDO().setKnowledgeId(documentDO.getKnowledgeId()).setDocumentId(documentId)
.setContent(segment.getText()).setContentLength(segment.getText().length())
.setVectorId(AiKnowledgeSegmentDO.VECTOR_ID_EMPTY).setTokens(tokenCountEstimator.estimate(segment.getText()))
.setVectorId(AiKnowledgeSegmentDO.VECTOR_ID_EMPTY)
.setTokens(tokenCountEstimator.estimate(segment.getText()))
.setStatus(CommonStatusEnum.ENABLE.getStatus());
});
segmentMapper.insertBatch(segmentDOs);
@ -180,6 +179,26 @@ public class AiKnowledgeSegmentServiceImpl implements AiKnowledgeSegmentService
return segmentMapper.selectListByVectorIds(convertList(documents, Document::getId));
}
@Override
public List<AiKnowledgeSegmentDO> splitContent(String url, Integer segmentMaxTokens) {
// 1. 读取 URL 内容
String content = knowledgeDocumentService.readUrl(url);
// 2. 文档切片
List<Document> documentSegments = splitContentByToken(content, segmentMaxTokens);
// 3. 转换为段落对象
return convertList(documentSegments, segment -> {
if (StrUtil.isEmpty(segment.getText())) {
return null;
}
return new AiKnowledgeSegmentDO()
.setContent(segment.getText())
.setContentLength(segment.getText().length())
.setTokens(tokenCountEstimator.estimate(segment.getText()));
});
}
/**
* 校验段落是否存在
*
@ -202,6 +221,11 @@ public class AiKnowledgeSegmentServiceImpl implements AiKnowledgeSegmentService
return getVectorStoreById(knowledgeService.validateKnowledgeExists(knowledgeId));
}
private static List<Document> splitContentByToken(String content, Integer segmentMaxTokens) {
TextSplitter textSplitter = buildTokenTextSplitter(segmentMaxTokens);
return textSplitter.apply(Collections.singletonList(new Document(content)));
}
private static TextSplitter buildTokenTextSplitter(Integer segmentMaxTokens) {
return TokenTextSplitter.builder()
.withChunkSize(segmentMaxTokens)