【功能新增】AI:知识库,新增切片接口
This commit is contained in:
parent
e5cc9d2ad8
commit
d1e207899a
|
@ -0,0 +1,5 @@
|
|||
### 切片内容
|
||||
GET {{baseUrl}}/ai/knowledge/segment/split?url=https://static.iocoder.cn/README_yudao.md&segmentMaxTokens=800
|
||||
Content-Type: application/json
|
||||
Authorization: Bearer {{token}}
|
||||
tenant-id: {{adminTenantId}}
|
|
@ -10,12 +10,16 @@ import cn.iocoder.yudao.module.ai.controller.admin.knowledge.vo.segment.AiKnowle
|
|||
import cn.iocoder.yudao.module.ai.dal.dataobject.knowledge.AiKnowledgeSegmentDO;
|
||||
import cn.iocoder.yudao.module.ai.service.knowledge.AiKnowledgeSegmentService;
|
||||
import io.swagger.v3.oas.annotations.Operation;
|
||||
import io.swagger.v3.oas.annotations.Parameter;
|
||||
import io.swagger.v3.oas.annotations.Parameters;
|
||||
import io.swagger.v3.oas.annotations.tags.Tag;
|
||||
import jakarta.annotation.Resource;
|
||||
import jakarta.validation.Valid;
|
||||
import org.springframework.validation.annotation.Validated;
|
||||
import org.springframework.web.bind.annotation.*;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import static cn.iocoder.yudao.framework.common.pojo.CommonResult.success;
|
||||
|
||||
// TODO @芋艿:增加权限标识
|
||||
|
@ -30,7 +34,8 @@ public class AiKnowledgeSegmentController {
|
|||
|
||||
@GetMapping("/page")
|
||||
@Operation(summary = "获取段落分页")
|
||||
public CommonResult<PageResult<AiKnowledgeSegmentRespVO>> getKnowledgeSegmentPage(@Valid AiKnowledgeSegmentPageReqVO pageReqVO) {
|
||||
public CommonResult<PageResult<AiKnowledgeSegmentRespVO>> getKnowledgeSegmentPage(
|
||||
@Valid AiKnowledgeSegmentPageReqVO pageReqVO) {
|
||||
PageResult<AiKnowledgeSegmentDO> pageResult = segmentService.getKnowledgeSegmentPage(pageReqVO);
|
||||
return success(BeanUtils.toBean(pageResult, AiKnowledgeSegmentRespVO.class));
|
||||
}
|
||||
|
@ -44,9 +49,23 @@ public class AiKnowledgeSegmentController {
|
|||
|
||||
@PutMapping("/update-status")
|
||||
@Operation(summary = "启禁用段落内容")
|
||||
public CommonResult<Boolean> updateKnowledgeSegmentStatus(@Valid @RequestBody AiKnowledgeSegmentUpdateStatusReqVO reqVO) {
|
||||
public CommonResult<Boolean> updateKnowledgeSegmentStatus(
|
||||
@Valid @RequestBody AiKnowledgeSegmentUpdateStatusReqVO reqVO) {
|
||||
segmentService.updateKnowledgeSegmentStatus(reqVO);
|
||||
return success(true);
|
||||
}
|
||||
|
||||
@GetMapping("/split")
|
||||
@Operation(summary = "切片内容")
|
||||
@Parameters({
|
||||
@Parameter(name = "url", description = "文档 URL", required = true),
|
||||
@Parameter(name = "segmentMaxTokens", description = "分段的最大 Token 数", required = true)
|
||||
})
|
||||
public CommonResult<List<AiKnowledgeSegmentRespVO>> splitContent(
|
||||
@RequestParam("url") String url,
|
||||
@RequestParam(value = "segmentMaxTokens") Integer segmentMaxTokens) {
|
||||
List<AiKnowledgeSegmentDO> segments = segmentService.splitContent(url, segmentMaxTokens);
|
||||
return success(BeanUtils.toBean(segments, AiKnowledgeSegmentRespVO.class));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -3,7 +3,7 @@ package cn.iocoder.yudao.module.ai.controller.admin.knowledge.vo.segment;
|
|||
import io.swagger.v3.oas.annotations.media.Schema;
|
||||
import lombok.Data;
|
||||
|
||||
@Schema(description = "管理后台 - AI 知识库-文档 Response VO")
|
||||
@Schema(description = "管理后台 - AI 知识库文档分片 Response VO")
|
||||
@Data
|
||||
public class AiKnowledgeSegmentRespVO {
|
||||
|
||||
|
@ -22,13 +22,19 @@ public class AiKnowledgeSegmentRespVO {
|
|||
@Schema(description = "切片内容", requiredMode = Schema.RequiredMode.REQUIRED, example = "Java 开发手册")
|
||||
private String content;
|
||||
|
||||
@Schema(description = "切片内容长度", requiredMode = Schema.RequiredMode.REQUIRED, example = "1024")
|
||||
private Integer contentLength;
|
||||
|
||||
@Schema(description = "token 数量", requiredMode = Schema.RequiredMode.REQUIRED, example = "1024")
|
||||
private Integer tokens;
|
||||
|
||||
@Schema(description = "字符数", requiredMode = Schema.RequiredMode.REQUIRED, example = "1008")
|
||||
private Integer wordCount;
|
||||
@Schema(description = "召回次数", requiredMode = Schema.RequiredMode.REQUIRED, example = "10")
|
||||
private Integer retrievalCount;
|
||||
|
||||
@Schema(description = "文档状态", requiredMode = Schema.RequiredMode.REQUIRED, example = "1")
|
||||
private Integer status;
|
||||
|
||||
@Schema(description = "创建时间", requiredMode = Schema.RequiredMode.REQUIRED)
|
||||
private Long createTime;
|
||||
|
||||
}
|
||||
|
|
|
@ -63,4 +63,12 @@ public interface AiKnowledgeDocumentService {
|
|||
*/
|
||||
AiKnowledgeDocumentDO validateKnowledgeDocumentExists(Long id);
|
||||
|
||||
/**
|
||||
* 读取 URL 内容
|
||||
*
|
||||
* @param url URL
|
||||
* @return 内容
|
||||
*/
|
||||
String readUrl(String url);
|
||||
|
||||
}
|
||||
|
|
|
@ -127,9 +127,10 @@ public class AiKnowledgeDocumentServiceImpl implements AiKnowledgeDocumentServic
|
|||
return knowledgeDocument;
|
||||
}
|
||||
|
||||
private static String readUrl(String url) {
|
||||
@Override
|
||||
public String readUrl(String url) {
|
||||
// 下载文件
|
||||
ByteArrayResource resource = null;
|
||||
ByteArrayResource resource;
|
||||
try {
|
||||
byte[] bytes = HttpUtil.downloadBytes(url);
|
||||
if (bytes.length == 0) {
|
||||
|
|
|
@ -66,4 +66,13 @@ public interface AiKnowledgeSegmentService {
|
|||
*/
|
||||
List<AiKnowledgeSegmentDO> similaritySearch(AiKnowledgeSegmentSearchReqVO reqVO);
|
||||
|
||||
/**
|
||||
* 根据 URL 内容,切片创建多个段落
|
||||
*
|
||||
* @param url URL 地址
|
||||
* @param segmentMaxTokens 段落最大 Token 数
|
||||
* @return 切片后的段落列表
|
||||
*/
|
||||
List<AiKnowledgeSegmentDO> splitContent(String url, Integer segmentMaxTokens);
|
||||
|
||||
}
|
||||
|
|
|
@ -75,9 +75,7 @@ public class AiKnowledgeSegmentServiceImpl implements AiKnowledgeSegmentService
|
|||
VectorStore vectorStore = getVectorStoreById(knowledgeDO);
|
||||
|
||||
// 2. 文档切片
|
||||
Document document = new Document(content);
|
||||
TextSplitter textSplitter = buildTokenTextSplitter(documentDO.getSegmentMaxTokens());
|
||||
List<Document> documentSegments = textSplitter.apply(Collections.singletonList(document));
|
||||
List<Document> documentSegments = splitContentByToken(content, documentDO.getSegmentMaxTokens());
|
||||
|
||||
// 3.1 存储切片
|
||||
List<AiKnowledgeSegmentDO> segmentDOs = convertList(documentSegments, segment -> {
|
||||
|
@ -86,7 +84,8 @@ public class AiKnowledgeSegmentServiceImpl implements AiKnowledgeSegmentService
|
|||
}
|
||||
return new AiKnowledgeSegmentDO().setKnowledgeId(documentDO.getKnowledgeId()).setDocumentId(documentId)
|
||||
.setContent(segment.getText()).setContentLength(segment.getText().length())
|
||||
.setVectorId(AiKnowledgeSegmentDO.VECTOR_ID_EMPTY).setTokens(tokenCountEstimator.estimate(segment.getText()))
|
||||
.setVectorId(AiKnowledgeSegmentDO.VECTOR_ID_EMPTY)
|
||||
.setTokens(tokenCountEstimator.estimate(segment.getText()))
|
||||
.setStatus(CommonStatusEnum.ENABLE.getStatus());
|
||||
});
|
||||
segmentMapper.insertBatch(segmentDOs);
|
||||
|
@ -180,6 +179,26 @@ public class AiKnowledgeSegmentServiceImpl implements AiKnowledgeSegmentService
|
|||
return segmentMapper.selectListByVectorIds(convertList(documents, Document::getId));
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<AiKnowledgeSegmentDO> splitContent(String url, Integer segmentMaxTokens) {
|
||||
// 1. 读取 URL 内容
|
||||
String content = knowledgeDocumentService.readUrl(url);
|
||||
|
||||
// 2. 文档切片
|
||||
List<Document> documentSegments = splitContentByToken(content, segmentMaxTokens);
|
||||
|
||||
// 3. 转换为段落对象
|
||||
return convertList(documentSegments, segment -> {
|
||||
if (StrUtil.isEmpty(segment.getText())) {
|
||||
return null;
|
||||
}
|
||||
return new AiKnowledgeSegmentDO()
|
||||
.setContent(segment.getText())
|
||||
.setContentLength(segment.getText().length())
|
||||
.setTokens(tokenCountEstimator.estimate(segment.getText()));
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* 校验段落是否存在
|
||||
*
|
||||
|
@ -202,6 +221,11 @@ public class AiKnowledgeSegmentServiceImpl implements AiKnowledgeSegmentService
|
|||
return getVectorStoreById(knowledgeService.validateKnowledgeExists(knowledgeId));
|
||||
}
|
||||
|
||||
private static List<Document> splitContentByToken(String content, Integer segmentMaxTokens) {
|
||||
TextSplitter textSplitter = buildTokenTextSplitter(segmentMaxTokens);
|
||||
return textSplitter.apply(Collections.singletonList(new Document(content)));
|
||||
}
|
||||
|
||||
private static TextSplitter buildTokenTextSplitter(Integer segmentMaxTokens) {
|
||||
return TokenTextSplitter.builder()
|
||||
.withChunkSize(segmentMaxTokens)
|
||||
|
|
Loading…
Reference in New Issue