From d1e207899a1b7e4497104cd3321f4d2fcf5c3a4c Mon Sep 17 00:00:00 2001 From: YunaiV Date: Sat, 1 Mar 2025 07:48:16 +0800 Subject: [PATCH] =?UTF-8?q?=E3=80=90=E5=8A=9F=E8=83=BD=E6=96=B0=E5=A2=9E?= =?UTF-8?q?=E3=80=91AI=EF=BC=9A=E7=9F=A5=E8=AF=86=E5=BA=93=EF=BC=8C?= =?UTF-8?q?=E6=96=B0=E5=A2=9E=E5=88=87=E7=89=87=E6=8E=A5=E5=8F=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../AiKnowledgeSegmentController.http | 5 +++ .../AiKnowledgeSegmentController.java | 23 +++++++++++-- .../vo/segment/AiKnowledgeSegmentRespVO.java | 12 +++++-- .../knowledge/AiKnowledgeDocumentService.java | 8 +++++ .../AiKnowledgeDocumentServiceImpl.java | 5 +-- .../knowledge/AiKnowledgeSegmentService.java | 13 ++++++-- .../AiKnowledgeSegmentServiceImpl.java | 32 ++++++++++++++++--- 7 files changed, 85 insertions(+), 13 deletions(-) create mode 100644 yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/controller/admin/knowledge/AiKnowledgeSegmentController.http diff --git a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/controller/admin/knowledge/AiKnowledgeSegmentController.http b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/controller/admin/knowledge/AiKnowledgeSegmentController.http new file mode 100644 index 0000000000..25d622d4c1 --- /dev/null +++ b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/controller/admin/knowledge/AiKnowledgeSegmentController.http @@ -0,0 +1,5 @@ +### 切片内容 +GET {{baseUrl}}/ai/knowledge/segment/split?url=https://static.iocoder.cn/README_yudao.md&segmentMaxTokens=800 +Content-Type: application/json +Authorization: Bearer {{token}} +tenant-id: {{adminTenantId}} \ No newline at end of file diff --git a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/controller/admin/knowledge/AiKnowledgeSegmentController.java b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/controller/admin/knowledge/AiKnowledgeSegmentController.java index 267f0021d2..6177f5411a 100644 --- a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/controller/admin/knowledge/AiKnowledgeSegmentController.java +++ b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/controller/admin/knowledge/AiKnowledgeSegmentController.java @@ -10,12 +10,16 @@ import cn.iocoder.yudao.module.ai.controller.admin.knowledge.vo.segment.AiKnowle import cn.iocoder.yudao.module.ai.dal.dataobject.knowledge.AiKnowledgeSegmentDO; import cn.iocoder.yudao.module.ai.service.knowledge.AiKnowledgeSegmentService; import io.swagger.v3.oas.annotations.Operation; +import io.swagger.v3.oas.annotations.Parameter; +import io.swagger.v3.oas.annotations.Parameters; import io.swagger.v3.oas.annotations.tags.Tag; import jakarta.annotation.Resource; import jakarta.validation.Valid; import org.springframework.validation.annotation.Validated; import org.springframework.web.bind.annotation.*; +import java.util.List; + import static cn.iocoder.yudao.framework.common.pojo.CommonResult.success; // TODO @芋艿:增加权限标识 @@ -30,7 +34,8 @@ public class AiKnowledgeSegmentController { @GetMapping("/page") @Operation(summary = "获取段落分页") - public CommonResult> getKnowledgeSegmentPage(@Valid AiKnowledgeSegmentPageReqVO pageReqVO) { + public CommonResult> getKnowledgeSegmentPage( + @Valid AiKnowledgeSegmentPageReqVO pageReqVO) { PageResult pageResult = segmentService.getKnowledgeSegmentPage(pageReqVO); return success(BeanUtils.toBean(pageResult, AiKnowledgeSegmentRespVO.class)); } @@ -44,9 +49,23 @@ public class AiKnowledgeSegmentController { @PutMapping("/update-status") @Operation(summary = "启禁用段落内容") - public CommonResult updateKnowledgeSegmentStatus(@Valid @RequestBody AiKnowledgeSegmentUpdateStatusReqVO reqVO) { + public CommonResult updateKnowledgeSegmentStatus( + @Valid @RequestBody AiKnowledgeSegmentUpdateStatusReqVO reqVO) { segmentService.updateKnowledgeSegmentStatus(reqVO); return success(true); } + @GetMapping("/split") + @Operation(summary = "切片内容") + @Parameters({ + @Parameter(name = "url", description = "文档 URL", required = true), + @Parameter(name = "segmentMaxTokens", description = "分段的最大 Token 数", required = true) + }) + public CommonResult> splitContent( + @RequestParam("url") String url, + @RequestParam(value = "segmentMaxTokens") Integer segmentMaxTokens) { + List segments = segmentService.splitContent(url, segmentMaxTokens); + return success(BeanUtils.toBean(segments, AiKnowledgeSegmentRespVO.class)); + } + } diff --git a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/controller/admin/knowledge/vo/segment/AiKnowledgeSegmentRespVO.java b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/controller/admin/knowledge/vo/segment/AiKnowledgeSegmentRespVO.java index 5e3f2d8cbb..24c452621a 100644 --- a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/controller/admin/knowledge/vo/segment/AiKnowledgeSegmentRespVO.java +++ b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/controller/admin/knowledge/vo/segment/AiKnowledgeSegmentRespVO.java @@ -3,7 +3,7 @@ package cn.iocoder.yudao.module.ai.controller.admin.knowledge.vo.segment; import io.swagger.v3.oas.annotations.media.Schema; import lombok.Data; -@Schema(description = "管理后台 - AI 知识库-文档 Response VO") +@Schema(description = "管理后台 - AI 知识库文档分片 Response VO") @Data public class AiKnowledgeSegmentRespVO { @@ -22,13 +22,19 @@ public class AiKnowledgeSegmentRespVO { @Schema(description = "切片内容", requiredMode = Schema.RequiredMode.REQUIRED, example = "Java 开发手册") private String content; + @Schema(description = "切片内容长度", requiredMode = Schema.RequiredMode.REQUIRED, example = "1024") + private Integer contentLength; + @Schema(description = "token 数量", requiredMode = Schema.RequiredMode.REQUIRED, example = "1024") private Integer tokens; - @Schema(description = "字符数", requiredMode = Schema.RequiredMode.REQUIRED, example = "1008") - private Integer wordCount; + @Schema(description = "召回次数", requiredMode = Schema.RequiredMode.REQUIRED, example = "10") + private Integer retrievalCount; @Schema(description = "文档状态", requiredMode = Schema.RequiredMode.REQUIRED, example = "1") private Integer status; + @Schema(description = "创建时间", requiredMode = Schema.RequiredMode.REQUIRED) + private Long createTime; + } diff --git a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeDocumentService.java b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeDocumentService.java index 5d5c811bb2..8bbee98179 100644 --- a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeDocumentService.java +++ b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeDocumentService.java @@ -63,4 +63,12 @@ public interface AiKnowledgeDocumentService { */ AiKnowledgeDocumentDO validateKnowledgeDocumentExists(Long id); + /** + * 读取 URL 内容 + * + * @param url URL + * @return 内容 + */ + String readUrl(String url); + } diff --git a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeDocumentServiceImpl.java b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeDocumentServiceImpl.java index c90839f556..6ed7fdd57d 100644 --- a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeDocumentServiceImpl.java +++ b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeDocumentServiceImpl.java @@ -127,9 +127,10 @@ public class AiKnowledgeDocumentServiceImpl implements AiKnowledgeDocumentServic return knowledgeDocument; } - private static String readUrl(String url) { + @Override + public String readUrl(String url) { // 下载文件 - ByteArrayResource resource = null; + ByteArrayResource resource; try { byte[] bytes = HttpUtil.downloadBytes(url); if (bytes.length == 0) { diff --git a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeSegmentService.java b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeSegmentService.java index 064d373b73..b105dcb8eb 100644 --- a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeSegmentService.java +++ b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeSegmentService.java @@ -29,7 +29,7 @@ public interface AiKnowledgeSegmentService { * 基于 content 内容,切片创建多个段落 * * @param documentId 知识库文档编号 - * @param content 文档内容 + * @param content 文档内容 */ void createKnowledgeSegmentBySplitContent(Long documentId, String content); @@ -37,7 +37,7 @@ public interface AiKnowledgeSegmentService { * 【异步】基于 content 内容,切片创建多个段落 * * @param documentId 知识库文档编号 - * @param content 文档内容 + * @param content 文档内容 */ @Async default void createKnowledgeSegmentBySplitContentAsync(Long documentId, String content) { @@ -66,4 +66,13 @@ public interface AiKnowledgeSegmentService { */ List similaritySearch(AiKnowledgeSegmentSearchReqVO reqVO); + /** + * 根据 URL 内容,切片创建多个段落 + * + * @param url URL 地址 + * @param segmentMaxTokens 段落最大 Token 数 + * @return 切片后的段落列表 + */ + List splitContent(String url, Integer segmentMaxTokens); + } diff --git a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeSegmentServiceImpl.java b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeSegmentServiceImpl.java index 615299b2db..37599b01fc 100644 --- a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeSegmentServiceImpl.java +++ b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeSegmentServiceImpl.java @@ -75,9 +75,7 @@ public class AiKnowledgeSegmentServiceImpl implements AiKnowledgeSegmentService VectorStore vectorStore = getVectorStoreById(knowledgeDO); // 2. 文档切片 - Document document = new Document(content); - TextSplitter textSplitter = buildTokenTextSplitter(documentDO.getSegmentMaxTokens()); - List documentSegments = textSplitter.apply(Collections.singletonList(document)); + List documentSegments = splitContentByToken(content, documentDO.getSegmentMaxTokens()); // 3.1 存储切片 List segmentDOs = convertList(documentSegments, segment -> { @@ -86,7 +84,8 @@ public class AiKnowledgeSegmentServiceImpl implements AiKnowledgeSegmentService } return new AiKnowledgeSegmentDO().setKnowledgeId(documentDO.getKnowledgeId()).setDocumentId(documentId) .setContent(segment.getText()).setContentLength(segment.getText().length()) - .setVectorId(AiKnowledgeSegmentDO.VECTOR_ID_EMPTY).setTokens(tokenCountEstimator.estimate(segment.getText())) + .setVectorId(AiKnowledgeSegmentDO.VECTOR_ID_EMPTY) + .setTokens(tokenCountEstimator.estimate(segment.getText())) .setStatus(CommonStatusEnum.ENABLE.getStatus()); }); segmentMapper.insertBatch(segmentDOs); @@ -180,6 +179,26 @@ public class AiKnowledgeSegmentServiceImpl implements AiKnowledgeSegmentService return segmentMapper.selectListByVectorIds(convertList(documents, Document::getId)); } + @Override + public List splitContent(String url, Integer segmentMaxTokens) { + // 1. 读取 URL 内容 + String content = knowledgeDocumentService.readUrl(url); + + // 2. 文档切片 + List documentSegments = splitContentByToken(content, segmentMaxTokens); + + // 3. 转换为段落对象 + return convertList(documentSegments, segment -> { + if (StrUtil.isEmpty(segment.getText())) { + return null; + } + return new AiKnowledgeSegmentDO() + .setContent(segment.getText()) + .setContentLength(segment.getText().length()) + .setTokens(tokenCountEstimator.estimate(segment.getText())); + }); + } + /** * 校验段落是否存在 * @@ -202,6 +221,11 @@ public class AiKnowledgeSegmentServiceImpl implements AiKnowledgeSegmentService return getVectorStoreById(knowledgeService.validateKnowledgeExists(knowledgeId)); } + private static List splitContentByToken(String content, Integer segmentMaxTokens) { + TextSplitter textSplitter = buildTokenTextSplitter(segmentMaxTokens); + return textSplitter.apply(Collections.singletonList(new Document(content))); + } + private static TextSplitter buildTokenTextSplitter(Integer segmentMaxTokens) { return TokenTextSplitter.builder() .withChunkSize(segmentMaxTokens)