From f638f90afc5b038ef70913f2b125e141dad9a789 Mon Sep 17 00:00:00 2001 From: cherishsince Date: Sun, 2 Mar 2025 12:19:12 +0800 Subject: [PATCH] =?UTF-8?q?=E3=80=90=E6=96=B0=E5=A2=9E=E3=80=91=E8=81=94?= =?UTF-8?q?=E7=BD=91=E6=90=9C=E7=B4=A2=EF=BC=8C=E7=88=AC=E8=99=AB=E6=8A=93?= =?UTF-8?q?=E5=8F=96=E7=BD=91=E9=A1=B5=E5=86=85=E5=AE=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../service/websearch/WebSearchService.java | 9 +++ .../websearch/WebSearchServiceImpl.java | 78 +++++++++++++++++++ .../ai/service/WebSearchServiceTests.java | 21 +++++ 3 files changed, 108 insertions(+) create mode 100644 yudao-module-ai/yudao-module-ai-biz/src/test/java/cn/iocoder/yudao/module/ai/service/WebSearchServiceTests.java diff --git a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/websearch/WebSearchService.java b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/websearch/WebSearchService.java index 264ca74624..0024e3475c 100644 --- a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/websearch/WebSearchService.java +++ b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/websearch/WebSearchService.java @@ -3,6 +3,7 @@ package cn.iocoder.yudao.module.ai.service.websearch; import cn.iocoder.yudao.module.ai.service.websearch.vo.WebSearchRespVO; import java.util.List; +import java.util.Map; /** * Web 搜索 Service 接口 @@ -26,4 +27,12 @@ public interface WebSearchService { * @return 搜索结果列表 */ List googleSearch(String query, Integer count); + + /** + * web 爬虫 + * + * @param urls 爬虫地址 + * @return key: url value:爬虫内容 + */ + Map webCrawler(List urls); } diff --git a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/websearch/WebSearchServiceImpl.java b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/websearch/WebSearchServiceImpl.java index 8c0036c94a..071ece980c 100644 --- a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/websearch/WebSearchServiceImpl.java +++ b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/websearch/WebSearchServiceImpl.java @@ -10,9 +10,12 @@ import cn.iocoder.yudao.module.ai.service.websearch.vo.WebSearchRespVO; import lombok.extern.slf4j.Slf4j; import org.springframework.beans.factory.annotation.Value; import org.springframework.stereotype.Service; +import org.jsoup.Jsoup; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; /** * Bing Web 搜索实现类 @@ -136,4 +139,79 @@ public class WebSearchServiceImpl implements WebSearchService { return CollUtil.newArrayList(); } } + + /** + * web 爬虫 + * + * @param urls 爬虫地址 + * @return key: url value:爬虫内容 + */ + @Override + public Map webCrawler(List urls) { + if (CollUtil.isEmpty(urls)) { + return Map.of(); + } + + Map result = new HashMap<>(); + for (String url : urls) { + try { + // 解析URL以获取域名作为Origin + String origin = extractOrigin(url); + + // 发送HTTP请求获取网页内容 + HttpResponse response = HttpRequest.get(url) + .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") + .header("Origin", origin) + .header("Referer", origin) + .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7") + .header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8") + .header("Cache-Control", "max-age=0") + .timeout(10000) // 设置10秒超时 + .execute(); + + if (response.isOk()) { + String html = response.body(); + + // 使用Jsoup解析HTML并提取文本内容 + org.jsoup.nodes.Document doc = org.jsoup.Jsoup.parse(html); + + // 移除script和style元素,它们包含的内容不是我们需要的文本 + doc.select("script, style, meta, link").remove(); + + // 获取body中的文本内容 + String text = doc.body().text(); + + // 清理文本(移除多余空格) + text = text.replaceAll("\\s+", " ").trim(); + + result.put(url, text); + } else { + log.warn("[webCrawler][URL({}) 请求失败,状态码: {}]", url, response.getStatus()); + result.put(url, ""); + } + } catch (Exception e) { + log.error("[webCrawler][URL({}) 爬取异常]", url, e); + result.put(url, ""); + } + } + + return result; + } + + /** + * 从URL中提取Origin + * + * @param url 完整URL + * @return Origin (scheme://host[:port]) + */ + private String extractOrigin(String url) { + try { + java.net.URL parsedUrl = new java.net.URL(url); + return parsedUrl.getProtocol() + "://" + parsedUrl.getHost() + + (parsedUrl.getPort() == -1 ? "" : ":" + parsedUrl.getPort()); + } catch (Exception e) { + log.warn("[extractOrigin][URL({}) 解析异常]", url, e); + return ""; + } + } } diff --git a/yudao-module-ai/yudao-module-ai-biz/src/test/java/cn/iocoder/yudao/module/ai/service/WebSearchServiceTests.java b/yudao-module-ai/yudao-module-ai-biz/src/test/java/cn/iocoder/yudao/module/ai/service/WebSearchServiceTests.java new file mode 100644 index 0000000000..b937e46029 --- /dev/null +++ b/yudao-module-ai/yudao-module-ai-biz/src/test/java/cn/iocoder/yudao/module/ai/service/WebSearchServiceTests.java @@ -0,0 +1,21 @@ +package cn.iocoder.yudao.module.ai.service; + +import cn.iocoder.yudao.module.ai.service.websearch.WebSearchServiceImpl; +import com.google.common.collect.Lists; +import org.junit.jupiter.api.Test; + +import java.util.Map; + +public class WebSearchServiceTests { + + @Test + public void webCrawlerTest() { + WebSearchServiceImpl webSearchService = new WebSearchServiceImpl(); + Map webCrawlerRes = webSearchService.webCrawler( + Lists.newArrayList("https://tianqi.eastday.com/changsha/40/")); + + for (Map.Entry entry : webCrawlerRes.entrySet()) { + System.err.println(entry.getValue()); + } + } +}