【新增】联网搜索,爬虫抓取网页内容

This commit is contained in:
cherishsince 2025-03-02 12:19:12 +08:00
parent 86801517d1
commit f638f90afc
3 changed files with 108 additions and 0 deletions

View File

@ -3,6 +3,7 @@ package cn.iocoder.yudao.module.ai.service.websearch;
import cn.iocoder.yudao.module.ai.service.websearch.vo.WebSearchRespVO;
import java.util.List;
import java.util.Map;
/**
* Web 搜索 Service 接口
@ -26,4 +27,12 @@ public interface WebSearchService {
* @return 搜索结果列表
*/
List<WebSearchRespVO> googleSearch(String query, Integer count);
/**
* web 爬虫
*
* @param urls 爬虫地址
* @return key: url value爬虫内容
*/
Map<String, String> webCrawler(List<String> urls);
}

View File

@ -10,9 +10,12 @@ import cn.iocoder.yudao.module.ai.service.websearch.vo.WebSearchRespVO;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import org.jsoup.Jsoup;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Bing Web 搜索实现类
@ -136,4 +139,79 @@ public class WebSearchServiceImpl implements WebSearchService {
return CollUtil.newArrayList();
}
}
/**
* web 爬虫
*
* @param urls 爬虫地址
* @return key: url value爬虫内容
*/
@Override
public Map<String, String> webCrawler(List<String> urls) {
if (CollUtil.isEmpty(urls)) {
return Map.of();
}
Map<String, String> result = new HashMap<>();
for (String url : urls) {
try {
// 解析URL以获取域名作为Origin
String origin = extractOrigin(url);
// 发送HTTP请求获取网页内容
HttpResponse response = HttpRequest.get(url)
.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
.header("Origin", origin)
.header("Referer", origin)
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7")
.header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8")
.header("Cache-Control", "max-age=0")
.timeout(10000) // 设置10秒超时
.execute();
if (response.isOk()) {
String html = response.body();
// 使用Jsoup解析HTML并提取文本内容
org.jsoup.nodes.Document doc = org.jsoup.Jsoup.parse(html);
// 移除script和style元素它们包含的内容不是我们需要的文本
doc.select("script, style, meta, link").remove();
// 获取body中的文本内容
String text = doc.body().text();
// 清理文本移除多余空格
text = text.replaceAll("\\s+", " ").trim();
result.put(url, text);
} else {
log.warn("[webCrawler][URL({}) 请求失败,状态码: {}]", url, response.getStatus());
result.put(url, "");
}
} catch (Exception e) {
log.error("[webCrawler][URL({}) 爬取异常]", url, e);
result.put(url, "");
}
}
return result;
}
/**
* 从URL中提取Origin
*
* @param url 完整URL
* @return Origin (scheme://host[:port])
*/
private String extractOrigin(String url) {
try {
java.net.URL parsedUrl = new java.net.URL(url);
return parsedUrl.getProtocol() + "://" + parsedUrl.getHost() +
(parsedUrl.getPort() == -1 ? "" : ":" + parsedUrl.getPort());
} catch (Exception e) {
log.warn("[extractOrigin][URL({}) 解析异常]", url, e);
return "";
}
}
}

View File

@ -0,0 +1,21 @@
package cn.iocoder.yudao.module.ai.service;
import cn.iocoder.yudao.module.ai.service.websearch.WebSearchServiceImpl;
import com.google.common.collect.Lists;
import org.junit.jupiter.api.Test;
import java.util.Map;
public class WebSearchServiceTests {
@Test
public void webCrawlerTest() {
WebSearchServiceImpl webSearchService = new WebSearchServiceImpl();
Map<String, String> webCrawlerRes = webSearchService.webCrawler(
Lists.newArrayList("https://tianqi.eastday.com/changsha/40/"));
for (Map.Entry<String, String> entry : webCrawlerRes.entrySet()) {
System.err.println(entry.getValue());
}
}
}