【新增】联网搜索,爬虫抓取网页内容
This commit is contained in:
parent
86801517d1
commit
f638f90afc
|
@ -3,6 +3,7 @@ package cn.iocoder.yudao.module.ai.service.websearch;
|
|||
import cn.iocoder.yudao.module.ai.service.websearch.vo.WebSearchRespVO;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Web 搜索 Service 接口
|
||||
|
@ -26,4 +27,12 @@ public interface WebSearchService {
|
|||
* @return 搜索结果列表
|
||||
*/
|
||||
List<WebSearchRespVO> googleSearch(String query, Integer count);
|
||||
|
||||
/**
|
||||
* web 爬虫
|
||||
*
|
||||
* @param urls 爬虫地址
|
||||
* @return key: url value:爬虫内容
|
||||
*/
|
||||
Map<String, String> webCrawler(List<String> urls);
|
||||
}
|
||||
|
|
|
@ -10,9 +10,12 @@ import cn.iocoder.yudao.module.ai.service.websearch.vo.WebSearchRespVO;
|
|||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.jsoup.Jsoup;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Bing Web 搜索实现类
|
||||
|
@ -136,4 +139,79 @@ public class WebSearchServiceImpl implements WebSearchService {
|
|||
return CollUtil.newArrayList();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* web 爬虫
|
||||
*
|
||||
* @param urls 爬虫地址
|
||||
* @return key: url value:爬虫内容
|
||||
*/
|
||||
@Override
|
||||
public Map<String, String> webCrawler(List<String> urls) {
|
||||
if (CollUtil.isEmpty(urls)) {
|
||||
return Map.of();
|
||||
}
|
||||
|
||||
Map<String, String> result = new HashMap<>();
|
||||
for (String url : urls) {
|
||||
try {
|
||||
// 解析URL以获取域名作为Origin
|
||||
String origin = extractOrigin(url);
|
||||
|
||||
// 发送HTTP请求获取网页内容
|
||||
HttpResponse response = HttpRequest.get(url)
|
||||
.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
|
||||
.header("Origin", origin)
|
||||
.header("Referer", origin)
|
||||
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7")
|
||||
.header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8")
|
||||
.header("Cache-Control", "max-age=0")
|
||||
.timeout(10000) // 设置10秒超时
|
||||
.execute();
|
||||
|
||||
if (response.isOk()) {
|
||||
String html = response.body();
|
||||
|
||||
// 使用Jsoup解析HTML并提取文本内容
|
||||
org.jsoup.nodes.Document doc = org.jsoup.Jsoup.parse(html);
|
||||
|
||||
// 移除script和style元素,它们包含的内容不是我们需要的文本
|
||||
doc.select("script, style, meta, link").remove();
|
||||
|
||||
// 获取body中的文本内容
|
||||
String text = doc.body().text();
|
||||
|
||||
// 清理文本(移除多余空格)
|
||||
text = text.replaceAll("\\s+", " ").trim();
|
||||
|
||||
result.put(url, text);
|
||||
} else {
|
||||
log.warn("[webCrawler][URL({}) 请求失败,状态码: {}]", url, response.getStatus());
|
||||
result.put(url, "");
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.error("[webCrawler][URL({}) 爬取异常]", url, e);
|
||||
result.put(url, "");
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* 从URL中提取Origin
|
||||
*
|
||||
* @param url 完整URL
|
||||
* @return Origin (scheme://host[:port])
|
||||
*/
|
||||
private String extractOrigin(String url) {
|
||||
try {
|
||||
java.net.URL parsedUrl = new java.net.URL(url);
|
||||
return parsedUrl.getProtocol() + "://" + parsedUrl.getHost() +
|
||||
(parsedUrl.getPort() == -1 ? "" : ":" + parsedUrl.getPort());
|
||||
} catch (Exception e) {
|
||||
log.warn("[extractOrigin][URL({}) 解析异常]", url, e);
|
||||
return "";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,21 @@
|
|||
package cn.iocoder.yudao.module.ai.service;
|
||||
|
||||
import cn.iocoder.yudao.module.ai.service.websearch.WebSearchServiceImpl;
|
||||
import com.google.common.collect.Lists;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
public class WebSearchServiceTests {
|
||||
|
||||
@Test
|
||||
public void webCrawlerTest() {
|
||||
WebSearchServiceImpl webSearchService = new WebSearchServiceImpl();
|
||||
Map<String, String> webCrawlerRes = webSearchService.webCrawler(
|
||||
Lists.newArrayList("https://tianqi.eastday.com/changsha/40/"));
|
||||
|
||||
for (Map.Entry<String, String> entry : webCrawlerRes.entrySet()) {
|
||||
System.err.println(entry.getValue());
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue