时间:2023-03-20 06:22:01 | 来源:电子商务
时间:2023-03-20 06:22:01 来源:电子商务
前言:想要谷歌和百度已经够用了,这里实现的搜索只是为了方便自己做后续的事情的一个小实践。 import com.chaojilaji.auto.autocode.generatecode.GenerateFile;import com.chaojilaji.auto.autocode.standartReq.SendReq;import com.chaojilaji.auto.autocode.utils.Json;import com.chaojilaji.moneyframework.model.OnePage;import com.chaojilaji.moneyframework.model.Word;import com.chaojilaji.moneyframework.service.Nlp;import com.chaojilaji.moneyframework.utils.DomainUtils;import com.chaojilaji.moneyframework.utils.HtmlUtil;import com.chaojilaji.moneyframework.utils.MDUtils;import org.apache.commons.logging.Log;import org.apache.commons.logging.LogFactory;import org.springframework.stereotype.Service;import org.springframework.util.StringUtils;import java.io.*;import java.util.*;import java.util.concurrent.ConcurrentHashMap;import java.util.concurrent.ConcurrentSkipListSet;public class HttpClientCrawl { private static Log logger = LogFactory.getLog(HttpClientCrawl.class); public Set<String> oldDomains = new ConcurrentSkipListSet<>(); public Map<String, OnePage> onePageMap = new ConcurrentHashMap<>(400000); public Set<String> ignoreSet = new ConcurrentSkipListSet<>(); public Map<String, Set<String>> siteMaps = new ConcurrentHashMap<>(50000); public String domain; public HttpClientCrawl(String domain) { this.domain = DomainUtils.getDomainWithCompleteDomain(domain); String[] ignores = {"gov.cn", "apac.cn", "org.cn", "twitter.com" , "baidu.com", "google.com", "sina.com", "weibo.com" , "github.com", "sina.com.cn", "sina.cn", "edu.cn", "wordpress.org", "sephora.com"}; ignoreSet.addAll(Arrays.asList(ignores)); loadIgnore(); loadWord(); } private Map<String, String> defaultHeaders() { Map<String, String> ans = new HashMap<>(); ans.put("Accept", "application/json, text/plain, */*"); ans.put("Content-Type", "application/json"); ans.put("Connection", "keep-alive"); ans.put("Accept-Language", "zh-CN,zh;q=0.9"); ans.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36"); return ans; } public SendReq.ResBody doRequest(String url, String method, Map<String, Object> params) { String urlTrue = url; SendReq.ResBody resBody = SendReq.sendReq(urlTrue, method, params, defaultHeaders()); return resBody; } public void loadIgnore() { File directory = new File("."); try { String file = directory.getCanonicalPath() + "/moneyframework/generate/ignore/demo.txt"; BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(new File(file)))); String line = ""; while ((line = reader.readLine()) != null) { String x = line.replace("[", "").replace("]", "").replace(" ", ""); String[] y = x.split(","); ignoreSet.addAll(Arrays.asList(y)); } } catch (IOException e) { e.printStackTrace(); } } public void loadDomains(String file) { File directory = new File("."); try { File file1 = new File(directory.getCanonicalPath() + "//" + file); logger.info(directory.getCanonicalPath() + "//" + file); if (!file1.exists()) { file1.createNewFile(); } BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file1))); String line = ""; while ((line = reader.readLine()) != null) { line = line.trim(); OnePage onePage = new OnePage(line); if (!oldDomains.contains(onePage.getDomain())) { onePageMap.put(onePage.getDomain(), onePage); oldDomains.add(onePage.getDomain()); } } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } public void handleWord(List<String> s, String domain, String title) { for (String a : s) { String x = a.split(" ")[0]; String y = a.split(" ")[1]; Set<String> z = siteMaps.getOrDefault(x, new ConcurrentSkipListSet<>()); if (Integer.parseInt(y) >= 10) { if (z.contains(domain)) continue; z.add(domain); siteMaps.put(x, z); GenerateFile.appendFileWithRelativePath("moneyframework/domain/markdown", x + ".md", MDUtils.getMdContent(domain, title, s.toString())); } } Set<Word> xxxx = Nlp.separateWordAndReturnUnit(title); for (Word word : xxxx) { String x = word.getWord(); Set<String> z = siteMaps.getOrDefault(x, new ConcurrentSkipListSet<>()); if (z.contains(domain)) continue; z.add(domain); siteMaps.put(x, z); GenerateFile.appendFileWithRelativePath("moneyframework/domain/markdown", x + ".md", MDUtils.getMdContent(domain, title, s.toString())); } } public void loadWord() { File directory = new File("."); try { File file1 = new File(directory.getCanonicalPath() + "//moneyframework/domain/markdown"); if (file1.isDirectory()) { int fileCnt = 0; File[] files = file1.listFiles(); for (File file : files) { fileCnt ++; try { BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file))); String line = ""; siteMaps.put(file.getName().replace(".md", ""), new ConcurrentSkipListSet<>()); while ((line = reader.readLine()) != null) { line = line.trim(); if (line.startsWith("####")) { siteMaps.get(file.getName().replace(".md", "")).add(line.replace("#### ", "").trim()); } } }catch (Exception e){ } if ((fileCnt % 1000 ) == 0){ logger.info((fileCnt * 100.0) / files.length + "%"); } } } for (Map.Entry<String,Set<String>> xxx : siteMaps.entrySet()){ oldDomains.addAll(xxx.getValue()); } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } public void doTask() { String root = "http://" + this.domain + "/"; Queue<String> urls = new LinkedList<>(); urls.add(root); Set<String> tmpDomains = new HashSet<>(); tmpDomains.addAll(oldDomains); tmpDomains.add(DomainUtils.getDomainWithCompleteDomain(root)); int cnt = 0; while (!urls.isEmpty()) { String url = urls.poll(); SendReq.ResBody html = doRequest(url, "GET", new HashMap<>()); cnt++; if (html.getCode().equals(0)) { ignoreSet.add(DomainUtils.getDomainWithCompleteDomain(url)); try { GenerateFile.createFile2("moneyframework/generate/ignore", "demo.txt", ignoreSet.toString()); } catch (IOException e) { e.printStackTrace(); } continue; } OnePage onePage = new OnePage(); onePage.setUrl(url); onePage.setDomain(DomainUtils.getDomainWithCompleteDomain(url)); onePage.setCode(html.getCode()); String title = HtmlUtil.getTitle(html.getResponce()).trim(); if (!StringUtils.hasText(title) || title.length() > 100 || title.contains("�")) { title = "没有"; } onePage.setTitle(title); String content = HtmlUtil.getContent(html.getResponce()); Set<Word> words = Nlp.separateWordAndReturnUnit(content); List<String> wordStr = Nlp.print2List(new ArrayList<>(words), 10); handleWord(wordStr, DomainUtils.getDomainWithCompleteDomain(url), title); onePage.setContent(wordStr.toString()); if (html.getCode().equals(200)) { List<String> domains = HtmlUtil.getUrls(html.getResponce()); for (String domain : domains) { int flag = 0; String[] aaa = domain.split("."); if (aaa.length>=4){ continue; } for (String i : ignoreSet) { if (domain.endsWith(i)) { flag = 1; break; } } if (flag == 1) continue; if (StringUtils.hasText(domain.trim())) { if (!tmpDomains.contains(domain)) { tmpDomains.add(domain); urls.add("http://" + domain + "/"); } } } logger.info(this.domain + " 队列的大小为 " + urls.size()); if (cnt >= 2000) { break; } } else { if (url.startsWith("http:")){ urls.add(url.replace("http:","https:")); } } } }}
其中,这里的_SendReq.sendReq_是自己实现的一个下载页面你的方法,调用了HttpClient的方法。如果你想实现对Web2.0的抓取,可以考虑在里面封装一个PlayWrite。 然后是格式化Html,去除标签和由于特殊字符引起的各种乱码的工具类HtmlUtils。import org.apache.commons.lang3.StringEscapeUtils;import java.io.IOException;import java.nio.charset.StandardCharsets;import java.util.ArrayList;import java.util.HashSet;import java.util.List;import java.util.Set;import java.util.regex.Matcher;import java.util.regex.Pattern;public class HtmlUtil { public static String getContent(String html) { String ans = ""; try { html = StringEscapeUtils.unescapeHtml4(html); html = delHTMLTag(html); html = htmlTextFormat(html); return html; } catch (Exception e) { e.printStackTrace(); } return ans; } public static String delHTMLTag(String htmlStr) { String regEx_script = "<script[^>]*?>[//s//S]*?<///script>"; //定义script的正则表达式 String regEx_style = "<style[^>]*?>[//s//S]*?<///style>"; //定义style的正则表达式 String regEx_html = "<[^>]+>"; //定义HTML标签的正则表达式 Pattern p_script = Pattern.compile(regEx_script, Pattern.CASE_INSENSITIVE); Matcher m_script = p_script.matcher(htmlStr); htmlStr = m_script.replaceAll(""); //过滤script标签 Pattern p_style = Pattern.compile(regEx_style, Pattern.CASE_INSENSITIVE); Matcher m_style = p_style.matcher(htmlStr); htmlStr = m_style.replaceAll(""); //过滤style标签 Pattern p_html = Pattern.compile(regEx_html, Pattern.CASE_INSENSITIVE); Matcher m_html = p_html.matcher(htmlStr); htmlStr = m_html.replaceAll(""); //过滤html标签 return htmlStr.trim(); } public static String htmlTextFormat(String htmlText) { return htmlText .replaceAll(" +", " ") .replaceAll("/n", " ") .replaceAll("/r", " ") .replaceAll("/t", " ") .replaceAll(" "," ") .replaceAll(" "," ") .replaceAll(" "," ") .replaceAll(" "," ") .replaceAll(" "," ") .replaceAll(" "," ") .replaceAll(" "," ") .replaceAll(" "," ") .replaceAll(" "," ") .replaceAll(" "," ") .replaceAll(" "," ") .replaceAll(" "," ") .replaceAll(" • "," ") .replaceAll("⎛⎝"," ") .replaceAll("⎠⎞"," ") .replaceAll(" "," ") .replaceAll("!!"," ") .replaceAll("✔ "," "); } public static List<String> getUrls(String htmlText) { Pattern pattern = Pattern.compile("(http|https)://////[A-Za-z0-9_//-//+.:?&@=///%#,;]*"); Matcher matcher = pattern.matcher(htmlText); Set<String> ans = new HashSet<>(); while (matcher.find()) { ans.add(DomainUtils.getDomainWithCompleteDomain(matcher.group())); } return new ArrayList<>(ans); } public static String getTitle(String htmlText) { Pattern pattern = Pattern.compile("(?<=title//>).*(?=</title)"); Matcher matcher = pattern.matcher(htmlText); Set<String> ans = new HashSet<>(); while (matcher.find()) { return matcher.group(); } return ""; }}
除了上面提到的去除标签和特殊字符外,还实现了获取所有url和标题的方法(Java有一些库也提供了相同的方法)。 public void handleWord(List<String> s, String domain, String title) { for (String a : s) { String x = a.split(" ")[0]; String y = a.split(" ")[1]; Set<String> z = siteMaps.getOrDefault(x, new ConcurrentSkipListSet<>()); if (Integer.parseInt(y) >= 10) { if (z.contains(domain)) continue; z.add(domain); siteMaps.put(x, z); GenerateFile.appendFileWithRelativePath("moneyframework/domain/markdown", x + ".md", MDUtils.getMdContent(domain, title, s.toString())); } } Set<Word> xxxx = Nlp.separateWordAndReturnUnit(title); for (Word word : xxxx) { String x = word.getWord(); Set<String> z = siteMaps.getOrDefault(x, new ConcurrentSkipListSet<>()); if (z.contains(domain)) continue; z.add(domain); siteMaps.put(x, z); GenerateFile.appendFileWithRelativePath("moneyframework/domain/markdown", x + ".md", MDUtils.getMdContent(domain, title, s.toString())); } }
存储的方法就是这个handleWord,其中,这里的s就是某个页面的分词结果(这里没有存储词语出现的偏移量,所以也不算是倒排索引),domain是域名本身,title是标题。 其中,这里调用了GenerateFile,是自定义实现的创建文件工具类。部分代码如下:public static void createFileRecursion(String fileName, Integer height) throws IOException { Path path = Paths.get(fileName); if (Files.exists(path)) { // TODO: 2021/11/13 如果文件存在 return; } if (Files.exists(path.getParent())) { // TODO: 2021/11/13 如果父级文件存在,直接创建文件 if (height == 0) { Files.createFile(path); } else { Files.createDirectory(path); } } else { createFileRecursion(path.getParent().toString(), height + 1); // TODO: 2021/11/13 这一步能保证path的父级一定存在了,现在需要把自己也建一下 createFileRecursion(fileName, height); }}public static void appendFileWithRelativePath(String folder, String fileName, String value) { File directory = new File("."); try { fileName = directory.getCanonicalPath() + "/" + folder + "/" + fileName; createFileRecursion(fileName, 0); } catch (IOException e) { e.printStackTrace(); } try { BufferedOutputStream bufferedOutputStream = new BufferedOutputStream(new FileOutputStream(fileName, true)); bufferedOutputStream.write(value.getBytes()); bufferedOutputStream.flush(); bufferedOutputStream.close(); } catch (IOException e) { e.printStackTrace(); }}
import com.chaojilaji.moneyframework.model.Word;import com.hankcs.hanlp.HanLP;import com.hankcs.hanlp.seg.Segment;import com.hankcs.hanlp.seg.common.Term;import java.util.*;import java.util.regex.Matcher;import java.util.regex.Pattern;public class Nlp { private static Pattern ignoreWords = Pattern.compile("[,.0-9_//- ,、:。;;//]//[///!()【】*?“”()+:|/"%~<>——]+"); public static Set<Word> separateWordAndReturnUnit(String text) { Segment segment = HanLP.newSegment().enableOffset(true); Set<Word> detectorUnits = new HashSet<>(); Map<Integer, Word> detectorUnitMap = new HashMap<>(); List<Term> terms = segment.seg(text); for (Term term : terms) { Matcher matcher = ignoreWords.matcher(term.word); if (!matcher.find() && term.word.length() > 1 && !term.word.contains("�")) { Integer hashCode = term.word.hashCode(); Word detectorUnit = detectorUnitMap.get(hashCode); if (Objects.nonNull(detectorUnit)) { detectorUnit.setCount(detectorUnit.getCount() + 1); } else { detectorUnit = new Word(); detectorUnit.setWord(term.word.trim()); detectorUnit.setCount(1); detectorUnitMap.put(hashCode, detectorUnit); detectorUnits.add(detectorUnit); } } } return detectorUnits; } public static List<String> print2List(List<Word> tmp,int cnt){ PriorityQueue<Word> words = new PriorityQueue<>(); List<String> ans = new ArrayList<>(); for (Word word : tmp) { words.add(word); } int count = 0; while (!words.isEmpty()) { Word word = words.poll(); if (word.getCount()<50){ ans.add(word.getWord() + " " + word.getCount()); count ++; if (count >= cnt){ break; } } } return ans; }}
其中,separateWordAndReturnUnit是对文本进行分词和进行词频统计,其结构如下:public class Word implements Comparable{ private String word; private Integer count = 0; ... ... @Override public int compareTo(Object o) { if (this.count >= ((Word)o).count){ return -1; }else { return 1; } }}
print2List方法是为了对List进行排序后输出,直接使用自带的排序方法也可以,这里使用优先队列的目的是觉得可能大顶堆的时间复杂度比快排低一些,不过这里的数据量不大,优化过头了。 @GetMapping("/api/v1/keywords")@ResponseBodypublic String getKeyWords(String domain) { try { Site site = demoService.stringSiteMap.get(DomainUtils.getDomainWithCompleteDomain(domain)); if (Objects.nonNull(site)) { String keyWords = site.getKeywords(); keyWords = keyWords.replace("[", "").replace("]", ""); String[] keyWordss = keyWords.split(", "); StringBuffer ans = new StringBuffer(); for (int i = 0; i < keyWordss.length; i++) { ans.append(keyWordss[i]).append("/n"); } return ans.toString(); } } catch (Exception e) { } return "该网站没有入库";}
@GetMapping("/api/v1/relations")@ResponseBodypublic String getRelationDomain(String domain) { try { Site site = demoService.stringSiteMap.get(DomainUtils.getDomainWithCompleteDomain(domain)); String keyWords = site.getKeywords(); keyWords = keyWords.replace("[", "").replace("]", ""); String[] keyWordss = keyWords.split(", "); Set<String> tmp = new HashSet<>(); int cnt = 0; for (int i = 0; i < keyWordss.length; i++) { String keyword = keyWordss[i]; String key = keyword.split(" ")[0]; if (IgnoreUtils.checkIgnore(key)) continue; cnt++; Set<String> x = demoService.siteMaps.get(key); if (Objects.nonNull(x)) { for (String y : x) { String yy = demoService.stringSiteMap.get(y).getKeywords(); int l = yy.indexOf(key); if (l != -1) { String yyy = ""; int flag = 0; for (int j = l; j < yy.length(); j++) { if (yy.charAt(j) == ',' || yy.charAt(j) == ']') { break; } if (flag == 1) { yyy = yyy + yy.charAt(j); } if (yy.charAt(j) == ' ') { flag = 1; } } if (Integer.parseInt(yyy) >= 20) { tmp.add(y + "----" + key + "----" + yyy); } } else { // Boolean titleContains = demoService.stringSiteMap.get(y).getTitle().contains(key); // if (titleContains) { // tmp.add(y + "----" + key + "----标题含有"); // } } } } if (cnt >= 4) { break; } } StringBuffer ans = new StringBuffer(); for (String s : tmp) { ans.append("<a href=/"http://" + s.split("----")[0] + "/">" + s + "</a><br>"); } return ans.toString(); } catch (Exception e) { // e.printStackTrace(); } return "该网站暂无相似网站";}
@GetMapping("/api/v1/keyresult")@ResponseBodypublic String getKeyResult(String key, String key2, String key3,Integer page, Integer size) { Set<String> x = new HashSet<>(demoService.siteMaps.get(key)); if (StringUtils.hasText(key2)) { key2 = key2.trim(); if (StringUtils.hasText(key2)){ Set<String> x2 = demoService.siteMaps.get(key2); x.retainAll(x2); } } if (StringUtils.hasText(key3)) { key3 = key3.trim(); if (StringUtils.hasText(key3)){ Set<String> x3 = demoService.siteMaps.get(key3); x.retainAll(x3); } } if (Objects.nonNull(x) && x.size() > 0) { Set<String> tmp = new HashSet<>(); for (String y : x) { String yy = demoService.stringSiteMap.get(y).getKeywords(); int l = yy.indexOf(key); if (l != -1) { String yyy = ""; int flag = 0; for (int j = l; j < yy.length(); j++) { if (yy.charAt(j) == ',') { break; } if (flag == 1) { yyy = yyy + yy.charAt(j); } if (yy.charAt(j) == ' ') { flag = 1; } } tmp.add(y + "----" + demoService.stringSiteMap.get(y).getTitle() + "----" + key + "----" + yyy); } else { Boolean titleContains = demoService.stringSiteMap.get(y).getTitle().contains(key); if (titleContains) { tmp.add(y + "----" + demoService.stringSiteMap.get(y).getTitle() + "----" + key + "----标题含有"); } } } StringBuffer ans = new StringBuffer(); List<String> temp = new ArrayList<>(tmp); for (int i = (page - 1) * size; i < temp.size() && i < page * size; i++) { String s = temp.get(i); ans.append("<a href=/"http://" + s.split("----")[0] + "/" style=/"font-size: 20px/">" + s.split("----")[1] + "</a> <p style=/"font-size: 15px/">" + s.split("----")[0] + " " + s.split("----")[3] + "</p><hr color=/"silver/" size=1/>"); } return ans.toString(); } return "暂未收录";}
@GetMapping("/api/v1/demo")@ResponseBodypublic void demo(String key) { new Thread(new Runnable() { @Override public void run() { HttpClientCrawl clientCrawl = new HttpClientCrawl(key); try { clientCrawl.doTask(); } catch (Exception e) { e.printStackTrace(); } finally { clientCrawl.oldDomains.clear(); clientCrawl.siteMaps.clear(); clientCrawl.onePageMap.clear(); clientCrawl.ignoreSet.clear(); } } }).start();}
这是一个非正式的项目,所以写得比较简陋和随意,见谅。 关键词:索引,领域,垂直