时间:2023-03-13 22:54:01 | 来源:电子商务
时间:2023-03-13 22:54:01 来源:电子商务
今天突然看到人人网的一个链接,想起前几年可是很火的大学生社交网站,登录自己的账号进去看后,发现这网站居然开始搞直播,不是之前那么单纯的社交功能网站了,不过发现人人网的数据还挺有意思的,可以爬取到不认识的人的社交资料,比如性别生日,所读学校,好友列表的,这个数据可以拿来做一个社交关系网状图,甚至拿来检验六度分隔理论,想想还有点小兴奋,于是就开始研究怎么爬取人人网的数据。package http;import org.apache.http.HttpEntity;import org.apache.http.HttpResponse;import org.apache.http.NameValuePair;import org.apache.http.client.HttpClient;import org.apache.http.client.entity.UrlEncodedFormEntity;import org.apache.http.client.methods.HttpGet;import org.apache.http.client.methods.HttpPost;import org.apache.http.impl.client.HttpClients;import org.apache.http.message.BasicNameValuePair;import org.apache.http.util.EntityUtils;import util.RegexUtil;import java.util.*;/** * Created by on 2017/12/23. */public class HttpUtil { public String doGet(String url, Map<String, String> headers) { HttpClient httpClient; HttpGet httpGet; String result = null; try { httpClient = HttpClients.createDefault(); httpGet = new HttpGet(url); Iterator<Map.Entry<String, String>> it = headers.entrySet().iterator(); while (it.hasNext()) { Map.Entry<String, String> entry = it.next(); String key = entry.getKey(); String val = entry.getValue(); httpGet.addHeader(key, val); } HttpResponse response = httpClient.execute(httpGet); if (response != null) { HttpEntity resEntity = response.getEntity(); if (resEntity != null) { result = EntityUtils.toString(resEntity, "utf8"); } } } catch (Exception ex) { System.out.println("http get失败"); } return result; } public String doPost(String url, Map<String, String> params, Map<String, String> headers) { HttpClient httpClient; HttpPost httpPost; String result = null; try { httpClient = HttpClients.createDefault(); httpPost = new HttpPost(url); Iterator<Map.Entry<String, String>> it = headers.entrySet().iterator(); while (it.hasNext()) { Map.Entry<String, String> entry = it.next(); String key = entry.getKey(); String val = entry.getValue(); httpPost.addHeader(key, val); } Iterator iterator = params.entrySet().iterator(); List<NameValuePair> list = new ArrayList<NameValuePair>(); while (iterator.hasNext()) { Map.Entry<String, String> elem = (Map.Entry<String, String>) iterator.next(); list.add(new BasicNameValuePair(elem.getKey(), elem.getValue())); } if (list.size() > 0) { UrlEncodedFormEntity entity = new UrlEncodedFormEntity(list, "utf8"); httpPost.setEntity(entity); } HttpResponse response = httpClient.execute(httpPost); if (response != null) { HttpEntity resEntity = response.getEntity(); if (resEntity != null) { result = EntityUtils.toString(resEntity, "utf8"); } } } catch (Exception ex) { System.out.println("http post失败"); } return result; } public static void main(String[] args) { HttpUtil util = new HttpUtil(); Map<String, String> headers = new HashMap<String, String>(); headers.put("Cookie", "你的cookie串"); String htmlStr = util.doGet("http://www.renren.com/494871890/profile", headers); Map<String,String> userInfo = RegexUtil.groupUserInfo(htmlStr); System.out.println(userInfo); }}
package util;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import java.util.Map;import java.util.regex.Matcher;import java.util.regex.Pattern;/** * Created by on 2017/12/24. */public class RegexUtil { public static List<String> getValByReg(String str, String reg, int valueNum) { List<String> rets = null; Pattern p = Pattern.compile(reg); Matcher m = p.matcher(str); if (m.find()) { if (m.groupCount() == valueNum) { rets = new ArrayList<String>(); for (int i = 1; i <= valueNum; i++) rets.add(m.group(i)); } } return rets; } /* * 获取用户标题栏消息工具 * **/ public static Map<String, String> groupUserInfo(String htmlStr) { Map<String, String> retMap = new HashMap<String, String>(); try { String infoReg = "(<div class=/"tl-information/"[//s//S]+</div>)"; //获取到存储用户信息的div List<String> rets = RegexUtil.getValByReg(htmlStr, infoReg, 1); if (rets == null || rets.size() < 1) { return retMap; } // 将div中的信息匹配出来 String div = rets.get(0).replace("/n", ""); Pattern p = Pattern.compile("<li class=/"(.*?)/">(.*?)</li>"); Matcher m = p.matcher(div); while (m.find()) { String attr = m.group(1); String value = m.group(2).replaceAll("(<span>|</span>| )", ""); retMap.put(attr, value); } } catch (Exception e) { return retMap; } return retMap; }}
于是运行上面httpUtil中的主类方法我们可以输出的结果就是{birthday=女生,二月十二日, hometown=来自河南商丘市, address=现居重庆, school=就读于重庆邮电大学}
这就是说我们已经有了一个可以根据id拿到用户主页标题栏信息的工具了package renren;import java.util.Set;/** * Created by on 2017/12/24. */public class User { private Stringd renrenId; // 人人id private String renrenName; // 人人昵称 private String renrenInfo; //人人标题栏信息 private int visitTime; //该人主页被访问次数 private Set<String> allFriendsId; // 该人的所有好友人人id public User() { } public User(String renrenId, String renrenName, String renrenInfo, int visitTime, Set<String> allFriendsId) { this.renrenId = renrenId; this.renrenName = renrenName; this.renrenInfo = renrenInfo; this.visitTime = visitTime; this.allFriendsId = allFriendsId; } public String getRenrenId() { return renrenId; } public void setRenrenId(String renrenId) { this.renrenId = renrenId; } public String getRenrenInfo() { return renrenInfo; } public void setRenrenInfo(String renrenInfo) { this.renrenInfo = renrenInfo; } public int getVisitTime() { return visitTime; } public void setVisitTime(int visitTime) { this.visitTime = visitTime; } public String getRenrenName() { return renrenName; } public void setRenrenName(String renrenName) { this.renrenName = renrenName; } public Set<String> getAllFriendsId() { return allFriendsId; } public void setAllFriendsId(Set<String> allFriendsId) { this.allFriendsId = allFriendsId; } @Override public String toString() { return String.format("%s---%s---%s---%d", renrenId, renrenName, renrenInfo, visitTime); }}
可以看到我们再bean中还定义了 用户昵称,用户主页面访问次数,好友id集合的字段,好友id集合获取方式会在后面介绍,有点小复杂package util;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import java.util.Map;import java.util.regex.Matcher;import java.util.regex.Pattern;/** * Created by on 2017/12/24. */public class RegexUtil { /* * 获取用户昵称信息 * **/ public static String groupUsername(String htmlStr) { String nameReg = "<title>人人网 - (.*)</title>"; List<String> rets = RegexUtil.getValByReg(htmlStr, nameReg, 1); if (rets != null && rets.size() > 0) return rets.get(0); return null; } //获取用户主页面访问次数 public static int groupVisitTime(String htmlStr){ htmlStr.replace("/n",""); String visitTimeReg = "(<div id=/"footprint-box/"[//s//S]*/h5>)"; List<String> rets = RegexUtil.getValByReg(htmlStr, visitTimeReg, 1); if(rets!=null && rets.size()>0){ String div = rets.get(0).replace("/n",""); String visitTime = div.replaceAll(".*最近来访.*?(//d+).*","$1"); if(visitTime.matches("//d+")) return Integer.parseInt(visitTime); } return 0; }}
现在我们能拿到的信息有 用户id,用户昵称,用户标题栏信息(地址,大学等),主页访问次数 , 编写一个主类来试一下package http;import net.sf.json.JSONObject;import org.apache.http.HttpEntity;import org.apache.http.HttpResponse;import org.apache.http.NameValuePair;import org.apache.http.client.HttpClient;import org.apache.http.client.entity.UrlEncodedFormEntity;import org.apache.http.client.methods.HttpGet;import org.apache.http.client.methods.HttpPost;import org.apache.http.impl.client.HttpClients;import org.apache.http.message.BasicNameValuePair;import org.apache.http.util.EntityUtils;import renren.User;import util.RegexUtil;import java.util.*;/** * Created by on 2017/12/23. */public class Test { public static void main(String[] args) { HttpUtil util = new HttpUtil(); Map<String, String> headers = new HashMap<String, String>(); // Cookie headers.put("Cookie","你的cookie"); String renrenId ="494871890"; String htmlStr = util.doGet("http://www.renren.com/"+renrenId+"/profile", headers); //获取用户信息 Map<String,String> userInfo = RegexUtil.groupUserInfo(htmlStr); JSONObject jsonObj = JSONObject.fromObject(userInfo); String renrenInfo = jsonObj.toString(); //获取用户姓名 String userName = RegexUtil.groupUsername(htmlStr); //获取用户主页面访问次数 int visitTime= RegexUtil.groupVisitTime(htmlStr); User user = new User(renrenId,userName,renrenInfo,visitTime,null); System.out.println(user); }}
输出结果为494871890---李瑶玉---{"birthday":"女生,二月十二日","hometown":"来自河南商丘市","address":"现居重庆","school":"就读于重庆邮电大学"}---62
那么我们根据一个id获取到用户信息并且填入bean已经可以成功了,后续会讲到关键词:数据,社交,爬虫