时间:2023-05-20 08:42:02 | 来源:网站运营
时间:2023-05-20 08:42:02 来源:网站运营
新浪微博爬虫实现(附核心Python代码):如何爬取新浪微博数据? payload = { 'username': '156****1997', 'password': '**********'} #设置请求头文件信息 header_init = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36', 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding':'gzip, deflate, br', 'Connection':'close', 'Referer':'https://weibo.com/askcliff?is_all=1' } #微博登陆页URL url_login='https://passport.weibo.cn/signin/login' #设置一个会话对象 s = requests.Session() #以post形式提交登陆用户名和密码 s.post(url=url_login, data=payload, headers=header_init)
另外一种是模拟浏览器登录,利用浏览器的Cookies信息,这个信息可以在网页的后台控制中很容易找到,注意要以字典的形式存储在变量中。[代码:Cookies示例(在XHR的JS中的Header Requests可以找到)] #微博内容抓取页URL url_init = url_base+'&page={}' #设置Cookie的内容 cookie={ 'MLOGIN':'1', 'M_WEIBOCN_PARAMS':'luicode%3D10000011%26lfid%3D100103type%253D3%2526q%253D%25E5%25AF%2585%25E5%25AD%2590%2526t%253D0%26featurecode%3D20000320%26oid%3D3900004206849232%26fid%3D1005053628359543%26uicode%3D10000011&page={}', 'SCF':'AkhONeuuFcGAlAz0kgavz1wRbp1fz7ZGn0Xn_zPHzoa0B_VbPTNxInVDSaycKttiCUPGwlxaxxqJG', 'SUB':'_2A252C9fsDeRhGeNI41QZ-CrEyzqIHXVV9_mkrDV6PUJbkdAKLW_CkW1NSDUIJok_9iLiEAocyWlucWgHT-UKNQiO', 'SUHB':'0A0JidXol5dQPP', 'WEIBOCN_FROM':'1110006030', '_T_WM':'28789df2dacda9b86d0a2ffa60adbfe8', }
一般来说,模拟浏览器比模拟登陆的方式用时更短。# 发博人creen_name = jd['data']['cards'][count]['mblog']['user']['screen_name']result['微博用户'].append(screen_name)# 这部分是获取微博用户的文本内容# 发表时间a = ('/''+jd['data']['cards'][count]['mblog']['created_at']+'/'').count('-')if a == 2:date = datetime.strptime(jd['data']['cards'][count]['mblog']['created_at'],'%Y-%m-%d')datesrt = date.strftime('%Y-%m-%d')if a == 1:date = datetime.strptime(jd['data']['cards'][count]['mblog']['created_at'],'%m-%d')datesrt = date.strftime('2018'+'-%m-%d')if a == 0:datesrt = '今天或昨天'result['发博日期'].append(datesrt)
这部分是关于发博日期的获取。之所以用了很多判断,是它的时间呈现方式导致。新浪微博中会有类似于“1小时前”、“今天”“5天前”等非规范的时间表达方式。(规范的方式为xxxx年xx月xx日)def PageData(url,cookie,header): #提交请求获取要抓取的页面信息 res = requests.get(url=url, cookies=cookie, headers=header) #读取页面内容 jd = json.loads(res.text) count = -1 result = {} result['id'] = [] result['微博用户'] = [] result['发博日期'] = [] result['微博文本'] = [] result['附带图片链接'] = [] result['微博链接'] = [] result['点赞数'] = [] result['评论数'] = [] result['转发数'] = [] for i in jd['data']['cards']: count = count + 1 if jd['data']['cards'][count]['card_type'] == 9: #发博人 screen_name = jd['data']['cards'][count]['mblog']['user']['screen_name'] result['微博用户'].append(screen_name) #发表时间 a = ('/''+jd['data']['cards'][count]['mblog']['created_at']+'/'').count('-') if a == 2: date = datetime.strptime(jd['data']['cards'][count]['mblog']['created_at'],'%Y-%m-%d') datesrt = date.strftime('%Y-%m-%d') if a == 1: date = datetime.strptime(jd['data']['cards'][count]['mblog']['created_at'],'%m-%d') datesrt = date.strftime('2018'+'-%m-%d') if a == 0: datesrt = '今天或昨天' result['发博日期'].append(datesrt) #微博内容 text = jd['data']['cards'][count]['mblog']['text'] text = filter_emoji(text,restr='') soup = BeautifulSoup(text,'html.parser') text = soup.get_text() result['微博文本'].append(text) #微博所附图片链接 if 'original_pic' in jd['data']['cards'][count]['mblog'].keys(): original_pic = jd['data']['cards'][count]['mblog']['original_pic'] else: original_pic = '无图片链接' result['附带图片链接'].append(original_pic) #微博网页链接 html = jd['data']['cards'][count]['scheme'] result['微博链接'].append(html) #点赞数量 attitudes_count = jd['data']['cards'][count]['mblog']['attitudes_count'] result['点赞数'].append(attitudes_count) #评论数量 comments_count = jd['data']['cards'][count]['mblog']['comments_count'] result['评论数'].append(comments_count) #转发数量 reposts_count = jd['data']['cards'][count]['mblog']['reposts_count'] result['转发数'].append(reposts_count) return result
def to_sql(weibo_total,dbname): #写入数据库 conn = pymysql.connect(user='spider',password='****',host='***.***.**.**',port=3306,database='spider',use_unicode=True,charset="utf8") cs = conn.cursor() #整理字典数据 for i in range(len(weibo_total['微博用户'])): data = '' for k in weibo_total.keys(): data = (data + '/'' + '{}' + '/'' + ',').format(weibo_total[k][i]) #data = '/"'+ data[:-1] + '/"' #SQL语句执行 sql = ("""INSERT INTO %s VALUES (%s)""") % (dbname,data[:-1]) cs.execute(sql) cs.execute("SELECT * FROM %s"%dbname) conn.commit() print(cs.fetchall()) conn.close()
由以上代码所示,首先链接数据库,然后整理储存在字典中的数据,之后执行操作将数据存入数据库。此代码适用于所有网页版移动端微博数据的爬取。#过滤emoji表情def filter_emoji(desstr,restr=''): try: co = re.compile(u'[/U00010000-/U0010ffff]') except re.error: co = re.compile(u'[/uD800-/uDBFF][/uDC00-/uDFFF]') return co.sub(restr, desstr)
如有错误或理解不到位之处,还希望大家及时指正哇。毕竟代码比较久远,有些细节不太清楚了~下次更新预告:利用卷积实现人脸识别(附Python核心代码)
关键词:核心,实现,爬虫