时间:2023-05-20 10:36:02 | 来源:网站运营
时间:2023-05-20 10:36:02 来源:网站运营
python 爬虫 爬微博 分析 数据:import jsonimport scrapyfrom weibo.items import WeiboItemfrom bs4 import BeautifulSoupclass weibo_spider(scrapy.Spider): name = "weibo" start_urls =["https://m.weibo.cn/api/container/getIndex?uid=1927305954&t=0&luicode=10000011&lfid=100103type%3D1%26q%3D%E6%88%90%E6%9E%9C&type=uid&value=1927305954&containerid=1076031927305954"] url = "https://m.weibo.cn/api/container/getIndex?uid=1927305954&t=0&luicode=10000011&lfid=100103type%3D1%26q%3D%E6%88%90%E6%9E%9C&type=uid&value=1927305954&containerid=1076031927305954&since_id=" #start_urls = ["https://m.weibo.cn/"] allowed_domains = ["weibo.com", "weibo.cn"] since_id = "" # 下下面的id created_at = "" # 创建的日期 text = "" # 发布的内容 source = "" # 发布文章的设备 scheme = "" # 原文连接 reposts_count = 0 # 转发数量 textLength = 0 # 文章字数 comments_count = 0 # 评论个数 attitudes_count = 0 # 点赞个数 def parse(self, response): text_json = json.loads(response.body_as_unicode()) self.since_id = text_json.get('data').get('cardlistInfo').get('since_id') cards = text_json.get('data').get('cards') for it in cards: it_son = it.get('mblog') if it_son: self.created_at = it_son['created_at'] self.text = it_son['text'] self.source = it_son['source'] self.scheme = it['scheme'] self.reposts_count = it_son['reposts_count'] self.comments_count = it_son['comments_count'] self.attitudes_count = it_son['attitudes_count'] soup = BeautifulSoup(str(self.text), "html.parser") # 抓取的数据是有html标签 去除一下 self.text = soup.get_text() if len(self.created_at) < 6 : self.created_at = "%s%s"%("2020-", self.created_at) #由于今年的微博没有年份 所有给数据处理一下 self.textLength = len(self.text) items = WeiboItem(created_at=self.created_at, text=self.text, source=self.source, scheme=self.scheme, reposts_count=self.reposts_count, comments_count=self.comments_count, attitudes_count=self.attitudes_count, textLength=self.textLength) # 将数据写入items 文件中 yield items if not self.since_id: return urls = "%s%s"%(self.url, str(self.since_id)) # 获取的下一个json链接 yield scrapy.Request(urls, callback=self.parse)
scrapy 的 itmes.py 文件# -*- coding: utf-8 -*-# Define here the models for your scraped items## See documentation in:# https://docs.scrapy.org/en/latest/topics/items.htmlimport scrapyclass WeiboItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() since_id = scrapy.Field() # 下下面的id created_at = scrapy.Field() # 创建的日期 text = scrapy.Field() # 发布的内容 source = scrapy.Field() # 发布文章的设备 scheme = scrapy.Field() # 原文连接 reposts_count = scrapy.Field() # 转发数量 textLength = scrapy.Field() # 文章字数 comments_count = scrapy.Field() # 评论个数 attitudes_count = scrapy.Field() # 点赞个数
接下来就是导入数据库了# -*- coding: utf-8 -*-# Define your item pipelines here## Don't forget to add your pipeline to the ITEM_PIPELINES setting# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.htmlimport pymysqlimport jsonclass WeiboPipeline(object): account = { 'user': 'root', 'password': '*******', 'host': 'localhost', 'database': 'python' } def mysqlConnect(self): connect = pymysql.connect(**self.account) return connect def __init__(self): self.connect = self.mysqlConnect() # 连接数据库 self. cursor = self.connect.cursor(cursor = pymysql.cursors.DictCursor) #### 以json写入 #self.fp = open("xiaofuren.json", 'w', encoding='utf-8') def insertMsg(self, scheme, text, source, reposts_count, comments_count, attitudes_count, textLength, created_at): try: self.cursor.execute( "INSERT INTO %s VALUES( /'%s/' ,/' %s/' ,/' %s/',/' %d/',/' %d/',/' %d/',/' %d/',/' %s/')" % ( "weibo", scheme, text, source, reposts_count, comments_count, attitudes_count, textLength, created_at) ) self.connect.commit() except Exception as e: print("insert_sql error: " + e) def open_spider(self, spider): print("爬虫开始了******************") def process_item(self, item, spider): self.insertMsg( item['scheme'], item['text'], item['source'], item['reposts_count'], item['comments_count'], item['attitudes_count'], item['textLength'], item['created_at']) return item #### 以json写入 # itme_json = json.dumps(dict(item), ensure_ascii=False) # self.fp.write(itme_json + '/n') # return item def close_spider(self, spider): print("爬虫结束***************") print("数据写入成功") self.cursor.close() # since_id = "" # 下下面的id # created_at = "" # 创建的日期 # text = "" # 发布的内容 # source = "" # 发布文章的设备 # scheme = "" # 原文连接 # reposts_count = 0 # 转发数量 # textLength = 0 # 文章字数 # comments_count = 0 # 评论个数 # attitudes_count = 0 # 点赞个数
import datetimeimport pymysqlaccount = { 'user' : 'root', 'password' : 'zhaobo123..', 'host' : 'localhost', 'database' : 'python'}def mysqlConnect(account): connect = pymysql.connect(**account) return connectdef getMessage(cursor, month, day, year, phone, dianzan, zhuanfa, pinlun, textLength, dates): sql = 'select * from weibo ORDER BY created_at' cursor.execute(sql) row = cursor.fetchall() Day = {} #建立字典便于统计每天发送的微博 Year = {} Month = {} for i in range(1, 32): Day[i] = 0 for i in range(1, 13): Month[i] = 0 for i in range(2013, 2021): Year[i] = 0 for it in row: date = datetime.datetime.strptime(it['created_at'], " %Y-%m-%d") Year[date.year] += 1 Day[date.day] += 1 Month[date.month] += 1 phone.append(it['source']) dianzan.append(it['attitudes_count']) zhuanfa.append(it['reposts_count']) pinlun.append(it['comments_count']) textLength.append(it['textLength']) dates.append(it['created_at']) for i in range(1, 32): day.append(Day[i]) for i in range(1, 13): month.append(Month[i]) for i in range(2013, 2021): year.append(Year[i])if __name__ == '__main__': month = [] # 按照月发送的微博 year = [] # 按照年发送的微博 day = [] # 按照日发送的微博 phone = [] # 手机的种类 dianzan = [] # 点赞数 zhuanfa = [] # 转发数 pinlun = [] # 评论数 textLength = [] #发送微博长度 dates = [] # 时间 connect = mysqlConnect(account) cursor = connect.cursor(cursor=pymysql.cursors.DictCursor) getMessage(cursor, month, day, year, phone, dianzan, zhuanfa, pinlun, textLength, dates)
代码里有注释我就不解释了。#按照日 发微博的个数 xday = [] for i in range(1, 32): xday.append(i) bar = ( Bar() .add_xaxis(xday) .add_yaxis("每天发送的微博", day) .set_global_opts(title_opts=opts.TitleOpts(title="狗哥发微博统计")) ) bar.render(path= 'day.html') # 按月 xmonth = [] for i in range(1, 13): xmonth.append(i) bar = ( Bar() .add_xaxis(xmonth) .add_yaxis("每月发送的微博", month) .set_global_opts(title_opts=opts.TitleOpts(title="狗哥发微博统计")) ) bar.render(path = 'month.html') # 按年 xyear = [] for i in range(2013, 2021): xyear.append(i) bar = ( Bar() .add_xaxis(xyear) .add_yaxis("每年发送的微博", year) .set_global_opts(title_opts=opts.TitleOpts(title="狗哥发微博统计")) ) bar.render(path = 'year.html')
关键词:分析,数据,爬虫