时间:2023-05-20 06:32:01 | 来源:网站运营
时间:2023-05-20 06:32:01 来源:网站运营
爬虫|爬取微博动态:import requestsfrom pyquery import PyQuery as pqfrom wordcloud import WordCloudfrom PIL import Imageimport jiebaimport numpy as npweibo_id = '' # 微博用户idbase_url = 'https://m.weibo.cn/api/container/getIndex?containerid=230413' + weibo_id + '&page='headers = { 'Accept': 'application/json, text/plain, */*', 'MWeibo-Pwa': '1', 'Referer': 'https://m.weibo.cn/u/6132300208', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' + 'Chrome/75.0.3770.100 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest'}FILE_NAME = 'weibo.txt' # 保存微博文本内容IMG_PATH = 'love2.jpg' # 词云的形状
获得json格式的数据。def get_content(page_num): url = base_url + str(page_num) response = requests.get(url, headers=headers) if response.status_code == 200: # 200表示正常,没被限制 content_json = response.json() return content_json else: return [] # 返回空列表
解析json,提取需要的文本内容。def parse_json(content_json): text = '' if content_json: items = content_json.get('data').get('cards') for item in items: item = item.get('mblog') # 找到文本内容 temp = pq(item.get('text')).text() pos = temp.find('/') # 把之前转发此微博的人发的文本内容去掉 if pos != -1: temp = temp[:pos] text = text + temp # 找到转发的微博的文本内容 # 如果此微博是本人发的,retweeted_status就不存在 item = item.get('retweeted_status') if item: text = text + pq(item.get('text')).text() + '/n' return text
制作词云所需的函数。def transform(text): word_list = jieba.cut(text) # 分词后在单独个体之间加上空格 text = ' '.join(word_list) return textdef get_word_cloud(): with open(FILE_NAME, 'r', encoding='utf-8') as f: img_matrix = Image.open(IMG_PATH) mask = np.array(img_matrix) wc = WordCloud(mask=mask, scale=4, font_path=r'C:/Windows/Fonts/simhei.ttf', background_color='white', max_font_size=40, min_font_size=2) text = f.read() text = transform(text) wc.generate(text) wc.to_file(weibo_id + 'wc.png')
词云在之前的文章介绍过,这里不再重复说了。移步if __name__ == '__main__': all_text = '' # 保存所有的文本 for page in range(1, 11): content_json = get_content(page) text = parse_json(content_json) all_text = all_text + text # 保存到文件 with open(FILE_NAME, 'w', encoding='utf-8') as f: f.write(all_text) # 生成词云 get_word_cloud()
关键词:动态,爬虫