时间:2023-05-19 20:48:02 | 来源:网站运营
时间:2023-05-19 20:48:02 来源:网站运营
将新浪微博备份后导入到DayOne(python实现):目录import osfrom requests_html import HTMLSessionfrom requests_file import FileAdapter # 挂载到本地import time # 英文时间转中文import jsonimport uuid # 需要生成UU唯一码import sys# 这是一个写入dayone Json的函数,位置信息请自行在location中修改def DayOneImport(ss_time3, ss_content): # 以下是DayOne的Json格式和生成UUID ss_uuid = str(uuid.uuid1()).replace('-', '') ss_dict = {'audios': [], 'creationDate': ss_time3, 'location': {'address': '芙蓉社区, 芙蓉区, 杭州, 湖南', 'administrativeArea': '湖南', 'country': 'China', 'latitude': 28.191735133055555, 'localityName': '杭州', 'longitude': 113.0060535813889, 'placeName': '芙蓉社区', 'region': {'center': {'latitude': 28.191735133055555, 'longitude': 113.0060535813889}}}, 'photos': [], 'starred': False, 'tags': ['微博'], 'text': ss_content, 'timeZone': 'Asia/Shanghai', 'uuid': ss_uuid, 'weather': {'conditionsDescription': 'Possible Light Rain and Humid', 'pressureMB': 1011.2, 'relativeHumidity': 1.0, 'temperatureCelsius': 27.0, 'visibilityKM': 14.897, 'weatherCode': 'sunny', 'weatherServiceName': 'Forecast.io', 'windBearing': 254.0, 'windChillCelsius': 0.0, 'windSpeedKPH': 1.16}} # 开始写入到同一文件夹内的Json中 with open(r'AllEntries.json', 'r', encoding='UTF-8-sig') as f2: diary_dict = json.load(f2) diary_lists = diary_dict['entries'] diary_lists.append(ss_dict) diary_dict.update({'entries': diary_lists}) f2.close() with open(r'AllEntries.json', 'w', encoding='UTF-8-sig') as f3: json.dump(diary_dict, f3, ensure_ascii=False) f3.close()# 获取文件夹内所有的微博文件def getAllFilePath(fPath): BlogList = [] for root, dirs, files in os.walk(fPath): # print("根目录:",root,'/n') # print("所含目录:",dirs, '/n') # print("所含文件:",files, '/n——————————————————————————') for file in files: if '.html' in file: BlogA = os.path.join(root, file) BlogList.append(BlogA) # 返回一个是html的日志内容list return BlogListBlogList = getAllFilePath(r".")print(BlogList)# 开始抓取保存的微博网页session = HTMLSession()# 挂载文件session.mount('file://', FileAdapter())# Windows系统路径目录分隔符为反斜杠,但get需要正斜杠所以先进行一下替换pwd = os.getcwd().replace("//", "/")# 测试发现使用相对路径读不到文件,需要使用绝对路径wb_page = 1 # 微博html篇数# 遍历每一篇htmlfor WBlogContent in BlogList: # 每一篇html需要变换下路径格式 WBlogContent = WBlogContent.replace('.//', '/').replace('//', '/') r = session.get(f'file:///{pwd}' + WBlogContent) # 开始抓取了,抓取到了当前html下的所有微博数 wb_list = r.html.find("div[class='card m-panel card9 weibo-member']") print(len(wb_list)) wb_num = 1 # 保存的微博数,不应放在循环中 # 对当前html下的所有微博数进行遍历 for wb_list15 in wb_list: weiboContent = '' # 获得微博文本 weiboOgHtmltext = wb_list15.find("div[class='weibo-text']")[0].text # print('现在是爬取的是:' + weiboOgHtmltext+'/n') # 时间转换成DAYOne格式 weiboOgTime = wb_list15.find("span[class='time']")[0].text.replace(' +0800', '') timeArray = time.strptime(weiboOgTime, "%a %b %d %H:%M:%S %Y") weiboOgTime_str = time.strftime("%Y-%m-%dT%H:%M:%SZ", timeArray) # 来源于也要获得 weiboOgFrom = wb_list15.find("span[class='from']")[0].text weiboContent = weiboContent + weiboOgHtmltext # 如果有链接的话,获取链接 if wb_list15.find("div[class='weibo-text']")[0].find('a') != []: weiboHtmlUrls = wb_list15.find("div[class='weibo-text']")[0].find('a') weiboHtmlUrlDict = {} for weiboHtmlUrl in weiboHtmlUrls: wbrpurl = list(weiboHtmlUrl.links)[0] wbrpurlText = weiboHtmlUrl.text # 把链接放到字典中 weiboHtmlUrlDict[wbrpurlText] = wbrpurl # markdown格式将链接文本进行替换 urlNames = weiboHtmlUrlDict.keys() for urlName in urlNames: if urlName in weiboContent: weiboContent = weiboContent.replace(urlName, '[' + urlName + ']' + '(' + weiboHtmlUrlDict[urlName] + ')') else: weiboContent = weiboContent + '[' + urlName + ']' + '(' + weiboHtmlUrlDict[urlName] + ')' # 微博所有图片,形成一个list列表 if wb_list15.find("div[class='weibo-og']")[0].find("div[class='m-img-box m-imghold-square']") != []: weiboHtmlPicList = wb_list15.find("div[class='weibo-og']")[0].find( "div[class='m-img-box m-imghold-square']") weiboHtmlPicUrlList = [] for weiboHtmlPic in weiboHtmlPicList: weiboHtmlPicUrl = list(weiboHtmlPic.find('a')[0].links)[0] weiboHtmlPicUrlList.append(weiboHtmlPicUrl) # markdown格式把图片也加进去 for wbHtmlPicUrl in weiboHtmlPicUrlList: weiboContent = weiboContent + '![' + '微博大图' + ']' + '(' + wbHtmlPicUrl + ')' # 转发微博里同样的操作 if len(wb_list15.find("div[class='weibo-text']")) > 1: weiboRpHtmlText = wb_list15.find("div[class='weibo-text']")[1].text # 转发微博所有网址和链接文字,形成一个字典 if wb_list15.find("div[class='weibo-text']")[1].find('a') != []: weiboRpHtmlUrls = wb_list15.find("div[class='weibo-text']")[1].find('a') weiboRpHtmlUrlDict = {} for weiboRpHtmlUrl in weiboRpHtmlUrls: wbrpurl = list(weiboRpHtmlUrl.links)[0] wbrpurlText = weiboRpHtmlUrl.text weiboRpHtmlUrlDict[wbrpurlText] = wbrpurl RpurlNames = weiboRpHtmlUrlDict.keys() for RpurlName in RpurlNames: if RpurlName in weiboRpHtmlText: # 先把weiboRpHtmlText替换掉网址和图片,再和weiboContent合并 weiboRpHtmlText = weiboRpHtmlText.replace(RpurlName, '[' + RpurlName + ']' + '(' + weiboRpHtmlUrlDict[ RpurlName] + ')') else: # 查找不到也没关系,直接附在后面 weiboRpHtmlText = weiboRpHtmlText + '[' + RpurlName + ']' + '(' + weiboRpHtmlUrlDict[ urlName] + ')' # 转发微博所有图片,形成一个list列表 if wb_list15.find("div[class='m-img-box m-imghold-square']") != []: weiboRpHtmlPicList = wb_list15.find("div[class='m-img-box m-imghold-square']") weiboRpHtmlPicUrlList = [] for weiboRpHtmlPic in weiboRpHtmlPicList: weiboRpHtmlPicUrl = list(weiboRpHtmlPic.find('a')[0].links)[0] weiboRpHtmlPicUrlList.append(weiboRpHtmlPicUrl) for wbRpHtmlPicUrl in weiboRpHtmlPicUrlList: weiboRpHtmlText = weiboRpHtmlText + '![' + '微博大图' + ']' + '(' + wbRpHtmlPicUrl + ')' # 合并了 weiboContent = weiboContent + '/n/n' + '> ' + weiboRpHtmlText # 加点来源和日期 weiboContent = weiboContent + '/n/n' + ' *** ' + '/n/n' + '**' + weiboOgTime_str + ',' + weiboOgFrom + '**/n/n' # 进去吧您~ DayOneImport(weiboOgTime_str, weiboContent) # 微博很多的打印下进度,实时掌握情况 speed = wb_num / len(wb_list) speed = round(speed, 2) * 100 os.system("cls") print('正在保存第' + str(wb_page) + '页', '当前进度为:', speed, '%/n/n---------------/n') sys.stdout.flush() wb_num = wb_num + 1 wb_page = wb_page + 1 ##这个可以打印成md格式的文件,需要的可以试试 # f=open('temp.md','a',encoding='utf-8-sig') # print(weiboContent,file=f) # f.close()
运行,微博多的需要等待一段时间。出现“Process finished with exit code 0”后,将AllEntries.json文件重新放入到之前的压缩包文件夹中并打包成zip格式。 'location': {'address': '芙蓉社区, 芙蓉区, 杭州, 湖南', 'administrativeArea': '湖南', 'country': 'China', 'latitude': 28.191735133055555, 'localityName': '杭州', 'longitude': 113.0060535813889, 'placeName': '芙蓉社区', 'region': {'center': {'latitude': 28.191735133055555, 'longitude': 113.0060535813889}}},
其中关键词:实现