时间:2023-06-07 18:54:02 | 来源:网站运营
时间:2023-06-07 18:54:02 来源:网站运营
链家网全国省份城市的url地址:import requestsfrom requests.exceptions import RequestExceptionfrom bs4 import BeautifulSoupimport jsondef fetch(url): try: # proxies = {'http': 'http://172.17.0.3:7890'} headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0', } result = requests.get(url, headers=headers) return result.text except RequestException as e: return f"Error {e}!"#解析页面,bs4真的慢,好久没用了体验哈,parsel超好用,这个迭代了三次都 O 3了。def pase(result): bs = BeautifulSoup(result, 'lxml') ul = bs.find('ul', attrs={'class': 'city_list_ul'}) li = ul.find_all('div', attrs={'class': 'city_list'}) for i in li: title = i.find('div', attrs={'class': 'city_list_tit c_b'}) table = title.text datas = {table: [], 'url_link': []} city_ul = i.find_all('ul') for j in city_ul: a = j.find_all('a') for a_ in a: datas.get(table).append(a_.text) datas.get('url_link').append(a_.attrs['href']) print(datas) yield datas#将数据写入json格式文件,也可以是其他合适的def back_datas(data): def datas(): for d in data: yield d return datas()def end_save_datas(da): datas = {'result': [data for data in back_datas(da)]} with open('city.json', 'a') as fp: json.dump(datas, fp, indent=4)if __name__ == "__main__": res = fetch("https://www.lianjia.com/city/") data = pase(res) end_save_datas(data)
关键词:城市,地址,省份