18143453325 或

所在位置：首页 > 营销资讯 > 网站运营 > python selenium 爬虫模拟浏览网站内容

python selenium 爬虫模拟浏览网站内容

时间：2023-04-24 12:51:01 | 来源：网站运营

时间：2023-04-24 12:51:01 来源：网站运营

python selenium 爬虫模拟浏览网站内容：使用python selenium编写的爬虫代码，模拟用户浏览某个网站内容，废话少说进入正文。
1、爬虫界面如下：

界面使用说明：
第一步：填写要访问的网站地址
第二步：填写每天访问该网址的次数
第三步：点击“开始刷量”按钮开始访问网站内容
2、爬虫源代码介绍：
1）点击“开始刷量”按钮调用runjob方法，runjob具体代码如下：
# 访问网站操作代码
def runjob():
# m, s = divmod(second, 60)
# h, m = divmod(m, 60)
# hms = "{:02}:{:02}:{:02}".format(h, m, s)
# http://logger.info(hms)
#a、如果人工停止刷量，则直接结束当前线程，并刷新界面
if (myframe.stop_refresh_page_thread):
# 增加刷量
addrefreshnum()
# 刷新页面
myframe.refresh_run_stop_button()
return
# b、正常执行任务
refreshnum = myframe.refreshnum
siteurl = myframe.siteurlinput.GetValue().strip()
my_logger_info(logger,"==开始网站%s第%d次刷量<直接访问>=="%(siteurl,refreshnum+1))
try:
# 创建浏览器
driver = createWebDriver()
# 浏览器最大化
driver.maximize_window()
# 浏览网站
viewSite(driver,siteurl)
except Exception as e:
info = traceback.format_exc()
my_logger_info(logger,info)
driver.save_screenshot(".//refreshpage_directvisit_error.png")
myframe.stop_refresh_page_thread=True
finally:
# 关闭浏览器
driver.quit()
# 增加刷量
addrefreshnum()
# 正常执行后刷新界面
myframe.refresh_run_stop_button()
# 打印日志
my_logger_info(logger,"==完成网站%s第%d次刷量<直接访问>=="%(siteurl,refreshnum+1))

2）runjob=>createWebDriver()代码如下
#创建浏览器驱动
def createWebDriver():
# 配置参数
options = webdriver.ChromeOptions()
# 设置网页编码
options.add_argument('lang=zh_CN.UTF-8')
# 禁止加载图片
options.add_argument('blink-settings=imagesEnabled=false')
# 禁用sandbox
options.add_argument('--no-sandbox')
# 无界面模式
options.add_argument('headless')
driver = webdriver.Chrome(options=options, keep_alive=True)
# 防止selenium访问被识别出来，不算流量
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
return driver

3）runjob=>viewSite(driver,siteurl)代码如下
# 模拟浏览网站
def viewSite(driver,url):
my_logger_info(logger,"<<开始访问网站:%s"% url)
driver.get(url)
pagesource = driver.page_source
runIdleSomeTime(random.randint(3, 5))
linklist = []
linklist.append("桥架国标")
linklist.append("桥架价格")
linklist.append("桥架安装")
prodlist = []
prodlist.append("桥架配件")
prodlist.append("桥架规格")
for i in range(len(prodlist)):
# 支持人工停止刷量
if (myframe.stop_refresh_page_thread):
break
lanmu = prodlist[i]
viewProductOfLanmu(driver, lanmu)
for i in range(len(linklist)):
# 支持人工停止刷量
if (myframe.stop_refresh_page_thread):
break
lanmu = linklist[i]
viewArticleOfLanmu(driver, lanmu)
# 支持人工停止刷量
if (myframe.stop_refresh_page_thread):
my_logger_info(logger, "已经停止刷量")
my_logger_info(logger,">>完成访问网站:%s" % url)

4）runjob=>viewSite(driver,siteurl)=>viewProductOfLanmu(driver,lanmu)代码如下
# 查看栏目产品
def viewProductOfLanmu(driver,lanmu):
# 浏览相关栏目
link_d = driver.find_element_by_link_text(lanmu)
# 不直接使用link.click()，避免被其他咨询窗口遮挡
driver.execute_script("arguments[0].click();", link_d)
# 等待栏目第一页加载完成
runIdleSomeTime(random.randint(3, 5))
# 支持人工停止刷量
if (myframe.stop_refresh_page_thread):
return
pagesource = driver.page_source
soup = BeautifulSoup(pagesource, "html.parser")
# logger.debug(soup.prettify())
while True:
# 查看当前页所有文章
newsdiv_s = soup.find("div", class_="list").find_all("div", class_="mask")
for i in range(len(newsdiv_s)):
link=newsdiv_s[i].find("a")
my_logger_info(logger,"访问页面：%s" % link['href'])
# 在新的窗口打开文章
js = "window.open('" + link['href'] + "','_blank');"
try:
driver.execute_script(js)
except Exception as e:
info = traceback.format_exc()
my_logger_info(logger, info)
continue
# driver.implicitly_wait(3)
# 查看打开的文章内容
runIdleSomeTime(random.randint(5, 7))
# 支持人工停止刷量
if (myframe.stop_refresh_page_thread):
break
# driver.close()
# 获取当前打开的所有窗口
windows = driver.window_handles
# 转换到最新打开的窗口
driver.switch_to.window(windows[-1])
driver.close()
# 转换到父窗口
driver.switch_to.window(windows[0])
# 在当前页浏览文章
runIdleSomeTime(random.randint(1, 3))
# 支持人工停止刷量
if (myframe.stop_refresh_page_thread):
break
# 支持人工停止刷量
if (myframe.stop_refresh_page_thread):
break
# 判断是否有下一页
pagediv_s = soup.find("div", class_="pageBox")
nextpagelink_s=pagediv_s.find("a",text="下一页")
if (not nextpagelink_s):
break
# 尝试翻转到下一页,翻页失败则不再进行
nextpagelink_d = driver.find_element_by_link_text("下一页")
driver.execute_script("arguments[0].click();", nextpagelink_d)
# 等待栏目当前页加载完成
runIdleSomeTime(random.randint(3, 5))
pagesource = driver.page_source
soup = BeautifulSoup(pagesource, "html.parser")

5）runjob=>viewSite(driver,siteurl)=>viewArticleOfLanmu(driver,lanmu)代码如下
# 查看栏目文章
def viewArticleOfLanmu(driver,lanmu):
# 浏览相关栏目
link_d = driver.find_element_by_link_text(lanmu)
# 不直接使用link.click()，避免被其他咨询窗口遮挡
driver.execute_script("arguments[0].click();", link_d)
# 等待栏目第一页加载完成
runIdleSomeTime(random.randint(3, 5))
# 支持人工停止刷量
if (myframe.stop_refresh_page_thread):
return
pagesource = driver.page_source
soup = BeautifulSoup(pagesource, "html.parser")
# logger.debug(soup.prettify())
while True:
# 查看当前页所有文章
newsdiv_s = soup.find("div", class_="newsList")
for link in newsdiv_s.find_all("a", class_="look"):
my_logger_info(logger,"访问页面：%s" % link['href'])
# 在新的窗口打开文章
js = "window.open('" + link['href'] + "','_blank');"
driver.execute_script(js)
# driver.implicitly_wait(3)
# 查看打开的文章内容
runIdleSomeTime(random.randint(5, 7))
# 支持人工停止刷量
if (myframe.stop_refresh_page_thread):
break
# driver.close()
# 获取当前打开的所有窗口
windows = driver.window_handles
# 转换到最新打开的窗口
driver.switch_to.window(windows[-1])
driver.close()
# 转换到父窗口
driver.switch_to.window(windows[0])
# 在当前页浏览文章
runIdleSomeTime(random.randint(5, 7))
# 支持人工停止刷量
if (myframe.stop_refresh_page_thread):
break
# 支持人工停止刷量
if (myframe.stop_refresh_page_thread):
break
# 判断是否有下一页
pagediv_s = soup.find("div", class_="pageBox")
nextpagelink_s=pagediv_s.find("a",text="下一页")
if (not nextpagelink_s):
break
# 尝试翻转到下一页,翻页失败则不再进行
nextpagelink_d = driver.find_element_by_link_text("下一页")
driver.execute_script("arguments[0].click();", nextpagelink_d)
# 等待栏目当前页加载完成
runIdleSomeTime(random.randint(3, 5))
pagesource = driver.page_source
soup = BeautifulSoup(pagesource, "html.parser")

3、爬虫访问示例网站：

首页链接：

http://www.jywy.bj.cn/

栏目链接：

http://www.jywy.bj.cn/index.php?s=/List/index/cid/23.html

文章链接：

http://www.jywy.bj.cn/index.php?s=/Show/index/cid/23/id/167.html

http://www.jywy.bj.cn/index.php?s=/Show/index/cid/23/id/168.html

欢迎有兴趣的同学留言交流。

关键词：内容,浏览,模拟,爬虫

网站
营销
设计
运营
优化
效率
专注
电商
方案
推广

解决方案&服务

客户&案例

营销资讯

关于我们

解决方案&服务

客户&案例

营销资讯

关于我们

微信公众号

为了最佳展示效果，本站不支持IE9及以下版本的浏览器，建议您使用谷歌Chrome浏览器。点击下载Chrome浏览器

关闭

在线咨询

快捷入口

python selenium 爬虫模拟浏览网站内容

什么样的域名利于网站SEO优化？

怎么创建一个表白网站！网站！网站？

最好的 Python 网站开发方面的学习教程有哪些？

广告公司一般在哪些网站找设计素材？

中国著名的高端私人旅游定制机构有哪些呢？

完美的网站SEO优化计划方案【简单seo】

有什么运营网站推荐？

网站建设制作-建企业网站公司

杭州APP开发公司外包价格

为什么要搭建电商网站？有哪些好处

在线咨询

快捷入口

python selenium 爬虫 模拟浏览网站内容

推荐文章