selenium爬取斗鱼的房间信息
"""
@author:86135
@file:douyu.py
@time:2020/10/05
@desc:
格式化代码:Ctrl+Alt+L
运行代码:Ctrl+Shift+F10
"""
from selenium
import webdriver
import time
class douyuSpider():
def __init__(self
):
self
.startUrl
= 'https://www.douyu.com/directory/all'
options
= webdriver
.ChromeOptions
()
options
.add_argument
('--headless')
self
.driver
= webdriver
.Chrome
(chrome_options
=options
)
def parse(self
):
time
.sleep
(5)
self
.roll
()
li_list
= self
.driver
.find_elements_by_xpath
("//*[@id='listAll']/section[2]/div[2]/ul/li")
content_list
= []
for li
in li_list
:
item
= {}
item
["cate"] = li
.find_element_by_xpath
(".//span[@class = 'DyListCover-zone']").text
item
["name"] = li
.find_element_by_xpath
(".//div[@class='DyListCover-userName']").text
print(item
)
content_list
.append
(item
)
next_url
= self
.driver
.find_elements_by_xpath
("//li[@class=' dy-Pagination-next']/span")
print(len(next_url
))
next_url
= next_url
[0] if len(next_url
)>0 else None
return content_list
, next_url
def save_content(self
, content_list
):
pass
def roll(self
):
for x
in range(1,11,2):
time
.sleep
(0.5)
j
= x
/10
js
= "document.documentElement.scrollTop = document.documentElement.scrollHeight * {}".format(j
)
self
.driver
.execute_script
(js
)
def run(self
):
self
.driver
.get
(self
.startUrl
)
content_list
, next_url
= self
.parse
()
self
.save_content
(content_list
)
while next_url
is not None:
next_url
.click
()
content_list
, next_url
= self
.parse
()
self
.save_content
(content_list
)
else:self
.driver
.quit
()
if __name__
== '__main__':
douyu
= douyuSpider
()
douyu
.run
()
转载请注明原文地址:https://blackberry.8miu.com/read-29399.html