首先这种网站一定要设置爬取的速率,目标网站用这种方式写入网页估计是被爬虫搞怕了,大概率有更简单的反爬方法,因此爬取速率要注意。博主要爬的网站是一个电影网站:艺恩,点击下一页可以看到其实执行了一个js拿数据,但是URL却没有任何变化,我们需要一路下一页下一页点下去,然后把展示出的电影详情也搞到。
爬取思路:
启动selenium,控制Chrome开两个标签页,第一个标签页显示主页,第二个标签页显示不同的电影详情页如果是需要点击下一步的,就先跳转到第一个tab,然后点击“下一步”,把网页信息传回去。如果要显示电影详情页,就跳转到第二个tab,然后把网页信息传回去。
代码:
首先是爬虫文件spider的编写
from scrapy
.spiders
import CrawlSpider
import scrapy
from urllib
.parse
import urljoin
class MovieDetailSpider(CrawlSpider
):
name
= "flim"
allowed_domains
= ["endata.com.cn"]
def start_requests(self
):
start_url
= 'https://www.endata.com.cn/BoxOffice/MovieStock/movies.html'
self
.page
= 1
self
.max_page
= 500
yield scrapy
.Request
(start_url
, self
.parse
, dont_filter
=True, meta
={
'page': self
.page
,
})
def parse(self
, response
):
li_movie_list
= response
.css
('ul.movies-list-box li')
for li_movie_info
in li_movie_list
[:2]:
relative_url
= li_movie_info
.css
('a::attr(href)').extract_first
()
relative_url
= relative_url
.strip
()
movie_url
= urljoin
(response
.url
, relative_url
)
yield scrapy
.Request
(movie_url
, callback
=self
.movie_detail_page
,
dont_filter
=False)
start_url
= 'https://www.endata.com.cn/BoxOffice/MovieStock/movies.html'
self
.page
+= 1
if self
.page
< self
.max_page
:
yield scrapy
.Request
(start_url
, self
.parse
, dont_filter
=True, meta
={
'page': self
.page
,
})
def movie_detail_page(self
, response
):
movie_dict
= {}
yield movie_dict
middleware 文件:
class HandlessMiddleware(object):
def __init__(self
):
super(HandlessMiddleware
, self
).__init__
()
option
= webdriver
.ChromeOptions
()
option
.add_argument
('headless')
prefs
= {
"profile.managed_default_content_settings.images": 2,
'permissions.default.stylesheet': 2,
}
option
.add_experimental_option
("prefs", prefs
)
self
.browser
= webdriver
.Chrome
(chrome_options
=option
)
self
.browser
.implicitly_wait
(5)
self
.browser
.execute_script
('window.open("","_blank");')
def process_request(self
, request
, spider
):
if request
.url
== 'https://www.endata.com.cn/BoxOffice/MovieStock/movies.html':
if request
.meta
['page'] == 1:
self
.browser
.switch_to
.window
(self
.browser
.window_handles
[0])
self
.browser
.get
(request
.url
)
self
.max_page
= int(self
.browser
.find_element_by_id
('TableList_Paging').find_element_by_css_selector
(
'a.layui-laypage-last').text
)
else:
self
.browser
.switch_to
.window
(self
.browser
.window_handles
[0])
if request
.meta
['page'] <= self
.max_page
:
print("MAIN PAGE CHANGE : " + str(request
.meta
['page']) + " / " + str(self
.max_page
))
self
.browser
.find_element_by_id
('TableList_Paging').find_element_by_class_name
(
'layui-laypage-next').click
()
else:
return None
else:
print("NEW PAGE GET : " + request
.url
)
self
.browser
.switch_to
.window
(self
.browser
.window_handles
[1])
self
.browser
.get
(request
.url
)
time
.sleep
(2)
return HtmlResponse
(url
=self
.browser
.current_url
, body
=self
.browser
.page_source
, encoding
="utf-8",request
=request
)
在setting中注册
ROBOTSTXT_OBEY
= False
COOKIES_ENABLED
= False
DOWNLOADER_MIDDLEWARES
= {
'scrapy.downloadermiddlewares.useragent.UsserAgentMiddleware': None,
'movie_data.middlewares.HandlessMiddleware': 200,
}