网址:https://movie.douban.com/top250? 分析页面,发现250部电影,每页25部,10页的url规律,然后分别合成。 分析如何提取页面的信息
伪装
# 浏览器伪装 headers = { 'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36" } # 分析网页 def getUrls(): '''250部电影,每页25部,需取得10页的url''' url_init = 'https://movie.douban.com/top250?start={0}&filter=' urls = [url_init.format(index*25) for index in range(10)] return urls def get_all_book_links(urls): '''遍历十个页面,提取每页电影页面的url''' all_book_links = [] for url in urls: r = requests.get(url,headers=headers) r.encoding = 'utf-8' soup = BeautifulSoup(r.text) ol_list = soup.find_all('ol') # print(r.status_code) li_list = ol_list[0].find_all('li') book_herfs = [li.find('a')['href'] for li in li_list] all_book_links += book_herfs return all_book_links一步步分析提取自己所需要的文字图片等信息,解析HTML取出
def get_info(book_url): '''从一部电影的主页中抓取信息:电影名,导演,主演,类型,国家/地区,剧情简介,评分''' r = requests.get(book_url,headers=headers) soup = BeautifulSoup(r.text) img_info = soup.h1.nextSibling.nextSibling info = img_info.div.div.div.div.div.nextSibling.nextSibling # 电影名 movie = soup.title.string.replace('(豆瓣)','').strip() # 导演 spans_1 = info.find('span') director = spans_1.find('a').string # 主演 spans_2 = spans_1.find_next_sibling('span').find_next_sibling('span') main_charactors = '|'.join([x.string for x in spans_2.find_all('a')]) # 类型 spans_3 = spans_2.find_next_sibling('span') types_list = [spans_3.find_next_sibling('span').string,spans_3.find_next_sibling('span').find_next_sibling('span').string] types = '|'.join(types_list) # 剧情简介 synopsis = img_info.find('h2').nextSibling.nextSibling.find('span',property="v:summary").get_text() synopsis = synopsis.replace('\n','').replace('\u3000','').replace(' ','') # 评分 score = img_info.div.div.div.find('strong',property="v:average").string # 海报 img = img_info.div.div.div.div.div.find('img')['src'] # url url = book_url.remove('\n') infos = [movie, director,main_charactors,types,synopsis,score,img,url] return infos all_book_links = get_all_book_links(urls) # 访问次数过多 ip会被封 info_list = [] # 存放爬取信息 for i,book_url in enumerate(all_book_links): time.sleep(1.0/3) try: info_list.append(get_info(book_url)) print(f"[{'#'*int(i/250*100)}=>{' '*(100-int(i/250*100))}] {int(i/250*100)}%",end='\r') except: print(f'第{i}部电影信息未获取到')网络爬虫突破限制的常见方法https://www.jianshu.com/p/5e7f8d75edbe