爬取《乘风破浪的姐姐》的选手图片信息

科技2022-07-12 141

import json import re import requests import datetime from bs4 import BeautifulSoup import os def down_save_pic(name,pic_urls): ''' 根据图片链接列表pic_urls, 下载所有图片，保存在以name命名的文件夹中, ''' path = '/Desktop/'+ name + '/' if not os.path.exists(path): os.makedirs(path) for i, pic_url in enumerate(pic_urls): try: pic = requests.get(pic_url, timeout=15) string = str(i + 1) + '.jpg' with open(path+string, 'wb') as f: f.write(pic.content) print('成功下载第%s张图片: %s' % (str(i + 1), str(pic_url))) except Exception as e: print('下载第%s张图片时失败: %s' % (str(i + 1), str(pic_url))) print(e) continue def crawl_wiki_data(): """ 爬取百度百科中《乘风破浪的姐姐》中嘉宾信息，返回html """ headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' } #url='https://baijiahao.baidu.com/s?id=1617072149388636691&wfr=spider&for=pc' #url = 'https://baijiahao.baidu.com/s?id=1596087680236569271&wfr=spider&for=pc' url = 'https://baijiahao.baidu.com/s?id=1679432499353970214&wfr=spider&for=pc' try: response = requests.get(url,headers=headers) #将一段文档传入BeautifulSoup的构造方法,就能得到一个文档的对象, 可以传入一段字符串 soup = BeautifulSoup(response.text,'lxml') #返回所有的<table>所有标签 imgs = soup.find_all(class_="img-container") url = [] for i in imgs: url.append(i.img['src']) #down_save_pic('黄龄', url) #down_save_pic('郁可唯',url) down_save_pic('沈梦辰',url) ''' for table in tables: #对当前节点前面的标签和字符串进行查找 table_titles = table.find_previous('div') for title in table_titles: if(crawl_table_title in title): return table ''' except Exception as e: print(e) if __name__ == '__main__': crawl_wiki_data()

Processed: 0.012, SQL: 8