import json
import re
import requests
import datetime
from bs4 import BeautifulSoup
import os
def down_save_pic(name,pic_urls):
'''
根据图片链接列表pic_urls, 下载所有图片,保存在以name命名的文件夹中,
'''
path = '/Desktop/'+ name + '/'
if not os.path.exists(path):
os.makedirs(path)
for i, pic_url in enumerate(pic_urls):
try:
pic = requests.get(pic_url, timeout=15)
string = str(i + 1) + '.jpg'
with open(path+string, 'wb') as f:
f.write(pic.content)
print('成功下载第%s张图片: %s' % (str(i + 1), str(pic_url)))
except Exception as e:
print('下载第%s张图片时失败: %s' % (str(i + 1), str(pic_url)))
print(e)
continue
def crawl_wiki_data():
"""
爬取百度百科中《乘风破浪的姐姐》中嘉宾信息,返回html
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
#url='https://baijiahao.baidu.com/s?id=1617072149388636691&wfr=spider&for=pc'
#url = 'https://baijiahao.baidu.com/s?id=1596087680236569271&wfr=spider&for=pc'
url = 'https://baijiahao.baidu.com/s?id=1679432499353970214&wfr=spider&for=pc'
try:
response = requests.get(url,headers=headers)
#将一段文档传入BeautifulSoup的构造方法,就能得到一个文档的对象, 可以传入一段字符串
soup = BeautifulSoup(response.text,'lxml')
#返回所有的<table>所有标签
imgs = soup.find_all(class_="img-container")
url = []
for i in imgs:
url.append(i.img['src'])
#down_save_pic('黄龄', url)
#down_save_pic('郁可唯',url)
down_save_pic('沈梦辰',url)
'''
for table in tables:
#对当前节点前面的标签和字符串进行查找
table_titles = table.find_previous('div')
for title in table_titles:
if(crawl_table_title in title):
return table
'''
except Exception as e:
print(e)
if __name__ == '__main__':
crawl_wiki_data()