爬取地址:https://www.gupiaoxuexi.com/post/1319 原码:
from requests_html import HTMLSession import os session = HTMLSession() headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36' } url = 'https://www.gupiaoxuexi.com/post/1319' response = session.get(url, headers=headers) html = response.html.html # html = html.replace('https://www.gupiaoxuexi.com/post/', '') # 把首页保存到本地.html文件 files = './pac/1/1.html' with open(files, "a", encoding="UTF-8") as f: page1 = f.write(html) # 定位 所有 二级页面的url xpath_a = '//*[@id="post-1319"]//div[@class="entry-content clearfix"]/p/a/@href' # xpath_name='//*[@id="post-1319"]//div[@class="entry-content clearfix"]/p/a/text()' url2 = response.html.xpath(xpath_a) # num为所有二级页面的数量 num = len(url2) i = 1 while i <= len(url2): for url0 in url2: name0 = url0.split('/')[-1] + '.html' # 得到文件名 files2 = './pac/1/' + name0 # 带路径的文件名 response2 = session.get(url0, headers=headers) html2 = response2.html.html # html2 = html2.replace('https://www.gupiaoxuexi.com/post/', '') # 所有二级页面写入本地 with open(files2, "a", encoding="UTF-8") as f: page2 = f.write(html2) print("正在写入第 " + str(i) + " 页 ") i += 1 # url0 = 'https://www.gupiaoxuexi.com/post/1319' response2 = session.get(url0, headers=headers) # 获得所有页面中图片的地址 xpath_pic = '//div[@class="entry-content clearfix"]//img/@data-original' # pic的URL xpath_pic2 = response2.html.xpath(xpath_pic) for url_pic in xpath_pic2: url_pic2 = url_pic folder1 = url_pic[27:53] # 获取文件的相对路径 folder2 = './pac/1/' + folder1 + '/' pic_name2 = url_pic.split('/')[-1] # 获取文件名 if os.path.exists(folder2): # 判断路径是否存在 pass else: os.makedirs(folder2) # 如文件路径不存在新建 res = session.get(url_pic2) file3 = folder2 + pic_name2 # 文件及路径 with open(file3, "wb") as f: f.write(res.content)程序还些问题,在首页中,还没有把域名代替下来,这上在以后再改进