-- coding: utf-8 --
“”" Project: pider Creator: kingS Create time: 2020-07-20 12:13 IDE: PyCharm Introduction:三行情诗爬虫实战 “”" import requests import re from lxml import etree
def parse_page(url): headers = { ‘user-agent’:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36’ } response = requests.get(url , headers) text = response.content.decode(‘gbk’) titles = re.findall(r’
.
?
(.?)
’,text,re.DOTALL)
# print(titles)
classes = re.findall(r’
.?<a.?>(.?)’, text, re.DOTALL) # print(classes) views = re.findall(r’.?.
?
(.?)’,text,re.DOTALL)
# print(views)
html = etree.HTML(text)
contents = html.xpath("//div[@class=‘ct’]/text()")
content_list = []
for content in contents:
x = ‘’.join(content)
# print(content_str.strip())
content_list.append(x.strip())
# print(x)
# print(’=‘
30) # # 存为txt # print(content_list) # print(’**’*30)
# result = ‘\n’.join(content_list)
# print(result)
# with open(’./三行情诗.txt’,‘w’,encoding=‘utf-8’) as f:
# f.write(result)
filename为写入CSV文件的路径,data为要写入数据列表.
file = open('./三行情诗.txt', 'a+',encoding='utf-8')
for i in range(len(content_list)):
s = str(content_list[i]).replace('[', '').replace(']', '') # 去除[],这两行按数据不同,可以选择
s = s.replace("'", '').replace(',', '') + '\n' # 去除单引号,逗号,每行末尾追加换行符
file.write(s)
file.close()
print("保存文件成功")
# x = re.sub(r'<.*?>','',content)
# content_list.append(x.strip())
#
# poems = []
# # 解包或者拼接
# for i in zip(titles , classes , views , content_list):
# title , classer ,views , content_list = i
# poem = {
# 'titles':title,
# 'classes':classes,
# 'views':views,
# 'content':content_list
# }
#
# poems.append(poem)
# # 打印参数
# for poem in poems:
# print(poem)
# print('**'*10)
def main(): url = ‘http://www.ai5v.com/sanhangqingshu/’ # 解析主页面,获取每个三行情诗的url headers = { ‘user-agent’:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36’ } response = requests.get(url , headers) text = response.text html = etree.HTML(text) hrefs = html.xpath("//div[@class=‘pl’]//span[@class=‘st’]/a") url_list = [] for href in hrefs: url = href.get(‘href’) url_list.append(url) i = 1 for url in url_list: parse_page(url) print(‘第%d次爬取’%i) i += 1
if name == ‘main’: main()