一、需求
爬取网址:https://www.gushiwen.org/
需求:
(1)获取侧边栏【类型】信息; (2)获取每个类型中古诗文详情页信息; (3)提取详情页数据:古诗文名、作者、朝代、类型、内容、译文及注释; (4)将数据保存到 csv 文件;
二、代码实现
import requests
import csv
from lxml
import etree
start_url
= "https://so.gushiwen.cn/shiwen/"
base_url
= "https://so.gushiwen.cn"
headers
= {
"user-agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
}
items
= []
def parse_url(url
):
"""解析url,得到响应内容"""
response
= requests
.get
(url
=url
, headers
=headers
)
return response
.content
.decode
("utf-8")
def parse_html(html
):
"""使用xpath解析html,返回xpath对象"""
etree_obj
= etree
.HTML
(html
)
return etree_obj
def get_first_type():
"""获取所有的一级类型"""
first_type_list
= []
html
= parse_url
(start_url
)
etree_obj
= parse_html
(html
)
first_type_name_list
= etree_obj
.xpath
('(//a[contains(@href,"/gushi/")]|//a[contains(@href,"/wenyan/")])/text()')
first_type_url_list
= etree_obj
.xpath
('(//a[contains(@href,"/gushi/")]|//a[contains(@href,"/wenyan/")])/@href')
data_zip
= zip(first_type_name_list
, first_type_url_list
)
for data
in data_zip
:
first_type
= {}
first_type
["name"] = data
[0]
first_type
["url"] = data
[1]
first_type_list
.append
(first_type
)
return first_type_list
def get_data(first_type
):
"""查询数据"""
url
= base_url
+ first_type
["url"]
first_type_name
= first_type
["name"]
html
= parse_url
(url
)
etree_obj
= parse_html
(html
)
div_list
= etree_obj
.xpath
('//div[@class="typecont"]')
for div
in div_list
:
second_type_name
= div
.xpath
(".//strong/text()")
if second_type_name
:
second_type_name
= second_type_name
[0]
else:
second_type_name
= ""
poetry_name_list
= div
.xpath
(".//span/a/text()")
poetry_url_list
= div
.xpath
(".//span/a/@href")
data_zip
= zip(poetry_name_list
,poetry_url_list
)
for data
in data_zip
:
item
= {}
item
["first_type_name"] = first_type_name
item
["second_type_name"] = second_type_name
item
["poetry_name"] = data
[0]
poetry_url
= base_url
+data
[1]
html
= parse_url
(poetry_url
)
etree_obj
= parse_html
(html
)
poetry_author
= etree_obj
.xpath
('//p[@class="source"]')[0].xpath
(".//text()")
item
["poetry_author"] = "".join
(poetry_author
).strip
()
poetry_content
= etree_obj
.xpath
('//*[@id="contson45c396367f59"]/text()')
item
["poetry_content"] = "".join
(poetry_content
).strip
()
if etree_obj
.xpath
('//div[@class="contyishang"]'):
poetry_explain
= etree_obj
.xpath
('//div[@class="contyishang"]')[0].xpath
(".//text()")
item
["poetry_explain"] = "".join
(poetry_explain
).strip
()
else:
item
["poetry_explain"] = ""
print(item
)
save
(item
)
def save(item
):
"""将数据保存到csv中"""
with open("./古诗词.csv", "a", encoding
="utf-8") as file:
writer
= csv
.writer
(file)
writer
.writerow
(item
.values
())
def start():
first_type_list
= get_first_type
()
for first_type
in first_type_list
:
get_data
(first_type
)
if __name__
== '__main__':
start
()
三、运行结果
保存数据: