源代码
import requests
import os
from lxml import etree
headers
= {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
}
if not os
.path
.exists('resume_template'):
os
.mkdir('resume_template')
url_cnt
= download_cnt
= 0
template_url_list
= []
template_name
= []
download_url_list
= []
url
= 'http://sc.chinaz.com/jianli/free_%d.html'
for i in
range(1, 6):
if i
== 1:
entrance_url
= 'http://sc.chinaz.com/jianli/free.html'
else:
entrance_url
= format(url
% i
)
entrance_page
= requests
.get(url
=entrance_url
, headers
=headers
)
entrance_page
.encoding
='utf-8'
tree
= etree
.HTML(entrance_page
.text
)
entrance_list
= tree
.xpath('//div[@id="main"]//p/a')
for item in entrance_list
:
name
= item
.xpath('./text()')[0]
addr
= item
.xpath('./@href')[0]
# print(title)
# print(addr)
template_url_list
.append(addr
)
template_name
.append(name
)
url_cnt
+= 1
print(url_cnt
)
for i in
range(len(template_url_list
)):
template_url
= template_url_list
[i
]
print(template_url
)
template_page
= requests
.get(url
=template_url
, headers
=headers
).text
tree
= etree
.HTML(template_page
)
download_url
= tree
.xpath('//div[@class="down_wrap"]//ul/li[10]/a/@href')[0]
path
= './resume_template/' + template_name
[i
]
# if not os.path.exists(path):
# os.mkdir(path)
# print("make")
template
= requests
.get(url
=download_url
, headers
=headers
).content
# z = zipfile.ZipFile(io.BytesIO(template))
# z.extractall(path)
with
open(path
+ '.rar', 'wb') as f
:
f
.write(template
)
download_cnt
+= 1
download_url_list
.append(download_url
)
print(download_cnt
)
第一, xpath的语法不太熟练
这个xpath其实语法挺简单的,只要系统地去了解一下就行,搞清楚相应符号的意义基本就问题不大
第二, 当url对应的下载文件是zip文件时,不知道该以何种方式下载能让下载下来的直接就是压缩文件,后来是参考了 python爬虫爬取站长素材免费简历模板完成的。
运行结果