简单 python 小说爬虫 想爬就爬 带txt配置文件 day01(半成品) 划掉 … day02(成品) 书名 史上第一剑修 笔趣阁 url代码里找
##################################################### # import import requests import os from fake_useragent import UserAgent from lxml import etree import random # -------------------------------------------------- # UA ua = UserAgent().chrome heads = {'User-Agent':ua} # -------------------------------------------------- # URL url_s = 'https://www.bqg66.com/book/131950/' url_h = 'https://www.bqg66.com' #--------------------------------------------------- # random ip addr def random_ip(): a1 = str(random.randint(11,240)) a2 = str(random.randint(11,240)) a3 = str(random.randint(11,240)) a4 = str(random.randint(11,240)) ip = '''%s.%s.%s.%s:8080'''%(a1,a2,a3,a4) ip_addr_1 = { 'https' : ip } return ip_addr_1 ##################################################### p1_net_stu = False while p1_net_stu == False : page_1 = requests.get(url=url_s, headers=heads) if page_1.status_code == 200: p1_net_stu = True p1_net_stu = False page_1 = page_1.text.encode('utf-8') tree = etree.HTML(page_1) # title = [] title_tree = tree.xpath('//div[@class="w100"]/h1/text()')[0] author_tree = tree.xpath('//div[@class="w100 dispc"]/span/a/text()')[0] title.append(title_tree) title.append(author_tree) # #chapter url chapter_url_lib = [] '''//div[@class="container border3-2 mt8 mb20"]/div[2]/a/@href''' chapter_urls = tree.xpath('//div[@class="container border3-2 mt8 mb20"]/div[2]/a/@href') for per_url in chapter_urls: chapter_url_lib.append(url_h+per_url) #text p3_net_status = False file = open('./史上第一剑修.text', 'a', encoding='utf-8') for dl_url in chapter_url_lib: while p3_net_status == False: texts = requests.get(url=dl_url, headers=heads) if texts.status_code == 200: p3_net_status = True p3_net_status = False texts = texts.text texts_tree = etree.HTML(texts) textc = texts_tree.xpath('//article[@id="article"]/p/text()') for ch in textc: file.write(ch) file.write('\n') print(dl_url+' '+"ok") file.close()代码在图片里用notepad。。。。编辑器打开看
day 02 修正版 带超时重载 错误处理:不下完就一直尝试 爬完完整内容不漏章节,无重复
##################################################### # import import requests import os from fake_useragent import UserAgent from lxml import etree import random # -------------------------------------------------- # UA ua = UserAgent().chrome heads = {'User-Agent':ua} # -------------------------------------------------- # URL url_s = 'https://www.bqg66.com/book/131950/' url_h = 'https://www.bqg66.com' # -------------------------------------------------- # timeout timeout = 5 #--------------------------------------------------- # random ip addr def random_ip(): a1 = str(random.randint(11,240)) a2 = str(random.randint(11,240)) a3 = str(random.randint(11,240)) a4 = str(random.randint(11,240)) ip = '''%s.%s.%s.%s:8080'''%(a1,a2,a3,a4) ip_addr_1 = { 'https' : ip } return ip_addr_1 ##################################################### p1_net_stu = False while p1_net_stu == False : try: page_1 = requests.get(url=url_s, headers=heads, timeout=timeout) if page_1.status_code == 200: p1_net_stu = True except requests.exceptions.RequestException as p1ex: print("timeout"+str(timeout)) continue p1_net_stu = False page_1 = page_1.text.encode('utf-8') tree = etree.HTML(page_1) # title = [] title_tree = tree.xpath('//div[@class="w100"]/h1/text()')[0] author_tree = tree.xpath('//div[@class="w100 dispc"]/span/a/text()')[0] title.append(title_tree) title.append(author_tree) # #chapter url chapter_url_lib = [] '''//div[@class="container border3-2 mt8 mb20"]/div[2]/a/@href''' chapter_urls = tree.xpath('//div[@class="container border3-2 mt8 mb20"]/div[2]/a/@href') for per_url in chapter_urls: chapter_url_lib.append(url_h+per_url) #text p3_net_status = False p3_net_p_sta = False text_p_url = [] file = open('./史上第一剑修.text', 'a', encoding='utf-8') ### for dl_url in chapter_url_lib : ### while p3_net_status == False: try: texts = requests.get(url=dl_url, headers=heads, timeout=timeout) if texts.status_code == 200: p3_net_status = True except requests.exceptions.RequestException as p2ex: print("timeout" + str(timeout)) continue p3_net_status = False texts = texts.text texts_tree = etree.HTML(texts) textc = texts_tree.xpath('//article[@id="article"]/p/text()') for ch in textc: file.write(ch) file.write('\n') ### if '下一页' in (texts_tree.xpath('//div[@class="read_nav reader-bottom"]/a[3]/text()')[0]) : text_p_url = texts_tree.xpath('//div[@class="read_nav reader-bottom"]/a[@id="next_url"]/@href') for p_url in text_p_url: while p3_net_p_sta == False: try: textsp = requests.get(url=url_h+p_url, headers=heads, timeout=timeout) if textsp.status_code == 200: p3_net_p_sta = True else:print("over time") except requests.exceptions.RequestException as p3ex: print("timeout" + str(timeout)) continue textsp = textsp.text textsp_tree = etree.HTML(textsp) textcp = textsp_tree.xpath('//article[@id="article"]/p/text()') for chp in textcp: file.write(chp) file.write('\n') p3_net_p_sta = False print(dl_url+' ok') file.close()效果图
图里有代码 用notepad打开
代码肯定存在许多问题与不足,希望多多交流,指出,改进!!!!!!
这是另一个小说网站的爬虫 输入url即可爬取该站点小说 站点名称 笔趣趣 url:…biququ.info 自动生成文件名 分章节下载
##################################################### # import import requests import os from fake_useragent import UserAgent from lxml import etree import random # -------------------------------------------------- # UA ua = UserAgent().chrome heads = {'User-Agent':ua} # -------------------------------------------------- # URL url_s = input("url!:")##'http://www.biququ.info/html/745/' url_h = 'http://www.biququ.info' # -------------------------------------------------- # timeout timeout = 5 ##################################################### def gethtml(url_s,heads,timeout): net_stu = False while net_stu == False: try: html = requests.get(url=url_s, headers=heads, timeout=timeout) if html.status_code == 200: net_stu = True print("->") except requests.exceptions.RequestException as hte: print("timeout" +' :'+str(timeout)) continue net_stu = False return html #################################################### # res_1 = gethtml(url_s = url_s,heads=heads,timeout=timeout) page_1 = res_1.text tree = etree.HTML(page_1) #chapter url chapter_url_lib = [] chapter_urls = tree.xpath('//div[@id="list"]//dd/a/@href') for per_url in chapter_urls: chapter_url_lib.append(url_h+per_url) #text filename = tree.xpath('//div[@id="info"]/h1/text()') file = open('./'+filename[0]+'.txt', 'a',encoding='utf-8') for dl_url in chapter_url_lib: content = gethtml(url_s=dl_url, heads=heads, timeout=timeout) c_text = content.text ct_tree = etree.HTML(c_text) cont_text = ct_tree.xpath('//div[@id="content"]/p/text()') cp = ct_tree.xpath('//div[@class="bookname"]/h1/text()') file.write(cp[0]) file.write("\n") for teext in cont_text: file.write(teext) file.write("\n") file.write("\n\n\n\n\n\n") print(cp[0])new day 更新了配置文件可以根据txt配置文件中的规则进行爬取 主体部分
##################################################### # import import requests import os from fake_useragent import UserAgent from lxml import etree import time # -------------------------------------------------- # UA ua = UserAgent().chrome heads = {'User-Agent': ua} # -------------------------------------------------- # timeout timeout = 5 ##################################################### class novel_set: def gethtml(url_s,heads,timeout,sleep): net_stu = False while net_stu == False: time.sleep(sleep) try: html = requests.get(url=url_s, headers=heads, timeout=timeout) if html.status_code == 200: net_stu = True print("->") except requests.exceptions.RequestException as hte: print("timeout" +' :'+str(timeout)) continue net_stu = False return html #################################################### def downloader(filename,code,url_s,next_page,next_url,charpt_name_xpath,text_xpath,novel_dir_url,next_url_statu,have,have2,url_h,timeout,gap): #write file = open('./'+filename+'.txt', 'a',encoding='utf-8') # cp_url = url_s while True: # print(cp_url) content = novel_set.gethtml(url_s=cp_url, heads=heads, timeout=timeout,sleep=gap) content.encoding = code c_text = content.text # print(c_text) ct_tree = etree.HTML(c_text) # print("in") # print(ct_tree) sg = ct_tree.xpath(next_page) # print(sg) sgc = sg[0] print(sgc) # print(sgc) if have in sgc or have2 in sgc: if have2 in sgc: file.write("\n\n\n\n\n") cp_title =ct_tree.xpath(charpt_name_xpath) # print(cp_title) file.write(cp_title[0]) file.write("\n") print(cp_title[0]) print("ff") cont_text = ct_tree.xpath(text_xpath) # print(cont_text) for teext in cont_text: if teext == '\n': continue # print(teext) if code == 'gbk': file.write(teext) else: file.write(teext.encode('utf-8', 'ignore').decode('utf-8', "ignore")) # file.write("\n") if len(novel_dir_url) > 0:## 小说目录 是否需要拼接 dir_url = ct_tree.xpath(novel_dir_url) # dir_url = dir_url[0].rstrip('/') # print(dir_url) cp_url = url_h+dir_url[0]+ ct_tree.xpath(next_url)[0] elif 'fool' in next_url_statu : cp_url = ct_tree.xpath(next_url)[0] else: cp_url = url_h + ct_tree.xpath(next_url)[0] # print('1'+cp_url) else: break ######################################################################################################################### def dler(url_s,novel_rule,filename): gap = 1 i = 0 sitelist = [] for key in novel_rule: sitelist.append(key) if (key in url_s): print(key) site = key i = 1 break if (i == 1): i = 0 print("ok") else: print("no such site") url_h = novel_rule[site]['url'] # 'http://wap.qiqint.la' # url title_xpath = novel_rule[site]['title'] # '//div[@class="block_txt2"]/h2' #标题 text_xpath = novel_rule[site]['conten'] # '//div[@id="nr1"]/text()' #章节内容 charpt_name_xpath = novel_rule[site]['charpt_name'] # '//h1/text()' #章节名称 novel_dir_url = novel_rule[site]['novel_dir_url'] have = novel_rule[site]['have'] # '下一页' have2 = novel_rule[site]['have2'] # '下一章' next_url = novel_rule[site]['next_url'] # '//a[@id="pb_next"]/@href' next_page = novel_rule[site]['next_page'] # '//a[@id="pb_next"]/text()' next_url_statu = novel_rule[site]['next_url_statu'] code = novel_rule[site]['code'] novel_set.downloader(filename,code,url_s,next_page,next_url,charpt_name_xpath,text_xpath,novel_dir_url,next_url_statu,have,have2,url_h,timeout,gap)启动部分
# import all1.no from all1 import no config = open('./novel.txt','r+',encoding='utf-8') novel_rule = eval(config.read()) config.close() url_s = input("url_s") filename = input("filename") no.novel_set.dler(url_s=url_s,filename=filename,novel_rule=novel_rule)配置文件 json格式
{ 'fenghuaju': {'code': "gbk", 'url': 'http://www.fenghuaju.cc', 'title': '//*[@id="wrapper"]/div[4]/div/div[1]/a[3]/text()', 'conten': '//*[@id="content"]/text()', 'charpt_name': '//div[@class="bookname"]/h1/text()', 'next_page': '//div[@class="bottem1"]/a[3]/text()', 'next_url': '//div[@class="bottem1"]/a[3]/@href', 'next_url_statu': '', 'novel_dir_url': '//div[@class="bottem1"]/a[2]/@href', 'have': '下一页', 'have2': '下一章' }, 'xinxs84': {'code': "gbk", 'url': 'http://m.xinxs84.com', 'title': '//span[@class="title"]/text()', 'conten': '//div[@id="chaptercontent"]/text()', 'charpt_name': '//span[@class="title"]/text()', 'next_page': '//a[@id="pt_next"]/text()', 'next_url': '//a[@id="pt_next"]/@href', 'next_url_statu': '', 'novel_dir_url': '', 'have': '下一页', 'have2': '下一章' }, 'jyyxs': {'code': "gbk", 'url': 'http://www.jyyxs.com/', 'title': '', 'conten': '//div[@id="content"]/text()', 'charpt_name': '//div[@class="bookname"]/h1/text()', 'next_page': '//div[@class="bottem2"]/a[4]/text()', 'next_url': '//div[@class="bottem2"]/a[4]/@href', 'next_url_statu': 'fool', 'novel_dir_url': '', 'have': '下一页', 'have2': '下一章' }, 'aixiawx': {'code': "utf-8", 'url': 'http://www.aixiawx.com', 'title': '', 'conten': '//div[@id="content"]/text()', 'charpt_name': '//div[@class="bookname"]/h1/text()', 'next_page': '//div[@class="bottem2"]/a[4]/text()', 'next_url': '//div[@class="bottem2"]/a[4]/@href', 'next_url_statu': '', 'novel_dir_url': '', 'have': '下一页', 'have2': '下一章' }, 'iqishu': {'code': "utf-8", 'url': 'http://www.iqishu.la', 'title': '', 'conten': '//div[@id="content1"]/text()', 'charpt_name': '//div[@class="txt_cont"]/h1/text()', 'next_page': '//div[@class="txt_lian2"]/a[4]/text()', 'next_url': '//div[@class="txt_lian2"]/a[4]/@href', 'next_url_statu': '', 'novel_dir_url': '//div[@class="txt_lian2"]/a[3]/@href', 'have': '下一页', 'have2': '下一章' }, '7kzw': {'code': "utf-8", 'url': 'https://www.7kzw.com', 'title': '', 'conten': '//div[@id="content"]/text()', 'charpt_name': '//div[@class="bookname"]/h1/text()', 'next_page': '//a[@class="next"]/text()', 'next_url': '//a[@class="next"]/@href', 'next_url_statu': '', 'novel_dir_url': '', 'have': '下一页', 'have2': '下一章' }, 'xs98': {'code': "utf-8", 'url': 'https://www.xs98.me/', 'title': '', 'conten': '//div[@id="content"]/text()', 'charpt_name': '//div[@class="bookname"]/h1/text()', 'next_page': '//a[@class="next"]/text()', 'next_url': '//a[@class="next"]/@href', 'next_url_statu': '', 'novel_dir_url': '//a[@id="pager_current"]/@href', 'have': '下一页', 'have2': '下一章' }, 'xuehongxs': {'code': "gbk", 'url': 'https://www.xuehongxs.com/', 'title': '', 'conten': '//div[@id="content"]/text()', 'charpt_name': '//div[@class="bookname"]/h1/text()', 'next_page': '//div[@class="bottem2"]/a[4]/text()', 'next_url': '//div[@class="bottem2"]/a[4]/@href', 'next_url_statu': '', 'novel_dir_url': '', 'have': '下一页', 'have2': '下一章' }, 'yueshu': {'code': "gbk", 'url': 'https://www.yueshu.org', 'title': '', 'conten': '//div[@id="htmlContent"]/text()', 'charpt_name': '//div[@id="content"]/h1/text()', 'next_page': '//span[@class="yd_butp1"][2]/a[1]/text()', 'next_url': '//span[@class="yd_butp1"][2]/a[1]/@href', 'next_url_statu': 'fool', 'novel_dir_url': '', 'have': '下一页', 'have2': '下一章' }, 'u33': {'code': "utf-8", 'url': 'http://wap.u33.me/', 'title': '', 'conten': '//div[@id="chaptercontent"]/text()', 'charpt_name': '//span[@class="title"]/text()', 'next_page': '//a[@id="pb_next"]/text()', 'next_url': '//a[@id="pb_next"]/@href', 'next_url_statu': '', 'novel_dir_url': '//a[@id="pb_mulu"]/@href', 'have': '下一页', 'have2': '下一章' }, 'qitxt': {'code': "utf-8", 'url': 'https://m.qitxt.com', 'title': '', 'conten': '//p/text()', 'charpt_name': '//div[@id="txt"]/p[1]/text()', 'next_page': '//a[@id="pt_next"]/text()', 'next_url': '//a[@id="pt_next"]/@href', 'next_url_statu': '', 'novel_dir_url': '', 'have': '下一页', 'have2': '下一章' }, 'keepshu': {'code': "gbk", 'url': 'https://m.keepshu.com/', 'title': '', 'conten': '//div[@id="content"]/text()', 'charpt_name': '//div[@class="title"]/text()', 'next_page': '//a[5]/i[1]/text()', 'next_url': '//a[5]/@href', 'next_url_statu': '', 'novel_dir_url': '', 'have': '下一页', 'have2': '下一章' }, 'qq717': {'code': "utf-8", 'url': 'https://m.qq717.com/', 'title': '', 'conten': '//div[@id="nr1"]/text()', 'charpt_name': '//div[@id="nr_title"]/text()', 'next_page': '//a[@id="pt_next"]/text()', 'next_url': '//a[@id="pt_next"]/@href', 'next_url_statu': '', 'novel_dir_url': '', 'have': '下一页', 'have2': '下一章' }, '7wx': {'code': "gbk", 'url': 'https://www.7wx.org/', 'title': '', 'conten': '//div[@id="TextContent"]//text()', 'charpt_name': '//div[@id="mlfy_main_text"]/h1/text()', 'next_page': '//p[@class="mlfy_page"]/a[3]/text()', 'next_url': '//p[@class="mlfy_page"]/a[3]/@href', 'next_url_statu': '', 'novel_dir_url': '//p[@class="mlfy_page"]/a[2]/@href', 'have': '下一页', 'have2': '下一章' }, 'youkong': {'code': "gbk", 'url': 'https://m.youkong.cc/', 'title': '', 'conten': '//p[@id="BookText"]/text()', 'charpt_name': '//span[@class="h-title"]/text()', 'next_page': '//div[@id="toolbar"][2]//div[@class="flex_item"][4]/a/text()', 'next_url': '//div[@id="toolbar"][2]//div[@class="flex_item"][4]/a/@href', 'next_url_statu': '', 'novel_dir_url': '', 'have': '下一页', 'have2': '下一章' }, 'uu234w': {'code': "UTF-8", 'url': 'http://www.uu234w.net', 'title': '', 'conten': '//div[@class="content"]/text()', 'charpt_name': '//div[@class="dirwraps readbg"]/h1/text()', 'next_page': '//div[@class="readdown"]/a/text()', 'next_url': '//div[@class="readdown"]/a/@href', 'next_url_statu': 'fool', 'novel_dir_url': '', 'have': '下一页', 'have2': '下一章' }, 'duquanben': {'code': "gbk", 'url': 'https://www.duquanben.com', 'title': '', 'conten': '//div[@id="htmlContent"]/text()', 'charpt_name': '//div[@class="h1title"]/h1/text()', 'next_page': '//div[@class="h1title"]//div[@class="chapter_Turnpage"]/a[3]/text()', 'next_url': '//div[@class="h1title"]//div[@class="chapter_Turnpage"]/a[3]/@href', 'next_url_statu': '', 'novel_dir_url': '', 'have': '下一页', 'have2': '下一章' } }把这三个放在同一个文件夹下运行co.py就行 后期可以根据个人口味加解析规则爬取更多网站 输入小说第一张url即可爬取