简单 python 小说爬虫 ultimate

科技2022-07-12 138

简单 python 小说爬虫想爬就爬带txt配置文件 day01(半成品) 划掉 … day02(成品) 书名史上第一剑修笔趣阁 url代码里找

代码在图片里用notepad。。。。编辑器打开看

day 02 修正版带超时重载错误处理：不下完就一直尝试爬完完整内容不漏章节，无重复

##################################################### # import import requests import os from fake_useragent import UserAgent from lxml import etree import random # -------------------------------------------------- # UA ua = UserAgent().chrome heads = {'User-Agent':ua} # -------------------------------------------------- # URL url_s = 'https://www.bqg66.com/book/131950/' url_h = 'https://www.bqg66.com' # -------------------------------------------------- # timeout timeout = 5 #--------------------------------------------------- # random ip addr def random_ip(): a1 = str(random.randint(11,240)) a2 = str(random.randint(11,240)) a3 = str(random.randint(11,240)) a4 = str(random.randint(11,240)) ip = '''%s.%s.%s.%s:8080'''%(a1,a2,a3,a4) ip_addr_1 = { 'https' : ip } return ip_addr_1 ##################################################### p1_net_stu = False while p1_net_stu == False : try: page_1 = requests.get(url=url_s, headers=heads, timeout=timeout) if page_1.status_code == 200: p1_net_stu = True except requests.exceptions.RequestException as p1ex: print("timeout"+str(timeout)) continue p1_net_stu = False page_1 = page_1.text.encode('utf-8') tree = etree.HTML(page_1) # title = [] title_tree = tree.xpath('//div[@class="w100"]/h1/text()')[0] author_tree = tree.xpath('//div[@class="w100 dispc"]/span/a/text()')[0] title.append(title_tree) title.append(author_tree) # #chapter url chapter_url_lib = [] '''//div[@class="container border3-2 mt8 mb20"]/div[2]/a/@href''' chapter_urls = tree.xpath('//div[@class="container border3-2 mt8 mb20"]/div[2]/a/@href') for per_url in chapter_urls: chapter_url_lib.append(url_h+per_url) #text p3_net_status = False p3_net_p_sta = False text_p_url = [] file = open('./史上第一剑修.text', 'a', encoding='utf-8') ### for dl_url in chapter_url_lib : ### while p3_net_status == False: try: texts = requests.get(url=dl_url, headers=heads, timeout=timeout) if texts.status_code == 200: p3_net_status = True except requests.exceptions.RequestException as p2ex: print("timeout" + str(timeout)) continue p3_net_status = False texts = texts.text texts_tree = etree.HTML(texts) textc = texts_tree.xpath('//article[@id="article"]/p/text()') for ch in textc: file.write(ch) file.write('\n') ### if '下一页' in (texts_tree.xpath('//div[@class="read_nav reader-bottom"]/a[3]/text()')[0]) : text_p_url = texts_tree.xpath('//div[@class="read_nav reader-bottom"]/a[@id="next_url"]/@href') for p_url in text_p_url: while p3_net_p_sta == False: try: textsp = requests.get(url=url_h+p_url, headers=heads, timeout=timeout) if textsp.status_code == 200: p3_net_p_sta = True else:print("over time") except requests.exceptions.RequestException as p3ex: print("timeout" + str(timeout)) continue textsp = textsp.text textsp_tree = etree.HTML(textsp) textcp = textsp_tree.xpath('//article[@id="article"]/p/text()') for chp in textcp: file.write(chp) file.write('\n') p3_net_p_sta = False print(dl_url+' ok') file.close()

效果图

图里有代码用notepad打开

代码肯定存在许多问题与不足，希望多多交流，指出，改进！！！！！！

这是另一个小说网站的爬虫输入url即可爬取该站点小说站点名称笔趣趣 url:…biququ.info 自动生成文件名分章节下载

new day 更新了配置文件可以根据txt配置文件中的规则进行爬取主体部分

##################################################### # import import requests import os from fake_useragent import UserAgent from lxml import etree import time # -------------------------------------------------- # UA ua = UserAgent().chrome heads = {'User-Agent': ua} # -------------------------------------------------- # timeout timeout = 5 ##################################################### class novel_set: def gethtml(url_s,heads,timeout,sleep): net_stu = False while net_stu == False: time.sleep(sleep) try: html = requests.get(url=url_s, headers=heads, timeout=timeout) if html.status_code == 200: net_stu = True print("->") except requests.exceptions.RequestException as hte: print("timeout" +' :'+str(timeout)) continue net_stu = False return html #################################################### def downloader(filename,code,url_s,next_page,next_url,charpt_name_xpath,text_xpath,novel_dir_url,next_url_statu,have,have2,url_h,timeout,gap): #write file = open('./'+filename+'.txt', 'a',encoding='utf-8') # cp_url = url_s while True: # print(cp_url) content = novel_set.gethtml(url_s=cp_url, heads=heads, timeout=timeout,sleep=gap) content.encoding = code c_text = content.text # print(c_text) ct_tree = etree.HTML(c_text) # print("in") # print(ct_tree) sg = ct_tree.xpath(next_page) # print(sg) sgc = sg[0] print(sgc) # print(sgc) if have in sgc or have2 in sgc: if have2 in sgc: file.write("\n\n\n\n\n") cp_title =ct_tree.xpath(charpt_name_xpath) # print(cp_title) file.write(cp_title[0]) file.write("\n") print(cp_title[0]) print("ff") cont_text = ct_tree.xpath(text_xpath) # print(cont_text) for teext in cont_text: if teext == '\n': continue # print(teext) if code == 'gbk': file.write(teext) else: file.write(teext.encode('utf-8', 'ignore').decode('utf-8', "ignore")) # file.write("\n") if len(novel_dir_url) > 0:## 小说目录是否需要拼接 dir_url = ct_tree.xpath(novel_dir_url) # dir_url = dir_url[0].rstrip('/') # print(dir_url) cp_url = url_h+dir_url[0]+ ct_tree.xpath(next_url)[0] elif 'fool' in next_url_statu : cp_url = ct_tree.xpath(next_url)[0] else: cp_url = url_h + ct_tree.xpath(next_url)[0] # print('1'+cp_url) else: break ######################################################################################################################### def dler(url_s,novel_rule,filename): gap = 1 i = 0 sitelist = [] for key in novel_rule: sitelist.append(key) if (key in url_s): print(key) site = key i = 1 break if (i == 1): i = 0 print("ok") else: print("no such site") url_h = novel_rule[site]['url'] # 'http://wap.qiqint.la' # url title_xpath = novel_rule[site]['title'] # '//div[@class="block_txt2"]/h2' #标题 text_xpath = novel_rule[site]['conten'] # '//div[@id="nr1"]/text()' #章节内容 charpt_name_xpath = novel_rule[site]['charpt_name'] # '//h1/text()' #章节名称 novel_dir_url = novel_rule[site]['novel_dir_url'] have = novel_rule[site]['have'] # '下一页' have2 = novel_rule[site]['have2'] # '下一章' next_url = novel_rule[site]['next_url'] # '//a[@id="pb_next"]/@href' next_page = novel_rule[site]['next_page'] # '//a[@id="pb_next"]/text()' next_url_statu = novel_rule[site]['next_url_statu'] code = novel_rule[site]['code'] novel_set.downloader(filename,code,url_s,next_page,next_url,charpt_name_xpath,text_xpath,novel_dir_url,next_url_statu,have,have2,url_h,timeout,gap)

启动部分

# import all1.no from all1 import no config = open('./novel.txt','r+',encoding='utf-8') novel_rule = eval(config.read()) config.close() url_s = input("url_s") filename = input("filename") no.novel_set.dler(url_s=url_s,filename=filename,novel_rule=novel_rule)

配置文件 json格式

{ 'fenghuaju': {'code': "gbk", 'url': 'http://www.fenghuaju.cc', 'title': '//*[@id="wrapper"]/div[4]/div/div[1]/a[3]/text()', 'conten': '//*[@id="content"]/text()', 'charpt_name': '//div[@class="bookname"]/h1/text()', 'next_page': '//div[@class="bottem1"]/a[3]/text()', 'next_url': '//div[@class="bottem1"]/a[3]/@href', 'next_url_statu': '', 'novel_dir_url': '//div[@class="bottem1"]/a[2]/@href', 'have': '下一页', 'have2': '下一章' }, 'xinxs84': {'code': "gbk", 'url': 'http://m.xinxs84.com', 'title': '//span[@class="title"]/text()', 'conten': '//div[@id="chaptercontent"]/text()', 'charpt_name': '//span[@class="title"]/text()', 'next_page': '//a[@id="pt_next"]/text()', 'next_url': '//a[@id="pt_next"]/@href', 'next_url_statu': '', 'novel_dir_url': '', 'have': '下一页', 'have2': '下一章' }, 'jyyxs': {'code': "gbk", 'url': 'http://www.jyyxs.com/', 'title': '', 'conten': '//div[@id="content"]/text()', 'charpt_name': '//div[@class="bookname"]/h1/text()', 'next_page': '//div[@class="bottem2"]/a[4]/text()', 'next_url': '//div[@class="bottem2"]/a[4]/@href', 'next_url_statu': 'fool', 'novel_dir_url': '', 'have': '下一页', 'have2': '下一章' }, 'aixiawx': {'code': "utf-8", 'url': 'http://www.aixiawx.com', 'title': '', 'conten': '//div[@id="content"]/text()', 'charpt_name': '//div[@class="bookname"]/h1/text()', 'next_page': '//div[@class="bottem2"]/a[4]/text()', 'next_url': '//div[@class="bottem2"]/a[4]/@href', 'next_url_statu': '', 'novel_dir_url': '', 'have': '下一页', 'have2': '下一章' }, 'iqishu': {'code': "utf-8", 'url': 'http://www.iqishu.la', 'title': '', 'conten': '//div[@id="content1"]/text()', 'charpt_name': '//div[@class="txt_cont"]/h1/text()', 'next_page': '//div[@class="txt_lian2"]/a[4]/text()', 'next_url': '//div[@class="txt_lian2"]/a[4]/@href', 'next_url_statu': '', 'novel_dir_url': '//div[@class="txt_lian2"]/a[3]/@href', 'have': '下一页', 'have2': '下一章' }, '7kzw': {'code': "utf-8", 'url': 'https://www.7kzw.com', 'title': '', 'conten': '//div[@id="content"]/text()', 'charpt_name': '//div[@class="bookname"]/h1/text()', 'next_page': '//a[@class="next"]/text()', 'next_url': '//a[@class="next"]/@href', 'next_url_statu': '', 'novel_dir_url': '', 'have': '下一页', 'have2': '下一章' }, 'xs98': {'code': "utf-8", 'url': 'https://www.xs98.me/', 'title': '', 'conten': '//div[@id="content"]/text()', 'charpt_name': '//div[@class="bookname"]/h1/text()', 'next_page': '//a[@class="next"]/text()', 'next_url': '//a[@class="next"]/@href', 'next_url_statu': '', 'novel_dir_url': '//a[@id="pager_current"]/@href', 'have': '下一页', 'have2': '下一章' }, 'xuehongxs': {'code': "gbk", 'url': 'https://www.xuehongxs.com/', 'title': '', 'conten': '//div[@id="content"]/text()', 'charpt_name': '//div[@class="bookname"]/h1/text()', 'next_page': '//div[@class="bottem2"]/a[4]/text()', 'next_url': '//div[@class="bottem2"]/a[4]/@href', 'next_url_statu': '', 'novel_dir_url': '', 'have': '下一页', 'have2': '下一章' }, 'yueshu': {'code': "gbk", 'url': 'https://www.yueshu.org', 'title': '', 'conten': '//div[@id="htmlContent"]/text()', 'charpt_name': '//div[@id="content"]/h1/text()', 'next_page': '//span[@class="yd_butp1"][2]/a[1]/text()', 'next_url': '//span[@class="yd_butp1"][2]/a[1]/@href', 'next_url_statu': 'fool', 'novel_dir_url': '', 'have': '下一页', 'have2': '下一章' }, 'u33': {'code': "utf-8", 'url': 'http://wap.u33.me/', 'title': '', 'conten': '//div[@id="chaptercontent"]/text()', 'charpt_name': '//span[@class="title"]/text()', 'next_page': '//a[@id="pb_next"]/text()', 'next_url': '//a[@id="pb_next"]/@href', 'next_url_statu': '', 'novel_dir_url': '//a[@id="pb_mulu"]/@href', 'have': '下一页', 'have2': '下一章' }, 'qitxt': {'code': "utf-8", 'url': 'https://m.qitxt.com', 'title': '', 'conten': '//p/text()', 'charpt_name': '//div[@id="txt"]/p[1]/text()', 'next_page': '//a[@id="pt_next"]/text()', 'next_url': '//a[@id="pt_next"]/@href', 'next_url_statu': '', 'novel_dir_url': '', 'have': '下一页', 'have2': '下一章' }, 'keepshu': {'code': "gbk", 'url': 'https://m.keepshu.com/', 'title': '', 'conten': '//div[@id="content"]/text()', 'charpt_name': '//div[@class="title"]/text()', 'next_page': '//a[5]/i[1]/text()', 'next_url': '//a[5]/@href', 'next_url_statu': '', 'novel_dir_url': '', 'have': '下一页', 'have2': '下一章' }, 'qq717': {'code': "utf-8", 'url': 'https://m.qq717.com/', 'title': '', 'conten': '//div[@id="nr1"]/text()', 'charpt_name': '//div[@id="nr_title"]/text()', 'next_page': '//a[@id="pt_next"]/text()', 'next_url': '//a[@id="pt_next"]/@href', 'next_url_statu': '', 'novel_dir_url': '', 'have': '下一页', 'have2': '下一章' }, '7wx': {'code': "gbk", 'url': 'https://www.7wx.org/', 'title': '', 'conten': '//div[@id="TextContent"]//text()', 'charpt_name': '//div[@id="mlfy_main_text"]/h1/text()', 'next_page': '//p[@class="mlfy_page"]/a[3]/text()', 'next_url': '//p[@class="mlfy_page"]/a[3]/@href', 'next_url_statu': '', 'novel_dir_url': '//p[@class="mlfy_page"]/a[2]/@href', 'have': '下一页', 'have2': '下一章' }, 'youkong': {'code': "gbk", 'url': 'https://m.youkong.cc/', 'title': '', 'conten': '//p[@id="BookText"]/text()', 'charpt_name': '//span[@class="h-title"]/text()', 'next_page': '//div[@id="toolbar"][2]//div[@class="flex_item"][4]/a/text()', 'next_url': '//div[@id="toolbar"][2]//div[@class="flex_item"][4]/a/@href', 'next_url_statu': '', 'novel_dir_url': '', 'have': '下一页', 'have2': '下一章' }, 'uu234w': {'code': "UTF-8", 'url': 'http://www.uu234w.net', 'title': '', 'conten': '//div[@class="content"]/text()', 'charpt_name': '//div[@class="dirwraps readbg"]/h1/text()', 'next_page': '//div[@class="readdown"]/a/text()', 'next_url': '//div[@class="readdown"]/a/@href', 'next_url_statu': 'fool', 'novel_dir_url': '', 'have': '下一页', 'have2': '下一章' }, 'duquanben': {'code': "gbk", 'url': 'https://www.duquanben.com', 'title': '', 'conten': '//div[@id="htmlContent"]/text()', 'charpt_name': '//div[@class="h1title"]/h1/text()', 'next_page': '//div[@class="h1title"]//div[@class="chapter_Turnpage"]/a[3]/text()', 'next_url': '//div[@class="h1title"]//div[@class="chapter_Turnpage"]/a[3]/@href', 'next_url_statu': '', 'novel_dir_url': '', 'have': '下一页', 'have2': '下一章' } }

把这三个放在同一个文件夹下运行co.py就行后期可以根据个人口味加解析规则爬取更多网站输入小说第一张url即可爬取

Processed: 0.012, SQL: 8