以前的是多线程的,这里是协程的,是单线程的
这个已经是全站爬虫了
10.21更新 11.13更新 12.1更新
from pyquery import PyQuery import aiohttp import re import asyncio from urllib import parse import urllib.request from lxml import etree import os stopping = False # BASE_DIR = os.getcwd() BASE_DIR = 'H:\\' base_url = 'https://zh.qqhentai.com/' artist_hot_url = 'https://zh.qqhentai.com/artists/hot/page/' sem = asyncio.Semaphore(500) timeout = 15 async def create_dir(path, name): tar = os.path.exists(path + '\\' + name) if not tar: print('new dir !!!!!!!!!', name) try: os.mkdir(path + '\\' + name) except Exception as e: print('in creating dir: ', e) async def fetch(url, session): async with sem: try: async with session.get(url) as resp: # print('url status:{} {}'.format(url, resp.status)) if resp.status in [200, 201]: data = await resp.text() return data except Exception as e: print(e) async def fetch_file(url, session): global timeout async with sem: try: async with session.get(url, timeout=timeout) as resp: # print('url status:{} {}'.format(url, resp.status)) if resp.status in [200, 201]: data = await resp.read() return data else: tpe = url.split('.')[-1] # print(tpe) if tpe == 'png': url = url.replace('png', 'jpg') elif tpe == 'jpg': url = url.replace('jpg', 'png') # print(url) if resp not in [200, 201]: async with session.get(url, timeout=timeout) as resp2: # print('url status:{} {}'.format(url, resp2.status)) if resp2.status in [200, 201]: data = await resp2.read() return data except Exception as e: print(e) # write down your logic def extract_urls(html): urls = [] pq = PyQuery(html) pq = pq('#thumbnail-container') for link in pq.items('a'): url = link.attr('href') url = base_url+url # print(url) urls.append(url) return urls def extract_artists_links(html): pq = PyQuery(html) pq = pq('#tag-container') # print(pq) urls = {} for link in pq.items('a'): url = link.attr('href') name = link.text().split(' (')[0].replace('/', '-').replace('\\', '-').replace('\n', '-').replace('?', '[ask]').replace('|', '[竖杠]').replace('+', '[add]').replace('{', '[left]').replace('}', '[right]').replace(':', '[mao_hao]').replace('"', '[双引]').replace('*', '[star]').replace('>', '[b-than]').replace('<', '[s-than]') # print(url) urls.update({name: parse.urljoin(base_url, url)}) print(urls) return urls def extract_comics_links(html): pq = PyQuery(html) # print(pq) pq = pq('.container.index-container') # 带空格的属性! urls_names = {} for link in pq.items('a'): url = link.attr('href') name = link('img').attr('alt') # print(url, name) urls_names.update({parse.urljoin(base_url, url): name}) # print(urls_names) return urls_names def extract_rec_links(html): pat = '/g/\d+/' urls = re.findall(pat, html) urls_f = [] for url in urls: url = parse.urljoin(base_url, url) urls_f.append(url) print(len(urls), urls) print(len(urls_f), urls_f) return urls_f async def get_jpeg_url(url, session, path): html = await fetch(url, session) try: tree = etree.HTML(html) real_url = tree.xpath('//section[@id="image-container"]/a/img/@src') await get_jpeg(real_url[0], session, path) except Exception as e: print(e) async def get_jpeg(url, session, path): global timeout num = url.split('/')[-1] path_t = path+'\\'+num tar = os.path.exists(path_t) # print(url) if not tar: data = await fetch_file(url, session) if data: # print(type(data)) with open(path_t, 'wb') as f: print(path_t, timeout) f.write(data) else: print('fail on page ', url, 'timeout', timeout) async def comic_handler(url, path_o, session): global timeout html = await fetch(url, session) pq = PyQuery(html) pq = pq('#info') # print(pq) for i in range(10): name = pq('h'+str(i)).text() if name: break pat = '<div>.*?(\d+).*?</div>' page = re.findall(pat, str(pq))[0] # print(page) name = str(name).replace('/', '-').replace('\\', '-').replace('\n', '-').replace('?', '[ask]').replace('|', '[竖杠]').replace('+', '[add]').replace('{', '[left]').replace('}', '[right]').replace(':', '[mao_hao]').replace('"', '[双引]').replace('*', '[star]').replace('>', '[b-than]').replace('<', '[s-than]') if name == '': name = 'None' print(path_o.split('\\')[-1]) name = re.compile('1-\d+').sub('1-x', name) name = re.compile('1~\d+').sub('1~x', name) # 建立目标文件夹 await create_dir(path_o, name) path = path_o+'\\'+name print(name, url, path) '''html-detail''' with open(path + '\\html.txt', 'w', encoding='utf-8') as f: f.write(str(pq)) rec = os.listdir(path) try: del rec[rec.index('html.txt')] except Exception as e: print(e) for item in rec: rec[rec.index(item)] = item.split('.')[0] print(sorted(list(map(int, rec))), int(page), len(rec), int(page)-len(rec)) if int(page)-len(rec) >= 50: timeout = 60 print('timeout', timeout) else: timeout = 15 print('timeout', timeout) if str(len(rec)) == page: return else: urls_t = extract_urls(html) tasks1 = [] for item in urls_t: p = item.split('/')[-2] # print(p, item) if p in rec: # print('in') continue tasks1.append(get_jpeg_url(item, session, path)) await asyncio.gather(*tasks1) async def comic_handler_r(url, root_path, session): global timeout html = await fetch(url, session) pat = '/artist/.*?>(.*?) <' arti = re.findall(pat, html) try: arti = arti[0].replace('/', '-').replace('\\', '-').replace('\n', '-').replace('?', '[ask]').replace('|', '[竖杠]').replace('+', '[add]').replace('{', '[left]').replace('}', '[right]').replace(':', '[mao_hao]').replace('"', '[双引]').replace('*', '[star]').replace('>', '[b-than]').replace('<', '[s-than]') except Exception as e: print('in comic handler r:', e) arti = 'no_name' if arti and arti != 'con': await create_dir(root_path, arti) path = root_path+'\\'+arti else: await create_dir(root_path, 'no_name') path = root_path+'\\no_name' await comic_handler(url, path, session) import time async def page_handler(comics_urls, arti_path, session): for comic_link in comics_urls: # choose one of his work print(time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time()))) # try: await comic_handler(comic_link, arti_path, session) # except Exception as e: # print(e) async def unique_comic_names(arti_base_link, session): i = 0 urls_names = {} while True: try: html = await fetch(arti_base_link+'chinese/page/'+str(i), session) urls_t_names_t = extract_comics_links(html) urls_names.update(urls_t_names_t) i += 1 except Exception as e: print('in unique', e) break return len(list(set([s.lower().replace(' ', '') for s in urls_names.values()]))), urls_names async def check_arti(num, arti_path, urls_names, session): rec = os.listdir(arti_path) print('need:', num, 'now comics:', len(rec)) if len(rec) == 0: return False if len(rec) >= num: for name in rec: comic_path = arti_path+'\\'+name recc = os.listdir(comic_path) with open(comic_path+'\\html.txt', 'r', encoding='utf8') as f: t = f.read() pat = '共 (.*?) 頁' patt = '<h\d>(.*?)</h\d>' try: k = int(re.findall(pattern=pat, string=t)[0]) except Exception as e: print('in', name, e) k = int(re.findall(pattern='共(.*?)頁', string=t)[0]) kk = re.findall(pattern=patt, string=t) print('need:', k, 'now:', len(recc)-1, name) f = 0 if len(recc)-1 < k: for n in kk: if f: break for u, nn in urls_names.items(): if f: break if n == nn: print(time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time()))) await comic_handler(u, arti_path, session) f += 1 print(f) if not f: return False else: return False return True async def main(start): async with aiohttp.ClientSession() as session: ''' 选择爬取模式 || \/ ''' '''get-all-site`s-work''' # lock = Lock() # for i in range(331682, 1, -1): # await lock.acquire() # await comic_handler('https://nyahentai.club/g/'+str(i)+'/', session) # lock.release() '''only-one''' # await comic_handler('https://zh.qqhentai.com/g/333292/', BASE_DIR, session) '''get specific artist''' # arti_name = 'ratatatat74' # arti_base_link = 'https://zh.yyhentai.com/artist/ratatatat74-mr-skull/chinese/' # await create_dir(BASE_DIR, 'all_co') # root_path = BASE_DIR + '\\all_co' # await create_dir(root_path, arti_name) # arti_path = root_path + '\\' + arti_name # i = 1 # while (True): # choose one of his page # try: # html_all_comics = await fetch(arti_base_link + 'page/' + str(i), session) # print(arti_base_link + 'page/' + str(i)) # comics_urls = extract_comics_links(html_all_comics) # # await page_handler(comics_urls, arti_path, session) # # for comic_link in comics_urls: # choose one of his work # # await comic_handler(comic_link, arti_path, session) # i += 1 # # # except Exception as e: # print('fail or end on artist', arti_name, e) # break '''get recent ch comics''' await create_dir(BASE_DIR, 'all_co') root_path = BASE_DIR + '\\all_co' for j in range(1, 1000): print('pg:', j) rec_html = await fetch(base_url+'language/chinese/page/'+str(j), session) comic_urls = extract_rec_links(rec_html) for url in comic_urls: await comic_handler_r(url, root_path, session) # '''get-all-site`s-hot-artists-work''' # await create_dir(BASE_DIR, 'all_co') # root_path = BASE_DIR+'\\all_co' # # for j in range(15, 180): # artists page # print(j) # html_all_artists = await fetch(artist_hot_url + str(j), session) # # artists_base_urls = extract_artists_links(html_all_artists) # one page # # for arti_name, arti_base_link in artists_base_urls.items(): # choose one artist # # await create_dir(root_path, arti_name) # arti_path = root_path+'\\'+arti_name # # # try: # # k, comics_urls_names = await unique_comic_names(arti_base_link, session) # print(arti_name, 'artist works', k) # # if await check_arti(int(k), arti_path, comics_urls_names, session): # print('next artist===============================================================') # continue # # await page_handler(comics_urls_names.keys(), arti_path, session) # print('next artist===============================================================') # # except Exception as e: # # print('fail or end on artist', arti_name, e) # # break if __name__ == "__main__": # print(BASE_DIR) loop = asyncio.get_event_loop() loop.run_until_complete(main(loop)) # tha_main() # =========================================================