python协程异步aiohttp爬虫:喵绅士2020。8最新爬虫

    科技2025-02-05  13

    我只想说懂的都懂

    以前的是多线程的,这里是协程的,是单线程的

    世界在你脚下

    生产资料就在前方

    在每一个夜深人静的夜里

    仍有你懂的和懂你的与你为伴

    这个已经是全站爬虫了


    10.21更新 11.13更新 12.1更新

    from pyquery import PyQuery import aiohttp import re import asyncio from urllib import parse import urllib.request from lxml import etree import os stopping = False # BASE_DIR = os.getcwd() BASE_DIR = 'H:\\' base_url = 'https://zh.qqhentai.com/' artist_hot_url = 'https://zh.qqhentai.com/artists/hot/page/' sem = asyncio.Semaphore(500) timeout = 15 async def create_dir(path, name): tar = os.path.exists(path + '\\' + name) if not tar: print('new dir !!!!!!!!!', name) try: os.mkdir(path + '\\' + name) except Exception as e: print('in creating dir: ', e) async def fetch(url, session): async with sem: try: async with session.get(url) as resp: # print('url status:{} {}'.format(url, resp.status)) if resp.status in [200, 201]: data = await resp.text() return data except Exception as e: print(e) async def fetch_file(url, session): global timeout async with sem: try: async with session.get(url, timeout=timeout) as resp: # print('url status:{} {}'.format(url, resp.status)) if resp.status in [200, 201]: data = await resp.read() return data else: tpe = url.split('.')[-1] # print(tpe) if tpe == 'png': url = url.replace('png', 'jpg') elif tpe == 'jpg': url = url.replace('jpg', 'png') # print(url) if resp not in [200, 201]: async with session.get(url, timeout=timeout) as resp2: # print('url status:{} {}'.format(url, resp2.status)) if resp2.status in [200, 201]: data = await resp2.read() return data except Exception as e: print(e) # write down your logic def extract_urls(html): urls = [] pq = PyQuery(html) pq = pq('#thumbnail-container') for link in pq.items('a'): url = link.attr('href') url = base_url+url # print(url) urls.append(url) return urls def extract_artists_links(html): pq = PyQuery(html) pq = pq('#tag-container') # print(pq) urls = {} for link in pq.items('a'): url = link.attr('href') name = link.text().split(' (')[0].replace('/', '-').replace('\\', '-').replace('\n', '-').replace('?', '[ask]').replace('|', '[竖杠]').replace('+', '[add]').replace('{', '[left]').replace('}', '[right]').replace(':', '[mao_hao]').replace('"', '[双引]').replace('*', '[star]').replace('>', '[b-than]').replace('<', '[s-than]') # print(url) urls.update({name: parse.urljoin(base_url, url)}) print(urls) return urls def extract_comics_links(html): pq = PyQuery(html) # print(pq) pq = pq('.container.index-container') # 带空格的属性! urls_names = {} for link in pq.items('a'): url = link.attr('href') name = link('img').attr('alt') # print(url, name) urls_names.update({parse.urljoin(base_url, url): name}) # print(urls_names) return urls_names def extract_rec_links(html): pat = '/g/\d+/' urls = re.findall(pat, html) urls_f = [] for url in urls: url = parse.urljoin(base_url, url) urls_f.append(url) print(len(urls), urls) print(len(urls_f), urls_f) return urls_f async def get_jpeg_url(url, session, path): html = await fetch(url, session) try: tree = etree.HTML(html) real_url = tree.xpath('//section[@id="image-container"]/a/img/@src') await get_jpeg(real_url[0], session, path) except Exception as e: print(e) async def get_jpeg(url, session, path): global timeout num = url.split('/')[-1] path_t = path+'\\'+num tar = os.path.exists(path_t) # print(url) if not tar: data = await fetch_file(url, session) if data: # print(type(data)) with open(path_t, 'wb') as f: print(path_t, timeout) f.write(data) else: print('fail on page ', url, 'timeout', timeout) async def comic_handler(url, path_o, session): global timeout html = await fetch(url, session) pq = PyQuery(html) pq = pq('#info') # print(pq) for i in range(10): name = pq('h'+str(i)).text() if name: break pat = '<div>.*?(\d+).*?</div>' page = re.findall(pat, str(pq))[0] # print(page) name = str(name).replace('/', '-').replace('\\', '-').replace('\n', '-').replace('?', '[ask]').replace('|', '[竖杠]').replace('+', '[add]').replace('{', '[left]').replace('}', '[right]').replace(':', '[mao_hao]').replace('"', '[双引]').replace('*', '[star]').replace('>', '[b-than]').replace('<', '[s-than]') if name == '': name = 'None' print(path_o.split('\\')[-1]) name = re.compile('1-\d+').sub('1-x', name) name = re.compile('1~\d+').sub('1~x', name) # 建立目标文件夹 await create_dir(path_o, name) path = path_o+'\\'+name print(name, url, path) '''html-detail''' with open(path + '\\html.txt', 'w', encoding='utf-8') as f: f.write(str(pq)) rec = os.listdir(path) try: del rec[rec.index('html.txt')] except Exception as e: print(e) for item in rec: rec[rec.index(item)] = item.split('.')[0] print(sorted(list(map(int, rec))), int(page), len(rec), int(page)-len(rec)) if int(page)-len(rec) >= 50: timeout = 60 print('timeout', timeout) else: timeout = 15 print('timeout', timeout) if str(len(rec)) == page: return else: urls_t = extract_urls(html) tasks1 = [] for item in urls_t: p = item.split('/')[-2] # print(p, item) if p in rec: # print('in') continue tasks1.append(get_jpeg_url(item, session, path)) await asyncio.gather(*tasks1) async def comic_handler_r(url, root_path, session): global timeout html = await fetch(url, session) pat = '/artist/.*?>(.*?) <' arti = re.findall(pat, html) try: arti = arti[0].replace('/', '-').replace('\\', '-').replace('\n', '-').replace('?', '[ask]').replace('|', '[竖杠]').replace('+', '[add]').replace('{', '[left]').replace('}', '[right]').replace(':', '[mao_hao]').replace('"', '[双引]').replace('*', '[star]').replace('>', '[b-than]').replace('<', '[s-than]') except Exception as e: print('in comic handler r:', e) arti = 'no_name' if arti and arti != 'con': await create_dir(root_path, arti) path = root_path+'\\'+arti else: await create_dir(root_path, 'no_name') path = root_path+'\\no_name' await comic_handler(url, path, session) import time async def page_handler(comics_urls, arti_path, session): for comic_link in comics_urls: # choose one of his work print(time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time()))) # try: await comic_handler(comic_link, arti_path, session) # except Exception as e: # print(e) async def unique_comic_names(arti_base_link, session): i = 0 urls_names = {} while True: try: html = await fetch(arti_base_link+'chinese/page/'+str(i), session) urls_t_names_t = extract_comics_links(html) urls_names.update(urls_t_names_t) i += 1 except Exception as e: print('in unique', e) break return len(list(set([s.lower().replace(' ', '') for s in urls_names.values()]))), urls_names async def check_arti(num, arti_path, urls_names, session): rec = os.listdir(arti_path) print('need:', num, 'now comics:', len(rec)) if len(rec) == 0: return False if len(rec) >= num: for name in rec: comic_path = arti_path+'\\'+name recc = os.listdir(comic_path) with open(comic_path+'\\html.txt', 'r', encoding='utf8') as f: t = f.read() pat = '共 (.*?) 頁' patt = '<h\d>(.*?)</h\d>' try: k = int(re.findall(pattern=pat, string=t)[0]) except Exception as e: print('in', name, e) k = int(re.findall(pattern='共(.*?)頁', string=t)[0]) kk = re.findall(pattern=patt, string=t) print('need:', k, 'now:', len(recc)-1, name) f = 0 if len(recc)-1 < k: for n in kk: if f: break for u, nn in urls_names.items(): if f: break if n == nn: print(time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time()))) await comic_handler(u, arti_path, session) f += 1 print(f) if not f: return False else: return False return True async def main(start): async with aiohttp.ClientSession() as session: ''' 选择爬取模式 || \/ ''' '''get-all-site`s-work''' # lock = Lock() # for i in range(331682, 1, -1): # await lock.acquire() # await comic_handler('https://nyahentai.club/g/'+str(i)+'/', session) # lock.release() '''only-one''' # await comic_handler('https://zh.qqhentai.com/g/333292/', BASE_DIR, session) '''get specific artist''' # arti_name = 'ratatatat74' # arti_base_link = 'https://zh.yyhentai.com/artist/ratatatat74-mr-skull/chinese/' # await create_dir(BASE_DIR, 'all_co') # root_path = BASE_DIR + '\\all_co' # await create_dir(root_path, arti_name) # arti_path = root_path + '\\' + arti_name # i = 1 # while (True): # choose one of his page # try: # html_all_comics = await fetch(arti_base_link + 'page/' + str(i), session) # print(arti_base_link + 'page/' + str(i)) # comics_urls = extract_comics_links(html_all_comics) # # await page_handler(comics_urls, arti_path, session) # # for comic_link in comics_urls: # choose one of his work # # await comic_handler(comic_link, arti_path, session) # i += 1 # # # except Exception as e: # print('fail or end on artist', arti_name, e) # break '''get recent ch comics''' await create_dir(BASE_DIR, 'all_co') root_path = BASE_DIR + '\\all_co' for j in range(1, 1000): print('pg:', j) rec_html = await fetch(base_url+'language/chinese/page/'+str(j), session) comic_urls = extract_rec_links(rec_html) for url in comic_urls: await comic_handler_r(url, root_path, session) # '''get-all-site`s-hot-artists-work''' # await create_dir(BASE_DIR, 'all_co') # root_path = BASE_DIR+'\\all_co' # # for j in range(15, 180): # artists page # print(j) # html_all_artists = await fetch(artist_hot_url + str(j), session) # # artists_base_urls = extract_artists_links(html_all_artists) # one page # # for arti_name, arti_base_link in artists_base_urls.items(): # choose one artist # # await create_dir(root_path, arti_name) # arti_path = root_path+'\\'+arti_name # # # try: # # k, comics_urls_names = await unique_comic_names(arti_base_link, session) # print(arti_name, 'artist works', k) # # if await check_arti(int(k), arti_path, comics_urls_names, session): # print('next artist===============================================================') # continue # # await page_handler(comics_urls_names.keys(), arti_path, session) # print('next artist===============================================================') # # except Exception as e: # # print('fail or end on artist', arti_name, e) # # break if __name__ == "__main__": # print(BASE_DIR) loop = asyncio.get_event_loop() loop.run_until_complete(main(loop)) # tha_main() # =========================================================
    Processed: 0.022, SQL: 8