1.目标网站:https://www.1point3acres.com/bbs/forum-28-1.html此处开始的若干页
2.首先创建两个队列,一个页面队列和一个用于I/O的队列。顺便创建个锁,防止写的时候出问题
page_queue = Queue() joke_queue = Queue() gLock = threading.Lock()3.用CSV存储数据
fp = open('asd.csv','a+',newline='',encoding='utf-8') url = 'https://www.1point3acres.com/bbs/forum-28-1.html' writer = csv.writer(fp) writer.writerow(('标题','链接'))4.寻找最大页码
max_page = find_max_page(url) def find_max_page(url): selector = comp(url) max_page = selector.xpath('//div[@class="pg"]//span/text()') if max_page : max_page = max_page[0] max_page = int(re.findall('\d+',max_page)[0]) return max_page else: return5.循环如栈,把页压入队列内
for x in range(1,max_page): url = 'https://www.1point3acres.com/bbs/forum-28-{}.html'.format(x) page_queue.put(url) for x in range(4): t = BSSpider(page_queue,joke_queue) t.start() for x in range(4): t = BSWriter(joke_queue,writer,gLock) t.start()6.解析线程代码如下:
class BSSpider(threading.Thread): # headers = { # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' # } headers = { 'User-Agent': get_ua(), 'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'accept - encoding':'gzip, deflate, br', 'accept - language':'zh - CN, zh;q = 0.9', 'referer': 'https://www.1point3acres.com/bbs/', 'upgrade - insecure - requests':'1', 'Connection': 'keep-alive', } # ip代理池 proxies = { 'http': '123.54.44.60:9999', 'http': '182.101.207.11:8080', 'http': '121.232.148.231:9000', 'http': '183.166.163.61:9999', 'http': '175.44.108.179:9999', 'http': '175.43.155.36:9999', 'http': '39.108.59.34:8118', 'http': '219.159.38.207:56210', 'http': '113.194.48.14:9999', 'http': '163.125.220.175:8118', 'http': '123.149.136.180:9999', 'http': '121.232.194.37:9000', 'http': '1.85.5.66:8060', 'http': '125.108.100.20:9000', 'http': '114.101.252.37:3000', } def __init__(self,page_queue,joke_queue,*args,**kwargs): super(BSSpider,self).__init__(*args,**kwargs) # 基域名 self.base_domain = 'https://www.1point3acres.com/bbs/' self.page_queue = page_queue self.joke_queue = joke_queue def run(self): while True: # 如果页面队列为空,则break退出 if self.page_queue.empty(): break # 从页面队列取出url url = self.page_queue.get() print(url) # 设置重传 requests.adapters.DEFAULT_RETRIES = 30 response = requests.get(url,headers=self.headers,proxies=self.proxies,timeout=100).text html = etree.HTML(response) titles = html.xpath('//a[contains(@class,"xst")]/text()') urls = html.xpath('//a[contains(@class,"xst")]/@href') # print(titles,urls) for title,link in zip(titles,urls): link = self.base_domain + link # 把得到的数据压入I/O队列中 self.joke_queue.put((title,link)) print('完成一页')6.I/O线程代码如下:
class BSWriter(threading.Thread): def __init__(self,joke_queue,writer,gLock,*args,**kwargs): super(BSWriter,self).__init__(*args,**kwargs) self.joke_queue = joke_queue self.writer = writer # 用于写入的锁 self.lock = gLock def run(self): while True: try: joke_info = self.joke_queue.get(timeout=40) title,link = joke_info # 写入之前上锁 self.lock.acquire() self.writer.writerow((title,link)) self.lock.release() # print('保存一条') except: break7.完整代码如下:
# -*- encoding: utf-8 -*- #@Time: 15:40 #@Software:PyCharm import requests from lxml import etree import threading from queue import Queue import csv import random import re import time import ssl ssl._create_default_https_context = ssl._create_unverified_context # def get_ua(): # user_agents = [ # "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36", # "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36", # "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36", # "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36", # "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36", # "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36", # "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36", # "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36", # "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36", # "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36", # "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36", # "Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36", # "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36", # "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36", # "Mozilla/5.0 (X11; OpenBSD i386) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36", # "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36", # "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36", # "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2309.372 Safari/537.36", # "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2117.157 Safari/537.36", # "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36", # "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1866.237 Safari/537.36", # "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/4E423F", # "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.116 Safari/537.36 Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.10", # "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.517 Safari/537.36", # "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36", # "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36", # "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36", # "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36", # "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36", # "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.36", # "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36", # "Mozilla/5.0 (X11; CrOS i686 4319.74.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36", # "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36", # "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36", # "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36", # "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36", # "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36", # "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36", # "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36", # "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36", # "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36", # "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36", # "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36", # "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.90 Safari/537.36", # "Mozilla/5.0 (X11; NetBSD) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36", # "Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36", # "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.60 Safari/537.17", # "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17", # "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.15 (KHTML, like Gecko) Chrome/24.0.1295.0 Safari/537.15", # "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.14 (KHTML, like Gecko) Chrome/24.0.1292.0 Safari/537.14", # 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36', # ] # user_agent = random.choice(user_agents) # random.choice(),从列表中随机抽取一个对象 # return user_agent class BSSpider(threading.Thread): # headers = { # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' # } headers = { 'User-Agent': get_ua(), 'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'accept - encoding':'gzip, deflate, br', 'accept - language':'zh - CN, zh;q = 0.9', 'referer': 'https://www.1point3acres.com/bbs/', 'upgrade - insecure - requests':'1', 'Connection': 'keep-alive', } # ip代理池 proxies = { 'http': '123.54.44.60:9999', 'http': '182.101.207.11:8080', 'http': '121.232.148.231:9000', 'http': '183.166.163.61:9999', 'http': '175.44.108.179:9999', 'http': '175.43.155.36:9999', 'http': '39.108.59.34:8118', 'http': '219.159.38.207:56210', 'http': '113.194.48.14:9999', 'http': '163.125.220.175:8118', 'http': '123.149.136.180:9999', 'http': '121.232.194.37:9000', 'http': '1.85.5.66:8060', 'http': '125.108.100.20:9000', 'http': '114.101.252.37:3000', } def __init__(self,page_queue,joke_queue,*args,**kwargs): super(BSSpider,self).__init__(*args,**kwargs) # 基域名 self.base_domain = 'https://www.1point3acres.com/bbs/' self.page_queue = page_queue self.joke_queue = joke_queue def run(self): while True: # 如果页面队列为空,则break退出 if self.page_queue.empty(): break # 从页面队列取出url url = self.page_queue.get() print(url) # 设置重传 requests.adapters.DEFAULT_RETRIES = 30 response = requests.get(url,headers=self.headers,proxies=self.proxies,timeout=100).text html = etree.HTML(response) titles = html.xpath('//a[contains(@class,"xst")]/text()') urls = html.xpath('//a[contains(@class,"xst")]/@href') # print(titles,urls) for title,link in zip(titles,urls): link = self.base_domain + link # 把得到的数据压入I/O队列中 self.joke_queue.put((title,link)) print('完成一页') class BSWriter(threading.Thread): def __init__(self,joke_queue,writer,gLock,*args,**kwargs): super(BSWriter,self).__init__(*args,**kwargs) self.joke_queue = joke_queue self.writer = writer # 用于写入的锁 self.lock = gLock def run(self): while True: try: joke_info = self.joke_queue.get(timeout=40) title,link = joke_info # 写入之前上锁 self.lock.acquire() self.writer.writerow((title,link)) self.lock.release() # print('保存一条') except: break def main(): page_queue = Queue() joke_queue = Queue() gLock = threading.Lock() fp = open('asd.csv','a+',newline='',encoding='utf-8') url = 'https://www.1point3acres.com/bbs/forum-28-1.html' writer = csv.writer(fp) writer.writerow(('标题','链接')) max_page = find_max_page(url) for x in range(1,max_page): url = 'https://www.1point3acres.com/bbs/forum-28-{}.html'.format(x) page_queue.put(url) for x in range(4): t = BSSpider(page_queue,joke_queue) t.start() for x in range(4): t = BSWriter(joke_queue,writer,gLock) t.start() def find_max_page(url): selector = comp(url) max_page = selector.xpath('//div[@class="pg"]//span/text()') if max_page : max_page = max_page[0] max_page = int(re.findall('\d+',max_page)[0]) return max_page else: return # def comp(url): # ua = get_ua() # headers = { # 'User-Agent': ua, # 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', # 'accept - encoding': 'gzip, deflate, br', # 'accept - language': 'zh - CN, zh;q = 0.9', # 'referer': 'https://www.1point3acres.com/bbs/', # 'upgrade - insecure - requests': '1', # 'Connection': 'keep-alive', # } # proxies = { # 'http': '123.54.44.60:9999', # 'http': '182.101.207.11:8080', # 'http': '121.232.148.231:9000', # 'http': '183.166.163.61:9999', # 'http': '175.44.108.179:9999', # 'http': '175.43.155.36:9999', # 'http': '39.108.59.34:8118', # 'http': '219.159.38.207:56210', # 'http': '113.194.48.14:9999', # 'http': '163.125.220.175:8118', # 'http': '123.149.136.180:9999', # 'http': '121.232.194.37:9000', # 'http': '1.85.5.66:8060', # 'http': '125.108.100.20:9000', # 'http': '114.101.252.37:3000', # } # requests.adapters.DEFAULT_RETRIES = 30 # html_data = requests.get(url=url, headers=headers, proxies=proxies, timeout=10) # # html_data.encoding = html_data.apparent_encoding # html = html_data.text # selector = etree.HTML(html) # return selector if __name__ == '__main__': main()