快代理url如下:https://www.kuaidaili.com/free
注:仅用来记录自己的学习!!! 不要随意用于商业用途
看到网上搭的代理池对小白不太友好,于是搭建了一个属于自己的IP代理池,就不用担心自己的IP被反爬封禁了!!!
知识点:
利用faker使得user-agent随机化将数据保存到MongoDB中可以随时调用从数据库中随意选择一个IP用来代替本地IP # -*- coding = utf-8 -*- # @Time:2020-10-07 15:20 # @Author:来瓶安慕嘻 # @File:搭建代理池.py # @开始美好的一天吧 @Q_Q@、 """ 简介: 1.利用快代理搭建属于自己的IP代理池 2.将爬取的有效的IP存储到MongoDB中 3.从数据库MongoDB中随机选择一个有效IP 调用本程序说明: 1.首先要调用 get_proxy(page_num) 并将其传给具体边变量 2.其次调用 read_ip 不需要传参 并将其传给具体变量 """ # ---------------------------------------------------------------------------------------------------------------------- import random import requests from lxml import etree import time from faker import Factory from pymongo import MongoClient # -----------------------------------------生成随机的请求头 User-Agent----------------------------------------------------- def get_user_agent(num): factory = Factory.create() user_agent = [] for i in range(num): user_agent.append({'User-Agent': factory.user_agent()}) return user_agent # -----------------------------------------爬取代理IP 主要爬取快代理----------------------------------------------------- def get_proxy(page_num): """ 爬取代理IP 并检验IP的有效性 :param page_num: 需要爬取的页数 :return: proxies_list_use: 返回爬取的有效IP地址 """ headers = get_user_agent(5) # ip的格式 {'协议类型':'ip:端口'} proxies_list = [] for i in range(1, 5): print('正在爬取代理第{}页的所有代理ip'.format(i)) header_i = random.randint(0, len(headers) - 1) headers = headers[header_i] base_url = 'https://www.kuaidaili.com/free/inha/{}/'.format(i) page_text = requests.get(url=base_url, headers=headers).text tree1 = etree.HTML(page_text) tr_list = tree1.xpath('//table[@class="table table-bordered table-striped"]/tbody/tr') for tr in tr_list: http_type = tr.xpath('./td[@data-title="类型"]/text()')[0] ip = tr.xpath('./td[@data-title="IP"]/text()')[0] port = tr.xpath('./td[@data-title="PORT"]/text()')[0] proxies = {http_type: ip + ':' + port} proxies_list.append(proxies) # print(proxies) time.sleep(1) proxies_list_use = check_ip(proxies_list) save_ip(proxies_list_use) return proxies_list_use # -----------------------------------------利用百度检验 爬取的代理IP的有效性------------------------------------------------- def check_ip(proxies_list): """ 检测代理IP的质量 直接利用爬取的代理IP去访问百度 设置响应时间为0.1 :param proxies_list: :return: """ headers = get_user_agent(5) header_i = random.randint(0, len(headers) - 1) headers = headers[header_i] can_use = [] for proxy in proxies_list: try: response = requests.get('https://www.baidu.com', headers=headers, proxies=proxy, timeout=0.1) if response.status_code == 200: can_use.append(proxy) except Exception as e: print('代理IP的错误为:', e) return can_use # -----------------------------------------持久化存储到 MongoDB 中 便于后期从里面拿到有效的代理IP------------------------------ def save_ip(ip_list): """ 将爬取到代理IP存储到MongoDB中 :param ip_list: 爬取到的有效IP (列表) :return: """ client = MongoClient() time_info = time.strftime("%Y-%m-%d", time.localtime()) collection = client['快代理'][time_info + '爬取的代理IP'] collection.insert_many(ip_list) # -----------------------------------------读取MongoDB 中的有效IP 用来伪装IP地址爬取其他的网站url------------------------------ def read_ip(): """ 从MongoDB中随机调用一个有效的IP :return:proxy 返回数据库中随机的有效IP地址 """ client = MongoClient() time_info = time.strftime("%Y-%m-%d", time.localtime()) collection = client['快代理'][time_info + '爬取的代理IP'] ip_list = list(collection.find()) proxy_i = random.randint(0, len(ip_list) - 1) proxy = {ip_list[proxy_i]['http_type']: ip_list[proxy_i]['ip_port']} return proxy # -------------------------------------------------主函数main()---------------------------------------------------------- def main(): useful_ip = get_user_agent(5) proxy = read_ip() MongoDB中储存的数据如下(2020-10-07爬取的代理IP)一入爬虫深似海,学习成本实在太大了!!!