【爬虫】爬取B站小黑屋

    科技2026-03-01  6

    爬取B站小黑屋信息

    由于b站更新了反爬虫策略,现在爬取B站可以采用模拟浏览器操作进行爬取。需要安装以下python模块:

    pip3 install selenium pip3 install bs4

    使用selenium模拟浏览器操作,对小黑屋进行模拟下拉操作,可以设置下拉次数(这里要注意每次下拉后要sleep一段时间,否则网页会加载不完)。等获取到足够的页面后在进行数据清洗。

    from selenium import webdriver from bs4 import BeautifulSoup import time import json import re class BSpider(): def __init__(self): # 设置无界面模式 options = webdriver.FirefoxOptions() options.add_argument('--headless') self.browser = webdriver.Firefox(options = options) self.blackroom_page = 'https://www.bilibili.com/blackroom/ban' self.count = 0 # 获取页面 def get_page(self): self.browser.get(self.blackroom_page) # 只获取弹幕内容 self.browser.find_element_by_xpath('//*[@id="app"]/div/div/div/div[2]/div[1]/div[2]/div[1]/i').click() time.sleep(0.5) self.browser.find_element_by_xpath('//*[@id="app"]/div/div/div/div[2]/div[1]/div[2]/div[2]/p[3]').click() time.sleep(0.5) # 下拉页面, 下拉300次 index, max_count = 0, 300 while index < max_count: print("scroll down: %d ..." % (index)) self.browser.execute_script( 'window.scrollTo(0,document.body.scrollHeight)' ) time.sleep(0.8) index = index + 1 # 字符串找中文字符 def find_chinese(self, article): pattern = re.compile(r'[^\u4e00-\u9fa5]') chinese = re.sub(pattern, '', article) return chinese # 删除星号* def delete_star(self, article): pattern = re.compile(r'[*]') no_star = re.sub(pattern, '', article) return no_star # 解析页面,对数据进行清洗 在这里只获取账号封禁时间(永久/15天/7天......)和发的弹幕 def paser_page(self): html = BeautifulSoup(self.browser.page_source) output_data = [] for dl in html.find_all('dl'): sub_output_data = {} black_cube = dl.parent try: temp_type = (black_cube.find(class_='jc').get_text()) first_p_text = self.delete_star(dl.dt.p.text) # first_p_text = dl.dt.p.text except Exception as e: print(e) # sub_output_data["reason"] = temp_reson sub_output_data["type"] = temp_type sub_output_data['article'] = first_p_text if first_p_text != '': output_data.append(sub_output_data) # print(output_data) # 存储数据 print('dump to json file ...') with open(r'2020\ML\ML_action\3.NaiveBayes\data\blackroom.json', 'w', encoding='utf-8') as f: json.dump(output_data, f, ensure_ascii=False,sort_keys=False, indent=4) print('dump file done.') b = BSpider() print("init....") b.get_page() b.paser_page()
    Processed: 0.011, SQL: 9