1、环境
pycharm+selenium+pyquery+openpyxl
2、功能说明
通过openpyxl读取关键词,爬取亚马逊指定关键词商品 的信息,并将获取到的信息通过openpyxl写入到excel中
3、完整代码
import time
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.by import By
from pyquery import PyQuery as Pq
import openpyxl
def parse_detail(page_source, page_index,key):
global write_row
isEnd = False
doc = Pq(page_source)
# 判断是否为最后一页,最后一页就停止往下获取
last = doc('.a-disabled.a-last').text()
if last.strip() != "":
isEnd = True
print("last text = ",last," isEnd = ",isEnd)
# 获取商品所有信息
main_list = doc('.s-main-slot.s-result-list.s-search-results.sg-row')
par_list = main_list.children().items()
normal_count = 0
for child in par_list:
# asin 为空的表示非正常商品链接
asin = child.attr('data-asin')
if asin.strip() == "":
continue
normal_count += 1
# 是否为广告
Issponsor = child('.s-label-popover-default .a-size-mini.a-color-secondary').text()
# 标题
title = child('.a-size-base-plus.a-color-base.a-text-normal').text()
# 价格,但是会有打折价格,所以多个价格只选第一个
price_list = child('.a-price .a-offscreen').text().split(" ")
price = price_list[0]
# 当前商品位置
cur_pos = str(page_index)+"-"+str(normal_count)
print(key,title, ", ", price, ", ", Issponsor, ", ", cur_pos)
# 写入到excel
write_row += 1
worksheet.cell(write_row, 1, key)
worksheet.cell(write_row, 2, title)
worksheet.cell(write_row, 3, price)
worksheet.cell(write_row, 4, Issponsor)
worksheet.cell(write_row, 5, cur_pos)
print(normal_count)
return isEnd
if __name__ == '__main__':
filepath = "C:/Users/45906/Desktop/关键词.xlsx"
# 获取关键词
key_list = []
write_row = 1
try:
# 需要文件存在
workbook = openpyxl.load_workbook(filepath)
sheet_names = workbook.sheetnames
worksheet = workbook[sheet_names[0]]
write_row = worksheet.max_row+1
rows = worksheet.iter_rows()
for key in rows:
key_list.append(key[0].value)
print(write_row)
print(key_list)
if key_list[0] == "关键词" :
key_list.remove("关键词")
print(key_list)
except:
print("excel Abnormal operation")
exit()
excel_title = ["关键词", "标题", "价格", "模式", "自然位置"]
worksheet.cell(write_row, 1, excel_title[0])
worksheet.cell(write_row, 2, excel_title[1])
worksheet.cell(write_row, 3, excel_title[2])
worksheet.cell(write_row, 4, excel_title[3])
worksheet.cell(write_row, 5, excel_title[4])
# 设置get直接返回,不再等待界面加载完成
desired_capabilities = DesiredCapabilities.CHROME
desired_capabilities["pageLoadStrategy"] = "none"
chrome_options = webdriver.ChromeOptions()
# 无窗口模式
# chrome_options.add_argument('--headless')
# 禁止硬件加速,避免严重占用cpu
chrome_options.add_argument('--disable-gpu')
# 关闭安全策略
chrome_options.add_argument("disable-web-security")
# 禁止图片加载
chrome_options.add_experimental_option('prefs', {'profile.managed_default_content_settings.images': 2})
# 隐藏"Chrome正在受到自动软件的控制
chrome_options.add_argument('disable-infobars')
# 设置开发者模式启动,该模式下webdriver属性为正常值
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
# 模拟移动设备
chrome_options.add_argument('user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"')
driver = webdriver.Chrome(options=chrome_options)
# 返回驱动等待的变量
wait = WebDriverWait(driver, 20)
# driver.maximize_window()
print(time.strftime("start %Y-%m-%d %H:%M:%S", time.localtime()))
# 固定搜索内容,变化的只有页面
search_page_url = 'https://www.amazon.com/s?k={}&page={}'
for key in key_list:
for i in range(1, 10):
# 最大往下爬取10页
print("正在爬取", search_page_url.format(key, i))
driver.get(search_page_url.format(key, i))
time.sleep(3)
# css选择器,返回结果存在跳出,异常报错
try:
wait.until(ec.presence_of_element_located((By.CSS_SELECTOR, "div.s-result-list")))
isEnd = parse_detail(driver.page_source,i,key)
if isEnd:
break
except:
print("url: "+search_page_url.format(i)+"获取失败")
pass
print(time.strftime("end %Y-%m-%d %H:%M:%S", time.localtime()))
driver.quit()
try:
# 注意 excel被手动打开后,保存会失败
workbook.save(filepath)
workbook.close()
except:
pass
4、结果
5、缺点
从我的角度上,就觉得不够自由,
文件打开和存储都是写死的,需要由用户选择选择捕捉的内容也是写死的,需要可以增加删除查询的地址不可选