from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import time
import requests
import json
import random
import pandas as pd
import numpy as np
import re
import os
from pyquery import PyQuery as pq
import jieba
def get_one_page(url):
headers = {
'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:46.0) Gecko/20100101 Firefox/46.0',
'Content-Type': 'application/x-www-form-urlencoded',
'Connection' : 'Keep-Alive',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
}
response = requests.get(url,headers=headers)
if response.status_code == 200:
return response
return None
def get_city(text,pattern):
patterns = re.compile('.*?(\w{7})[?。,]',re.S)
text1 = re.findall(patterns,text)
return text
def get_text(html,pattern):
doc = pq(html)
text1 = doc(pattern)
return text1
def get_word_s(texts):
cut = jieba.lcut_for_search(texts)
return cut[1]
def get_word_j(texts):
cut = jieba.lcut(texts)
return cut[1]
def get_word_q(texts):
cut = jieba.lcut(texts,cut_all=True)
return cut[1]
def open_edge_getsuoece(url):
browser = webdriver.Edge(executable_path = "C:\\Users\\Public\\Documents\\Python Scripts\\msedgedriver.exe")
browser.get(url)
wait = WebDriverWait(browser,15)
text1 = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'.article-content')))
text = browser.page_source
browser.close()
print('open_edge_getsuoece ok')
return text
def get_dz(text):
pattern = re.compile('.&?(【.*?】).*?',re.S)
text1 = re.findall(pattern,text)
text1 =text1[::-1]
print(text1)
list1 =[]
for i in text1:
l = 0
l = l+1
te = get_word_j(str(i))
if te in list1:
continue
list1.append(te)
return list1
url = "http://baijiahao.baidu.com/s?id=1670368210129917683"
html = open_edge_getsuoece(url)
pation = "img"
text = get_text(html,pation)
pation1 = re.compile('.*?data-loadfunc="0" src="(.*?)" data-loaded="0"',re.S)
urls = re.findall(pation1,str(text))
index = 0
print(urls)
for i in urls:
html1 = get_one_page(i)
if index%20 ==0 :
print(index)
index +=1
dizhi = "C:\\Users\\西木康\\Desktop\\123\\" + str(index) + ".jpg"
with open(dizhi,"wb")as f:
f.write(html1.content)
转载请注明原文地址:https://blackberry.8miu.com/read-27906.html