目录
python爬虫环境准备预备知识UrllibBeautifulSoup正则表达式xlwt表格操作
sqlite3数据库操作获取数据解析数据保存数据到表格中保存数据到数据库中数据可视化Flask框架Echarts应用WorldCloud应用
完整代码
python爬虫
网络爬虫:按照一定规则,自动抓取互联网信息的程序或者脚本 学习视频地址:https://www.bilibili.com/video/BV12E411A7ZQ/?p=28
环境准备
导入包配置
import sys
from bs4
import BeautifulSoup
import re
import urllib
.request
, urllib
.error
import xlwt
import sqlite3
基础文件结构
"""
爬取豆瓣电影排名250电影
"""
def main():
url
= 'https://movie.douban.com/top250?start=0'
savepath
='.\\doubanTop250.xls'
datalist
= get_data
(url
)
sava_data
()
def get_data(url
):
datalist
= []
return datalist
def sava_data(savepath
):
print("save")
if __name__
== '__main__':
main
()
预备知识
Urllib
import urllib
.request
import urllib
.parse
response
= urllib
.request
.urlopen
("http://www.baidu.com")
print(response
.read
().decode
('utf-8'))
data
= bytes(urllib
.parse
.urlencode
({"hello": "word"}), encoding
="utf-8")
response
= urllib
.request
.urlopen
("http://httpbin.org/post",data
=data
)
print(response
.read
().decode
('utf-8'))
try:
response
= urllib
.request
.urlopen
("http://httpbin.org/get",timeout
=1)
print(response
.read
().decode
('utf-8'))
except Exception
as error
:
print(error
)
url
= "http://douban.com"
headers
= {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36 Edg/85.0.564.68"
}
req
= urllib
.request
.Request
(url
=url
, headers
=headers
, method
="POST")
response
= urllib
.request
.urlopen
(req
, timeout
=3)
print(response
.status
)
print(response
.read
().decode
('utf-8'))
BeautifulSoup
将复杂的html转换成一个复杂的树形结构,每个节点都是python对象所有对象可以分文四种:
Tag:标签及其内容:(但是只是第一个标签)NavigableString:标签里的内容BeautifulSoup:自身,整个文档Comment:注释:是一个特殊的NavigableString输出的内容不包含注释符号
from bs4
import BeautifulSoup
file = open("./../baidu.html", "rb")
html
= file.read
()
bs
= BeautifulSoup
(html
, "html.parser")
print(bs
.title
)
print(bs
.a
)
print(bs
.a
.attrs
)
print(type(bs
.a
))
print(bs
.title
.string
)
print(type(bs
.title
.string
))
print(bs
.name
)
print(type(bs
))
print(bs
.a
.string
)
文档的遍历和查找
print(bs
.head
.contents
[1])
print("---------------------------------")
a_list
= bs
.find_all
("a")
print(a_list
)
print("---------------------------------")
import re
a_list1
= bs
.find_all
(re
.compile("a"))
print(a_list1
)
print("---------------------------------")
def name_is_exists(tag
):
return tag
.has_attr
("name")
a_list2
= bs
.find_all
(name_is_exists
)
print(a_list2
)
print("---------------------------------")
a_list3
= bs
.find_all
(text
="新闻")
a_list3
= bs
.find_all
(text
=re
.compile("\d"),limit
=2)
for item
in a_list3
:
print(item
)
print("---------------------------------")
print(bs
.select
("title"))
print(bs
.select
(".classname"))
print(bs
.select
("#idname"))
print(bs
.select
("a[class='classname']"))
print(bs
.select
("head>title"))
正则表达式
参考文档链接:https://www.runoob.com/regexp/regexp-syntax.html
python中的re库
import re
pat
= re
.compile("AA")
a
= pat
.search
("AAA")
print(a
)
a
= re
.search
("abc", "aabcc")
print(a
)
a
= re
.findall
("[a-z]+", "aABCXabuhua")
print(a
)
a
= re
.sub
("a", "A", "abcdefg")
print(a
)
xlwt表格操作
简单操作
import xlwt
work_book
= xlwt
.Workbook
(encoding
="utf-8")
work_sheet
= work_book
.add_sheet
("sheet1")
work_sheet
.write
(0, 0, 'hello')
work_book
.save
('student.xls')
sqlite3数据库操作
建表语句
import sqlite3
connet
= sqlite3
.connect
("test.db")
c
= connet
.cursor
()
sql
= '''
create table if not exists doubanTop250 (
id integer not null primary key autoincrement,
find_link text not null,
find_image text,
find_ctitle char(100),
find_otitle char(100),
find_score int(4),
find_judge_number int(10),
find_sign char(200),
find_description text
)
'''
c
.execute
(sql
)
connet
.commit
()
connet
.close
()
插入语句
import sqlite3
connet
= sqlite3
.connect
("test.db")
c
= connet
.cursor
()
sql
= '''
insert into doubanTop250 (find_link,find_image,find_ctitle,find_otitle,find_score,find_judge_number,find_sign,find_description)
values ("1","1","1","1",1,1,"1","1");
'''
c
.execute
(sql
)
connet
.commit
()
connet
.close
()
查询语句
import sqlite3
connet
= sqlite3
.connect
("test.db")
c
= connet
.cursor
()
sql
= '''
select * from doubanTop250;
'''
response
= c
.execute
(sql
)
for i
in response
:
for j
in range(0,8):
print(i
[j
])
connet
.commit
()
connet
.close
()
获取数据
根据一个URL获取一个页面的html数据
def ask_url(url
):
head
= {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0"
}
"""
head:模拟浏览器头部信息
"User-Agent":用户代理
"""
request
= urllib
.request
.Request
(url
=url
, headers
=head
)
html
= ""
try:
response
= urllib
.request
.urlopen
(request
)
html
= response
.read
().decode
("utf-8")
except Exception
as e
:
if hasattr(e
,"code"):
print(e
.code
)
if hasattr(e
,"reason"):
print(e
.reason
)
return html
解析数据
豆瓣排名分析
find_link
= re
.compile(r
'<a href="(.*?)">')
find_image
= re
.compile(r
'<img.*src="(.*?)"', re
.S
)
find_title
= re
.compile(r
'<span class="title">(.*)</span>')
find_score
= re
.compile(r
'<span class="rating_num" property="v:average">(.*)</span>')
find_judge_number
= re
.compile(r
'<span>(\d*)人评价</span>')
find_sign
= re
.compile(r
'<span class="inq">(.*)</span>')
find_description
= re
.compile(r
'<p class="">(.*?)</p>', re
.S
)
def get_data(url
):
datalist
= []
for i
in range(0, 10):
urls
= url
+ str(i
*25)
html
= ask_url
(urls
)
soup
= BeautifulSoup
(html
, "html.parser")
for item
in soup
.find_all
('div', class_
="item"):
data
= []
item
= str(item
)
link
= re
.findall
(find_link
, item
)[0]
data
.append
(link
)
img
= re
.findall
(find_image
, item
)[0]
data
.append
(img
)
title
= re
.findall
(find_title
, item
)
if len(title
) == 2:
ctitle
= title
[0]
data
.append
(ctitle
)
otitle
= title
[1].replace
("/","")
data
.append
(otitle
)
else:
data
.append
(title
)
data
.append
(" ")
score
= re
.findall
(find_score
, item
)[0]
data
.append
(score
)
judge_number
= re
.findall
(find_judge_number
, item
)[0]
data
.append
(judge_number
)
sign
= re
.findall
(find_sign
, item
)
if len(sign
) != 0:
data
.append
(sign
[0].replace
("。", ""))
else:
data
.append
(" ")
description
= re
.findall
(find_description
, item
)[0]
description
= re
.sub
('<br(\s+)?/>(\s+)?',"",description
)
description
= re
.sub
('/',"",description
)
data
.append
(description
.strip
())
print(data
)
datalist
.append
(data
)
return datalist
保存数据到表格中
def sava_data(savepath
, datalist
):
print('------------save------------')
work_book
= xlwt
.Workbook
(encoding
="utf-8", style_compression
=0)
work_sheet
= work_book
.add_sheet
("doubanTop250", cell_overwrite_ok
=True)
col
= ("电影详情链接", "图片链接" , "影片中文名", "影片外文名", "评分", "评价数", "概况", "详情")
for i
in range(0, 8):
work_sheet
.write
(0, i
, col
[i
])
for i
in range(0, 250):
print("第%d条" %i
)
data
= datalist
[i
]
for j
in range(0, 8):
work_sheet
.write
(i
+1, j
, data
[j
])
work_book
.save
(savepath
)
保存数据到数据库中
保存数据到数据库中
def sava_data_sqlite(datalist
):
print("------------SQLite------------")
init_sqlite
()
connet
= sqlite3
.connect
("test.db")
c
= connet
.cursor
()
for data
in datalist
:
for index
in range(0, 8):
if index
not in (4, 5):
data
[index
] = '"'+data
[index
]+'"'
sql
= '''
insert into doubanTop250(find_link,find_image,find_ctitle,find_otitle,find_score,find_judge_number,find_sign,find_description)
values(%s)'''%",".join
(data
)
print(sql
)
c
.execute
(sql
)
connet
.commit
()
c
.close
()
connet
.close
()
def init_sqlite():
connet
= sqlite3
.connect
("test.db")
c
= connet
.cursor
()
sql
= '''
create table if not exists doubanTop250 (
id integer not null primary key autoincrement,
find_link text not null,
find_image text,
find_ctitle char(100),
find_otitle char(100),
find_score int(4),
find_judge_number int(10),
find_sign char(200),
find_description text
)
'''
c
.execute
(sql
)
connet
.commit
()
connet
.close
()
数据可视化
Flask框架
web框架
新建一个flask框架
app.py的测试文件:
from flask
import Flask
, render_template
, request
import datetime
app
= Flask
(__name__
)
@app
.route
('/index/<name>')
def hello_word(name
):
return 'hello word %s'%name
@app
.route
('/index/<int:id>')
def hello_word2(id):
return 'hello %d'%id
@app
.route
('/')
def index():
time
= datetime
.date
.today
()
name
= [1, 2, 3]
dic
= {"name": "黄耀辉", "age": "18"}
return render_template
("index.html", time
=time
, name
=name
, dic
=dic
)
@app
.route
('/register')
def register():
return render_template
("register.html")
@app
.route
('/result', methods
=['POST'])
def register_result():
if request
.method
== 'POST':
result
= request
.form
return render_template
("result.html", result
=result
)
else:
return render_template
("error.html")
if __name__
== '__main__':
app
.run
(debug
=True)
html中的数据操作
<body>
<div>豆瓣爬虫
</div>
<div>时间:{{time}}
</div>
<div>
测试列表:
{%for i in name%}
<li>
{{i}}
</li>
{% endfor %}
</div>
<div>
测试字典:
<table>
{% for key,value in dic.items() %}
<tr>
<td>{{key}}
</td>
<td>{{value}}
</td>
</tr>
{% endfor %}
</table>
</div>
</body>
表单提交
<form action="{{url_for('result')}}" method="post">
<p>姓名:
<input type="text" name="name"></p>
<p>性别:
<input type="text" name="sex"></p>
<p>年龄:
<input type="text" name="age"></p>
<p>地址:
<input type="text" name="address"></p>
<p><input type="submit"></p>
</form>
Echarts应用
百度数据可视化应用 官网地址链接
WorldCloud应用
词云 官网地址地址
完整代码
app.py
from flask
import Flask
, render_template
, request
import sqlite3
import jieba
from matplotlib
import pyplot
as plt
from wordcloud
import WordCloud
from PIL
import Image
import numpy
as np
app
= Flask
(__name__
)
@app
.route
('/')
def home():
return render_template
("home.html")
@app
.route
('/movie')
def movie():
movies
= []
con
= sqlite3
.connect
("./douban/test.db")
cur
= con
.cursor
()
sql
= "select * from doubanTop250"
data
= cur
.execute
(sql
)
for item
in data
:
movies
.append
(item
)
cur
.close
()
con
.close
()
return render_template
("movie.html", movies
=movies
)
@app
.route
('/score')
def score():
find_score
= []
find_number
= []
con
= sqlite3
.connect
("./douban/test.db")
cur
= con
.cursor
()
sql
= "select find_score,count(find_score) from doubanTop250 group by find_score"
data
= cur
.execute
(sql
)
for item
in data
:
find_score
.append
(item
[0])
find_number
.append
(item
[1])
cur
.close
()
con
.close
()
return render_template
("score.html", score
=find_score
, number
=find_number
)
@app
.route
('/word')
def word():
con
= sqlite3
.connect
("./douban/test.db")
cur
= con
.cursor
()
sql
="select find_sign from doubanTop250"
data
= con
.execute
(sql
)
text
= ""
for item
in data
:
text
= text
+ item
[0]
cut
= jieba
.cut
(text
)
string
= " ".join
(cut
)
cur
.close
()
con
.close
()
img
= Image
.open("./static/timg.jpg")
img_array
= np
.array
(img
)
wold_cloud
= WordCloud
(
background_color
="#E4E7ED",
mask
=img_array
,
font_path
="STKAITI.TTF",
).generate_from_text
(string
)
fig
= plt
.figure
(1)
plt
.imshow
(wold_cloud
)
plt
.axis
('off')
plt
.savefig
("./static/word.jpg", dpi
=500)
return render_template
("word.html")
@app
.route
('/author')
def author():
return render_template
("author.html")
if __name__
== '__main__':
app
.run
(debug
=True)
前端页面home.html
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8" />
<title>豆瓣top260
</title>
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta name="description" content="Premium Bootstrap 4 Landing Page Template" />
<meta name="keywords" content="bootstrap 4, premium, marketing, multipurpose" />
<meta content="Shreethemes" name="author" />
<link rel="shortcut icon" href="images/favicon.ico">
<link href="static/css/bootstrap.min.css" rel="stylesheet" type="text/css" />
<link href="static/css/materialdesignicons.min.css" rel="stylesheet" type="text/css" />
<link rel="stylesheet" href="static/css/unicons.css" />
<link rel="stylesheet" type="text/css" href="static/css/pe-icon-7.css">
<link href="static/css/magnific-popup.css" rel="stylesheet" type="text/css" />
<link href="static/css/style.css" rel="stylesheet" type="text/css" />
<link href="static/css/default.css" rel="stylesheet" id="color-opt">
</head>
<body>
<header id="topnav" class="defaultscroll sticky" style="background: #409EFF">
<div class="container">
<div>
<a class="logo" href="/" target="_blank" style="color: #0b0b0b">
豆瓣Top250
</a>
</div>
<div id="navigation">
<ul class="navigation-menu">
<li class="has-submenu">
<a href="/">首页
</a>
</li>
<li class="has-submenu">
<a href="/movie">电影
</a>
</li>
<li class="has-submenu">
<a href="/score">评分
</a>
</li>
<li class="has-submenu">
<a href="/word">词云
</a>
</li>
<li class="has-submenu">
<a href="/author">作者
</a>
</li>
</ul>
</div>
</div>
</header>
<section class="section" id="price" style="background: #E4E7ED;min-height: 600px">
<div class="container">
<div class="row justify-content-center">
<div class="col-12">
<div class="section-title text-center mb-4 pb-2">
<h4 class="title mb-4">豆瓣电影Top250数据分析
</h4>
<p class="text-muted para-desc mx-auto mb-0">应用Python爬虫、Flask框架、Echarts、Word Cloud等技术实现
</p>
</div>
</div>
</div>
<div class="row">
<div class="col-lg-3 col-md-6 col-12 mt-4 pt-2">
<a href="/movie">
<div class="card service-wrapper rounded border-0 shadow px-4 py-5">
<div class="icon text-center text-primary h1 shadow rounded bg-white">
<i class="uim uim-airplay"></i>
</div>
<div class="content mt-4">
<h5 class="title">经典电影
</h5>
</div>
</div>
</a>
</div>
<div class="col-lg-3 col-md-6 col-12 mt-4 pt-2">
<div class="card service-wrapper rounded border-0 shadow px-4 py-5">
<a href="/score">
<div class="icon text-center text-primary h1 shadow rounded bg-white">
<i class="uim uim-circle-layer"></i>
</div>
<div class="content mt-4">
<h5 class="title">评分统计
</h5>
</div>
</a>
</div>
</div>
<div class="col-lg-3 col-md-6 col-12 mt-4 pt-2">
<div class="card service-wrapper rounded border-0 shadow px-4 py-5">
<a href="/word">
<div class="icon text-center text-primary h1 shadow rounded bg-white">
<i class="uim uim-signal-alt-3"></i>
</div>
<div class="content mt-4">
<h5 class="title">词汇统计
</h5>
</div>
</a>
</div>
</div>
<div class="col-lg-3 col-md-6 col-12 mt-4 pt-2">
<div class="card service-wrapper rounded border-0 shadow px-4 py-5">
<a href="/author">
<div class="icon text-center text-primary h1 shadow rounded bg-white">
<i class="uim uim-flip-h-alt"></i>
</div>
<div class="content mt-4">
<h5 class="title">作者信息
</h5>
</div>
</a>
</div>
</div>
</div>
</div>
</section>
<footer class="bg-dark footer-bar py-4">
<div class="container">
<div class="row justify-content-center">
<div class="col-12 text-center">
<p class="foot-color mb-0">persistenthuang@163.com
</p>
</div>
</div>
</div>
</footer>
<script src="static/js/jquery.min.js"></script>
<script src="static/js/bootstrap.bundle.min.js"></script>
<script src="static/js/jquery.easing.min.js"></script>
<script src="static/js/scrollspy.min.js"></script>
<script src="static/js/jquery.magnific-popup.min.js"></script>
<script src="static/js/magnific.init.js"></script>
<script src="static/js/parallax.js"></script>
<script src="static/js/bundle.js"></script>
<script src="static/js/feather.min.js"></script>
<script src="static/js/contact.js"></script>
<script src="static/js/app.js"></script>
</body>
</html>
move.html
<section class="section" id="price" style="background: #E4E7ED;min-height: 600px">
<div class="container">
<div class="row justify-content-center">
<div class="col-12">
<div class="section-title text-center mb-4 pb-2">
<h4 class="title mb-4">豆瓣电影Top250电影
</h4>
</div>
</div>
</div>
<table class="table table-hover table-light">
<tr>
<td>排名
</td>
<td>中文名称
</td>
<td>外文名称
</td>
<td>评分
</td>
<td>人数
</td>
<td>一句话描述
</td>
<td>其他信息
</td>
</tr>
{%for movie in movies%}
<tr>
<td>{{movie[0]}}
</td>
<td>
<a href="{{movie[1]}}" target="_blank">
{{movie[3]}}
</a>
</td>
<td>{{movie[4]}}
</td>
<td>{{movie[5]}}
</td>
<td>{{movie[6]}}
</td>
<td>{{movie[7]}}
</td>
<td>{{movie[8]}}
</td>
</tr>
{%endfor%}
</table>
</div>
</section>
<footer class="bg-dark footer-bar py-4">
<div class="container">
<div class="row justify-content-center">
<div class="col-12 text-center">
<p class="foot-color mb-0">persistenthuang@163.com
</p>
</div>
</div>
</div>
</footer>
score.html
<section class="section" id="price" style="background: #E4E7ED;min-height: 600px">
<div class="container">
<div class="row justify-content-center">
<div class="col-12">
<div class="section-title text-center mb-4 pb-2">
<h4 class="title mb-4">豆瓣电影Top250评分分布图
</h4>
</div>
</div>
</div>
<div id="main" style="width: 100%;height:450px;margin: 0 auto;"></div>
</div>
</section>
<footer class="bg-dark footer-bar py-4">
<div class="container">
<div class="row justify-content-center">
<div class="col-12 text-center">
<p class="foot-color mb-0">persistenthuang@163.com
</p>
</div>
</div>
</div>
</footer>
<script type="text/javascript">
var myChart = echarts.init( document.getElementById('main'));
var dataAxis = {{ score }};
var data = {{ number }};
var yMax = 50;
var dataShadow = [];
for (var i = 0; i < data.length; i++) {
dataShadow.push(yMax);
}
option = {
xAxis: {
type: 'category',
data: dataAxis,
axisLabel: {
inside: true,
textStyle: {
color: '#1c1b1b'
}
},
axisTick: {
show: false
},
axisLine: {
show: false
},
z: 10
},
yAxis: {
axisLine: {
show: false
},
axisTick: {
show: false
},
axisLabel: {
textStyle: {
color: '#999'
}
}
},
dataZoom: [
{
type: 'inside'
}
],
series: [
{
type: 'bar',
itemStyle: {
color: 'rgba(0,0,0,0.05)'
},
barGap: '-100%',
barCategoryGap: '40%',
data: dataShadow,
animation: false
},
{
type: 'bar',
itemStyle: {
color: new echarts.graphic.LinearGradient(
0, 0, 0, 1,
[
{offset: 0, color: '#83bff6'},
{offset: 0.5, color: '#188df0'},
{offset: 1, color: '#188df0'}
]
)
},
emphasis: {
itemStyle: {
color: new echarts.graphic.LinearGradient(
0, 0, 0, 1,
[
{offset: 0, color: '#2378f7'},
{offset: 0.7, color: '#2378f7'},
{offset: 1, color: '#83bff6'}
]
)
}
},
data: data
}
]
};
var zoomSize = 6;
myChart.on('click', function (params) {
console.log(dataAxis[Math.max(params.dataIndex - zoomSize / 2, 0)]);
myChart.dispatchAction({
type: 'dataZoom',
startValue: dataAxis[Math.max(params.dataIndex - zoomSize / 2, 0)],
endValue: dataAxis[Math.min(params.dataIndex + zoomSize / 2, data.length - 1)]
});
});
myChart.setOption(option);
</script>
word.html
<section
class="section" id="service" style
="background: #E4E7ED;min-height: 600px">
<div
class="container mt-60 mt-5">
<div
class="row align-items-center">
<div
class="col-lg-8 col-md-8">
<div
class="mr-lg-5">
<img src
="./../static/word.jpg" class="img-fluid" alt
="">
</div
>
</div
><!
--end col
-->
<div
class="col-lg-4 col-md-4 mt-4 mt-sm-0 pt-2 pt-sm-0">
<div
class="section-title">
<h2
class="text-primary">
<i
class="uim uim-google-play"></i
>
</h2
>
<h4
class="title mt-3 mb-4">词频统计
</h4
>
<p
class="text-blue para-desc">根据
250部电影的一句话概述,提取的词云树,让我们了解一下经典电影都有什么相同点
</p
>
</div
>
</div
><!
--end col
-->
</div
><!
--end row
-->
</div
><!
--end container
-->
</section
><!
--end section
-->
<!
-- Services End
-->
<footer
class="bg-dark footer-bar py-4">
<div
class="container">
<div
class="row justify-content-center">
<div
class="col-12 text-center">
<p
class="foot-color mb-0">persistenthuang@
163.com
</p
>
</div
>
</div
>
</div
>
</footer
>