一、Python基础语法
1标识符
首字母是字母或者下划线其余是字母、下划线、数字大小写敏感
2赋值
python可以实现多个数值的赋值
a
, b
= 2, 1.11
print(a
)
print(b
)
3数据类型
3.1整型
整数类型在32位机器上的取值:-231~231-1
整数类型在64位机器上的取值:-263~263-1
python基本就没有整数类型溢出的问题 了
3.2布尔型
True 是用 1 存储的
False 是用 0 存储的
a
= True
b
= False
print(int(a
))
print(int(b
))
3.3浮点型
a
= 9.8e3
print(a
)
b
= -2e-2
print(b
)
其中e3就表示103
3.4复数型
a
= 2 + 3j
print(a
)
print(a
.real
)
print(a
.imag
)
print(a
.conjugate
())
4包 package
5函数
进制转换
十进制转二进制 bin
print(bin(83))
十进制转八进制 oct
print(oct(83))
print('%o' % 83)
print(oct(83)[2:])
十进制转十六进制 hex
print(hex(83))
print('%x' % 83)
二进制转十进制 0b1010
print(0b1010)
八进制转十进制 0o1010
print(0o1010)
十六进制转十进制 0x1010
print(0x1010)
>>> int(4.5) # 结果向下取整
4
>>> int('123') # 默认为十进制
123
isinstance() 判断某个元素是不是某种类型
a
= int(3.14)
isinstance(a
, int)
map() 映射函数
a
= map(str, range(5))
b
= map(float, range(9))
print(list(a
))
print(tuple(b
))
6字典
aDic
= {}.fromkeys
(('John', 'Rose', 'Jiw', 'SunY'), 3000)
aDic2
= {}.fromkeys
(['John', 'Rose', 'Jiw', 'SunY'], 3000)
print(sorted(aDic
))
names
= ['John', 'Rose', 'Jiw', 'SunY']
salaries
= [3000, 2000, 1000, 4000]
print(dict(zip(names
, salaries
)))
PList
= [('AXP', 'American Express Company', '78.51'),
('BA', 'The Boeing Company', '184.76'),
('CAT', 'Caterpillar', '96.39')]
d
= {}
for item
in PList
:
d
[item
[0]] = item
[2]
print(d
)
names
= ['jiwei', 'sunyan', 'liuzixin', 'jihong']
age
= [24, 22, 5, 31]
dic
= dict(zip(names
, age
))
print(dic
.keys
())
print(dic
.values
())
for i
, j
in dic
.items
():
print(i
, j
)
字典的查找
inf
= {'jiwei': 24, 'sunyan': 22}
print(inf
.get
('licai'))
字典的删除
json文件与字典的关系和转换
一般都是讲字典 转换成json格式,然后发到网页上,
而数据挖掘和分析是将网上的json文件转换成字典格式,再使用Frame或者Series格式去分析。
json格式–>字典;解码
json.load()
字典–>json格式;编码
json.dumps()
import json
x
= {'jiwei': 24, 'sunyan': 22, 'john': {'a': 'A', 'b': 22}}
x_encode
= json
.dumps
(x
)
print(x_encode
)
print(json
.loads
(x_encode
))
7 集合
创建集合
set()frozenset()
集合的运算符
in
==
< 和 >
aSet
= set('sunnise')
bSet
= set('sunset')
print('u' in aSet
)
print(aSet
== bSet
)
print(aSet
< bSet
)
print(set('sun') < aSet
)
& 交集
| 并集
-减
^两个集合中不同的元素
aSet
= set('sunnise')
bSet
= set('sunset')
print(aSet
& bSet
)
print(aSet
| bSet
)
print(aSet
- bSet
)
print(aSet
^ bSet
)
集合的函数
面向所有集合的函数
aSet
= set('sunnise')
bSet
= set('sunset')
print(aSet
.issubset
(bSet
))
print(aSet
.intersection
(bSet
))
print(aSet
.difference
(bSet
))
print(aSet
.copy
())
面向可变集合的函数
aSet
= set('sunnise')
aSet
.add
('!')
print(aSet
)
aSet
.remove
('!')
print(aSet
)
aSet
.update
('Yeah')
print(aSet
)
aSet
.clear
()
print(aSet
)
Python变量和数据类型
s
= 'Python was started in 1989 by \"Guido\".\n\nPython is free and easy to learn.'
print(s
)
s
= r
'''"To be, or not to be": that is the question.
Whether it's nobler in the mind to suffer.'''
print(s
)
s2
= r
'\(~_~)/ \(~_~)/'
print(s2
)
s3
= '''Python is created by "Guido".\nIt is free and easy to learn.
Let's start learn Python in imooc!
'''
print(s3
)
a
= 'python'
print('hello, ', a
or 'world')
b
= ''
print('hello,', b
or 'world')
变量作用域
全局变量
局部变量
如果全局变量和局部变量同名:内层屏蔽外层
a
= 3
def f(x
):
global a
print(a
)
a
= 5
print(a
+ x
)
f
(8)
print(a
)
异常(Exception)
try:
num1
= int(input('enter the first number:'))
num2
= int(input('enter the second number:'))
print(num1
/num2
)
except ValueError
:
print('Please input a digit!')
except ZeroDivisionError
as err
:
print('The second number cannot be zero!')
print(err
)
当不知道哪里出错,可以使用如下捕获所有异常,一了百了
try:
num1
= int(input('enter the first number:'))
num2
= int(input('enter the second number:'))
print(num1
/num2
)
except:
print('Something went wrong!')
上面的方法只能判断出错了,但是不知道具体出现什么错误!可以用使用如下方式
try:
num1
= int(input('enter the first number:'))
num2
= int(input('enter the second number:'))
print(num1
/num2
)
except Exception
as err
:
print('Something went wrong!')
print(err
)
异常处理还可以与else语句一起使用;对于多个异常可以如下使用:!!!
try:
num1
= int(input('enter the first number:'))
num2
= int(input('enter the second number:'))
print(num1
/num2
)
except(ValueError
, ZeroDivisionError
):
print('Invalid input!')
else:
print('Aha, everthing is OK!')
循环,只有当输入格式正确的时候才输出结果,跳出循环
while True:
try:
num1
= int(input('enter the first number:'))
num2
= int(input('enter the second number:'))
print(num1
/num2
)
break
except ValueError
:
print('Please input a digit!')
except ZeroDivisionError
:
print('The second number cannot be zero!')
finally子句
不管出不出错,finally里面的字句都会执行!!!
def finally_test():
try:
num1
= int(input('enter the first number:'))
num2
= int(input('enter the second number:'))
print(num1
/num2
)
return 1
except Exception
as err
:
print(err
)
return 0
finally:
print('It is a finally clause')
result
= finally_test
()
print(result
)
二、常用标准库函数
re
1re.match
2re.search
3filldall
math
import math
print(dir(math
))
print(math
.pi
)
print(math
.e
)
print(math
.ceil
(3.6))
print(math
.floor
(3.6))
print(math
.pow(2, 3))
print(math
.sqrt
(4))
print(math
.degrees
(3.14))
print(math
.radians
(180))
os
import os
print(os
.getcwd
())
path
= 'G:\\test'
os
.chdir
(path
)
print(os
.getcwd
())
os
.rename
('test.txt', 'test1.txt')
os
.remove
('test1.txt')
os
.listdir
(path
)
os
.path
.join
(path
, filename
)
datatime
import datetime
print(dir(datetime
))
from datetime
import date
print(date
.today
())
from datetime
import time
tm
= time
(23, 20, 25)
print(tm
)
from datetime
import datetime
dt
= datetime
.now
()
print(datetime
(2020, 5, 31, 20, 16))
print(dt
)
print(dt
.strftime
('%a, %b %d %Y %H:%M'))
random
import random
print(random
.random
())
print(random
.uniform
(1, 5))
print(random
.randint
(1, 5))
a
= list(range(1, 10))
print(random
.choice
(a
))
random
.shuffle
(a
)
print(a
)
print(random
.sample
(range(100), 10))
print(random
.randrange
(1, 10))
print(random
.random
(10))
NumPy
ndarray(N维数组)
a
= np
.array
([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
print(a
[0])
print(a
[0][0])
print(a
[:, [0, 1]])
print(a
[::2])
print(a
[:, ::2])
print(a
[::-1])
print(a
[:, ::-1])
import numpy
as np
a
= np
.array
([[1, 2], [3, 4], [5, 6]])
b
= a
.reshape
(2, 3)
print(b
.shape
)
print(a
.shape
)
a
.resize
(6, 1)
print(a
)
import numpy
as np
x
= np
.ones
((10, 10))
x
[1: -1, 1: -1] = 0
print(x
)
print('===================================')
x2
= np
.full
((10, 10), np
.pi
, dtype
=np
.float64
)
print(x2
)
print('===================================')
x3
= np
.full_like
(x2
, 3)
print(x3
)
print('===================================')
x4
= np
.eye
(3)
print(x4
)
print('=================================')
print(np
.identity
(3))
print('=====================================')
print(np
.eye
(8, k
=-2))
import numpy
as np
X
= np
.arange
(1, 101)
print(X
)
print('===================')
print(X
<=50)
print('===================')
print(X
[X
<=50])
print('=====================')
print(X
[(X
> 50) & (X
% 2 == 0)])
print('=====================')
X
[(X
> 50) & (X
% 2 == 0)] = -1
print(X
)
print('======================')
print(np
.where
((X
% 2 == 0) & (X
> 50), -1, X
))
数组的矢量化运算和广播!!!
import numpy
as np
score
= np
.array
([[79, 80, 92], [67, 89, 99]])
mean_score
= score
.mean
(axis
=1, keepdims
=True)
print(mean_score
)
print(score
- mean_score
)
Pandas
Series(变长字典)
import numpy
as np
import pandas
as pd
print(pd
.Series
([11, 12], index
=['北京', '上海']))
print(pd
.Series
(np
.arange
(3, 6), index
=['A', 'B', 'C']))
print(pd
.Series
(['jiwei', 'sunyan', 'liuzixin']))
print(pd
.Series
(np
.array
([12, 222, 32]), index
=['A', 'B', 'C']))
import numpy
as np
import pandas
as pd
x
= pd
.Series
(np
.random
.rand
(4), index
=range(1, 5))
print(x
)
print(x
.values
)
print(x
.keys
())
print(x
[1])
print('===================================')
x2
= pd
.Series
([3, 5, 1, 9], index
=['A', 'B', 'C', 'D'])
print(x2
['C'])
x2
['C'] = 8
print(x2
)
print(x2
[['A', 'B', 'C']])
x2
.name
= 'Pop'
x2
.index
.name
= 'INDEX'
print('=========================')
print(x2
)
print('===================')
print(x
+x2
)
import pandas as pd
data = {'A': 12, 'B': 22, 'C': 33}
p = pd.Series(data, index=['A', 'B', 'D', 'C'])
print(p)
print(pd.isnull(p)) # 判断是不是空值null
A 12.0
B 22.0
D NaN
C 33.0
dtype: float64
A False
B False
D True
C False
dtype: bool
DataFrame(数据框)
import pandas
as pd
data
= [['Alex', 10], ['Bob', 20], ['John', 30]]
x
= pd
.DataFrame
(data
, columns
=['name', 'numbers'], index
=range(1, 4))
print(x
)
print('=====================================')
data2
= {'state': ['beijing', 'shanghai', 'shenzhen', 'shandong'],
'year': [2001, 2000, 2005, 2010],
'pop': [2.1, 4.2, 1.0, 5]}
x2
= pd
.DataFrame
(data2
)
print(x2
)
print('========================================')
data3
= data2
x3
= pd
.DataFrame
(data3
, columns
=['year', 'state', 'pop', 'debt'])
print(x3
)
print('========================================')
print(x3
.columns
)
print(x3
['state'])
print('=======================================')
import numpy
as np
x3
['debt'] = np
.arange
(4)
print(x3
)
print('====================================')
data4
= data3
x4
= pd
.DataFrame
(data4
, columns
=['state', 'year', 'pop', 'debt'], index
=['one', 'two', 'three', 'four'])
x4
['debt'] = np
.random
.rand
(4)
print(x4
)
print(x4
.iloc
[:2, -2:])
print(x4
['pop'].max())
print(x4
['pop'] >= '2.5')
print('=================')
for x
in frame
['pay']:
if int(x
) >= 5000:
print(x
)
print(frame
['pay'] >= 5000)
print((frame
['pay'])[frame
['pay'] >= 5000])
SciPy
三、数据的获取与表示
1本地数据获取
文件的打开、读写和关闭
文件的打开
相对路径绝对路径
f
= open('infile.txt')
f1
= open('G:/3.0 哈尔滨师范大学/MarkDown学习/Python/file.txt')
f2
= open('infile.txt', 'w')
f3
= open('outfile.txt', 'wb', 0)
f4
= open('infile.txt', 'rb', encoding
='UTF-8')
文件格式
r:读文件模式,文件必须存在
w:写文件模式,清空文件内容或者是新建一个文件
a:追加,在文件的尾部加内容
r+ = r + w 读写模式
w+ = w + r 读写模式(清空原内容)
a+ = a + r 读和追加模式
rb : rb+ 后面加b就表示二进制文件的读写和追加
wb :wb+
ab:ab+
文件相关函数
open()函数返回一个文件file对象 f = open(‘file.txt’, rw)
对象名.方法名
f.read() 将文件内全部数据读出来作为一个字符串返回
with open('infile.txt') as f
:
print(f
.read
(5))
print(f
.read
())
f.write()
下面的写法并不推荐
f
= open('infile.txt', 'w')
f
.write
('Hello World !')
f
.close
()
正常应该以下面的写法去操作
with open('infile.txt', 'w') as f
:
f
.write
('Hello!')
f.readline() 读取一行数据
f.readlines() 读取多行数据,返回的是一个列表
f.writelines() 写入多行数据
with open('infile.txt') as f1
:
data
= f1
.readlines
()
for i
in range(len(data
)):
data
[i
] = str(i
+1)+''+data
[i
]
with open('outfile.txt', 'w') as f2
:
f2
.writelines
(data
)
f.close() 关闭文件对象
当使用with open() as f:语句就不用每次都关闭文件对象了!
f.seek()
s
= 'Tencent Technology Company Limited'
with open('outfile.txt', 'a+') as f
:
f
.writelines
('\n')
f
.writelines
(s
)
f
.seek
(0)
print(f
.readlines
())
小案例:查看文件有几行
try:
with open('infile.txt') as f
:
data
= f
.readlines
()
except FileNotFoundError
:
print('file not find!')
lens
= len(data
)
print('file has {} lines.'.format(lens
))
和os库结合起来使用
import os
def countLines(fname): # 以文件名作为参数传入
try:
with open(fname) as f:
data = f.readlines()
except FileNotFoundError:
print('not found file')
lens = len(data)
print(fname.split('\\')[1] + 'has' + str(lens) + 'lines')
path = 'C:/python_course/ImoocNanjingUniversity/pythonbase'
# os.listdir()找到当前path路径目录下的所有文件
for fname in os.listdir(path):
# 如果文件名以.txt结尾
if fname.endswith('.txt'):
# 将路径和文件名连接起来
file_path = os.path.join(path, fname)
# 调用函数
countLines(file_path)
2网络数据获取
网络数据如何获取?(爬虫)
抓取 requests库解析 bs4库的BeautifulSoup
如何查看网页能不能被爬取呢?
这个时候可以在网站的后面加上robots.txt如果可以访问就是可以爬取的
例如https://blog.csdn.net/robots.txt
requests库
import requests
r
= requests
.get
('https://blog.csdn.net/')
print(r
.status_code
)
print(r
.text
)
BeautifulSoup库
此时注意!!!解析的时候必须要有lxml解释器
不然会报错bs4.FeatureNotFound: Couldn’t find a tree builder with the features you requested: lxml. Do you need to install a parser library
可以安装lxml库,解决问题。
from bs4
import BeautifulSoup
markup
= '<p class="title"><b>The Little Prince</b></p>'
soup
= BeautifulSoup
(markup
, features
='lxml')
print(soup
.b
)
print(type(soup
.b
))
tag
= soup
.p
print(tag
.name
)
print(soup
.find_all
('b'))
综合起来使用实例
import requests
from bs4
import BeautifulSoup
r
= requests
.get
('https://image.baidu.com/search/index?isource=infinity&iname=baidu&tn=baiduimage&word=caizhuoyan')
soup
= BeautifulSoup
(r
.text
, 'lxml')
pattern
= soup
.find_all
('p')
print(pattern
)
3拓展学习
正则表达式
正则表达式验证的网站:
www.regex101.com
输入和输出
1如何输入获得多个字符串?
字符串格式
x
, y
= input('input:').split
()
print('x:', x
)
print('y:', y
)
a
, b
= input('input:').split
(',')
print('a:{}, b:{}'.format(a
, b
))
2如何输入获得两个整数?
整数类型
x
, y
, z
= eval(input('input:'))
print('x:{}, y:{}, z:{}'.format(x
, y
, z
))
3如何输入一个元素后获得一个元素均为数值型的列表?
st
= list(eval(input('input:')))
print(st
)
st2
= eval(input('input:'))
print(st2
)
4多个元素输出,以逗号分隔开
x
, y
, z
= 1, 3, 'a'
print(x
, y
, z
, sep
=',')
5对于循环中的输出,将所有输出数据放在一行输出
for i
in range(10):
print(i
, end
=',')
列表解析
data
= input('input:').split
(',')
print(data
)
print([eval(i
) for i
in data
])
函数式编程
函数式编程完场上面的列表解析
data
= input('input:').split
(',')
print(list(map(eval, data
)))
主要由3个基本函数和1个算子构成
map()函数
lst
= [2, 1, 8, 9]
print(list(map(lambda x
: x
*2, lst
)))
data
= list(range(10))
print(list(map(str, data
)))
data2
= ['asd', 'open', 'hello', 'python']
print(list(map(lambda x
: x
.upper
(), data2
)))
filter()函数
lst2
= [1, 2, 3, 4, 5, 6]
print(list(filter(lambda x
: x
% 2 == 0, lst2
)))
reduce()函数
from functools
import reduce
lst3
= [1, 2, 3, 4, 5]
print(reduce(lambda x
, y
: x
+ y
, lst3
))
算子(operator):lambda
浅拷贝和深拷贝
赋值如x = [1, 2, 3] y = x 表示x和y共用一块内存地址
a
= [1, 2, 3, [4, 4]]
b
= a
a
[0], a
[3][1] = 9, 9
print(a
)
print(b
)
浅拷贝:只复制了父对象,而不复制内部子对象;copy()
a
= [1, 2, 3, [4, 4]]
c
= a
.copy
()
a
[0], a
[3][0] = 9, 9
print(a
)
print(c
)
深拷贝:不仅复制父对象,而且复制内部子对象;copy.deepcopy()
import copy
a
= [1, 2, 3, [4, 4]]
d
= copy
.deepcopy
(a
)
a
[0], a
[3][0] = 9, 9
print(a
)
print(d
)