Pandas基础
文章目录
Pandas基础一、Series二、DataFrame三、索引值四、索引和选取loc和iloc函数讲解
五、行和列的操作map、apply、applymap函数讲解Pandas的函数应用、层级索引、统计计算
六、pandas数据可视化1.使用series绘制线性图2.使用DataFrame绘制线型图3.使用series绘制柱状图4.使用DataFrame绘制柱状图5.使用DataFrame的hist方法生成直方图
七、pandas文件操作八、pandas数据清洗与整理1.数据清洗2.重复数据duplicated,drop_duplicates函数讲解3.替换值4.虚拟变量pandas的get_dummies函数讲解5.数据合并和重塑merge函数讲解concat函数讲解6.数据重塑stack和unstack函数讲解
八、综合案例——小费数据集
一、Series
Series讲解
from pandas
import Series
,DataFrame
import pandas
as pd
obj
= Series
([1, -2, 3, -4])
type(obj
)
obj2
= Series
([1, -2, 3, -4], index
=['a', 'b', 'c', 'd'])
obj2
.values
obj2
.index
obj2
[2]
obj2
[['c']]
obj2
['c'] = 23
obj2
[obj2
< 0 ]
obj2
* 2
np
.abs(obj2
)
data
= {
'张三':92,
'李四':78,
'王五':68,
'小明':82
}
obj3
= Series
(data
)
obj4
= Series
(data
, index
=names
)
obj4
.name
= 'math'
obj4
.index
.name
= 'students'
二、DataFrame
import numpy
as np
from pandas
import Series
,DataFrame
import pandas
as pd
data
= {
'name':['张三', '李四', '王五', '小明'],
'sex':['female', 'female', 'male', 'male'],
'year':[2001, 2001, 2003, 2002],
'city':['北京', '上海', '广州', '北京']
}
df
= DataFrame
(data
)
df
= DataFrame
(data
, columns
=['name', 'sex', 'year', 'city'])
df
= DataFrame
(data
, columns
=['name', 'sex', 'year', 'city'],index
=['a', 'b', 'c', 'd'])
'sex' in df
.columns
'f' in df
.index
三、索引值
obj
= Series
([1, -2, 3, -4], index
=['b', 'a', 'c', 'd'])
obj2
= obj
.reindex
(['a', 'b', 'c', 'd', 'e'])
obj2
= obj
.reindex
(range(6),method
='ffill')
df2
= df
.reindex
(['a', 'b', 'c', 'd'],fill_value
=0.0)
df3
= df2
.reset_index
(drop
=True)
df2
= df
.set_index
('name')
四、索引和选取
loc和iloc函数讲解
obj
[['a','c']]
obj
['a':'c']
df
[['city','sex']]
df2
.loc
['张三']
df2
.iloc
[1]
df2
[(df2
['sex'] == 'female') & (df2
['city'] == '北京')]
五、行和列的操作
map、apply、applymap函数讲解
Pandas的函数应用、层级索引、统计计算
new_data
= {
'city':'武汉',
'name':'小李',
'sex':'male',
'year':2002
}
df
= df
.append
(new_data
,ignore_index
=True)
new_df
= df
.drop
(2,axis
=0)
new_df
= new_df
.drop
('class',axis
=1)
new_df
.rename
(index
={3:2,4:3},columns
={'math':'MATH'},inplace
=True)
obj1
.sort_index
()
obj1
.sort_index
(ascending
=False)
obj1
.sort_values
(ascending
=False)
df2
.sort_values
(by
='b',ascending
=False)
df
.describe
()
obj
.unique
()
obj
.value_counts
()
obj
= Series
(np
.random
.randn
(9),
index
=[['one','one','one','two','two','two','three','three','three'],
['a','b','c','a','b','c','a','b','c']])
obj
[:,'a']
df
= DataFrame
(np
.arange
(16).reshape
(4,4),
index
=[['one','one','two','two'],['a','b','a','b']],
columns
=[['apple','apple','orange','orange'],['red','green','red','green']])
df
.swaplevel
(0,1)
df
.sum(level
=0)
df
.sum(level
=1,axis
=1)
六、pandas数据可视化
1.使用series绘制线性图
import numpy
as np
from pandas
import Series
,DataFrame
import pandas
as pd
import matplotlib
as mpl
import matplotlib
.pyplot
as plt
%matplotlib inline
s
= Series
(np
.random
.normal
(size
=10),index
=['a','b','c','d','e','f','g','h','i','j'])
s
.plot
()
plt
.show
()
2.使用DataFrame绘制线型图
df
= DataFrame
({'normal': np
.random
.normal
(size
=100),
'gamma': np
.random
.gamma
(1, size
=100),
'poisson': np
.random
.poisson
(size
=100)})
df
.describe
()
df
.plot
()
plt
.show
()
df
['sex'].value_counts
()
df
['sex'].value_counts
().plot
(kind
='barh')
3.使用series绘制柱状图
from pandas
import DataFrame
,Series
import pandas
as pd
import numpy
as np
import matplotlib
.pyplot
as plt
fig
,axes
= plt
.subplots
(2,1)
df
= pd
.Series
(np
.random
.rand
(16),index
= list('abcdefgijkpolikj'))
df
.plot
.bar
(ax
=axes
[0],color
='r',alpha
=0.7)
df
.plot
.barh
(ax
=axes
[1],color
='r',alpha
=0.7)
plt
.show
()
4.使用DataFrame绘制柱状图
from pandas
import DataFrame
,Series
import pandas
as pd
import numpy
as np
import matplotlib
.pyplot
as plt
df
= pd
.DataFrame
(np
.random
.rand
(4,4),index
= ['one','two','three','four'],columns
= pd
.Index
(['A','B','C','D'],name
='bar'))
df
.plot
.bar
()
plt
.show
()
5.使用DataFrame的hist方法生成直方图
import pandas
as pd
from pandas
import Series
,DataFrame
import numpy
as np
import matplotlib
.pyplot
as plt
df
= pd
.DataFrame
({'a':np
.random
.randn
(1000),'b':np
.random
.randn
(1000),},columns
=['a','b'])
df
.plot
.hist
(bins
=20)
plt
.show
()
七、pandas文件操作
data
=pd
.read_csv
('05_Regression_5.2_logreg_credit_scores.csv',sep
=';')
data
=pd
.read_table
('iris.data',sep
=',',header
=None)
data
.head
(6)
data
.to_csv
('iris.csv')
八、pandas数据清洗与整理
1.数据清洗
df1
= DataFrame
([[3,5,3],[1,6,np
.nan
],
['lili',np
.nan
,'pop'],[np
.nan
,'a','b']])
df1
.isnull
()
df1
.notnull
()
df1
.isnull
().sum()
df1
.isnull
().sum().sum()
df1
.isnull
().any()
df1
.isnull
().values
.any()
df1
.info
()
df1
.dropna
()
df2
.ix
[2,:] = np
.nan
df2
[3] = np
.nan
df2
.dropna
(how
='all')
df2
.dropna
(how
='all',axis
=1)
df2
.fillna
(0)
df2
.fillna
({0:1,1:6,2:9,3:11})
df2
.fillna
({1:6,3:0},inplace
=True)
df2
.fillna
(method
='ffill')
df2
[0] = df2
[0].fillna
(df2
[0].mean
())
2.重复数据
duplicated,drop_duplicates函数讲解
data
= {
'name':['张三', '李四', '张三', '小明'],
'sex':['female', 'male', 'female', 'male'],
'year':[2001, 2002, 2001, 2002],
'city':['北京', '上海', '北京', '北京']
}
df1
= DataFrame
(data
)
df1
.duplicated
()
df1
.drop_duplicates
(inplace
=True)
df1
.drop_duplicates
(['sex','year'])
df1
.drop_duplicates
(['sex','year'],keep
='last')、
3.替换值
data
= {
'name':['张三', '李四', '王五', '小明'],
'sex':['female', 'male', '', 'male'],
'year':[2001, 2003, 2001, 2002],
'city':['北京', '上海', '', '北京']
}
df1
= DataFrame
(data
)
df1
.replace
(['',2001],['不详',2002])
df1
.replace
({'':'不详',2001:2002})
def f(x
):
if x
>= 90:
return '优秀'
elif 70<=x
<90:
return '良好'
elif 60<=x
<70:
return '合格'
else:
return '不合格'
df2
['class'] = df2
['math'].map(f
)
del df2
['class']
df2
['class'] = df2
['math'].apply(f
)
4.虚拟变量
pandas的get_dummies函数讲解
df
= DataFrame
({
'朝向':['东','南','东','西','北'],
'价格':[1200,2100,2300,2900,1400]
})
pd
.get_dummies
(df
['朝向'])
5.数据合并和重塑
merge函数讲解
price
= DataFrame
({
'fruit':['apple','banana','orange'],
'price':[23,32,45]
})
amount
= DataFrame
({
'fruit':['apple','banana','apple','apple','banana','pear'],
'amount':[5,3,6,3,5,7]
})
pd
.merge
(amount
,price
,on
='fruit')
concat函数讲解
s1
= Series
([0,1],index
=['a','b'])
s2
= Series
([2,3],index
=['c','d'])
s3
= Series
([4,5],index
=['e','f'])
pd
.concat
([s1
,s2
,s3
])
6.数据重塑
stack和unstack函数讲解
result
= df
.stack
()
result
.unstack
()
八、综合案例——小费数据集
import numpy
as np
from pandas
import Series
,DataFrame
import pandas
as pd
import seaborn
as sns
tips
=sns
.load_dataset
('tips')
tips
.head
()
tips
.shape
tips
.describe
()
tips
.info
()
tips
.plot
(kind
='scatter',x
='total_bill',y
='tip')
male_tip
= tips
[tips
['sex'] == 'Male']['tip'].mean
()
female_tip
= tips
[tips
['sex'] == 'Female']['tip'].mean
()
s
= Series
([male_tip
,female_tip
],index
=['male','female'])
s
.plot
(kind
='bar')
tips
['day'].unique
()
sun_tip
= tips
[tips
['day'] == 'Sun']['tip'].mean
()
sat_tip
= tips
[tips
['day'] == 'Sat']['tip'].mean
()
thur_tip
= tips
[tips
['day'] == 'Thur']['tip'].mean
()
fri_tip
= tips
[tips
['day'] == 'Fri']['tip'].mean
()
s
= Series
([thur_tip
,fri_tip
,sat_tip
,sun_tip
],index
=['Thur','Fri','Sat','Sun'])
s
.plot
(kind
='bar')
tips
['percent_tip'] = tips
['tip']/(tips
['total_bill']+tips
['tip'])
tips
.head
(10)
tips
['percent_tip'].hist
(bins
=50)