Python学习日记-pandas模块学习
根据需求筛选数据绘制柱状图-利用pandas绘制绘制柱状图-利用matplotlib绘制绘制柱状图-两组数据比较绘制柱状图-叠加柱状图绘制饼状图绘制折线图绘制散点图绘制直方图绘制密度图和打印相关性联立两张sheet进行数据验证进行数据拆分按行/列求和/均值删除表中重复的数据表格转置读取csv/tsv/txt文件成dataframe数据透视表根据原始数据拟合曲线数据行的增、删、改、插入数据列的增、删、改、插入、并联、串联对列数据进行复杂的计算
根据需求筛选数据
import pandas
as pd
def age_18_to_30(age
):
return 18<=age
<30
def leave_A(score
):
return 85<=score
<=100
stu
=pd
.read_excel
('E:\\工作日志\\Python\\ExcelResource\\filter.xlsx',index_col
='ID')
stu
=stu
.loc
[stu
['Age'].apply(age_18_to_30
)]
stu
=stu
.loc
[stu
.Age
.apply(age_18_to_30
)]
stu
=stu
.loc
[stu
['Age'].apply(age_18_to_30
)].loc
[stu
['Score'].apply(leave_A
)]
stu
=stu
.loc
[stu
['Age'].apply(lambda age
:18<=age
<30)].loc
[stu
['Score'].apply(lambda score
:85<=score
<=100)]
print(stu
)
绘制柱状图-利用pandas绘制
import pandas
as pd
import matplotlib
.pyplot
as pl
stu
=pd
.read_excel
('E:\\工作日志\\Python\\ExcelResource\\field.xlsx')
stu
.sort_values
(by
='Number',inplace
=True,ascending
=False)
stu
.plot
.bar
(x
='Field',y
='Number',color
='orange',title
='Students Field')
pl
.show
()
绘制柱状图-利用matplotlib绘制
import pandas
as pd
import matplotlib
.pyplot
as pl
stu
=pd
.read_excel
('E:\\工作日志\\Python\\ExcelResource\\field.xlsx')
stu
.sort_values
(by
='Number',inplace
=True,ascending
=False)
pl
.bar
(stu
.Field
,stu
.Number
,color
='orange')
pl
.xticks
(stu
.Field
,rotation
='90')
pl
.xlabel
('Field')
pl
.ylabel
('Number')
pl
.title
('Students Field',fontSize
=16)
pl
.show
()
绘制柱状图-两组数据比较
import pandas
as pd
import matplotlib
.pyplot
as pl
stu
=pd
.read_excel
('E:\\工作日志\\Python\\ExcelResource\\compare.xlsx')
stu
.sort_values
(by
='2016Year',inplace
=True,ascending
=True)
stu
.plot
.bar
(x
='Field',y
=['2016Year','2017Year'],color
=['green','red'])
pl
.title
('Students Field',fontsize
=16)
pl
.xlabel
('Field')
pl
.ylabel
('Number')
pl
.gca
().set_xticklabels
(stu
.Field
,rotation
='45',ha
='right')
pl
.gcf
().subplots_adjust
(left
=0.2,bottom
=0.4)
pl
.show
()
绘制柱状图-叠加柱状图
import pandas
as pd
import matplotlib
.pyplot
as pl
user
=pd
.read_excel
('E:/工作日志/Python/ExcelResource/overlay.xlsx',index_col
='ID')
user
['Total']=user
['Oct']+user
['Nov']+user
['Dec']
user
.sort_values
(by
='Total',inplace
=True,ascending
=False)
user
.plot
.bar
(x
='Name',y
=['Oct','Nov','Dec'],stacked
=True)
user
.plot
.barh
(x
='Name',y
=['Oct','Nov','Dec'],stacked
=True)
pl
.show
()
print(user
)
绘制饼状图
import pandas
as pd
import matplotlib
.pyplot
as pl
stu
=pd
.read_excel
('E:/工作日志/Python/ExcelResource/pie.xlsx',index_col
='Name')
stu
['Number'].plot
.pie
(fontsize
=8,counterclock
=True)
pl
.ylabel
('Number')
pl
.title
('Student',fontsize
=16)
pl
.show
()
print(stu
)
绘制折线图
import pandas
as pd
import matplotlib
.pyplot
as pl
week
=pd
.read_excel
('E:/工作日志/Python/ExcelResource/资料/Orders.xlsx',index_col
='Week')
print(week
)
print(week
.columns
)
week
.plot
(y
=['Accessories', 'Bikes', 'Clothing', 'Components'])
week
.plot
.area
(y
=['Accessories', 'Bikes', 'Clothing', 'Components'])
pl
.title
('Sale Weekly Trand',fontsize
=16)
pl
.ylabel
('Total',fontsize
=8,fontweight
='bold')
pl
.xticks
(week
.index
,fontsize
=6)
pl
.show
()
绘制散点图
import pandas
as pd
import matplotlib
.pyplot
as pl
pd
.options
.display
.max_columns
=999
homes
=pd
.read_excel
('E:/工作日志/Python/ExcelResource/资料/home_data.xlsx')
print(homes
.head
())
homes
.plot
.scatter
(x
='sqft_living',y
='price')
pl
.title
('HomeData',fontsize
=8,fontweight
='bold')
homes
.plot
.scatter
(x
='price',y
='sqft_living')
pl
.title
('HomeData',fontsize
=8,fontweight
='bold')
pl
.show
()
绘制直方图
import pandas
as pd
import matplotlib
.pyplot
as pl
homes
=pd
.read_excel
('E:/工作日志/Python/ExcelResource/资料/home_data.xlsx')
homes
.sqft_living
.plot
.hist
(bins
=100)
pl
.xticks
(range(0,max(homes
.sqft_living
),500),fontsize
=8,rotation
=90)
pl
.show
()
绘制密度图和打印相关性
import pandas
as pd
import matplotlib
.pyplot
as pl
homes
=pd
.read_excel
('E:/工作日志/Python/ExcelResource/资料/home_data.xlsx')
homes
.sqft_living
.plot
.hist
(bins
=100)
pl
.xticks
(range(0,max(homes
.sqft_living
),500),fontsize
=8,rotation
=90)
homes
.sqft_living
.plot
.kde
()
pl
.xticks
(range(0,max(homes
.sqft_living
),500),fontsize
=8,rotation
=90)
pl
.show
()
pd
.options
.display
.max_columns
=999
print(homes
.corr
())
联立两张sheet
import pandas
as pd
import matplotlib
.pyplot
as pl
students
=pd
.read_excel
('E:/工作日志/Python/ExcelResource/资料/Student_Score.xlsx',sheet_name
='Students')
scores
=pd
.read_excel
('E:/工作日志/Python/ExcelResource/资料/Student_Score.xlsx',sheet_name
='Scores')
table
=students
.merge
(scores
,on
='ID')
table
=students
.merge
(scores
,on
='ID',how
='left').fillna
(0)
table
.Score
=table
.Score
.astype
(int)
table
=students
.join
(scores
,on
='ID')
print(table
)
进行数据验证
import pandas
as pd
import matplotlib
.pyplot
as pl
def score_validation1(row
):
try:
assert 0<=row
.Score
<=100
except:
print('#{0} student {1} has an invalid score {2}'.format(row
.ID
,row
.Name
,row
.Score
))
def score_validation2(row
):
if not 0<=row
.Score
<=100:
print('#{0} student {1} has an invalid score {2}'.format(row
.ID
, row
.Name
, row
.Score
))
stu
=pd
.read_excel
('E:/工作日志/Python/ExcelResource/资料/Students.xlsx')
stu
.apply(score_validation1
,axis
=1)
进行数据拆分
import pandas
as pd
employees
=pd
.read_excel
('E:/工作日志/Python/ExcelResource/资料/Employees.xlsx')
df_temp
=employees
['Full Name'].str.split
(pat
=' ',expand
=True)
employees
['First Name']=df_temp
[0]
employees
['Last Name']=df_temp
[1]
print(employees
)
按行/列求和/均值
import pandas
as pd
students
=pd
.read_excel
('E:/工作日志/Python/ExcelResource/资料/Students.xlsx',index_col
='ID')
temp
=students
[['Test_1','Test_2','Test_3']]
row_sum
=temp
.sum(axis
=1)
row_mean
=temp
.mean
(axis
=1)
students
['Total']=row_sum
students
['Average']=row_mean
col_mean
=students
[['Test_1','Test_2','Test_3','Total','Average']].mean
()
col_mean
['Name']='Summary'
students
=students
.append
(col_mean
,ignore_index
=True)
print(students
)
删除表中重复的数据
import pandas
as pd
students
=pd
.read_excel
('E:/工作日志/Python/ExcelResource/资料/Students_Duplicates.xlsx')
repeat
=students
.duplicated
(subset
='Name')
repeat_ID
=repeat
[repeat
==True]
repeat_students
=students
.iloc
[repeat_ID
.index
]
students
.drop_duplicates
(subset
='Name',inplace
=True,keep
='first')
print(students
)
表格转置
import pandas
as pd
pd
.options
.display
.max_columns
=999
video
=pd
.read_excel
('E:/工作日志/Python/ExcelResource/资料/Videos.xlsx',index_col
='Month')
table
=video
.transpose
()
print(table
)
读取csv/tsv/txt文件成dataframe
import pandas
as pd
table1
=pd
.read_csv
('E:/工作日志/Python/ExcelResource/资料/Students.csv',index_col
='ID')
table2
=pd
.read_csv
('E:/工作日志/Python/ExcelResource/资料/Students.tsv',sep
='\t',index_col
='ID')
table3
=pd
.read_csv
('E:/工作日志/Python/ExcelResource/资料/Students.txt',sep
='\t',index_col
='ID')
数据透视表
import pandas
as pd
import numpy
as np
pd
.options
.display
.max_columns
=999
orders
=pd
.read_excel
('E:/工作日志/Python/ExcelResource/资料/Orders2.xlsx')
orders
['Year']=pd
.DatetimeIndex
(orders
['Date']).year
pt1
=orders
.pivot_table
(index
='Category',columns
='Year',values
='Total',aggfunc
=np
.sum)
print(pt1
)
groups
=orders
.groupby
(['Category','Year'])
s
=groups
['Total'].sum()
c
=groups
['ID'].count
()
pt2
=pd
.DataFrame
({'Summary':s
,'Count':c
})
print(pt2
)
writer
=pd
.ExcelWriter
('E:/工作日志/Python/ExcelResource/newtb.xlsx')
pt1
.to_excel
(writer
,sheet_name
='pt1')
pt2
.to_excel
(writer
,sheet_name
='pt2')
writer
.save
()
writer
.close
()
根据原始数据拟合曲线
import pandas
as pd
import matplotlib
.pyplot
as pl
import scipy
.stats
as sci
sales
=pd
.read_excel
('E:/工作日志/Python/ExcelResource/资料/Sales.xlsx',dtype
={'Month':str})
slope
,intercept
,r
,p
,std_err
=sci
.linregress
(sales
.index
,sales
['Revenue'])
exp
=slope
*sales
.index
+intercept
pl
.scatter
(sales
.index
,sales
['Revenue'])
pl
.plot
(sales
.index
,exp
,color
='red')
pl
.xticks
(sales
.index
,sales
['Month'],rotation
=90)
pl
.title
(f
'y={round(slope,4)}*x+{round(intercept,4)}')
pl
.show
()
数据行的增、删、改、插入
import pandas
as pd
page1
=pd
.read_excel
('E:/工作日志/Python/ExcelResource/资料/Students.xlsx',sheet_name
='Page_001')
page2
=pd
.read_excel
('E:/工作日志/Python/ExcelResource/资料/Students.xlsx',sheet_name
='Page_002')
new_page
=page1
.append
(page2
).reset_index
(drop
=True)
new_row1
=pd
.Series
({'ID':41,'Name':'Tom','Score':100})
new_page
=new_page
.append
(new_row1
,ignore_index
=True)
new_page
.at
[36,'Name']='Tony'
new_page
.at
[36,'Score']=97
new_row2
=pd
.Series
({'ID':24,'Name':'Jack','Score':89})
new_page
.iloc
[24]=new_row2
new_row3
=pd
.Series
({'ID':101,'Name':'Anny','Score':92})
part1
=new_page
[:20]
part2
=new_page
[20:]
new_page
=part1
.append
(new_row3
,ignore_index
=True).append
(part2
).reset_index
(drop
=True)
new_page
.drop
(index
=[7,9,28],inplace
=True)
missing
=new_page
.loc
[new_page
['Name']=='']
new_page
.drop
(index
=missing
.index
,inplace
=True)
new_page
=new_page
.reset_index
(drop
=True)
print(new_page
)
数据列的增、删、改、插入、并联、串联
import pandas
as pd
import numpy
as np
page1
=pd
.read_excel
('E:/工作日志/Python/ExcelResource/资料/Students.xlsx',sheet_name
='Page_001')
page2
=pd
.read_excel
('E:/工作日志/Python/ExcelResource/资料/Students.xlsx',sheet_name
='Page_002')
new_page1
=pd
.concat
([page1
,page2
],axis
=0)
new_page2
=pd
.concat
([page1
,page2
],axis
=1)
new_page1
['newRow']=np
.arange
(0,len(new_page1
))
new_page1
.drop
(columns
=['newRow','Score'],inplace
=True)
new_page1
.insert
(1,column
='Insert',value
=np
.arange
(0,len(new_page1
)))
new_page1
.rename
(columns
={'Insert':'InsertRow','Name':'NameModify'},inplace
=True)
new_page1
.dropna
(inplace
=True)
print(new_page1
)
对列数据进行复杂的计算
import pandas
as pd
import numpy
as np
def get_circumcircle_area(l
,h
):
r
=np
.sqrt
(l
*l
+h
*h
)/2
return round(np
.pi
*r
*r
,3)
rectangle
=pd
.read_excel
('E:/工作日志/Python/ExcelResource/资料/Rectangles.xlsx',index_col
='ID')
rectangle
['CA']=rectangle
.apply(lambda row
:get_circumcircle_area
(row
['Length'],row
['Height']),axis
=1)
print(rectangle
)
转载请注明原文地址:https://blackberry.8miu.com/read-18189.html