文章目录
1.读取数据2.查看数据集基本信息3.pandas数据结构分 DataFrame 和 Series4.Pandas查询数据(推荐.loc,既能查询,又能覆盖写入)5.新增数据列6.数据统计函数7.缺失值处理过程8.数据排序9.字符串处理10.处理日期时间数据11.groupby分组12.groupby后其他列数据处理13. 按行遍历DataFrame的3种方法14. DataFrame赋值方法15. 数据转换函数map、apply、applymap
1.读取数据
import pandas
as pd
fpath
= "./datas/ml-latest-small/ratings.csv"
df
= pd
.read_csv
(fpath
)
fpath
= "./datas/crazyant/access_pvuv.txt"
df
= pd
.read_csv
(
fpath
,
sep
="\t",
header
=None,
names
=['pdate', 'pv', 'uv']
)
fpath
= "./datas/crazyant/access_pvuv.xlsx"
df
= pd
.read_excel
(fpath
, skiprows
=2)
import pymysql
conn
= pymysql
.connect
(
host
='127.0.0.1',
user
='root',
password
='test',
database
='test',
charset
='utf8'
)
mysql_df
= pd
.read_sql
("select * from table_name", con
=conn
)
2.查看数据集基本信息
type(df
)
df
.head
()
df
.tail
()
df
.shape
df
.count
()
df
.columns
df
.index
df
.dtypes
df
.values
df
.describe
()
df
.info()
3.pandas数据结构分 DataFrame 和 Series
DataFrame是一个表格型的数据结构
1. 每列可以是不同的值类型(数值、字符串、布尔值等)
2. 既有行索引index,也有列索引columns
3. 可以被看做由Series组成的字典
4. 一行一列为Series,多行多列为DataFrame
ser_data
= [1, 'a', 5.2, 7]
ser
= pd
.Series
(ser_data
, index
=['d','b','a','c'])
ser_dict_data
= {'Ohio':35,'Texas':72,'Oregon':16,'Utah':50}
ser2
= pd
.Series
(ser_dict_data
)
ser
.name
= 'new_name'
data
={
'state':['Ohio','Ohio','Ohio','Nevada','Nevada'],
'year':[2000,2001,2002,2001,2002],
'pop':[1.5,1.7,3.6,2.4,2.9]
}
df
= pd
.DataFrame
(data
)
4.Pandas查询数据(推荐.loc,既能查询,又能覆盖写入)
df是可以直接做条件筛选或切片操作的 df[ 条件或切片操作]df.loc方法,根据行、列的 标签值 查询df.iloc方法,根据行、列的 数字位置 查询----只能整数df.where方法df.query方法
df
['bWendu']
df
.loc
[:,'bWendu']
df
[['bWendu']]
df
.loc
[:,['bWendu']]
1、使用单个label值查询数据
df
.loc
['2018-01-03', 'bWendu']
df
.loc
['2018-01-03', ['bWendu', 'yWendu']]
2、使用值列表批量查询
df
.loc
[['2018-01-03','2018-01-04','2018-01-05'], 'bWendu']
df
.loc
[['2018-01-03','2018-01-04','2018-01-05'], ['bWendu', 'yWendu']]
3、使用数值区间进行范围查询
df
.loc
['2018-01-03':'2018-01-05', 'bWendu']
df
.loc
['2018-01-03', 'bWendu':'fengxiang']
df
.loc
['2018-01-03':'2018-01-05', 'bWendu':'fengxiang']
4、使用条件表达式查询
df
.loc
[(df
["bWendu"]<=30) & (df
["yWendu"]>=15) & (df
["tianqi"]=='晴'), :]
5、调用函数查询
df
.loc
[lambda df
: (df
["bWendu"]<=30) & (df
["yWendu"]>=15), :]
def query_my_data(df
):
return df
.index
.str.startswith
("2018-09") & (df
["aqiLevel"]==1)
df
.loc
[query_my_data
, :]
iloc() 和 at() 、iat()、ix()
df
.iloc
[:, :2]
df
.iloc
[:2, ]
df
.iloc
[0,2]
df
.iloc
[1:4,[0,2]]
df
.iloc
[[1,3,5],[0,2]]
df
.iloc
[[1,3,5],0:2]
df
.ix
[1:3,['a','b']]
df
.ix
[[1,3,5],['a','b']]
df
.at
[3,'a']
df
.iat
[3, 0]
df
.iat
[3,'a']
5.新增数据列
直接赋值
df
.loc
[:, "wencha"] = df
["bWendu"] - df
["yWendu"]
df.apply 方法 Apply a function along an axis of the DataFrame.
def get_wendu_type(x
):
if x
["bWendu"] > 33:
return '高温'
if x
["yWendu"] < -10:
return '低温'
return '常温'
df
.loc
[:, "wendu_type"] = df
.apply(get_wendu_type
, axis
=1)
df.assign 方法 Assign new columns to a DataFrame. Returns a new object with all original columns in addition to new ones.
df
.assign
(
yWendu_huashi
= lambda x
: x
["yWendu"] * 9 / 5 + 32,
bWendu_huashi
= lambda x
: x
["bWendu"] * 9 / 5 + 32
)
按条件选择分组分别赋值
df
['wencha_type'] = ''
df
.loc
[df
["bWendu"]-df
["yWendu"]>10, "wencha_type"] = "温差大"
df
.loc
[df
["bWendu"]-df
["yWendu"]<=10, "wencha_type"] = "温差正常"
6.数据统计函数
协方差:衡量同向反向程度,如果协方差为正,说明X,Y同向变化,协方差越大说明同向程度越高;如果协方差为负,说明X,Y反向运动,协方差越小说明反向程度越高。相关系数:衡量相似度程度,当他们的相关系数为1时,说明两个变量变化时的正向相似度最大,当相关系数为-1时,说明两个变量变化的反向相似度最大
df
.describe
()
df
["bWendu"].mean
()
df
["bWendu"].max()
df
["bWendu"].min()
df
["fengxiang"].unique
()
df
["fengxiang"].value_counts
()
df
.cov
()
df
.corr
()
df
["aqi"].corr
(df
["bWendu"])
7.缺失值处理过程
步骤1:读取excel的时候,忽略前几个空行(也可利用dropna删除 空行)
studf
= pd
.read_excel
("./datas/student_excel/student_excel.xlsx", skiprows
=2)
步骤2:检测数据空值
studf
.isnull
()
studf
.loc
[studf
["分数"].notnull
(), :]
步骤3:删除掉全是空值的列
studf
.dropna
(axis
="columns", how
='all', inplace
=True)
步骤4:删除掉全是空值的行
studf
.dropna
(axis
="index", how
='all', inplace
=True)
步骤5:填充应该有值的空值项
studf
.loc
[:, '分数'] = studf
['分数'].fillna
(value
=0)
studf
.loc
[:, '姓名'] = studf
['姓名'].fillna
(method
="ffill")
步骤6:保存处理清洗后的数据
studf
.to_excel
("./datas/student_excel/student_excel_clean.xlsx", index
=False)
8.数据排序
Series 排序: Series.sort_values(ascending=True, inplace=False)
df
["aqi"].sort_values
(ascending
=False)
DataFrame 排序 DataFrame.sort_values(by, ascending=True, inplace=False)
df
.sort_values
(by
=["aqiLevel", "bWendu"], ascending
=[True, False])
9.字符串处理
使用方法:先获取Series的str属性,然后在属性上调用函数;只能在字符串列上使用,不能数字列上使用;Dataframe上没有str属性和处理方法,只有Series有Series.str并不是Python原生字符串,而是自己的一套方法,不过大部分和原生str很相似
df
["bWendu"].str.replace
("℃", "")
df
["bWendu"].str.isnumeric
()
df
["bWendu"].str.len()
condition
= df
["ymd"].str.startswith
("2018-03")
condition
= df
["ymd"].str.contains
("2018")
df
[condition
].head
()
df
["ymd"].str.replace
("-", "").str.slice(0, 6)
df
["ymd"].str.replace
("-", "").str[0:6]
df
["中文日期"].str.replace
("年", "").str.replace
("月","").str.replace
("日", "")
df
["中文日期"].str.replace
("[年月日]", "")
10.处理日期时间数据
df
.set_index
(pd
.to_datetime
(df
["ymd"]), inplace
=True)
df
.index
df
.index
[0]
df
.loc
['2018-01-05']
df
.loc
['2018-01-05':'2018-01-10']
df
.loc
['2018-03']
df
.loc
["2018-07":"2018-09"]
df
.loc
["2018"]
df
.index
.week
df
.index
.month
df
.index
.quarter
df
.groupby
(df
.index
.week
)["bWendu"].max().plot
()
df
.groupby
(df
.index
.month
)["bWendu"].max().plot
()
df
.groupby
(df
.index
.quarter
)["bWendu"].max().plot
()
处理日期索引的缺失的一般方法
df_date
= df
.set_index
("pdate")
df_date
= df_date
.set_index
(pd
.to_datetime
(df_date
.index
))
pdates
= pd
.date_range
(start
="2019-12-01", end
="2019-12-05")
df_date_new
= df_date
.reindex
(pdates
, fill_value
=0)
df_new2
= df
.set_index
(pd
.to_datetime
(df
["pdate"])).drop
("pdate", axis
=1)
df_new2
= df_new2
.resample
("D").mean
().fillna
(0)
说明: resample的含义:改变数据的时间频率
11.groupby分组
所有的聚合统计,都是在dataframe和series上进行的 1.常用用法
df
.groupby
('A').sum()
df
.groupby
(['A','B']).mean
()
df
.groupby
(['A','B'], as_index
=False).mean
()
df
.groupby
('A').agg
([np
.sum, np
.mean
, np
.std
])
df
.groupby
('A')['C'].agg
([np
.sum, np
.mean
, np
.std
])
df
.groupby
('A').agg
({"C":np
.sum, "D":np
.mean
})
理解过程
g
= df
.groupby
('A')
g
for name
,group
in g
:
print(name
)
print(group
)
print()
bar
A B C D
1 bar one
-0.375789 -0.345869
3 bar three
-1.564748 0.081163
5 bar two
-0.202403 0.701301
foo
A B C D
0 foo one
0.542903 0.788896
2 foo two
-0.903407 0.428031
g
.get_group
('bar')
g
= df
.groupby
(['A', 'B'])
for name
,group
in g
:
print(name
)
print(group
)
print()
g
.get_group
(('foo', 'one'))
g
['C']
for name
, group
in g
['C']:
print(name
)
print(group
)
print(type(group
))
print()
12.groupby后其他列数据处理
df
.groupby
("MovieID")["Rating"].mean
()
df
.groupby
("MovieID")["Rating"].agg
(
mean
="mean", max="max", min=np
.min
)
df
.groupby
("MovieID").agg
(
{"Rating":['mean', 'max', np
.min]}
)
df
.groupby
("MovieID").agg
(
rating_max
=("Rating", "max"),
user_count
=("UserID", lambda x
: x
.nunique
()),
LTB
=("LTB": (lambda x
: ",".join
(x
.unique
())))
)
df
.groupby
("MovieID").agg
(
{
"Rating": ['mean', 'min', 'max'],
"UserID": lambda x
:x
.nunique
()
}
)
.reset_index
()
.rename
(columns
={"ymd":"月份"})
13. 按行遍历DataFrame的3种方法
for idx
, row
in df
.iterrows
():
print(idx
, row
)
print(idx
, row
["A"], row
["B"], row
["C"], row
["D"])
for row
in df
.itertuples
():
print(row
)
print(row
.Index
, row
.A
, row
.B
, row
.C
, row
.D
)
for A
, B
in zip(df
["A"], df
["B"]):
print(A
, B
)
14. DataFrame赋值方法
df
.iloc
[2,2] = 1111
df
.loc
['20130101','B'] = 2222
df
.B
[df
.A
>4] = 0
df
.col1
[df
.col1
=='a'] = 'm'
df
['F'] = np
.nan
df
['E'] = pd
.Series
([1,2,3,4,5,6], index
=pd
.date_range
('20130101',periods
=6))
15. 数据转换函数map、apply、applymap
map:只用于Series,实现每个值->值的映射apply:用于Series实现每个值的处理,用于Dataframe实现某个轴的Series的处理applymap:只能用于DataFrame,用于处理该DataFrame的每个元素
map用于Series值的转换
dict_company_names
= {
"bidu": "百度",
"baba": "阿里巴巴",
"iq": "爱奇艺",
"jd": "京东"
}
stocks
["公司中文1"] = stocks
["公司"].str.lower
().map(dict_company_names
)
stocks
["公司中文2"] = stocks
["公司"].map(lambda x
: dict_company_names
[x
.lower
()])
apply用于Series和DataFrame的转换
stocks
["公司中文3"] = stocks
["公司"].apply(
lambda x
: dict_company_names
[x
.lower
()])
stocks
["公司中文4"] = stocks
.apply(
lambda x
: dict_company_names
[x
["公司"].lower
()],
axis
=1)
applymap用于DataFrame所有值的转换
sub_df
.applymap
(lambda x
: int(x
))