提示:文章写完后,目录可以自动生成,如何生成可参考右边的帮助文档
导入玩家的玩牌游戏数据 #player <- read.csv("./data/chapter05/玩家玩牌数据.csv",F,na.strings = “NA”) player <- read.csv("./data/chapter05/玩家玩牌数据.csv",F) head(player) str(player) player_col_names <- c(“用户id”,“性别”,“等级”,“站内好友数”,“经验值”, “积分”,“登录总次数”,“玩牌局数”,“赢牌局数”,“身上货币量”)
查看变量名 colnames(player) <- player_col_names colnames(player) 查看前六行 head(player)
利用is.na函数判断“玩牌局数”变量各值是否为缺失值 is.na(player$玩牌局数)
统计缺失值与非缺失值的个数 table(is.na(player$玩牌局数))
sum()和mean()函数来统计缺失值的个数和占比 #计算缺失值个数 sum(is.na(player$玩牌局数))
#计算缺失值占比 mean(is.na(player$玩牌局数))
利用complete.cases函数查看完整实例 sum(complete.cases(player))
用md.pattern函数查看player的缺失值模式 if(!require(mice)) install.packages(“mice”) md.pattern(player)
–删除缺失样本 sum(!complete.cases(player)) player_full <- na.omit(player) 计算有缺失值的样本个数 sum(!complete.cases(player_full))
–替换缺失值 iris1 <- iris[,c(1,5)] head(iris1) table(iris1KaTeX parse error: Expected 'EOF', got '#' at position 10: Species) #̲将40、80、120号样本的S…Sepal.Length,na.rm = T);Sepal.Length.mean iris1[c(40,80,120),1] <- round(Sepal.Length.mean,1) iris1[c(40,80,120),1] 查看以前的值和现在的值
iris[c(40,80,120),1];iris1[c(40,80,120),1]
利用同类均值进行赋值的方式来填补缺失值 #将40、80、120号样本的Sepal.Length设置为缺失值 iris2 <- iris[,c(1,5)] iris2[c(40,80,120),1] <- NA iris2[40,1] <- round(mean(iris2[iris1 S p e c i e s = = ′ s e t o s a ′ , ′ S e p a l . L e n g t h ′ ] , n a . r m = T ) , 1 ) i r i s 2 [ 80 , 1 ] < − r o u n d ( m e a n ( i r i s 2 [ i r i s 1 Species=='setosa','Sepal.Length'], na.rm = T),1) iris2[80,1] <- round(mean(iris2[iris1 Species==′setosa′,′Sepal.Length′],na.rm=T),1)iris2[80,1]<−round(mean(iris2[iris1Species==‘versicolor’,‘Sepal.Length’], na.rm = T),1) iris2[120,1] <- round(mean(iris2[iris1$Species==‘virginica’,‘Sepal.Length’], na.rm = T),1) #查看以前的值和现在的值 iris[c(40,80,120),1];iris1[c(40,80,120),1];iris2[c(40,80,120),1]
导入数据 rawdata <- read.csv(“D://小学期/数据转换数据.csv”) #查看数据的前六行 head(rawdata) str(rawdata) #将注册日期变量转换成日期格式 rawdata r e g i s t r a t i o n < − a s . D a t e ( p a s t e ( s u b s t r ( r a w d a t a registration <- as.Date(paste(substr(rawdata registration<−as.Date(paste(substr(rawdataregistration,1,4), substr(rawdata r e g i s t r a t i o n , 5 , 6 ) , s u b s t r ( r a w d a t a registration,5,6), substr(rawdata registration,5,6),substr(rawdataregistration,7,8), sep="-"), “%Y-%m-%d”) head(rawdata) str(rawdata)
将首次付费日期转换成日期格式 rawdata f i r s t p a y d a t e < − a s . D a t e ( p a s t e ( s u b s t r ( r a w d a t a firstpaydate <- as.Date(paste(substr(rawdata firstpaydate<−as.Date(paste(substr(rawdatafirstpaydate,1,4), substr(rawdata f i r s t p a y d a t e , 5 , 6 ) , s u b s t r ( r a w d a t a firstpaydate,5,6), substr(rawdata firstpaydate,5,6),substr(rawdatafirstpaydate,7,8), sep="-"), “%Y-%m-%d”)
查看数据的前六行 head(rawdata) str(rawdata)
#增加ispay变量:0表示非付费用户,1表示付费用户 rawdata i s p a y < − i f e l s e ( ! i s . n a ( r a w d a t a ispay <- ifelse(!is.na(rawdata ispay<−ifelse(!is.na(rawdatafirstpaydate),1,0) head(rawdata) 增加isnewpay变量:0表示非新增首日付费用户,1表示新增首日付费用户 rawdata i s n e w p a y < − i f e l s e ( r a w d a t a isnewpay <- ifelse(rawdata isnewpay<−ifelse(rawdataregistration==rawdata f i r s t p a y d a t e , 1 , 0 ) h e a d ( r a w d a t a ) r a w d a t a [ i s . n a ( r a w d a t a firstpaydate, 1,0) head(rawdata) rawdata[is.na(rawdata firstpaydate,1,0)head(rawdata)rawdata[is.na(rawdataisnewpay),‘isnewpay’] <- 0 #查看数据前10行 head(rawdata) #采用(x-mu)/std的标准化方法,与scale()函数效果一样 #install.packages(“caret”) library(caret) ?preProcess standard <- preProcess(iris) head(predict(standard,iris)) head(scale(iris[,1:4])) #采用(x-min(x))/(max(x)-min(x))的标准化方法 standard <- preProcess(iris, method = ‘range’) head(predict(standard,iris)) fun <- function(x) (x-min(x))/(max(x)-min(x)) head(sapply(iris[,1:4],fun))
对days(活跃天数)进行分箱操作 head(rawdata) rawdata d a y s i n t e r v a l < − c u t ( r a w d a t a days_interval <- cut(rawdata daysinterval<−cut(rawdatadays, breaks=c(0,30,60,90,Inf), labels=c(‘一个月内’,‘3160天’,'6190天’,‘三个月以上’)) head(rawdata) 对lifetime(生命周期)进行分箱操作 rawdata l i f e t i m e i n t e r v a l < − c u t ( r a w d a t a lifetime_interval <- cut(rawdata lifetimeinterval<−cut(rawdatalifetime, breaks=c(0,7,21,30,90,Inf), labels=c(‘小于一周’,‘小于三周’,‘小于一个月’, ‘小于三个月’,‘三个月以上’)) 查看前六行 head(rawdata) )
#采用(x-mu)/std的标准化方法,与scale()函数效果一样 #install.packages(“caret”) library(caret) ?preProcess standard <- preProcess(iris) head(predict(standard,iris)) head(scale(iris[,1:4])) #采用(x-min(x))/(max(x)-min(x))的标准化方法 standard <- preProcess(iris, method = ‘range’) head(predict(standard,iris)) fun <- function(x) (x-min(x))/(max(x)-min(x)) head(sapply(iris[,1:4],fun))
构建customers数据集 customers<-data.frame(id=c(10,20,30,40,50), gender=c(“male”,“female”,“female”,“male”,“female”), mood=c(“happy”,“sad”,“happy”,“sad”,“happy”), outcome=c(1,1,0,0,0)) customers #创建新数据框customers.new customers.new <- customers[,c(‘id’,‘outcome’)] customers.new 对gender变量进行哑变量处理 customers.new g e n d e r . m a l e < − i f e l s e ( c u s t o m e r s gender.male <- ifelse(customers gender.male<−ifelse(customersgender==‘male’,1,0) customers.new g e n d e r . f e m a l e < − i f e l s e ( c u s t o m e r s gender.female <- ifelse(customers gender.female<−ifelse(customersgender==‘female’,1,0) customers.new g e n d e r < − c u s t o m e r s gender <- customers gender<−customersgender customers.new 对mood变量进行哑变量处理 customers.new m o o d . h a p p y < − i f e l s e ( c u s t o m e r s mood.happy <- ifelse(customers mood.happy<−ifelse(customersmood==‘happy’,1,0) customers.new m o o d . s a d < − i f e l s e ( c u s t o m e r s mood.sad <- ifelse(customers mood.sad<−ifelse(customersmood==‘sad’,1,0) customers.new
library(caret)
str(customers)
dmy<-dummyVars(~.,data=customers)
trsf<-data.frame(predict(dmy,newdata=customers))
trsf
customers o u t c o m e < − a s . f a c t o r ( c u s t o m e r s outcome <- as.factor(customers outcome<−as.factor(customersoutcome)
dmy<-dummyVars(~.,data=customers)
trsf<-data.frame(predict(dmy,newdata=customers))
trsf
dmy.gender <- dummyVars(~gender,data=customers) trsf.gender <- data.frame(predict(dmy.gender,newdata=customers)) trsf.gender
customers<-data.frame(id=c(10,20,30,40,50), gender=c(“male”,“female”,“female”,“male”,“female”), mood=c(“happy”,“sad”,“happy”,“sad”,“happy”), outcome=c(1,1,0,0,0)) dmy<-dummyVars(~.,data=customers,levelsOnly=TRUE,fullRank=TRUE) trsf<-data.frame(predict(dmy,newdata=customers)) trsf
customers<-data.frame(id=c(10,20,30,40,50), gender=c(“male”,“female”,“female”,“male”,“female”), mood=c(“happy”,“sad”,“happy”,“sad”,“happy”), outcome=c(1,1,0,0,0), test=c(“1”,“2”,“1”,“3”,“4”)) dmy<-dummyVars(~.,data=customers,levelsOnly=TRUE,fullRank=TRUE) trsf<-data.frame(predict(dmy,newdata=customers)) trsf