数据分析案例——客户流失分析与预测

    科技2022-07-17  234

    客户流失分析与预测

    一、数据来源 https://www.kaggle.com/blastchar/telco-customer-churn

    二、数据整理 1、导入函数包

    import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns

    2、导入数据并展示

    data=pd.read_csv(r"D:\百度网盘\数据分析—实例\运营商客户流失分析与预测\WA_Fn-UseC_-Telco-Customer-Churn.csv") # 查看数据集大小 data.shape # 运行结果:(7043, 21) # 设置查看列不省略 pd.set_option('display.max_columns',None) # 查看前5条数据 data.head()

    3、异常值检查

    #查看性别有无异常值 pd.unique(data.gender) #运行结果:array(['Female', 'Male'], dtype=object) #查看年龄有无>120异常值 data.loc[data.tenure>120] #运行结果:0 rows × 21 columns # Null计数 pd.isnull(customerDF).sum()

    4、数据类型查看及转换

    data.info()

    将‘TotalCharges’总消费额的数据类型转换为浮点型

    data.TotalCharges.astype(float) #输出结果:ValueError: could not convert string to float: #强制转换 data.TotalCharges=data['TotalCharges'].convert_objects(convert_numeric=True) #输出结果:AttributeError: 'Series' object has no attribute 'convert_objects' #改用apply语句 data.TotalCharges=data['TotalCharges'].apply(pd.to_numeric, errors='coerce') #转换结果查询 data.info()

    5、查看转换后缺失值

    #查看TotalCharges为NULL的信息 data[data.TotalCharges.isnull()]

    6、修改数据

    #将总消费额填充为月消费额 data.loc[:,'TotalCharges'].replace(to_replace=np.nan,value=data.loc[:,'MonthlyCharges'],inplace=True) # 将‘tenure’入网时长从0修改为1 data.loc[:,'tenure'].replace(to_replace=0,value=1,inplace=True)

    7、数据分析

    data.describe()

    三、数据展示 根据一般经验,将用户特征划分为用户属性、服务属性、合同属性,并从这三个维度进行可视化分析。

    1、查看流失用户数量和占比。

    #查看流失用户数量和占比 plt.figure(figsize=(6,6)) plt.pie(data['Churn'].value_counts(),labels=data['Churn'].value_counts().index,autopct='%0.2f%%',explode=(0.1,0)) plt.title('Churn(Yes/No) Ratio') plt.show()

    x=churnDf.index y=churnDf['Churn'] plt.bar(x,y,width = 0.5,color = 'g') plt.title('Churn(Yes/No) Num') for a,b in zip(x,y): plt.text(a,b+10,'%.0f' % b, ha='center', va= 'bottom') plt.show()

    2、用户属性分析

    def barplot_percentages(feature,orient='v',axis_name="percentage of customers"): ratios = pd.DataFrame() g = (customerDF.groupby(feature)["Churn"].value_counts()/len(customerDF)).to_frame() g.rename(columns={"Churn":axis_name},inplace=True) g.reset_index(inplace=True) #print(g) if orient == 'v': ax = sns.barplot(x=feature, y= axis_name, hue='Churn', data=g, orient=orient) ax.set_yticklabels(['{:,.0%}'.format(y) for y in ax.get_yticks()]) plt.rcParams.update({'font.size': 13}) #plt.legend(fontsize=10) else: ax = sns.barplot(x= axis_name, y=feature, hue='Churn', data=g, orient=orient) ax.set_xticklabels(['{:,.0%}'.format(x) for x in ax.get_xticks()]) plt.legend(fontsize=10) plt.title('Churn(Yes/No) Ratio as {0}'.format(feature)) plt.show() barplot_percentages("SeniorCitizen") barplot_percentages("gender")

    fig, axis = plt.subplots(1, 2, figsize=(12,4)) axis[0].set_title("Has Partner") axis[1].set_title("Has Dependents") axis_y = "percentage of customers" # Plot Partner column gp_partner = (customerDF.groupby('Partner')["Churn"].value_counts()/len(customerDF)).to_frame() gp_partner.rename(columns={"Churn": axis_y}, inplace=True) gp_partner.reset_index(inplace=True) ax1 = sns.barplot(x='Partner', y= axis_y, hue='Churn', data=gp_partner, ax=axis[0]) ax1.legend(fontsize=10) # Plot Dependents column gp_dep = (customerDF.groupby('Dependents')["Churn"].value_counts()/len(customerDF)).to_frame() #print(gp_dep) gp_dep.rename(columns={"Churn": axis_y} , inplace=True) #print(gp_dep) gp_dep.reset_index(inplace=True) #print(gp_dep) ax2 = sns.barplot(x='Dependents', y= axis_y, hue='Churn', data=gp_dep, ax=axis[1]) #设置字体大小 plt.rcParams.update({'font.size': 20}) ax2.legend(fontsize=10) plt.show()

    # Kernel density estimaton核密度估计 def kdeplot(feature,xlabel): plt.figure(figsize=(9, 4)) plt.title("KDE for {0}".format(feature)) ax0 = sns.kdeplot(customerDF[customerDF['Churn'] == 'No'][feature].dropna(), color= 'navy', label= 'Churn: No', shade='True') ax1 = sns.kdeplot(customerDF[customerDF['Churn'] == 'Yes'][feature].dropna(), color= 'orange', label= 'Churn: Yes',shade='True') plt.xlabel(xlabel) #设置字体大小 plt.rcParams.update({'font.size': 20}) plt.legend(fontsize=10) kdeplot('tenure','tenure') plt.show()

    总结 (1)有伴侣的用户流失占比低于无伴侣用户; (2)有家属的用户较少; (3)有家属的用户流失占比低于无家属用户; (4)在网时长越久,流失率越低,符合一般经验。

    3、服务属性分析

    plt.figure(figsize=(9, 4.5)) barplot_percentages("MultipleLines", orient='h')

    plt.figure(figsize=(9, 4.5)) barplot_percentages("InternetService", orient="h")

    cols = ["PhoneService","MultipleLines","OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies"] df1 = pd.melt(customerDF[customerDF["InternetService"] != "No"][cols]) df1.rename(columns={'value': 'Has service'},inplace=True) plt.figure(figsize=(20, 8)) ax = sns.countplot(data=df1, x='variable', hue='Has service') ax.set(xlabel='Internet Additional service', ylabel='Num of customers') plt.rcParams.update({'font.size':20}) plt.legend( labels = ['No Service', 'Has Service'],fontsize=15) plt.title('Num of Customers as Internet Additional Service') plt.show()

    plt.figure(figsize=(20, 8)) df1 = customerDF[(customerDF.InternetService != "No") & (customerDF.Churn == "Yes")] df1 = pd.melt(df1[cols]) df1.rename(columns={'value': 'Has service'}, inplace=True) ax = sns.countplot(data=df1, x='variable', hue='Has service', hue_order=['No', 'Yes']) ax.set(xlabel='Internet Additional service', ylabel='Churn Num') plt.rcParams.update({'font.size':20}) plt.legend( labels = ['No Service', 'Has Service'],fontsize=15) plt.title('Num of Churn Customers as Internet Additional Service') plt.show()

    总结 (1)电话服务整体对用户流失影响较小; (2)单光纤用户的流失占比较高; (3)光纤用户绑定了安全、备份、保护、技术支持服务的流失率较低; (4)光纤用户附加流媒体电视、电影服务的流失率占比较高。

    4、合同属性分析

    plt.figure(figsize=(9, 4.5)) barplot_percentages("PaymentMethod",orient='h')

    g = sns.FacetGrid(customerDF, col="PaperlessBilling", height=6, aspect=.9) ax = g.map(sns.barplot, "Contract", "churn_rate", palette = "Blues_d", order= ['Month-to-month', 'One year', 'Two year']) plt.rcParams.update({'font.size':18}) plt.show()

    kdeplot('MonthlyCharges','MonthlyCharges') kdeplot('TotalCharges','TotalCharges') plt.show()

    总结 (1)采用电子支票支付的用户流失率最高,推测该方式的使用体验较为一般; (2)签订合同方式对客户流失率影响为:按月签订 > 按一年签订 > 按两年签订,证明长期合同最能保留客户; (3)月消费额大约在70-110之间用户流失率较高; (4)长期来看,用户总消费越高,流失率越低,符合一般经验。

    四、数据预测 1、特征编码

    for col in cateCols: if dfCate[col].nunique() == 2: dfCate[col] = pd.factorize(dfCate[col])[0] else: dfCate = pd.get_dummies(dfCate, columns=[col]) dfCate['tenure']=customerDF[['tenure']] dfCate['MonthlyCharges']=customerDF[['MonthlyCharges']] dfCate['TotalCharges']=customerDF[['TotalCharges']]

    2、查看关联性

    plt.figure(figsize=(16,8)) dfCate.corr()['Churn'].sort_values(ascending=False).plot(kind='bar') plt.show()

    3、特征提取

    # 特征选择 dropFea = ['gender','PhoneService', 'OnlineSecurity_No internet service', 'OnlineBackup_No internet service', 'DeviceProtection_No internet service', 'TechSupport_No internet service', 'StreamingTV_No internet service', 'StreamingMovies_No internet service', #'OnlineSecurity_No', 'OnlineBackup_No', #'DeviceProtection_No','TechSupport_No', #'StreamingTV_No', 'StreamingMovies_No', ] dfCate.drop(dropFea, inplace=True, axis =1) #最后一列是作为标识 target = dfCate['Churn'].values #列表:特征和1个标识 columns = dfCate.columns.tolist()

    4、构造数据集

    # 列表:特征 columns.remove('Churn') # 含有特征的DataFrame features = dfCate[columns].values # 30% 作为测试集,其余作为训练集 # random_state = 1表示重复试验随机得到的数据集始终不变 # stratify = target 表示按标识的类别,作为训练数据集、测试数据集内部的分配比例 from sklearn.model_selection import train_test_split train_x, test_x, train_y, test_y = train_test_split(features, target, test_size=0.30, stratify = target, random_state = 1)

    5、构造模型

    # 构造各种分类器 classifiers = [ SVC(random_state = 1, kernel = 'rbf'), DecisionTreeClassifier(random_state = 1, criterion = 'gini'), RandomForestClassifier(random_state = 1, criterion = 'gini'), KNeighborsClassifier(metric = 'minkowski'), AdaBoostClassifier(random_state = 1), ] # 分类器名称 classifier_names = [ 'svc', 'decisiontreeclassifier', 'randomforestclassifier', 'kneighborsclassifier', 'adaboostclassifier', ] # 分类器参数 #注意分类器的参数,字典键的格式,GridSearchCV对调优的参数格式是"分类器名"+"__"+"参数名" classifier_param_grid = [ {'svc__C':[0.1], 'svc__gamma':[0.01]}, {'decisiontreeclassifier__max_depth':[6,9,11]}, {'randomforestclassifier__n_estimators':range(1,11)} , {'kneighborsclassifier__n_neighbors':[4,6,8]}, {'adaboostclassifier__n_estimators':[70,80,90]} ]

    6、参数调优

    # 对具体的分类器进行 GridSearchCV 参数调优 def GridSearchCV_work(pipeline, train_x, train_y, test_x, test_y, param_grid, score = 'accuracy_score'): response = {} gridsearch = GridSearchCV(estimator = pipeline, param_grid = param_grid, cv=3, scoring = score) # 寻找最优的参数 和最优的准确率分数 search = gridsearch.fit(train_x, train_y) print("GridSearch 最优参数:", search.best_params_) print("GridSearch 最优分数: %0.4lf" %search.best_score_) #采用predict函数(特征是测试数据集)来预测标识,预测使用的参数是上一步得到的最优参数 predict_y = gridsearch.predict(test_x) print(" 准确率 %0.4lf" %accuracy_score(test_y, predict_y)) response['predict_y'] = predict_y response['accuracy_score'] = accuracy_score(test_y,predict_y) return response for model, model_name, model_param_grid in zip(classifiers, classifier_names, classifier_param_grid): #采用 StandardScaler 方法对数据规范化:均值为0,方差为1的正态分布 pipeline = Pipeline([ #('scaler', StandardScaler()), #('pca',PCA), (model_name, model) ]) result = GridSearchCV_work(pipeline, train_x, train_y, test_x, test_y, model_param_grid , score = 'accuracy')

    五、结论和建议 根据以上分析,得到高流失率用户的特征: 用户属性:老年用户,未婚用户,无亲属用户更容易流失; 服务属性:在网时长小于半年,有电话服务,光纤用户/光纤用户附加流媒体电视、电影服务,无互联网增值服务; 合同属性:签订的合同期较短,采用电子支票支付,是电子账单,月租费约70-110元的客户容易流失; 其它属性对用户流失影响较小,以上特征保持独立。 针对上述结论,从业务角度给出相应建议: 根据预测模型,构建一个高流失率的用户列表。通过用户调研推出一个最小可行化产品功能,并邀请种子用户进行试用。 用户方面:针对老年用户、无亲属、无伴侣用户的特征退出定制服务如亲属套餐、温暖套餐等,一方面加强与其它用户关联度,另一方对特定用户提供个性化服务。 服务方面:针对新注册用户,推送半年优惠如赠送消费券,以渡过用户流失高峰期。针对光纤用户和附加流媒体电视、电影服务用户,重点在于提升网络体验、增值服务体验,一方面推动技术部门提升网络指标,另一方面对用户承诺免费网络升级和赠送电视、电影等包月服务以提升用户黏性。针对在线安全、在线备份、设备保护、技术支持等增值服务,应重点对用户进行推广介绍,如首月/半年免费体验。 合同方面:针对单月合同用户,建议推出年合同付费折扣活动,将月合同用户转化为年合同用户,提高用户在网时长,以达到更高的用户留存。 针对采用电子支票支付用户,建议定向推送其它支付方式的优惠券,引导用户改变支付方式

    参考连接添加链接描述

    Processed: 0.011, SQL: 8