任务: 基于usa_housing_price.csv数据,建立线性回归模型,预测合理房价
以sqft_living为输入变量,建立单因子模型,评估模型表现,可视化线性回归预测结果以sqft_living、sqft_lot、sqft_above、yr_built、lat为输入变量,建立多因子模型,评估模型表现预测sqft_living=1180、sqft_lot=5650、sqft_above=1180、yr_built=1955、lat=47.5112的合理房价 # load the data import pandas as pd import numpy as np data = pd.read_csv('usa_housing_price.csv') data.head() pricesqft_livingsqft_lotsqft_aboveyr_builtlat0221900.0118056501180195547.51121538000.0257072422170195147.72102180000.077010000770193347.73793604000.0196050001050196547.52084510000.0168080801680198747.6168 %matplotlib inline from matplotlib import pyplot as plt fig = plt.figure(figsize = (10,10)) fig1 = plt.subplot(231) plt.scatter(data.loc[:,'sqft_lot'],data.loc[:,'price']) plt.title('price vs sqft_lot') fig2 = plt.subplot(232) plt.scatter(data.loc[:,'sqft_living'],data.loc[:,'price']) plt.title('price vs sqft_living') fig3 = plt.subplot(233) plt.scatter(data.loc[:,'sqft_above'],data.loc[:,'price']) plt.title('price vs sqft_above') fig4 = plt.subplot(234) plt.scatter(data.loc[:,'yr_built'],data.loc[:,'price']) plt.title('price vs sqft_living') fig5 = plt.subplot(235) plt.scatter(data.loc[:,'lat'],data.loc[:,'price']) plt.title('price vs lat') plt.show() #define X and y X = data.loc[:,'sqft_living'] y = data.loc[:,'price'] y.head() 0 221900.0 1 538000.0 2 180000.0 3 604000.0 4 510000.0 Name: price, dtype: float64 X = np.array(X).reshape(-1,1) print(X.shape) (21613, 1) # set up the linear regression model from sklearn.linear_model import LinearRegression LR1 = LinearRegression() # train the model LR1.fit(X,y) LinearRegression() # Calculate the price vs sqft_living y_predict_1 = LR1.predict(X) print(y_predict_1) [287484.29258296 677805.59158496 172353.54971186 ... 242555.22219424 405423.10235335 242555.22219424] # evaluate the model from sklearn.metrics import mean_squared_error,r2_score mean_squared_error_1 = mean_squared_error(y,y_predict_1) r2_score_1 = r2_score(y,y_predict_1) print(mean_squared_error_1,r2_score_1) 68437189845.45986 0.4928653865220143 fig6 = plt.figure(figsize=(8,5)) plt.scatter(X,y) plt.plot(X,y_predict_1,'r') plt.show() # define X_multi X_multi = data.drop(['price'],axis=1) X_multi sqft_livingsqft_lotsqft_aboveyr_builtlat0118056501180195547.51121257072422170195147.7210277010000770193347.73793196050001050196547.52084168080801680198747.6168..................21608153011311530200947.699321609231058132310201447.510721610102013501020200947.594421611160023881600200447.534521612102010761020200847.594121613 rows × 5 columns
# set up 2nd linear model LR_multi = LinearRegression() #train the model LR_multi.fit(X_multi,y) LinearRegression() # make prediction y_predict_multi = LR_multi.predict(X_multi) print(y_predict_multi) [279081.29418243 832377.00440778 346082.19661749 ... 176687.80346459 325324.21431742 178529.30559747] mean_squared_error_multi = mean_squared_error(y,y_predict_multi) r2_score_multi = r2_score(y,y_predict_multi) print(mean_squared_error_multi,r2_score_multi) 55814504752.37977 0.5864022564634701 fig7 = plt.figure(figsize=(8,5)) plt.scatter(y,y_predict_multi) plt.show() X_test = [1180,5650,1180,1955,47.5112] X_test = np.array(X_test).reshape(1,-1) print(X_test) [[1180. 5650. 1180. 1955. 47.5112]] y_test_predict = LR_multi.predict(X_test) print(y_test_predict) [279081.29418243]