Linear, Ridge and Lasso Regression Analysis on Diabetes Dataset¶
In [1]:
from sklearn.datasets import load_diabetes
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
import matplotlib.pyplot as plt
In [2]:
feature_names = load_diabetes().feature_names
feature_names
Out[2]:
['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']
In [3]:
X, y = load_diabetes(return_X_y=True)
In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
In [5]:
print(f'Shape for X Training Data: {X_train.shape}')
print(f'Shape for X Testing Data: {X_test.shape}')
Shape for X Training Data: (353, 10) Shape for X Testing Data: (89, 10)
In [6]:
# create dictionaries to hold metrics for comparision
cv_scores = {}
cv_mean = {}
r2_training = {}
r2_testing = {}
Linear Regression¶
In [7]:
model_pipeline = Pipeline([
('scale',None),
('model',LinearRegression())
])
display(model_pipeline)
model_name = 'LinRegression'
cv_scores[model_name]= cross_val_score(model_pipeline, X_train, y_train, scoring='r2').round(2)
cv_mean[model_name]= cross_val_score(model_pipeline, X_train, y_train, scoring='r2').mean().round(2)
model_pipeline.fit(X_train, y_train)
r2_training[model_name] = r2_score(y_train, model_pipeline.predict(X_train)).round(2)
r2_testing[model_name]= r2_score(y_test, model_pipeline.predict(X_test)).round(2)
print(f' R2 Training Score {r2_score(y_train, model_pipeline.predict(X_train)).round(2)}')
print(f' R2 Testing Score {r2_score(y_test, model_pipeline.predict(X_test)).round(2)}')
print('\n Regressio formula: ')
(
list(zip(feature_names,model_pipeline.named_steps.model.coef_.round(2)))
+
list(zip(['intercept'],[model_pipeline.named_steps.model.intercept_.round(2)]))
)
Pipeline(steps=[('scale', None), ('model', LinearRegression())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('scale', None), ('model', LinearRegression())])
None
LinearRegression()
R2 Training Score 0.55 R2 Testing Score 0.33 Regressio formula:
Out[7]:
[('age', -35.55), ('sex', -243.17), ('bmi', 562.76), ('bp', 305.46), ('s1', -662.7), ('s2', 324.21), ('s3', 24.75), ('s4', 170.32), ('s5', 731.64), ('s6', 43.03), ('intercept', 152.54)]
Ridge Regression¶
In [8]:
model_pipeline.set_params(scale = StandardScaler(), model=Ridge(alpha=15))
display(model_pipeline)
model_name = 'Ridge'
cv_scores[model_name]= cross_val_score(model_pipeline, X_train, y_train, scoring='r2').round(2)
cv_mean[model_name]= cross_val_score(model_pipeline, X_train, y_train, scoring='r2').mean().round(2)
model_pipeline.fit(X_train, y_train)
r2_training[model_name] = r2_score(y_train, model_pipeline.predict(X_train)).round(2)
r2_testing[model_name]= r2_score(y_test, model_pipeline.predict(X_test)).round(2)
print(f' R2 Training Score {r2_score(y_train, model_pipeline.predict(X_train)).round(2)}')
print(f' R2 Testing Score {r2_score(y_test, model_pipeline.predict(X_test)).round(2)}')
print('\n Regressio formula: ')
(
list(zip(feature_names,model_pipeline.named_steps.model.coef_.round(2)))
+
list(zip(['intercept'],[model_pipeline.named_steps.model.intercept_.round(2)]))
)
Pipeline(steps=[('scale', StandardScaler()), ('model', Ridge(alpha=15))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('scale', StandardScaler()), ('model', Ridge(alpha=15))])
StandardScaler()
Ridge(alpha=15)
R2 Training Score 0.55 R2 Testing Score 0.33 Regressio formula:
Out[8]:
[('age', -1.31), ('sex', -10.89), ('bmi', 26.95), ('bp', 13.82), ('s1', -7.14), ('s2', -3.38), ('s3', -9.27), ('s4', 5.42), ('s5', 25.22), ('s6', 2.73), ('intercept', 151.61)]
Note that Ridge regression decreases the magnitude of the cofficients to as close to zero as possible but not zero. The R2 scores are same as that of Linear Regression.
Lasso Regression¶
In [9]:
model_pipeline.set_params(scale = StandardScaler(), model=Lasso(alpha=15))
display(model_pipeline)
model_name = 'Lasso'
cv_scores[model_name]= cross_val_score(model_pipeline, X_train, y_train, scoring='r2').round(2)
cv_mean[model_name]= cross_val_score(model_pipeline, X_train, y_train, scoring='r2').mean().round(2)
model_pipeline.fit(X_train, y_train)
r2_training[model_name] = r2_score(y_train, model_pipeline.predict(X_train)).round(2)
r2_testing[model_name]= r2_score(y_test, model_pipeline.predict(X_test)).round(2)
print(f' R2 Training Score {r2_score(y_train, model_pipeline.predict(X_train)).round(2)}')
print(f' R2 Testing Score {r2_score(y_test, model_pipeline.predict(X_test)).round(2)}')
print('\n Regressio formula: ')
(
list(zip(feature_names,model_pipeline.named_steps.model.coef_.round(2)))
+
list(zip(['intercept'],[model_pipeline.named_steps.model.intercept_.round(2)]))
)
Pipeline(steps=[('scale', StandardScaler()), ('model', Lasso(alpha=15))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('scale', StandardScaler()), ('model', Lasso(alpha=15))])
StandardScaler()
Lasso(alpha=15)
R2 Training Score 0.45 R2 Testing Score 0.3 Regressio formula:
Out[9]:
[('age', 0.0), ('sex', -0.0), ('bmi', 22.87), ('bp', 2.07), ('s1', -0.0), ('s2', -0.0), ('s3', -1.44), ('s4', 0.0), ('s5', 19.18), ('s6', 0.0), ('intercept', 151.61)]
In Lasso regression, as alpha gets higher, the cofficients become zero. This is ideal for feature selection.
Graphs¶
In [10]:
plt.subplots(figsize=(7,5))
plt.boxplot(cv_scores.values(), labels=cv_scores.keys())
plt.title('Cross Valuation Scores for Different Models')
plt.xlabel('Model Name')
plt.ylabel('R2 Score')
plt.show()
During cross validataion, the Linear Regression and Ridge regression are giving the R2 score because they are basically the same except for scaling.
In [27]:
plt.subplots(figsize=(7,5))
plt.scatter(y=r2_training.values(), x=r2_training.keys(), label='Training R2')
plt.scatter(y=r2_testing.values(), x=r2_testing.keys(), label='Testing R2')
plt.legend(loc=[1.1,0.5])
plt.title('R2 Scores for Testing and Training')
plt.xlabel('Model Name')
plt.ylabel('R2 Scores')
plt.show()
In [ ]:
In [ ]: