[PYTHON] Regression analysis method

Loading Boston Home Price Data

from sklearn.datasets import load_boston
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

boston = load_boston()

df = pd.DataFrame(boston["data"], columns = boston["feature_names"])

df["PRICE"] = boston["target"]

df.head()

Implemented in Cykit Learn (hyperparameters are appropriate)

#How to use Cykit Learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
from sklearn.ensemble import GradientBoostingRegressor

#Input data
X = df.drop("PRICE", axis=1)
Y = df["PRICE"]

#Divided into train data and test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

#Standardize values
sc = StandardScaler()
X_train__std = sc.fit_transform(X_train)
Y_train_std = sc.fit_transform(Y_train.values.reshape(-1,1))
X_test_std = sc.transform(X_test)
Y_test_std = sc.transform(Y_test.values.reshape(-1, 1))


#Linear regression
print("***Linear regression***")
model_linear = LinearRegression()
model_linear.fit(X_train, Y_train)
print("Correlation coefficient of training data:", model_linear.score(X_train, Y_train))
print("Correlation coefficient of validation data:", model_linear.score(X_test, Y_test))
Y_train_pred = model_linear.predict(X_train)
Y_test_pred = model_linear.predict(X_test)
plt.scatter(Y_train,Y_train_pred, label = "train_data")
plt.scatter(Y_test,Y_test_pred, label = "test_data")
plt.legend()
plt.show()

#Linear kernel SVM regression
print("***SVM regression***")
#Regularization parameters=1, use linear kernel
model_svm = svm.SVR(C=1.0, kernel='linear', epsilon=0.1)
model_svm.fit(X_train, Y_train)
print("Correlation coefficient of training data:", model_svm.score(X_train, Y_train))
print("Correlation coefficient of validation data:", model_svm.score(X_test, Y_test))
Y_train_pred = model_svm.predict(X_train)
Y_test_pred = model_svm.predict(X_test)
plt.scatter(Y_train,Y_train_pred, label = "train_data")
plt.scatter(Y_test,Y_test_pred, label = "test_data")
plt.legend()
plt.show()

#Ridge regression
print("***Ridge regression***")
model_ridge = Ridge(alpha=1.0, fit_intercept=True, 
                           normalize=False, copy_X=True, 
                           max_iter=None, tol=0.001, random_state=0)
model_ridge.fit(X_train, Y_train)
print("Correlation coefficient of training data:", model_ridge.score(X_train, Y_train))
print("Correlation coefficient of validation data:", model_ridge.score(X_test, Y_test))
Y_train_pred = model_ridge.predict(X_train)
Y_test_pred = model_ridge.predict(X_test)
plt.scatter(Y_train,Y_train_pred, label = "train_data")
plt.scatter(Y_test,Y_test_pred, label = "test_data")
plt.legend()
plt.show()

#Lasso return
print("***Lasso return***")
model_lasso = Lasso(alpha=1.0, fit_intercept=True, 
                           normalize=False, copy_X=True, 
                           max_iter=1000, tol=0.0001, 
                           warm_start=False, positive=False, 
                           random_state=None, selection="cyclic")
model_lasso.fit(X_train, Y_train)
print("Correlation coefficient of training data:", model_lasso.score(X_train, Y_train))
print("Correlation coefficient of validation data:", model_lasso.score(X_test, Y_test))
Y_train_pred = model_lasso.predict(X_train)
Y_test_pred = model_lasso.predict(X_test)
plt.scatter(Y_train,Y_train_pred, label = "train_data")
plt.scatter(Y_test,Y_test_pred, label = "test_data")
plt.legend()
plt.show()

#Elastic net regression
print("***Elastic net regression***")
model_lasso_elasticnet = ElasticNet(alpha=1.0, l1_ratio=0.5, 
                                fit_intercept=True, normalize=False, 
                                max_iter=1000, copy_X=True, 
                                tol=0.0001, warm_start=False, 
                                positive=False, random_state=None, 
                                selection='cyclic')
model_lasso_elasticnet.fit(X_train, Y_train)
print("Correlation coefficient of training data:", model_lasso_elasticnet.score(X_train, Y_train))
print("Correlation coefficient of validation data:", model_lasso_elasticnet.score(X_test, Y_test))
Y_train_pred = model_lasso_elasticnet.predict(X_train)
Y_test_pred = model_lasso_elasticnet.predict(X_test)
plt.scatter(Y_train,Y_train_pred, label = "train_data")
plt.scatter(Y_test,Y_test_pred, label = "test_data")
plt.legend()
plt.show()

#Random forest regression
print("***Random forest regression***")
model_randomforest = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=2525, verbose=0, warm_start=False)
model_randomforest.fit(X_train, Y_train)
print("Correlation coefficient of training data:", model_randomforest.score(X_train, Y_train))
print("Correlation coefficient of validation data:", model_randomforest.score(X_test, Y_test))
Y_train_pred = model_randomforest.predict(X_train)
Y_test_pred = model_randomforest.predict(X_test)
plt.scatter(Y_train,Y_train_pred, label = "train_data")
plt.scatter(Y_test,Y_test_pred, label = "test_data")
plt.legend()
plt.show()

#Gradient boosting regression
print("Gradient boosting regression")
model_gbc = GradientBoostingRegressor(random_state=0)
model_gbc.fit(X_train, Y_train)
print("Correlation coefficient of training data:", model_gbc.score(X_train, Y_train))
print("Correlation coefficient of validation data:", model_gbc.score(X_test, Y_test))
Y_train_pred = model_gbc.predict(X_train)
Y_test_pred = model_gbc.predict(X_test)
plt.scatter(Y_train,Y_train_pred, label = "train_data")
plt.scatter(Y_test,Y_test_pred, label = "test_data")
plt.legend()
plt.show()

How to create a term for a polynomial (second order, third order, etc.)


from sklearn.preprocessing import PolynomialFeatures
df1 = pd.DataFrame([[1,2,3], [4,5,6], [7,8,9]] ,columns=["col_a", "col_b", "col_c"])
print(df1)
pf = PolynomialFeatures(degree=2, include_bias=False)
df2 = pd.DataFrame(pf.fit_transform(a), columns = pf.get_feature_names(a.columns))
print(df2)

Evaluation method


#Mean squared error(Sum of the squares of the residuals)
from sklearn.metrics import mean_squared_error
mean_squared_error(y, y_pred)

#Coefficient of determination(0.0-1.Evaluate the goodness of the model fit between 0)
#However, there are cases where a negative coefficient of determination appears. Represents a bad fit
from sklearn.metrics import r2_score
r2_score(y, y_pred)

Cross-validation

・ If "f1" is specified for scoring, it will be evaluated by F value.


from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
cv = KFold(5, shuffle=True)
model_rfc_1 = RandomForestClassifier()
cross_val_score(model_rfc_1, X, y, cv=cv, scoring='accuracy')

Grid search


from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
cv = KFold(5, shuffle=True)
param_grid = {'max_depth': [5, 10, 15], 'n_estimators': [10, 20, 30]}
model_rfc_2 = RandomForestClassifier()
grid_search = GridSearchCV(model_rfc_2, param_grid, cv=cv, scoring='accuracy')
grid_search.fit(X, y)

Recommended Posts

Regression analysis method
Poisson regression analysis
Single regression analysis by least squares method
Basics of regression analysis
Regression analysis with NumPy
Regression analysis in Python
What is Logistic Regression Analysis?
Multiple regression analysis with Keras
Linear regression method using Numpy
Simple regression analysis in Python
First simple regression analysis in Python
Machine learning algorithm (multiple regression analysis)
Machine learning algorithm (simple regression analysis)
Data analysis for improving POG 3-Regression analysis-
Simple regression analysis implementation in Keras
What is Multinomial Logistic Regression Analysis?
Logistic regression analysis Self-made with python
Manim's method 7
Manim's method 13
Manim's method 2
Manim's method 18
I tried multiple regression analysis with polynomial regression
Manim's method 17
Manim's method 5
Manim's method 15
Logistic regression
Machine learning with python (2) Simple regression analysis
Manim's method 16
Manim's method 20
Binary method
Manim's method 10
Try cluster analysis using the K-means method
Manim's method 9
Manim's method 6
Linear regression
Manim's method 4
Parabolic analysis
Manim's method 8
2. Multivariate analysis spelled out in Python 8-2. K-nearest neighbor method [Weighting method] [Regression model]
Manim's method 14
Manim's method 22
Manim's method 19
Manim's method 12
Time series analysis # 6 Spurious regression and cointegration
Special method
[Machine learning] Regression analysis using scikit learn
Special method
[scikit-learn, matplotlib] Multiple regression analysis and 3D drawing
Creating multiple output models for regression analysis [Beginner]
Logistic regression implementation with particle swarm optimization method
Easy Lasso regression analysis with Python (no theory)