Introduction

I'm trying to do a multivariable regression model, and I want to pick up some machine learning methods and compare and verify the accuracy.

The Python machine learning library scikit-learn has various implementations and is convenient, so I tried using it quickly.

First demo

The following are introduced in Examples


import numpy as np
from sklearn.svm import SVR
import matplotlib.pyplot as plt
%matplotlib inline

#Generate input with random numbers
X = np.sort(5 * np.random.rand(40, 1), axis=0)
#The output is a sin function
y = np.sin(X).ravel()

#Add noise to the output
y[::5] += 3 * (0.5 - np.random.rand(8))

#RBF kernel, linear, polynomial fitting
svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)
svr_lin = SVR(kernel='linear', C=1e3)
svr_poly = SVR(kernel='poly', C=1e3, degree=2)
y_rbf = svr_rbf.fit(X, y).predict(X)
y_lin = svr_lin.fit(X, y).predict(X)
y_poly = svr_poly.fit(X, y).predict(X)

#Create a diagram
plt.figure(figsize=[10, 5])
plt.scatter(X, y, c='k', label='data')
plt.hold('on')
plt.plot(X, y_rbf, c='g', label='RBF model')
plt.plot(X, y_lin, c='r', label='Linear model')
plt.plot(X, y_poly, c='b', label='Polynomial model')
plt.xlabel('data')
plt.ylabel('target')
plt.title('Support Vector Regression')
plt.legend()
plt.show()

result:

I actually tried it

conditions

4 variables (4 dimensions)
Prepare training data set and test data set
After training, estimate by incorporating test data
Compare RBF, linear and polynomial estimation accuracy
For estimation accuracy, use RMSE and correlation coefficient


import numpy as np
from sklearn.svm import SVR
import matplotlib.pyplot as plt

#Properly generate input
X1 = np.sort(5 * np.random.rand(40, 1).reshape(40), axis=0)
X2 = np.sort(3 * np.random.rand(40, 1).reshape(40), axis=0)
X3 = np.sort(9 * np.random.rand(40, 1).reshape(40), axis=0)
X4 = np.sort(4 * np.random.rand(40, 1).reshape(40), axis=0)

#Integrate an array of inputs into one
X = np.c_[X1, X2, X3, X4]

#Calculate output
y = np.sin(X1).ravel() + np.cos(X2).ravel() + np.sin(X3).ravel() - np.cos(X4).ravel()

y_o = y.copy()

#Add noise
y[::5] += 3 * (0.5 - np.random.rand(8))

#fitting
svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)
svr_lin = SVR(kernel='linear', C=1e3)
svr_poly = SVR(kernel='poly', C=1e3, degree=3)
y_rbf = svr_rbf.fit(X, y).predict(X)
y_lin = svr_lin.fit(X, y).predict(X)
y_poly = svr_poly.fit(X, y).predict(X)

#Prepare test data
test_X1 = np.sort(5 * np.random.rand(40, 1).reshape(40), axis=0)
test_X2 = np.sort(3 * np.random.rand(40, 1).reshape(40), axis=0)
test_X3 = np.sort(9 * np.random.rand(40, 1).reshape(40), axis=0)
test_X4 = np.sort(4 * np.random.rand(40, 1).reshape(40), axis=0)

test_X = np.c_[test_X1, test_X2, test_X3, test_X4]
test_y = np.sin(test_X1).ravel() + np.cos(test_X2).ravel() + np.sin(test_X3).ravel() - np.cos(test_X4).ravel()

#Try to estimate by plunging test data
test_rbf = svr_rbf.predict(test_X)
test_lin = svr_lin.predict(test_X)
test_poly = svr_poly.predict(test_X)

Below, verification


from sklearn.metrics import mean_squared_error
from math import sqrt

#Correlation coefficient calculation
rbf_corr = np.corrcoef(test_y, test_rbf)[0, 1]
lin_corr = np.corrcoef(test_y, test_lin)[0, 1]
poly_corr = np.corrcoef(test_y, test_poly)[0, 1]

#Calculate RMSE
rbf_rmse = sqrt(mean_squared_error(test_y, test_rbf))
lin_rmse = sqrt(mean_squared_error(test_y, test_lin))
poly_rmse = sqrt(mean_squared_error(test_y, test_poly))

print "RBF: RMSE %f \t\t Corr %f" % (rbf_rmse, rbf_corr)
print "Linear: RMSE %f \t Corr %f" % (lin_rmse, lin_corr)
print "Poly: RMSE %f \t\t Corr %f" % (poly_rmse, poly_corr)

I got this result



RBF: RMSE 0.707305 		 Corr 0.748894
Linear: RMSE 0.826913 	 Corr 0.389720
Poly: RMSE 2.913726 	 Corr -0.614328

[PYTHON] Multivariable regression model with scikit-learn --SVR comparison verification

Introduction

First demo

I actually tried it

conditions