2. Multivariate analysis spelled out in Python 8-3. K-nearest neighbor method [cross-validation]

import numpy as np
import pandas as pd

#sklearn-based libraries
from sklearn import datasets #data set
from sklearn.model_selection import train_test_split #Data split
from sklearn.neighbors import KNeighborsClassifier #Classification model
from sklearn.neighbors import KNeighborsRegressor #Regression model

#matplotlib libraries
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

!pip install japanize-matplotlib #Japanese display compatible module
import japanize_matplotlib

Regression model-Boston home price-

⑴ Data creation

#Get dataset
boston = datasets.load_boston()

#Extract explanatory variables and objective variables
X = boston.data[:, 5].reshape(len(boston.data), 1)
y = (boston.target).reshape(len(boston.target), 1)

#Data division for training / testing
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

⑵ Cross-validation

#k parameter
n_neighbors = 14

#Variable to store the correct answer rate
score = []

for w in ['uniform', 'distance']:
    #Model generation
    model = KNeighborsRegressor(n_neighbors, weights=w)
    model = model.fit(X_train, y_train)

    #Correct answer rate of training data
    r_train = model.score(X_train, y_train)
    score.append(r_train)

    #Test data accuracy rate
    r_test = model.score(X_test, y_test)    
    score.append(r_test)

#Represented in data frame
score = np.array(score)
pd.DataFrame(score.reshape(2,2), 
             columns = ['train', 'test'],
             index = ['uniform', 'distance'])

image.png

⑶ Visualization

#k parameter
n_neighbors = 14

#Instance generation
model_u = KNeighborsRegressor(n_neighbors, weights='uniform')
model_d = KNeighborsRegressor(n_neighbors, weights='distance')

#Model generation
model_u = model_u.fit(X_train, y_train)
model_d = model_d.fit(X_train, y_train)

#Forecast
y_u = model_u.predict(X_test)
y_d = model_d.predict(X_test)
plt.figure(figsize=(14,6))

#Scatter plot
plt.scatter(X_test, y_u, color='slateblue', lw=1, label='Predicted value(uniform)')
plt.scatter(X_test, y_d, color='tomato', lw=1, label='Predicted value(distance)')
plt.scatter(X_test, y_test, color='lightgrey', label='Measured value(test)')

plt.legend(fontsize=15)
plt.xlim(3, 9.5)
plt.show()

image.png

Classification model-length and width of iris sepals-

  • The data set "iris" attached to sklearn is used, but since we want to simplify the problem, we will narrow down the objective variable "type" to only two (versicolour = 1, virginica = 2) and classify it as a binary. ..
  • In these two types, it is difficult to divide the boundary between the given two variables because individuals are mixed. Knowing that, observe the difference between uniform and distance.

⑴ Data creation

#Get dataset
iris = datasets.load_iris()

#Extract only explanatory variables and objective variables
X = iris.data[:, :2]
y = iris.target
y = y.reshape(-1, 1) #Shape conversion

#After extracting only 2 values, set each variable
data = np.hstack([X, y]) # X,Combine y
data = data[data[:, 2] != 0] #Extract only 2 values
X = data[:, :2]
y = data[:, -1]

#Data split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 0)

⑵ Cross-validation

  • The k parameter (number of k) is set to 15 following the previous.
#k parameter
n_neighbors = 15

#Variable to store the correct answer rate
score = []

for i, w in enumerate(['uniform', 'distance']):
    #Model generation
    model = KNeighborsClassifier(n_neighbors, weights=w)
    model = model.fit(X_train, y_train)

    #Training data
    r_train = model.score(X_train, y_train)
    score.append(r_train)

    #test data
    r_test = model.score(X_test, y_test)    
    score.append(r_test)

#Represented in data frame
score = np.array(score)
pd.DataFrame(score.reshape(2,2), 
             columns = ['train', 'test'],
             index = ['uniform', 'distance'])

image.png

  • Uniform is 74.7% for training data and 60.0% for test data.
  • On the other hand, distance is a high rate of 92.0% in the training data, which is a 40% decrease in the test data to 52.0%.

⑶ Visualization

  • The mesh spacing passed to the model to predict the boundary is 0.02, which is the same as Last time.
#k parameter
n_neighbors = 15

#Mesh spacing
h = 0.02

#Generate a color map for mapping
cmap_surface = ListedColormap(['mistyrose', 'lightcyan'])
cmap_dot = ListedColormap(['tomato', 'slateblue'])
  • Pass the same test data to two models whose weight functions are uniform and distance, respectively, generate a boundary, and plot the measured test data there.
plt.figure(figsize=(18,6))

for j, w in enumerate(['uniform', 'distance']):
    #Generate model
    model = KNeighborsClassifier(n_neighbors, weights = w)
    model = model.fit(X_train, y_train)

    #Set test data
    X, y = X_test, y_test

    # x,Get the minimum and maximum values of the y-axis
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1

    #Generate grid columns at specified mesh intervals
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), 
                         np.arange(y_min, y_max, h))

    #Predict by passing the grid sequence to the model
    z = np.c_[xx.ravel(), yy.ravel()] #Flatten to one dimension and then join
    Z = model.predict(z) #Forecast
    Z = Z.reshape(xx.shape) #Shape conversion

    #drawing
    plt.subplot(1, 2, j + 1)
    plt.pcolormesh(xx, yy, Z, cmap=cmap_surface) #Color plot
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_dot, s=30)

    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.xlabel('Sepal length', fontsize=12)
    plt.ylabel('Sepal width', fontsize=12)
    plt.title("'%s'" % (w), fontsize=18)

plt.show()

image.png

  • The distance model has complicated boundaries due to overfitting in the training data, and there are some jumps everywhere, but they do not fit the test data and are meaningless.
  • The idea of uniform is that the positional relationship is constant (always constant), and the distance between them is considered to be accidental (it happened to happen).
  • The distance, which captures such contingencies as information and strongly reflects the characteristics of the data, seems to be unsuitable for model generation that requires generality.

Recommended Posts