Following Chapter 4, execution example of First pattern recognition Will be reproduced in Python. This time is Chapter 5 "k-nearest neighbor method (kNN method)".
However, we have abandoned the reproduction of the execution example using the ETL1 handwritten digit database (execution example 5.1, execution example 5.2 Figure 5.9 (a)) because there is a part where the data is randomly sampled. .. .. The operating environment of the described code is numpy 1.16.2, pandas 0.24.2, scikit-learn 0.20.3.
The k-nearest neighbor method (kNN method) is an identification method in which k-nearest neighbor templates of a sample are taken and the sample belongs to the class to which they belong most. Below is the code.
--Loading the required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
from sklearn.preprocessing import StandardScaler
--k Implementation of k-nearest neighbor method
class kNearestNeiborsClassifier:
def __init__(self, k=5):
self._k = k
self._templates = None
self._classes = None
def fit(self, templates, classes):
self._templates = templates
self._classes = classes
def predict(self, X):
pred = np.zeros(len(X))
for i in range(len(X)): #Process each sample with a for statement to ensure readability
distance = np.sum((X[i, :] - self._templates)**2, axis=1) #Calculate the Euclidean distance between X and the mold
idx = np.argsort(distance)[:self._k] #Get the index of k molds with the smallest distance to X
_, count = np.unique(self._classes[idx], return_counts=True) #Calculate the number of molds belonging to each class out of k molds
cls = np.where(count == np.max(count))[0] #Identify the class that maximizes the number of molds it belongs to
if len(cls) == 1:
pred[i] = cls[0] #If one class has the largest number of molds to which it belongs, return that class
else:
pred[i] = np.nan #If there are multiple classes that maximize the number of templates to which they belong, np will be rejected..Returns nan
return pred
def error_rate(self, y_true, y_pred):
return np.sum(((y_true==1)&(y_pred==0))|((y_true==0)&(y_pred==1)))/len(y_pred) #Calculate error rate by excluding rejects
def reject_rate(self, y_pred):
return np.sum(np.isnan(y_pred))/len(y_pred) #Reject rate
--Reading Pima Indian data
The data is borrowed from R datasets.
pima_train = pd.read_csv('https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/MASS/Pima.tr.csv')
pima_test = pd.read_csv('https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/MASS/Pima.te.csv')
--Calculate the error rate by the holdout method
#Training data (X_train, Y_train) and verification data (X)_test, Y_test) preparation
sc = StandardScaler()
X_train = sc.fit_transform(pima_train.drop(['Unnamed: 0', 'type'], axis=1))
Y_train = pima_train['type'].map({'Yes':1, 'No':0}).values
X_test = sc.transform(pima_test.drop(['Unnamed: 0', 'type'], axis=1))
Y_test = pima_test['type'].map({'Yes':1, 'No':0}).values
#Calculate the error rate when k (odd number only) is changed
hold_out_error_rate_list = np.empty((0,2), int)
for k in range(1, 120, 2):
knn = kNearestNeiborsClassifier(k=k)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
error_rate = knn.error_rate(y_true=Y_test, y_pred=Y_pred)
hold_out_error_rate_list = np.append(hold_out_error_rate_list, np.array([[k, error_rate]]), axis=0)
--Calculate the error rate by the one-exclusion method
#Data preparation
pima = pd.concat([pima_train, pima_test])
sc = StandardScaler()
X = sc.fit_transform(pima.drop(['Unnamed: 0', 'type'], axis=1))
Y = pima['type'].map({'Yes':1, 'No':0}).values
#Calculate the error rate when k (odd number only) is changed
leave_one_out_error_rate_list = np.empty((0,2), int)
for k in range(1, 120, 2):
error_list = []
for i in range(len(X)):
idx = np.ones(len(X), dtype=bool)
idx[i] = False #Index i sample is not used for learning
knn = kNearestNeiborsClassifier(k=k)
knn.fit(X[idx], Y[idx]) #Learn with samples other than index i
Y_pred = knn.predict(X[i].reshape([1, -1]))
error = knn.error_rate(y_true=Y[i], y_pred=Y_pred) #Validated with sample index i
error_list.append(error)
error_rate = np.mean(error_list)
leave_one_out_error_rate_list = np.append(leave_one_out_error_rate_list, np.array([[k, error_rate]]), axis=0)
--Change in error rate due to difference in k --Fig. 5.9 (b)
plt.scatter(hold_out_error_rate_list[:, 0], hold_out_error_rate_list[:, 1], label='hold out')
plt.scatter(leave_one_out_error_rate_list[:, 0], leave_one_out_error_rate_list[:, 1], label='leave one out')
plt.legend()
plt.xlabel('k')
plt.ylabel('error rate')
plt.show()
--Calculation of error rate
#Calculate the holdout error rate when k (odd number) is changed
odd_error_rate_list = np.empty((0,2), int)
for k in range(1, 62, 2):
knn = kNearestNeiborsClassifier(k=k)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
error_rate = knn.error_rate(y_true=Y_test, y_pred=Y_pred)
odd_error_rate_list = np.append(odd_error_rate_list, np.array([[k, error_rate]]), axis=0)
#Calculate the holdout error rate when k (even number) is changed
even_error_rate_list = np.empty((0,2), int)
for k in range(2, 62, 2):
knn = kNearestNeiborsClassifier(k=k)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
error_rate = knn.error_rate(y_true=Y_test, y_pred=Y_pred)
even_error_rate_list = np.append(even_error_rate_list, np.array([[k, error_rate]]), axis=0)
--Error rate --Fig. 5.12 (a)
plt.scatter(odd_error_rate_list[:, 0], odd_error_rate_list[:, 1], label='odd')
plt.scatter(even_error_rate_list[:, 0], even_error_rate_list[:, 1], label='even')
plt.legend()
plt.xlabel('k')
plt.ylabel('error rate')
plt.show()
--Calculation of rejection rate
reject_rate_list = np.empty((0,2), int)
for k in range(2, 62, 2):
knn = kNearestNeiborsClassifier(k=k)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
reject_rate = knn.reject_rate(y_pred=Y_pred)
reject_rate_list = np.append(reject_rate_list, np.array([[k, reject_rate]]), axis=0)
--Reject rate --Fig. 5.12 (b)
plt.scatter(reject_rate_list[:, 0], reject_rate_list[:, 1])
plt.xlabel('k')
plt.ylabel('reject rate')
plt.show()
I tried to execute the execution example of Chapter 5 of Hajipata in Python. If you have any code mistakes or more efficient implementation methods, we would appreciate it if you could make an edit request.
Recommended Posts