PROGRAMMING

  minte9
learningjourney




S R Q

K-Nearest

K is the number of nearest neighbors to use.
 
""" KNN Supervised machine learning (ML) algorithm
K is the number of nearest neighbors to use
Provide Training set of data points and Labels
Createa a KNN classifier with K=3
Predict the label of a new data point   
"""

from sklearn.neighbors import KNeighborsClassifier

X = [[0,0], [1,1], [2,2], [3,3]]    
y = [0, 1, 0, 1]

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X, y)
            
prediction = knn.predict([[1,2]])   
print(prediction) # [0]

Pandas

p36 Transform dataset into a DataFrame with pandas library.
 
"""KNN Fruit classification (height, width / type)
Learn a function f that maps any combination 
of height and width of a fruit to a (predicted) fruit type
"""

import pandas as pd
from sklearn.neighbors import KNeighborsClassifier

data = {
  'height': [
    3.91, 7.09, 10.48, 9.21, 7.95, 7.62, 7.95, 4.69, 7.50, 7.11, 
    4.15, 7.29, 8.49, 7.44, 7.86, 3.93, 4.40, 5.5, 8.10, 8.69
  ], 
  'width': [
     5.76, 7.69, 7.32, 7.20, 5.90, 7.51, 5.32, 6.19, 5.99, 7.02, 
     5.60, 8.38, 6.52, 7.89, 7.60, 6.12, 5.90, 4.5, 6.15, 5.82
  ],
  'fruit': [
    'Mandarin', 'Apple', 'Lemon', 'Lemon', 'Lemon', 'Apple', 'Mandarin', 
    'Mandarin', 'Lemon', 'Apple', 'Mandarin', 'Apple', 'Lemon', 'Apple', 
    'Apple', 'Apple', 'Mandarin', 'Lemon', 'Lemon', 'Lemon'
  ]
} 

df = pd.DataFrame(data) # transform dataset into a DataFrame
print(df)

X = df[['height', 'width']].values
y = df.fruit.values

knn = KNeighborsClassifier(n_neighbors=3) 
knn.fit(X, y)

prediction  = knn.predict([[9, 3]])
predictions = knn.predict([[9, 3], [4, 5], [2, 5], [8, 9], [5, 7]])

print(prediction)  # Lemon
print(predictions) # Lemon Mandarin Mandarin Apple Mandarin

Accuracy

p42 Evaluate the model score on the training dataset for k=3
 
"""KNN Evaluation
Evaluate the model on the training and test dataset
The score is the difference between actual and predicted labels
1.0 means the model correctly predicted all (100%)
"""

import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

A = pd.DataFrame({ # training dataset
  'height': [
    3.91, 7.09, 10.48, 9.21, 7.95, 7.62, 7.95, 4.69, 7.50, 7.11, 
    4.15, 7.29, 8.49, 7.44, 7.86, 3.93, 4.40, 5.5, 8.10, 8.69
  ], 
  'width': [
     5.76, 7.69, 7.32, 7.20, 5.90, 7.51, 5.32, 6.19, 5.99, 7.02, 
     5.60, 8.38, 6.52, 7.89, 7.60, 6.12, 5.90, 4.5, 6.15, 5.82
  ],
  'fruit': [
    'Mandarin', 'Apple', 'Lemon', 'Lemon', 'Lemon', 'Apple', 'Mandarin', 
    'Mandarin', 'Lemon', 'Apple', 'Mandarin', 'Apple', 'Lemon', 'Apple', 
    'Apple', 'Apple', 'Mandarin', 'Lemon', 'Lemon', 'Lemon'
  ]
})

B = pd.DataFrame({   # test dataset
    'height': [4, 4.47, 6.49, 7.51, 8.34],
    'width':  [6.5, 7.13, 7, 5.01, 4.23],
    'fruit':  ['Mandarin', 'Mandarin', 'Apple', 'Lemon', 'Lemon']
})

X  = A[['height', 'width']].values
X2 = B[['height', 'width']].values
y  = A.fruit.values
y2 = B.fruit.values

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X, y)

predictions = knn.predict(X)
score = metrics.accuracy_score(y, predictions) # evaluate on training dataset
print(score * 100) # 85%

predictions = knn.predict(X2)
score = metrics.accuracy_score(y2, predictions) # evaluate on test dataset
print(score * 100) # 100%
Score Graph

Score Graph

p50 Models between k=3 and k=7 perform optimally on the test set.
 
"""KNN plot score
Models between k=3 and k=7 perform optimally on the test set
They optimally balance overfitting and underfitting
"""
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
import matplotlib.pyplot as plt

A = pd.DataFrame({
  'height': [
    3.91, 7.09, 10.48, 9.21, 7.95, 7.62, 7.95, 4.69, 7.50, 7.11, 
    4.15, 7.29, 8.49, 7.44, 7.86, 3.93, 4.40, 5.5, 8.10, 8.69
  ], 
  'width': [
     5.76, 7.69, 7.32, 7.20, 5.90, 7.51, 5.32, 6.19, 5.99, 7.02, 
     5.60, 8.38, 6.52, 7.89, 7.60, 6.12, 5.90, 4.5, 6.15, 5.82
  ],
  'fruit': [
    'Mandarin', 'Apple', 'Lemon', 'Lemon', 'Lemon', 'Apple', 'Mandarin', 
    'Mandarin', 'Lemon', 'Apple', 'Mandarin', 'Apple', 'Lemon', 'Apple', 
    'Apple', 'Apple', 'Mandarin', 'Lemon', 'Lemon', 'Lemon'
  ]
})

B = pd.DataFrame({
    'height': [4, 4.47, 6.49, 7.51, 8.34],
    'width': [6.5, 7.13, 7, 5.01, 4.23],
    'fruit': ['Mandarin', 'Mandarin', 'Apple', 'Lemon', 'Lemon']
})

X  = A[['height', 'width']].values
X2 = B[['height', 'width']].values

y  = A.fruit.values
y2 = B.fruit.values

k = []
score = []
score2 = []

for i in range(len(X)):
    _k = i+1
    
    clf = KNeighborsClassifier(n_neighbors = _k)
    clf.fit(X, y)

    _score = metrics.accuracy_score(y, clf.predict(X))
    _score2 = metrics.accuracy_score(y2, clf.predict(X2))

    k.append(_k)
    score.append(_score * 100)
    score2.append(_score2 * 100)
    
    print(f'k={_k} | score: {score[i]} | score2: {score2[i]}')


# Plot train score
plt.scatter(k, score) #function
plt.plot(k, score, '-', label='train') #data points

# Plot test score
plt.scatter(k, score2)
plt.plot(k, score2, '-', label='test')

# Plot configurations
plt.axis([max(k),min(k)+1, 0, 100])
plt.xlabel('number of nearest neighbours (k)', size = 13)
plt.ylabel('accuracy score', size = 13)
plt.title('Model Performance vs Complexity', size = 20)
plt.legend()

# Output
plt.show()

Boundaries

p46 Decision boundaries of KNN on a graph (optimal fit for k=5)
Questions    
Last update: 2 days ago
Scikit, Linear Model