PROGRAMMING

  minte9
learningjourney




S R Q

Iris Dataset

p13 Predict the species of a new iris based on petals length and width.
 
""" Iris Species Classifier
Learn model that predicts the species of a new iris
based on known measurements (length and width of petals).

It's difficult to plot datasets with more than 2-3 features.
Pair plots uses all posible pair of features.  

The data points are colored according to the species the iris belons to.
From the plots, we can see that tha three classes are well separated.
This means that ML model will be able to learn to separate them.
"""

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt

dataset = load_iris()
print('Keys: ', dataset.keys())
    # 'data', 'target', 'frame', 'target_names', 
    # 'DESCR', 'feature_names', 'filename', 'data_module'

print(dataset['DESCR'][:193])
    # Number of Instances: 150 (50 in each of three classes)
    # Number of Attributes: 4 numeric

print(dataset['target_names'])
    # ['setosa' 'versicolor' 'virginica']

print(dataset['feature_names'])
    # sepal length (cm)
    # sepal width (cm)
    # petal length (cm)
    # petal width (cm)

print(dataset['data'].shape) # number of samples, features
    # (150, 4)

print(dataset['data'][:2])
    # [5.1 3.5 1.4 0.2]
    # [4.9 3.  1.4 0.2]

print(dataset['target'][:2])
print(dataset['target'][148:]) # species are encoded, 0 to 2
    # [0, 0]
    # [2, 2]

X1, X2, y1, y2 = train_test_split(
    dataset['data'], dataset['target'], random_state=0 # fixed seed
)

print('X1 shape: ', X1.shape) # (112, 4)
print('X2 shape: ', X2.shape) # (38, 4)
print('y1 shape: ', y1.shape) # 112
print('y2 shape: ', y2.shape) # 38

df = pd.DataFrame(X1, columns=dataset.feature_names)
pd.plotting.scatter_matrix(
    df, c=y1, figsize=(15, 15), marker='o', 
    s=60, alpha = .8, diagonal='none'
)

plt.suptitle('Iris features matrix')
plt.show()

KNN Classifier

p20 Build the actual ML model with KNeighborsClassifier algorithm.
 
""" Iris Species KNN Clasifier
The most important parameter is the number of neighbors (k)

Our model predicts that this new iris belongs to class 0, 
meaning its species is setosa.
"""

import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import matplotlib.pyplot as plt

dataset = load_iris()
X1, X2, y1, y2 = train_test_split(
    dataset['data'], dataset['target'], random_state=0
)

knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X1, y1)

X_new = np.array([5, 2.9, 1, 0.2]).reshape(1, 4)
y_new = knn.predict(X_new)
print("Prediction:", y_new)
print("Predicted target:", dataset['target_names'][y_new])
    # [0]
    # setosa

df = pd.DataFrame(X1, columns=dataset.feature_names)

# Iris petal scatter plot
fig, ax = plt.subplots()
ax.set_title("Petals")
ax.set_xlabel('length (cm)')
ax.set_ylabel('width (cm)')
ax.scatter(df['petal length (cm)'], df['petal width (cm)'], c=y1)
ax.scatter(X_new[0][2], X_new[0][3], c='r', marker='x', s=100)
ax.grid()

# Iris sepal scatter plot
fig, ax = plt.subplots()
ax.set_title("Sepals")
ax.set_xlabel('length (cm)')
ax.set_ylabel('width (cm)')
ax.scatter(df['sepal length (cm)'], df['sepal width (cm)'], c=y1)
ax.scatter(X_new[0][0], X_new[0][1], c='r', marker='x', s=100)
ax.grid()

# Plot the new point on the scatter matrix plot
axes = pd.plotting.scatter_matrix(
    df, c=y1, figsize=(15, 15), marker='o', 
    s=60, alpha = .8, diagonal='none'
)
for i in range(4):
    for j in range(4):
        if i == j:
            continue
        ax = axes[i, j]
        ax.scatter(X_new[:, j], X_new[:, i], c='r', marker='x', s=200)

plt.show()

Model Evaluation

p23 This is where the test set is usefull.
 
""" Iris species Model Evaluation
We make a prediction for each iris in the test dataset and
compare it against its known label.

For this model the test set accurary is 0.97
This means that we made the right prediction for 97% 
for irises in the test dataset.
"""

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import numpy as np

dataset = load_iris()
X1, X2, y1, y2 = train_test_split(
    dataset['data'], dataset['target'], random_state=0
)

knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X1, y1)

y_new = knn.predict(X2) # predictions on test dataset
score = np.mean(y_new == y2)
print(round(score, 2))
    # 0.97

score = knn.score(X2, y2) # get score using knn object
print(round(score, 2))
    # 0.97

Questions    
Last update: 48 days ago
Datasets, Diabetes