Cross Validation
Random Forests try to fix overfitting by using multiple decision trees.
""" Random Forest / Classifier (Tic Tac Toe)
Random Forests try to fix this overfitting by using multiple decision trees
that are slightly different and averaging the results.
Random Forests are using subsets of the dataset, randomly selected.
The generated trees have unique set of data.
The data is selected randomly for the original dataset, with replacement.
Each one of the tree has the same size as the original data.
With Cross-validation we put aside 25% of the data, before train.
Those test data will be use to measure how good the model is.
"""
import pathlib
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
# Dataset
DIR = pathlib.Path(__file__).resolve().parent
df = pd.read_csv(DIR / 'data/ttt_dataset.csv')
# Encode lables
df_encoded = pd.DataFrame()
for col in df.columns:
df_encoded[col] = LabelEncoder().fit_transform(df[col])
# Train and test data
X = df_encoded.drop(columns=["score", "is_terminal"])
Y = df_encoded['score']
X1, X2, Y1, Y2 = train_test_split(X, Y, test_size=0.25, random_state=0)
# ---------------------------------------------------
# Fitting the model
forest_model = RandomForestClassifier(n_estimators=30)
forest_model.fit(X1, Y1)
forest_score = forest_model.score(X2, Y2)
# ---------------------------------------------------
# Prediction
x_new = X2.iloc[0]
x_new = pd.DataFrame([x_new], columns=X2.columns)
y_pred = forest_model.predict(x_new)[0]
assert y_pred == 2
print("Test Data:"); print(X2, "\n")
print("Encoded:"); print(Y2, "\n")
print("Unknown:"); print(x_new, "\n")
print("Prediction:", y_pred)
print("Score:", round(forest_score,2))
"""
Test Data:
V1 V2 V3 V4 V5 V6 V7 V8 V9
4534 2 2 2 0 0 1 1 0 0
3544 1 2 0 1 0 0 2 0 2
5287 1 2 2 2 0 2 1 1 1
427 2 1 2 2 2 1 0 1 1
5969 2 1 2 2 2 0 1 1 1
... .. .. .. .. .. .. .. .. ..
28369 1 2 1 0 1 2 0 2 2
2253 1 2 2 2 0 0 0 1 0
2323 1 2 2 2 1 1 2 1 2
21741 2 1 0 1 0 0 2 2 1
10353 2 0 1 2 2 2 1 1 0
[7265 rows x 9 columns]
Encoded:
4534 2
3544 2
5287 0
427 2
5969 0
..
28369 0
2253 0
2323 1
21741 1
10353 2
Name: score
Unknown:
V1 V2 V3 V4 V5 V6 V7 V8 V9
4534 2 2 2 0 0 1 1 0 0
Prediction: 2
Score: 0.97
"""
Decision Trees
Random Forests are using subsets of the dataset, randomly selected.
""" Random Forest / Classifier (Play Tenis) - Trees
Random Forests try to fix this overfitting by using multiple decision trees.
Decistion Tree picks a criteria and a thresold.
Criteria specifies where to split, for instance length or with.
The thresold specifies what value of the criteria to split.
"""
import pathlib
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
# Dataset
DIR = pathlib.Path(__file__).resolve().parent
df = pd.read_csv(DIR / 'data/play_tennis.csv')
# Encode lables
df_encoded = pd.DataFrame()
for col in df.columns:
df_encoded[col] = LabelEncoder().fit_transform(df[col])
# Train and test data
X = df_encoded.drop(columns=["play"])
Y = df_encoded['play']
X1, X2, Y1, Y2 = train_test_split(X, Y, random_state=42)
# Fitting the model
forest_model = RandomForestClassifier(max_depth=3, random_state=66)
forest_model.fit(X1, Y1)
forest_score = forest_model.score(X2, Y2)
# Prediction
x_new = X2.iloc[0] # expect 1
x_new = pd.DataFrame([x_new], columns=X2.columns)
y_pred = forest_model.predict(x_new)[0]
assert y_pred == 1
# ---------------------------------------------------------------
# Output
output_trees = []
for i, t in enumerate(forest_model.estimators_):
out_tree = tree.export_text(t, feature_names=list(X.columns))
output_trees.append(out_tree)
# ---------------------------------------------------------------
print("Test Data:"); print(X2, "\n")
print("Encoded:"); print(Y2, "\n")
for i in range(3):
print("DecisionTree", i+1); print(output_trees[i])
print("Unknown:"); print(x_new, "\n")
print("Prediction:", y_pred)
print("Score:", round(forest_score,2))
"""
Test Data:
outlook temp humidity windy
9 1 2 1 0
11 0 2 0 1
0 2 1 0 0
12 0 1 1 0
Encoded:
9 1
11 1
0 0
12 1
Name: play, dtype: int64
DecisionTree 1
|--- outlook <= 0.50
| |--- class: 1.0
|--- outlook > 0.50
| |--- temp <= 1.00
| | |--- windy <= 0.50
| | | |--- class: 1.0
| | |--- windy > 0.50
| | | |--- class: 0.0
| |--- temp > 1.00
| | |--- windy <= 0.50
| | | |--- class: 1.0
| | |--- windy > 0.50
| | | |--- class: 0.0
DecisionTree 2
|--- windy <= 0.50
| |--- outlook <= 1.50
| | |--- class: 1.0
| |--- outlook > 1.50
| | |--- class: 0.0
|--- windy > 0.50
| |--- temp <= 1.50
| | |--- class: 0.0
| |--- temp > 1.50
| | |--- class: 1.0
DecisionTree 3
|--- humidity <= 0.50
| |--- temp <= 1.50
| | |--- class: 1.0
| |--- temp > 1.50
| | |--- windy <= 0.50
| | | |--- class: 0.0
| | |--- windy > 0.50
| | | |--- class: 0.0
|--- humidity > 0.50
| |--- class: 1.0
Unknown:
outlook temp humidity windy
9 1 2 1 0
Prediction: 1
Score: 0.75
"""
Best Params
Get information about best estimator, best score, etc.
""" Random Forest / Classifier (Play Tenis) - Best Params
"""
import pathlib
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
# Dataset
DIR = pathlib.Path(__file__).resolve().parent
df = pd.read_csv(DIR / 'data/play_tennis.csv')
# Encode lables
df_encoded = pd.DataFrame()
for col in df.columns:
df_encoded[col] = LabelEncoder().fit_transform(df[col])
# Train and test data
X = df_encoded.drop(columns=["play"])
Y = df_encoded['play']
X1, X2, Y1, Y2 = train_test_split(X, Y, test_size=0.2, random_state=42)
# --------------------------------------------------------------------
# Best parameters
parameters = {
'max_depth': [2, 3, 4, 10],
'n_estimators': [5, 10, 20]
}
model = RandomForestClassifier(random_state=42)
grid = GridSearchCV(model, parameters, cv=3)
grid.fit(X1, Y1)
print("Best Parameters:", grid.best_params_) # max_depth: 2, n_estimators: 10
print("Best Score:", round(grid.best_score_, 2), "\n")
# --------------------------------------------------------------------
# Fitting best model
forest_model = RandomForestClassifier(n_estimators=10, max_depth=2, random_state=0)
forest_model.fit(X1, Y1)
forest_score = forest_model.score(X2, Y2)
# Prediction
x_new = X2.iloc[2] # expect 0
x_new = pd.DataFrame([x_new], columns=X2.columns)
y_pred = forest_model.predict(x_new)[0]
assert y_pred == 0
print("Test Data:"); print(X2, "\n")
print("Encoded:"); print(Y2, "\n")
print("Unknown:"); print(x_new, "\n")
print("Prediction:", y_pred)
print("Score:", round(forest_score,2))
"""
Best Parameters: {'max_depth': 2, 'n_estimators': 10}
Best Score: 0.72
Test Data:
outlook temp humidity windy
9 1 2 1 0
11 0 2 0 1
0 2 1 0 0
Encoded:
9 1
11 1
0 0
Name: play
Unknown:
outlook temp humidity windy
0 2 1 0 0
Prediction: 0
Score: 1.0
"""
Gradient Boosting
Trees are build in serial maner, each tree tries to correct the mistakes of the previus one.
""" Gradient Boostring Classifier (cancer)
In contract to the random forest, gradient boosting works by
building trees in a serial manner, where each tree tries to correct
the mistakes of the previous one.
By default, it uses 100 trees, maxim depth 3 and learning rate 0.1
With the default params we get 100% accuracy on train data, which could
lead to overfitting.
We use a stronger pre-prunning by limiting the maximum depth.
"""
import pathlib
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
# Dataset
DIR = pathlib.Path(__file__).resolve().parent
df = pd.read_csv(DIR / 'data/play_tennis.csv')
# Encode lables
df_encoded = pd.DataFrame()
for col in df.columns:
df_encoded[col] = LabelEncoder().fit_transform(df[col])
# Train and test data
X = df_encoded.drop(columns=["play"])
Y = df_encoded['play']
X1, X2, Y1, Y2 = train_test_split(X, Y, random_state=42)
# ------------------------------------------------------------
# Default params
model = GradientBoostingClassifier(random_state=0)
model.fit(X1, Y1)
score1 = model.score(X1, Y1)
score2 = model.score(X2, Y2)
# Reduce overfitting
model = GradientBoostingClassifier(random_state=0, max_depth=1) # Look Here
model.fit(X1, Y1)
score3 = model.score(X1, Y1)
score4 = model.score(X2, Y2)
# ------------------------------------------------------------
# Prediction
x_new = X2.iloc[0] # expect 1
x_new = pd.DataFrame([x_new], columns=X2.columns)
y_pred = model.predict(x_new)[0]
assert y_pred == 1
# Ouput
print("Test Data:"); print(X2, "\n")
print("Encoded:"); print(Y2, "\n")
print("GradientBoostingClassifier(max_depth=3)")
print(" Training set:", score1)
print(" Test set:", round(score2, 2), "\n")
print("GradientBoostingClassifier(max_depth=1)")
print(" Training score:", round(score3, 2))
print(" Test score:", round(score4, 2), "\n")
print("Unknown:"); print(x_new, "\n")
print("Prediction:", y_pred)
"""
Test Data:
outlook temp humidity windy
9 1 2 1 0
11 0 2 0 1
0 2 1 0 0
12 0 1 1 0
Encoded:
9 1
11 1
0 0
12 1
GradientBoostingClassifier(max_depth=3)
Training set: 1.0
Test set: 0.75
GradientBoostingClassifier(max_depth=1)
Training score: 0.9
Test score: 1.0
Unknown:
outlook temp humidity windy
9 1 2 1 0
Prediction: 1
"""
Last update: 378 days ago