Algorithms / Decision Tree
What is Entropy? What is a Probability Distribution? What is the Gini Index? What is Information Gain?
Entropy
Entropy is a measure of how disordered a collection is.
""" Decision Tree / Entropy
Entropy tell us how disordered in a collection of data.
The more impure the feature is, the higher the entropy.
Probability distribution is the frequency of the unique values.
"""
import pandas as pd
import numpy as np
# Dataset
A = ['apple']*1 + ['orange']*2 + ['banana']*2
A = pd.Series(A)
B = ['apple']*5 + ['orange']*2 + ['banana']*0
B = pd.Series(B)
# Probability distribution (by hand or pandas)
P = [3/7, 2/7, 2/7]
PA = A.value_counts(normalize=True)
PB = B.value_counts(normalize=True)
# ------------------------------------
# Entropy (Shannon model)
EA = -1 * np.sum(PA * np.log2(PA))
EB = -1 * np.sum(PB * np.log2(PB))
assert EB < EA
# ------------------------------------
# Output
outputs = [
["A =", A.values],
["B =", B.values],
["PA =", PA.values],
["PB =", PB.values],
["EA =", EA],
["EB =", EB],
]
for v in outputs:
print(v[0], v[1])
"""
A = ['apple' 'orange' 'orange' 'banana' 'banana']
B = ['apple' 'apple' 'apple' 'apple' 'apple' 'orange' 'orange']
PA = [0.4 0.4 0.2]
PB = [0.71428571 0.28571429]
EA = 1.5219280948873621
EB = 0.863120568566631
"""
$$ H(X) = -\sum_{i=1}^{n} P(x_i) \log_2 P(x_i) $$
Gini index
Gini Index is a measure of impurity or diversity within a set of elements.
""" Decision Tree / Gini Index
Both entropy and Gini index can be used as impurity measures
for decision tree algorithms.
Gini index is between 0 and 1, it easy to compare gini across different features.
While Gini index is often preferred due to its computational efficiency,
entropy may be more sensitive to changes in class probabilities.
"""
import pandas as pd
import numpy as np
# Dataset
A = ['apple']*1 + ['orange']*2 + ['banana']*2
A = pd.Series(A)
B = ['apple']*5 + ['orange']*2 + ['banana']*0
B = pd.Series(B)
# Probability distribution
PA = A.value_counts(normalize=True)
PB = B.value_counts(normalize=True)
# ------------------------------------
# Gini Index
giniA = 1 - np.sum(np.square(PA)) # Look Here
giniB = 1 - np.sum(np.square(PB))
assert giniB < giniA
# ------------------------------------
outputs = [
["A =", A.values],
["B =", B.values],
["PA =", PA.values],
["PB =", PB.values],
["giniA =", giniA],
["giniB =", giniB],
]
for v in outputs:
print(v[0], v[1])
"""
A = ['apple' 'orange' 'orange' 'banana' 'banana']
B = ['apple' 'apple' 'apple' 'apple' 'apple' 'orange' 'orange']
PA = [0.4 0.4 0.2]
PB = [0.71428571 0.28571429]
giniA = 0.6399999999999999
giniB = 0.40816326530612246
"""
$$ \text{Gini}(X) = 1 - \sum_{x \in X} P(x)^2 $$
Information gain
We can measure the reduction in entropy by splitting a dataset based on an attribute.
""" Decision Trees / Info Gain
Information gain is a measure of the reduction in entropy.
As the entropy of an attribute increases, the information gain decreases.
We can find which attribute is the most useful or informative
and split the dataset based on that attribute.
IG = H - (8/14)H_week - (6/14)H_strong
IG = 0.940 - (8/14)0.811 - (6/14)1.00 - 0.048
Machine epsilon is the upper bound on the relative error due to rounding.
It is a small value added to the denominator in order to avoid division by zero.
"""
import numpy as np
import pandas as pd
import pathlib
# Dataset
DIR = pathlib.Path(__file__).resolve().parent
df = pd.read_csv(DIR / 'data/play_tennis.csv')
# Train data
X = df.drop(['play'], axis=1)
y = df['play']
# Total entropy
def dataset_entropy():
E = 0
N = df['play'].value_counts() # yes: 9, no: 5
for v in df['play'].unique(): # yes/no
P = N[v]/len(df['play']) # probability
E += -P*np.log2(P)
return E
# Entropy for each attribute
def attribute_entropy(attr):
E = 0
eps = np.finfo(float).eps # machine epsilon for the float
targets = df.play.unique()
values = df[attr].unique()
for v in values: # cool/hot
ent = 0
for t in targets: # yes,no
num = len(df[attr][df[attr] == v][df.play == t]) # numerator
den = len(df[attr][df[attr] == v])
fraction = num/(den + eps)
ent += -fraction*np.log2(fraction + eps) # entropy for one feature
E += -(den/len(df))*ent # sum of all entropies
return abs(E)
# -------------------------------
# Attributes names
attributes = df.keys()[:-1]
# Entropy (for each attribute)
E = {}
for k in attributes:
E[k] = attribute_entropy(k)
# Information gain (for each attribute)
IG = {}
for k in E:
IG[k] = dataset_entropy() - E[k]
# Oneliners
E = {k:attribute_entropy(k) for k in attributes}
IG = {k:(dataset_entropy() - E[k]) for k in E}
assert E['outlook'] < E['humidity']
assert IG['outlook'] > IG['humidity'] # Look Here
# -------------------------------
outputs = [
["Dataset:", df],
["Describe:", df.describe()],
["Entropy:", dataset_entropy()],
["AttrEntropy:", E],
["Information gains:", IG],
]
for v in outputs:
print("\n", v[0], "\n ", v[1])
"""
Dataset:
outlook temp humidity windy play
0 sunny hot high False no
1 sunny hot high True no
2 overcast hot high False yes
3 rainy mild high False yes
4 rainy cool normal False yes
5 rainy cool normal True no
6 overcast cool normal True yes
7 sunny mild high False no
8 sunny cool normal False yes
9 rainy mild normal False yes
10 sunny mild normal True yes
11 overcast mild high True yes
12 overcast hot normal False yes
13 rainy mild high True no
Describe:
outlook temp humidity windy play
count 14 14 14 14 14
unique 3 3 2 2 2
top sunny mild high False yes
freq 5 6 7 8 9
Entropy:
0.9402859586706311
Entropy for each attribute:
{ 'outlook': 0.6935361388961914,
'temp': 0.9110633930116756,
'humidity': 0.7884504573082889,
'windy': 0.892158928262361 }
Information gains:
{ 'outlook': 0.24674981977443977,
'temp': 0.029222565658955535,
'humidity': 0.15183550136234225,
'windy': 0.048127030408270155 }
"""
$$ \text{IG}(X, A) = \text{H}(X) - \sum_{v \in \text{Values}(A)} \frac{|X_v|}{|X|} \cdot \text{H}(X_v) $$
Algorithm
We build a decision tree recursively giving priority to the attributes with the higher IG.
""" Decision Trees / ID3 Algorithm
Iterative Dichotomiser 3, is a classification algorithm that follows a
greedy approach of building a decision tree that gives priority to the attributes
with the higher information gain.
1. Calculate entropy for dataset
2. For each attribute:
Calculate entropy for all categorical values
Calculate information gain for the current attribute
3. Find the feture with maximum information gain
4. Repeat
The `outlook` has the highest info gain of 0.24, so we will select it as
the root node for the start level of splitting.
"""
import numpy as np
import pandas as pd
import pathlib
from sklearn import tree
# Dataset
DIR = pathlib.Path(__file__).resolve().parent
df = pd.read_csv(DIR / 'data/play_tennis.csv')
# Train data
X = df.drop(['play'], axis=1)
y = df['play']
# Total entropy (for current dataframe)
def dataset_entropy(df):
E = 0
N = df['play'].value_counts() # yes: 9, no: 5
values = df['play'].unique()
for v in values: # yes/no
P = N[v]/len(df['play']) # probability
E += -P*np.log2(P)
return E
# Entropy for each attribute
def attribute_entropy(df, attr):
E = 0
eps = np.finfo(float).eps # machine epsilon for the float
targets = df.play.unique()
values = df[attr].unique()
for v in values: # cool/hot
ent = 0
for t in targets: # yes,no
num = len(df[attr][df[attr] == v][df.play == t]) # numerator
den = len(df[attr][df[attr] == v])
fraction = num/(den + eps)
ent += -fraction*np.log2(fraction + eps) # entropy for one feature
E += -(den/len(df))*ent # sum of all entropies
return abs(E)
# ---------------------------------------------------
# Find attribute with maximum information gain
def find_winner(df):
IG = {}
attributes = df.keys()[:-1]
# Loop for attributes in dataframe and compute info gains
for attr in attributes:
IG[attr] = dataset_entropy(df) - attribute_entropy(df, attr)
winner = attributes[np.argmax(IG)] # maxim info gains
return winner
# ---------------------------------------------------
# Construct the decision tree (dictionary)
def buildTree(df):
tree = {}
# Target column
Class = df.keys()[-1] # play
# Maximum info gain
node = find_winner(df) # outlook
tree[node] = {}
# Distinct values
values = np.unique(df[node]) # overcast/rain
# Loop throw the attribute values
for value in values:
subtable = df[df[node] == value].reset_index(drop=True)
attr_values, counts = np.unique(subtable[Class], return_counts=True)
if len(counts) == 1: # pure subset
tree[node][value] = attr_values[0]
else:
subtable = subtable.drop(node, axis=1)
tree[node][value] = buildTree(subtable) # Recursive case
return tree
decision_tree = buildTree(df)
# Print dictionary tree (recursion in case of subtrees)
def print_tree(tree, attr=None, i=0):
if not attr:
attr = next(iter(tree)) # attrribute in the current tree node
for key, subval in tree[attr].items():
if isinstance(subval, str): # Base case
print(i*" ", attr, "=", key, ":", subval)
continue
print(i*" ", attr, "=", key, ":")
print_tree(subval, i=i+1) # Recursive
return
# Predict unknow (only for cases included in the train dataset)
def predict(X, tree):
key = next(iter(tree))
val = X[key]
subval = tree[key][val]
if isinstance(subval, str): # Base case
return subval
subval = predict(X, subval) # Recursive
return subval
print(decision_tree, "\n")
print_tree(decision_tree)
# Example usage
x = {'outlook': 'sunny', 'temp': 'mild', 'humidity': 'high', 'windy': False}
y = predict(x, decision_tree)
print("\nAttributes:", x)
print("Prediction:", y)
# Example usage 2
x = {'outlook': 'rainy', 'temp': 'mild', 'humidity': 'normal', 'windy': True}
y = predict(x, decision_tree)
print("\nAttributes:", x)
print("Prediction:", y)
"""
{'outlook': {'overcast': 'yes', 'rainy': {'temp': {'cool': {'humidity': ...
outlook = overcast: yes
outlook = rainy:
temp = cool:
humidity = normal:
windy = False: yes
windy = True: no
temp = mild:
humidity = high:
windy = False: yes
windy = True: no
humidity = normal: yes
outlook = sunny:
temp = cool: yes
temp = hot: no
temp = mild:
humidity = high: no
humidity = normal: yes
Attributes: {
'outlook': 'sunny',
'temp': 'mild',
'humidity': 'high',
'windy': False}
Prediction: no
Attributes: {
'outlook': 'rainy',
'temp': 'mild',
'humidity': 'normal',
'windy': True}
Prediction: yes
"""