# minte9 LearnRemember

S R Q

### Entropy

p61 Entropy is a measure of how disordered a collection is.

""" Decision Tree / Entropy

Entropy tell us how disordered in a collection of data.
The more impure the feature is, the higher the entropy.
Probability distribution is the frequency of the unique values.
"""

import pandas as pd
import numpy as np

# Dataset
A = ['apple']*1 + ['orange']*2 + ['banana']*2
A = pd.Series(A)

B = ['apple']*5 + ['orange']*2 + ['banana']*0
B = pd.Series(B)

# Probability distribution (by hand or pandas)
P = [3/7, 2/7, 2/7]
PA = A.value_counts(normalize=True)
PB = B.value_counts(normalize=True)

# ------------------------------------

# Entropy (Shannon model)
EA = -1 * np.sum(PA * np.log2(PA))
EB = -1 * np.sum(PB * np.log2(PB))
assert EB < EA

# ------------------------------------

# Output
outputs = [
["A  =", A.values],
["B  =", B.values],
["PA =", PA.values],
["PB =", PB.values],
["EA =", EA],
["EB =", EB],
]
for v in outputs:
print(v[0], v[1])

"""
A  = ['apple' 'orange' 'orange' 'banana' 'banana']
B  = ['apple' 'apple' 'apple' 'apple' 'apple' 'orange' 'orange']
PA = [0.4 0.4 0.2]
PB = [0.71428571 0.28571429]
EA = 1.5219280948873621
EB = 0.863120568566631
"""

$$H(X) = -\sum_{i=1}^{n} P(x_i) \log_2 P(x_i)$$

### Gini index

p61 Gini Index is a measure of impurity or diversity within a set of elements.

""" Decision Tree / Gini Index

Both entropy and Gini index can be used as impurity measures
for decision tree algorithms.

Gini index is between 0 and 1, it easy to compare gini across different features.
While Gini index is often preferred due to its computational efficiency,
entropy may be more sensitive to changes in class probabilities.
"""

import pandas as pd
import numpy as np

# Dataset
A = ['apple']*1 + ['orange']*2 + ['banana']*2
A = pd.Series(A)

B = ['apple']*5 + ['orange']*2 + ['banana']*0
B = pd.Series(B)

# Probability distribution
PA = A.value_counts(normalize=True)
PB = B.value_counts(normalize=True)

# ------------------------------------

# Gini Index
giniA = 1 - np.sum(np.square(PA)) # Look Here
giniB = 1 - np.sum(np.square(PB))
assert giniB < giniA

# ------------------------------------

outputs = [
["A  =", A.values],
["B  =", B.values],
["PA =", PA.values],
["PB =", PB.values],
["giniA =", giniA],
["giniB =", giniB],
]
for v in outputs:
print(v[0], v[1])

"""
A  = ['apple' 'orange' 'orange' 'banana' 'banana']
B  = ['apple' 'apple' 'apple' 'apple' 'apple' 'orange' 'orange']
PA = [0.4 0.4 0.2]
PB = [0.71428571 0.28571429]
giniA = 0.6399999999999999
giniB = 0.40816326530612246
"""

$$\text{Gini}(X) = 1 - \sum_{x \in X} P(x)^2$$ Information gain

### Information gain

p66 We can measure the reduction in entropy by splitting a dataset based on an attribute.

""" Decision Trees / Info Gain

Information gain is a measure of the reduction in entropy.
As the entropy of an attribute increases, the information gain decreases.

We can find which attribute is the most useful or informative
and split the dataset based on that attribute.

IG = H - (8/14)H_week - (6/14)H_strong
IG = 0.940 - (8/14)0.811 - (6/14)1.00 - 0.048

Machine epsilon is the upper bound on the relative error due to rounding.
It is a small value added to the denominator in order to avoid division by zero.
"""

import numpy as np
import pandas as pd
import pathlib

# Dataset
DIR = pathlib.Path(__file__).resolve().parent

# Train data
X = df.drop(['play'], axis=1)
y = df['play']

# Total entropy
def dataset_entropy():
E = 0
N = df['play'].value_counts() # yes: 9, no: 5
for v in df['play'].unique(): # yes/no
P = N[v]/len(df['play'])  # probability
E += -P*np.log2(P)
return E

# Entropy for each attribute
def attribute_entropy(attr):
E = 0
eps = np.finfo(float).eps   # machine epsilon for the float
targets = df.play.unique()
values = df[attr].unique()
for v in values: # cool/hot
ent = 0
for t in targets: # yes,no
num = len(df[attr][df[attr] == v][df.play == t]) # numerator
den = len(df[attr][df[attr] == v])
fraction = num/(den + eps)
ent += -fraction*np.log2(fraction + eps) # entropy for one feature
E += -(den/len(df))*ent # sum of all entropies
return abs(E)

# -------------------------------

# Attributes names
attributes = df.keys()[:-1]

# Entropy (for each attribute)
E = {}
for k in attributes:
E[k] = attribute_entropy(k)

# Information gain (for each attribute)
IG = {}
for k in E:
IG[k] = dataset_entropy() - E[k]

# Oneliners
E  = {k:attribute_entropy(k) for k in attributes}
IG = {k:(dataset_entropy() - E[k]) for k in E}

assert E['outlook']  < E['humidity']
assert IG['outlook'] > IG['humidity'] # Look Here

# -------------------------------

outputs = [
["Dataset:", df],
["Describe:", df.describe()],
["Entropy:", dataset_entropy()],
["AttrEntropy:", E],
["Information gains:", IG],
]
for v in outputs:
print("\n", v[0], "\n ", v[1])

"""
Dataset:
outlook  temp humidity  windy play
0      sunny   hot     high  False   no
1      sunny   hot     high   True   no
2   overcast   hot     high  False  yes
3      rainy  mild     high  False  yes
4      rainy  cool   normal  False  yes
5      rainy  cool   normal   True   no
6   overcast  cool   normal   True  yes
7      sunny  mild     high  False   no
8      sunny  cool   normal  False  yes
9      rainy  mild   normal  False  yes
10     sunny  mild   normal   True  yes
11  overcast  mild     high   True  yes
12  overcast   hot   normal  False  yes
13     rainy  mild     high   True   no

Describe:
outlook  temp humidity  windy play
count       14    14       14     14   14
unique       3     3        2      2    2
top      sunny  mild     high  False  yes
freq         5     6        7      8    9

Entropy:
0.9402859586706311

Entropy for each attribute:
{ 'outlook':     0.6935361388961914,
'temp':        0.9110633930116756,
'humidity':    0.7884504573082889,
'windy':       0.892158928262361 }

Information gains:
{ 'outlook':     0.24674981977443977,
'temp':        0.029222565658955535,
'humidity':    0.15183550136234225,
'windy':       0.048127030408270155 }
"""

$$\text{IG}(X, A) = \text{H}(X) - \sum_{v \in \text{Values}(A)} \frac{|X_v|}{|X|} \cdot \text{H}(X_v)$$ Algorithm

### Algorithm

We build a decision tree recursively giving priority to the attributes with the higher IG.

""" Decision Trees / ID3 Algorithm

Iterative Dichotomiser 3, is a classification algorithm that follows a
greedy approach of building a decision tree that gives priority to the attributes
with the higher information gain.

1. Calculate entropy for dataset
2. For each attribute:
Calculate entropy for all categorical values
Calculate information gain for the current attribute
3. Find the feture with maximum information gain
4. Repeat

The outlook has the highest info gain of 0.24, so we will select it as
the root node for the start level of splitting.
"""

import numpy as np
import pandas as pd
import pathlib
from sklearn import tree

# Dataset
DIR = pathlib.Path(__file__).resolve().parent

# Train data
X = df.drop(['play'], axis=1)
y = df['play']

# Total entropy (for current dataframe)
def dataset_entropy(df):
E = 0
N = df['play'].value_counts() # yes: 9, no: 5
values = df['play'].unique()
for v in values: # yes/no
P = N[v]/len(df['play'])  # probability
E += -P*np.log2(P)
return E

# Entropy for each attribute
def attribute_entropy(df, attr):
E = 0
eps = np.finfo(float).eps   # machine epsilon for the float
targets = df.play.unique()
values = df[attr].unique()
for v in values: # cool/hot
ent = 0
for t in targets: # yes,no
num = len(df[attr][df[attr] == v][df.play == t]) # numerator
den = len(df[attr][df[attr] == v])
fraction = num/(den + eps)
ent += -fraction*np.log2(fraction + eps) # entropy for one feature
E += -(den/len(df))*ent # sum of all entropies
return abs(E)

# ---------------------------------------------------

# Find attribute with maximum information gain
def find_winner(df):
IG = {}
attributes = df.keys()[:-1]

# Loop for attributes in dataframe and compute info gains
for attr in attributes:
IG[attr] = dataset_entropy(df) - attribute_entropy(df, attr)
winner = attributes[np.argmax(IG)] # maxim info gains
return winner
# ---------------------------------------------------

# Construct the decision tree (dictionary)
def buildTree(df):
tree = {}

# Target column
Class = df.keys()[-1] # play

# Maximum info gain
node = find_winner(df) # outlook
tree[node] = {}

# Distinct values
values = np.unique(df[node]) # overcast/rain

# Loop throw the attribute values
for value in values:
subtable = df[df[node] == value].reset_index(drop=True)
attr_values, counts = np.unique(subtable[Class], return_counts=True)

if len(counts) == 1: # pure subset
tree[node][value] = attr_values[0]
else:
subtable = subtable.drop(node, axis=1)
tree[node][value] = buildTree(subtable) # Recursive case

return tree

decision_tree = buildTree(df)

# Print dictionary tree (recursion  in case of subtrees)
def print_tree(tree, attr=None, i=0):
if not attr:
attr = next(iter(tree)) # attrribute in the current tree node

for key, subval in tree[attr].items():

if isinstance(subval, str): # Base case
print(i*" ", attr, "=", key, ":", subval)
continue

print(i*" ", attr, "=", key, ":")
print_tree(subval, i=i+1) # Recursive

return

# Predict unknow (only for cases included in the train dataset)
def predict(X, tree):
key = next(iter(tree))
val = X[key]
subval = tree[key][val]

if isinstance(subval, str): # Base case
return subval

subval = predict(X, subval) # Recursive
return subval

print(decision_tree, "\n")
print_tree(decision_tree)

# Example usage
x = {'outlook': 'sunny', 'temp': 'mild', 'humidity': 'high', 'windy': False}
y = predict(x, decision_tree)
print("\nAttributes:", x)
print("Prediction:", y)

# Example usage 2
x = {'outlook': 'rainy', 'temp': 'mild', 'humidity': 'normal', 'windy': True}
y = predict(x, decision_tree)
print("\nAttributes:", x)
print("Prediction:", y)

"""
{'outlook': {'overcast': 'yes', 'rainy': {'temp': {'cool': {'humidity': ...

outlook = overcast: yes
outlook = rainy:
temp = cool:
humidity = normal:
windy = False: yes
windy = True: no
temp = mild:
humidity = high:
windy = False: yes
windy = True: no
humidity = normal: yes
outlook = sunny:
temp = cool: yes
temp = hot: no
temp = mild:
humidity = high: no
humidity = normal: yes

Attributes: {
'outlook': 'sunny',
'temp': 'mild',
'humidity': 'high',
'windy': False}
Prediction: no

Attributes: {
'outlook': 'rainy',
'temp': 'mild',
'humidity': 'normal',
'windy': True}
Prediction: yes
"""


Questions