minte9
LearnRemember




S R Q

Entropy

p61 Entropy is a measure of how disordered a collection is.
 
""" Decision Tree / Entropy

Entropy tell us how disordered in a collection of data.  
The more impure the feature is, the higher the entropy.
Probability distribution is the frequency of the unique values.
"""

import pandas as pd
import numpy as np

# Dataset
A = ['apple']*1 + ['orange']*2 + ['banana']*2
A = pd.Series(A)

B = ['apple']*5 + ['orange']*2 + ['banana']*0
B = pd.Series(B)

# Probability distribution (by hand or pandas)
P = [3/7, 2/7, 2/7]
PA = A.value_counts(normalize=True)
PB = B.value_counts(normalize=True)

# ------------------------------------

# Entropy (Shannon model)
EA = -1 * np.sum(PA * np.log2(PA))
EB = -1 * np.sum(PB * np.log2(PB))
assert EB < EA

# ------------------------------------

# Output
outputs = [
    ["A  =", A.values],
    ["B  =", B.values],
    ["PA =", PA.values],
    ["PB =", PB.values],
    ["EA =", EA],
    ["EB =", EB],
]
for v in outputs: 
    print(v[0], v[1])

"""
    A  = ['apple' 'orange' 'orange' 'banana' 'banana']
    B  = ['apple' 'apple' 'apple' 'apple' 'apple' 'orange' 'orange']
    PA = [0.4 0.4 0.2]
    PB = [0.71428571 0.28571429]
    EA = 1.5219280948873621
    EB = 0.863120568566631
"""
$$ H(X) = -\sum_{i=1}^{n} P(x_i) \log_2 P(x_i) $$

Gini index

p61 Gini Index is a measure of impurity or diversity within a set of elements.
 
""" Decision Tree / Gini Index

Both entropy and Gini index can be used as impurity measures 
for decision tree algorithms. 

Gini index is between 0 and 1, it easy to compare gini across different features.
While Gini index is often preferred due to its computational efficiency, 
entropy may be more sensitive to changes in class probabilities.
"""

import pandas as pd
import numpy as np

# Dataset
A = ['apple']*1 + ['orange']*2 + ['banana']*2
A = pd.Series(A)

B = ['apple']*5 + ['orange']*2 + ['banana']*0
B = pd.Series(B)

# Probability distribution
PA = A.value_counts(normalize=True)
PB = B.value_counts(normalize=True)

# ------------------------------------

# Gini Index
giniA = 1 - np.sum(np.square(PA)) # Look Here
giniB = 1 - np.sum(np.square(PB))
assert giniB < giniA

# ------------------------------------

outputs = [
    ["A  =", A.values],
    ["B  =", B.values],
    ["PA =", PA.values],
    ["PB =", PB.values],
    ["giniA =", giniA],
    ["giniB =", giniB],
]
for v in outputs: 
    print(v[0], v[1])

"""
    A  = ['apple' 'orange' 'orange' 'banana' 'banana']
    B  = ['apple' 'apple' 'apple' 'apple' 'apple' 'orange' 'orange']
    PA = [0.4 0.4 0.2]
    PB = [0.71428571 0.28571429]
    giniA = 0.6399999999999999
    giniB = 0.40816326530612246
"""
$$ \text{Gini}(X) = 1 - \sum_{x \in X} P(x)^2 $$ Information gain

Information gain

p66 We can measure the reduction in entropy by splitting a dataset based on an attribute.
 
""" Decision Trees / Info Gain

Information gain is a measure of the reduction in entropy.
As the entropy of an attribute increases, the information gain decreases.

We can find which attribute is the most useful or informative 
and split the dataset based on that attribute.

IG = H - (8/14)H_week - (6/14)H_strong
IG = 0.940 - (8/14)0.811 - (6/14)1.00 - 0.048 

Machine epsilon is the upper bound on the relative error due to rounding.
It is a small value added to the denominator in order to avoid division by zero. 
"""

import numpy as np
import pandas as pd
import pathlib

# Dataset
DIR = pathlib.Path(__file__).resolve().parent
df = pd.read_csv(DIR / 'data/play_tennis.csv')

# Train data
X = df.drop(['play'], axis=1)
y = df['play']

# Total entropy
def dataset_entropy():
    E = 0
    N = df['play'].value_counts() # yes: 9, no: 5
    for v in df['play'].unique(): # yes/no
        P = N[v]/len(df['play'])  # probability
        E += -P*np.log2(P)
    return E

# Entropy for each attribute
def attribute_entropy(attr):
    E = 0
    eps = np.finfo(float).eps   # machine epsilon for the float 
    targets = df.play.unique()
    values = df[attr].unique()
    for v in values: # cool/hot
        ent = 0
        for t in targets: # yes,no
            num = len(df[attr][df[attr] == v][df.play == t]) # numerator
            den = len(df[attr][df[attr] == v])
            fraction = num/(den + eps)
            ent += -fraction*np.log2(fraction + eps) # entropy for one feature
        E += -(den/len(df))*ent # sum of all entropies
    return abs(E)

# -------------------------------

# Attributes names
attributes = df.keys()[:-1]

# Entropy (for each attribute)
E = {} 
for k in attributes:
    E[k] = attribute_entropy(k)

# Information gain (for each attribute)
IG = {}
for k in E:
    IG[k] = dataset_entropy() - E[k]

# Oneliners
E  = {k:attribute_entropy(k) for k in attributes}
IG = {k:(dataset_entropy() - E[k]) for k in E} 

assert E['outlook']  < E['humidity']
assert IG['outlook'] > IG['humidity'] # Look Here

# -------------------------------

outputs = [
    ["Dataset:", df],
    ["Describe:", df.describe()],
    ["Entropy:", dataset_entropy()],
    ["AttrEntropy:", E],
    ["Information gains:", IG],
]
for v in outputs: 
    print("\n", v[0], "\n ", v[1])

"""
    Dataset: 
         outlook  temp humidity  windy play
    0      sunny   hot     high  False   no
    1      sunny   hot     high   True   no
    2   overcast   hot     high  False  yes
    3      rainy  mild     high  False  yes
    4      rainy  cool   normal  False  yes
    5      rainy  cool   normal   True   no
    6   overcast  cool   normal   True  yes
    7      sunny  mild     high  False   no
    8      sunny  cool   normal  False  yes
    9      rainy  mild   normal  False  yes
    10     sunny  mild   normal   True  yes
    11  overcast  mild     high   True  yes
    12  overcast   hot   normal  False  yes
    13     rainy  mild     high   True   no 

    Describe: 
            outlook  temp humidity  windy play
    count       14    14       14     14   14
    unique       3     3        2      2    2
    top      sunny  mild     high  False  yes
    freq         5     6        7      8    9 

    Entropy: 
        0.9402859586706311
    
    Entropy for each attribute: 
        { 'outlook':     0.6935361388961914, 
          'temp':        0.9110633930116756, 
          'humidity':    0.7884504573082889, 
          'windy':       0.892158928262361 }

    Information gains: 
        { 'outlook':     0.24674981977443977, 
          'temp':        0.029222565658955535, 
          'humidity':    0.15183550136234225, 
          'windy':       0.048127030408270155 }
"""
$$ \text{IG}(X, A) = \text{H}(X) - \sum_{v \in \text{Values}(A)} \frac{|X_v|}{|X|} \cdot \text{H}(X_v) $$ Algorithm

Algorithm

We build a decision tree recursively giving priority to the attributes with the higher IG.
 
""" Decision Trees / ID3 Algorithm

Iterative Dichotomiser 3, is a classification algorithm that follows a 
greedy approach of building a decision tree that gives priority to the attributes 
with the higher information gain.

1. Calculate entropy for dataset
2. For each attribute:
   Calculate entropy for all categorical values
   Calculate information gain for the current attribute
3. Find the feture with maximum information gain
4. Repeat

The `outlook` has the highest info gain of 0.24, so we will select it as 
the root node for the start level of splitting.
"""

import numpy as np
import pandas as pd
import pathlib
from sklearn import tree

# Dataset
DIR = pathlib.Path(__file__).resolve().parent
df = pd.read_csv(DIR / 'data/play_tennis.csv')

# Train data
X = df.drop(['play'], axis=1)
y = df['play']


# Total entropy (for current dataframe)
def dataset_entropy(df):
    E = 0
    N = df['play'].value_counts() # yes: 9, no: 5
    values = df['play'].unique()
    for v in values: # yes/no
        P = N[v]/len(df['play'])  # probability
        E += -P*np.log2(P)
    return E

# Entropy for each attribute
def attribute_entropy(df, attr):
    E = 0
    eps = np.finfo(float).eps   # machine epsilon for the float 
    targets = df.play.unique()
    values = df[attr].unique()
    for v in values: # cool/hot
        ent = 0
        for t in targets: # yes,no
            num = len(df[attr][df[attr] == v][df.play == t]) # numerator
            den = len(df[attr][df[attr] == v])
            fraction = num/(den + eps)
            ent += -fraction*np.log2(fraction + eps) # entropy for one feature
        E += -(den/len(df))*ent # sum of all entropies
    return abs(E)

# ---------------------------------------------------

# Find attribute with maximum information gain
def find_winner(df):
    IG = {}
    attributes = df.keys()[:-1]

    # Loop for attributes in dataframe and compute info gains
    for attr in attributes: 
        IG[attr] = dataset_entropy(df) - attribute_entropy(df, attr)
    winner = attributes[np.argmax(IG)] # maxim info gains
    return winner
# ---------------------------------------------------

# Construct the decision tree (dictionary)
def buildTree(df):
    tree = {}

    # Target column
    Class = df.keys()[-1] # play
    
    # Maximum info gain
    node = find_winner(df) # outlook
    tree[node] = {}

    # Distinct values
    values = np.unique(df[node]) # overcast/rain

    # Loop throw the attribute values
    for value in values:
        subtable = df[df[node] == value].reset_index(drop=True)
        attr_values, counts = np.unique(subtable[Class], return_counts=True)

        if len(counts) == 1: # pure subset
            tree[node][value] = attr_values[0]
        else:
            subtable = subtable.drop(node, axis=1)
            tree[node][value] = buildTree(subtable) # Recursive case
            
    return tree

decision_tree = buildTree(df)


# Print dictionary tree (recursion  in case of subtrees)
def print_tree(tree, attr=None, i=0):
    if not attr:
        attr = next(iter(tree)) # attrribute in the current tree node

    for key, subval in tree[attr].items():

        if isinstance(subval, str): # Base case
            print(i*" ", attr, "=", key, ":", subval)
            continue
        
        print(i*" ", attr, "=", key, ":")
        print_tree(subval, i=i+1) # Recursive

    return

# Predict unknow (only for cases included in the train dataset)
def predict(X, tree):
    key = next(iter(tree))
    val = X[key]
    subval = tree[key][val]

    if isinstance(subval, str): # Base case
        return subval
        
    subval = predict(X, subval) # Recursive
    return subval


print(decision_tree, "\n")
print_tree(decision_tree)

# Example usage
x = {'outlook': 'sunny', 'temp': 'mild', 'humidity': 'high', 'windy': False}
y = predict(x, decision_tree)
print("\nAttributes:", x)
print("Prediction:", y)

# Example usage 2
x = {'outlook': 'rainy', 'temp': 'mild', 'humidity': 'normal', 'windy': True}
y = predict(x, decision_tree)
print("\nAttributes:", x)
print("Prediction:", y)


"""
    {'outlook': {'overcast': 'yes', 'rainy': {'temp': {'cool': {'humidity': ...

    outlook = overcast: yes
    outlook = rainy:
      temp = cool:
        humidity = normal:
          windy = False: yes
          windy = True: no
      temp = mild:
        humidity = high:
          windy = False: yes
          windy = True: no
        humidity = normal: yes
    outlook = sunny:
      temp = cool: yes
      temp = hot: no
      temp = mild:
        humidity = high: no
        humidity = normal: yes

    Attributes: {
        'outlook': 'sunny', 
        'temp': 'mild', 
        'humidity': 'high', 
        'windy': False}
    Prediction: no

    Attributes: {
        'outlook': 'rainy', 
        'temp': 'mild', 
        'humidity': 'normal', 
        'windy': True}
    Prediction: yes
"""

Questions