# minte9 LearnRemember

### Probability

Probability is the chance of something to happen. When you flip a coin, there is a probability of 0.5 (or 50% chance) to land on heads. It's like asking, "What are the chances of something to happen?" Probability is a number between 0 and 1, where 0 means "no way" and 1 means "definitely happening". $$P(\text{Heads}) = \frac{1}{2} = 0.5$$

# Coin Flip events

# Probability distribution (coin flip):
print('Tail =', 1/len(events))

"""
Tail = 0.5
"""


### Probability Distribution

Now, imagine you're not just flipping a coin but rolling a dice. There are more outcomes (1 through 6), each with its own probability. A probability distribution is a list with all these probabilities. It's like a map with all the possible outcomes and how likely they are. $$P(\text{Rolling a 4}) = \frac{1}{6}$$

import pandas as pd
from icecream import ic

# Datasets
A = ['apple']*1 + ['orange']*2 + ['banana']*2
B = ['apple']*5 + ['orange']*2 + ['banana']*0
ic(A, B)

# Probability distribution (by hand)
P1 = [{'apple': 1/5}, {'orange': 2/5}, {'banana': 2/5}]
P2 = [{'apple': 5/7}, {'orange': 2/7}]
ic(P1, P2)

# With pandas
P1 = pd.Series(A).value_counts(normalize=True)
P2 = pd.Series(B).value_counts(normalize=True)
ic(P1, P2);

"""
ic| A: ['apple', 'orange', 'orange', 'banana', 'banana']
B: ['apple', 'apple', 'apple', 'apple', 'apple', 'orange', 'orange']
ic| P1: [{'apple': 0.2}, {'orange': 0.4}, {'banana': 0.4}]
P2: [{'apple': 0.7142857142857143}, {'orange': 0.2857142857142857}]
ic| P1: orange    0.4
banana    0.4
apple     0.2
dtype: float64
P2: apple     0.714286
orange    0.285714
dtype: float64
"""


### Entropy

Entropy is a measure of how disordered a collection is. The more impure the feature is, the higher the entropy. Probability distribution is the frequency of the unique values. It turns out that a logarithm of the number of states is perfect for compute entropy. $$H(X) = -\sum_{i=1}^{n} P(x_i) \log_2 P(x_i)$$

import pandas as pd
import numpy as np
from icecream import ic

# Set the initial traning data
A = ['apple']*1 + ['orange']*2 + ['banana']*2
B = ['apple']*5 + ['orange']*2 + ['banana']*0
ic(A, B)

# Probability
P1 = pd.Series(A).value_counts(normalize=True)
P2 = pd.Series(B).value_counts(normalize=True)
ic(P1, P2)

# Entropy (Shannon model)
P1 = P1.values
P2 = P2.values
H1 = -1 * np.sum(P1 * np.log2(P1))
H2 = -1 * np.sum(P2 * np.log2(P2))
ic(H1, H2);

assert H1 > H2

ic("A entropy > B entropy | There is more disorder in A than B")
ic("Assertion passed");

"""
ic| A: ['apple', 'orange', 'orange', 'banana', 'banana']
B: ['apple', 'apple', 'apple', 'apple', 'apple', 'orange', 'orange']
ic| P1: orange    0.4
banana    0.4
apple     0.2
dtype: float64
P2: apple     0.714286
orange    0.285714
dtype: float64
ic| H1: 1.5219280948873621, H2: 0.863120568566631
ic| 'A entropy > B entropy | There is more disorder in A than B'
ic| 'Assertion passed'
"""


### Information gain

Information gain is a measure of the reduction in entropy. As the entropy of an attribute increases, the information gain decreases. $$\text{IG}(X, A) = \text{H}(X) - \sum_{v \in \text{Values}(A)} \frac{|X_v|}{|X|} \cdot \text{H}(X_v)$$

""" Decision Trees / Info Gain
Target: Play Tennis or not

Information gain example (for feature 'wind'):
IG = H - (8/14)H_weak - (6/14)H_strong
IG = 0.940 - (8/14)0.811 - (6/14)1.00 = 0.048
"""

import numpy as np
import pandas as pd
import pathlib

# Load the dataset from a CSV file
DIR = pathlib.Path(__file__).resolve().parent

# Split the dataset into features (X) and target (y), play tennis yes/no
X = df.drop(['play'], axis=1)
y = df['play']

# Total entropy
def dataset_entropy(df):
E = 0
N = df['play'].value_counts() # yes: 9, no: 5
values = df['play'].unique()
for v in values: # yes/no
P = N[v]/len(df['play'])  # probability
E += -P*np.log2(P)
return E

# Entropy for each attribute
def attribute_entropy(attr):
E = 0
eps = np.finfo(float).eps   # machine epsilon for the float
targets = df.play.unique()
values = df[attr].unique()
for v in values: # cool/hot
ent = 0
for t in targets: # yes,no
num = len(df[attr][df[attr] == v][df.play == t]) # numerator
den = len(df[attr][df[attr] == v])
fraction = num/(den + eps)
ent += -fraction*np.log2(fraction + eps) # entropy for one feature
E += -(den/len(df))*ent # sum of all entropies
return abs(E)

total_entropy  = dataset_entropy(df)

# Get the names of attributes (excluding the target variable)
attributes = df.keys()[:-1]

# Calculate entropy for each attribute and store it in a dictionary
E = {}
for k in attributes:
E[k] = attribute_entropy(k)

# Calculate information gain for each attribute and store it in a dictionary
IG = {}
for k in E:
IG[k] = total_entropy - E[k]

# Asserts
assert E['outlook']  < E['humidity']
assert IG['outlook'] > IG['humidity']

# Output results
print("\n Dataset:"); print(df)
print("\n Describe:"); print(df.describe())
print("\n Entropy:"); print(total_entropy)
print("\n AttrEntropy:"); print(E)
print("\n Information gains:"); print(IG)

"""
Dataset:
outlook  temp humidity  windy play
0      sunny   hot     high  False   no
1      sunny   hot     high   True   no
2   overcast   hot     high  False  yes
3      rainy  mild     high  False  yes
4      rainy  cool   normal  False  yes
5      rainy  cool   normal   True   no
6   overcast  cool   normal   True  yes
7      sunny  mild     high  False   no
8      sunny  cool   normal  False  yes
9      rainy  mild   normal  False  yes
10     sunny  mild   normal   True  yes
11  overcast  mild     high   True  yes
12  overcast   hot   normal  False  yes
13     rainy  mild     high   True   no

Describe:
outlook  temp humidity  windy play
count       14    14       14     14   14
unique       3     3        2      2    2
top      sunny  mild     high  False  yes
freq         5     6        7      8    9

Entropy:
0.9402859586706311

AttrEntropy:
{'outlook': 0.6935361388961914,
'temp': 0.9110633930116756,
'humidity': 0.7884504573082889,
'windy': 0.892158928262361}

Information gains:
{'outlook': 0.24674981977443977,
'temp': 0.029222565658955535,
'humidity': 0.15183550136234225,
'windy': 0.048127030408270155}
"""


### Decision Tree ID3

We build a decision tree recursively giving priority to the attributes with the higher IG. Iterative Dichotomiser 3 is a classification algorithm. It follows a greedy approach of building a decision tree. It gives priority to the attributes with the higher information gain.

""" Decision Trees / ID3 Algorithm

1. Calculate entropy for dataset
2. For each attribute:
Calculate entropy for all categorical values
Calculate information gain for the current attribute
3. Find the feture with maximum information gain
4. Repeat

For example, the outlook has the highest info gain (0.24).
It we will select it as the root node for the start level of splitting.
"""

import numpy as np
import pandas as pd
import pathlib
from sklearn import tree

# Dataset
DIR = pathlib.Path(__file__).resolve().parent

# Training data
X = df.drop(['play'], axis=1)
y = df['play']

# Total entropy (for current dataframe)
def dataset_entropy(df):
E = 0
N = df['play'].value_counts() # yes: 9, no: 5
values = df['play'].unique()
for v in values: # yes/no
P = N[v]/len(df['play'])  # probability
E += -P*np.log2(P)
return E

# Entropy for each attribute
def attribute_entropy(df, attr):
E = 0
eps = np.finfo(float).eps   # machine epsilon for the float
targets = df.play.unique()
values = df[attr].unique()
for v in values: # cool/hot
ent = 0
for t in targets: # yes,no
num = len(df[attr][df[attr] == v][df.play == t]) # numerator
den = len(df[attr][df[attr] == v])
fraction = num/(den + eps)
ent += -fraction*np.log2(fraction + eps) # entropy for one feature
E += -(den/len(df))*ent # sum of all entropies
return abs(E)

# Find attribute with maximum information gain
def find_winner(df):
IG = {}
attributes = df.keys()[:-1]

# Loop for attributes in dataframe and compute info gains
for attr in attributes:
IG[attr] = dataset_entropy(df) - attribute_entropy(df, attr)
winner = attributes[np.argmax(IG)] # maxim info gains
return winner

# Construct the decision tree (dictionary)
def buildTree(df):
tree = {}

# Target column
Class = df.keys()[-1] # play

# Maximum info gain
node = find_winner(df) # outlook
tree[node] = {}

# Distinct values
values = np.unique(df[node]) # overcast/rain

# Loop throw the attribute values
for value in values:
subtable = df[df[node] == value].reset_index(drop=True)
attr_values, counts = np.unique(subtable[Class], return_counts=True)

if len(counts) == 1: # pure subset
tree[node][value] = attr_values[0]
else:
subtable = subtable.drop(node, axis=1)
tree[node][value] = buildTree(subtable) # Recursive case

return tree

decision_tree = buildTree(df)

# Print dictionary tree (recursion  in case of subtrees)
def print_tree(tree, attr=None, i=0):
if not attr:
attr = next(iter(tree)) # attrribute in the current tree node

for key, subval in tree[attr].items():

if isinstance(subval, str): # Base case
print(i*" ", attr, "=", key, ":", subval)
continue

print(i*" ", attr, "=", key, ":")
print_tree(subval, i=i+1) # Recursive

return

# Predict unknow (only for cases included in the train dataset)
def predict(X, tree):
key = next(iter(tree))
val = X[key]
subval = tree[key][val]

if isinstance(subval, str): # Base case
return subval

subval = predict(X, subval) # Recursive
return subval

print("\n Decistion Tree:")
print(decision_tree, "\n")
print_tree(decision_tree)

print("\n Example usage 1:")
x = {'outlook': 'sunny', 'temp': 'mild', 'humidity': 'high', 'windy': False}
y = predict(x, decision_tree)
print("Attributes:", x)
print("Prediction:", y)

print("\n Example usage 2:")
x = {'outlook': 'rainy', 'temp': 'mild', 'humidity': 'normal', 'windy': True}
y = predict(x, decision_tree)
print("Attributes:", x)
print("Prediction:", y)

"""
Decistion Tree:
{'outlook': {'overcast': 'yes', 'rainy': {'temp': {'cool': {'humidity': ...

outlook = overcast: yes
outlook = rainy:
temp = cool:
humidity = normal:
windy = False: yes
windy = True: no
temp = mild:
humidity = high:
windy = False: yes
windy = True: no
humidity = normal: yes
outlook = sunny:
temp = cool: yes
temp = hot: no
temp = mild:
humidity = high: no
humidity = normal: yes

Example usage 1:
Attributes: {
'outlook': 'sunny',
'temp': 'mild',
'humidity': 'high',
'windy': False}
Prediction: no

Example usage 2:
Attributes: {
'outlook': 'rainy',
'temp': 'mild',
'humidity': 'normal',
'windy': True}
Prediction: yes
"""


Last update: 179 days ago