minte9
LearnRemember




S R Q

Genereric cost function

p64 For start, let's pretend that intercept is known b = -18
 
""" Parameterized SSR (cost function)
Measure the goodness-of-fit (SSR)
For start, let's pretend that intercept is known b = -18
f(x) = ax + -18
SSR(a) = sum(R^2)
"""

import matplotlib.pyplot as plt
import numpy as np

# Training Dataset
X = np.array([30, 46, 60, 65, 77, 95]).reshape(6,1)
Y = np.array([31, 30, 80, 49, 70, 118])

# ------------------------------------------------

# Slopes range values
A = np.linspace(-2, 4.5, 13) # 13 values

# Calculate SSR for each a
SSR = []
for a in A:
    P = []  # predictions
    SR = [] # square residuals
    for i in X:
        P.append(-18 + a*i)
    for i in range(0, len(X)):
        SR.append((Y[i] - P[i])**2)
    SSR.append(np.sum(SR).round())

# Generic cost function SSR(a) = J
def J(a, b=-18):
    J = 0
    for i in range(len(X)): # number of train points
        J += (Y[i] - (a*X[i] + b))**2
    return J

# ------------------------------------------------

# Plot the training points
fig, ax = plt.subplots()
plt.ylim(0, 140)
plt.xlim(0, 140)
ax.plot(X, Y, 'o', color='g', label='training data') # points

for i in range(len(A)):
    msg ='f(x) = -18 + %sx' % A[i].round(1)
    ax.plot(X, -18 + A[i]*X, label = msg) # f(x) = -18 + -2.0x

plt.xlabel("x")
plt.ylabel("f(x)")  
plt.legend()

# Plot J(a, -18)
fig, ax = plt.subplots()
ax.plot(A, J(A)) # J(a)
for a in A:
    msg ='J(%.1f, -18)' % a
    ax.plot(a, J(a), 'o', label = msg) # points
plt.xlabel("a")
plt.ylabel("SSR(a)")  
plt.legend()

# Plot J(a, b) 3D
from mpl_toolkits.mplot3d.axes3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(1,1,1,projection='3d')
a = np.linspace(-1, 4, 20)
b = np.linspace(-100, 100, 10)
aa, bb = np.meshgrid(a, b)
ax.plot_surface(aa, bb, J(aa, bb)) # surface
ax.view_init(50,-150)

plt.show()

# Print results
print("Slope range: \n", A)
print("SSR(a -18): \n", SSR)

"""
    Slope range: 
     [-2.  -1.45833333  -0.91666667 -0.375  0.16666667  0.70833333
      1.25  1.79166667  2.33333333  2.875  3.41666667  3.95833333
      4.5]
    SSR(a -18): 
     [282654.0, 197923.0, 128329.0, 73872.0, 34552.0, 10368.0, 1320.0, 7409.0, 
      28635.0, 64998.0, 116497.0, 183133.0, 264906.0]
"""
Gradient descent

Gradient descent

p76 Finding the optimal value for coeficient.
 
""" Gradient descent
Algorithm starts with a random value of the parameter a, b=-18
Then, it finds the direction in which the function
descrease faster and takes a step in that direction, then repeat.
"""

import matplotlib.pyplot as plt
import numpy as np

# Training Dataset
X = np.array([30, 46, 60, 65, 77, 95]).reshape(6,1)
Y = np.array([31, 30, 80, 49, 70, 118])

# ------------------------------------------------------------

# Cost function
def J(a):
    J = 0
    for i in range(len(X)): # number of train points
        J += (Y[i] - (a*X[i] + -18))**2
    return J

# Derivative of the cost function
def dJ(a):
    dJ = 0
    for i in range(len(X)):
        dJ += -2*X[i]*(Y[i] - (a*X[i] + -18)) # d(x^2) = 2x
    return dJ.item()

# Gradient descent
def gradient_descent(X, Y, b=-18, lr=0.00001, loops=15):
    a = 0
    for i in range(15):

        # J(a) derivative is used to find where the SSR is the lowest
        d = dJ(a)
        a = a - d*lr
        
        # print(f'Step {i+1} a = {round(a, 5)}')
    return round(a, 5)

# Result
optim_a = gradient_descent(X, Y)

# ------------------------------------------------------------

# Compute values to print and plot
a = 0       # start value
l = 0.00001 # learning rate

a0 = 0
a1 = a  - l * dJ(a)  # step 1
a2 = a1 - l * dJ(a1) # step 2
a3 = a2 - l * dJ(a2) # step 3

# Plot lines SSR curve
fig, ax = plt.subplots()
A = np.linspace(-2, 4.5, 23) # 21 values
ax.plot(A, J(A), label='J(a) = sum(R(X)^2)') # J(a)

# Mimin SSR(a), or optim a
ax.plot(optim_a, J(optim_a), 'o', color='g', label='optim_a = 1.3029')

# Draw points (as gradient descends)
ax.plot(a0, J(0), 'o', color='r')
ax.plot(a1, J(a1), 'o', color='r')
ax.plot(a2, J(a2), 'o', color='r')
ax.plot(a3, J(a3), 'o', color='r')

# Draw lines to minimum
ax.plot([a0,  a1], [J(0), J(a1)], color='r')
ax.plot([a1, a2], [J(a1), J(a2)], color='r')
ax.plot([a2, a3], [J(a2), J(a3)], color='r')

# Show figure
plt.xlim(-2, 5)
plt.ylim(-10000, 70000)
plt.xlabel("a")
plt.ylabel("SSR(a)")  

ax.axhline(y=0, color='k')
ax.axvline(x=0, color='k')
plt.legend()

plt.show()

# Print results
print('Derivative of cost function J(0) = ', dJ(0))
print('Step 1 a =', round(a1, 5))
print('Step 2 a =', round(a2, 5))
print('Step 3 a =', round(a3, 5), "\n")
print("Gradient descent optim_a slope: \n", round(optim_a, 4))

"""
    Derivative of cost function J(0) =  -67218
    Step 1 a = 0.67218
    Step 2 a = 0.99758
    Step 3 a = 1.15511 

    Gradient descent optim_a slope: 
        1.3029
"""
Learning (a, b)

Learning (a, b)

p76 Finding the optimal value for both, coeficient and intercept.
 
""" Gradient descent (two params, a and b)
Algorithm starts with a random value of the parameter a, b
Then, it finds the direction in which the function
descrease faster and takes a step in that direction, then repeat
"""

import matplotlib.pyplot as plt
import numpy as np

# --------------------------------------------------------------

# The model (linear)
def predict(X, a, b):
    Y = X*a + b
    return np.round(Y) # f(x) = ax + b

# Cost function
def J(a, b):
    J = np.sum((Y - predict(X, a, b))**2)
    return J

# Derivatives
def dJ(a, b):
    da = np.sum(-2 * X * (Y - predict(X, a, b))) # b fixed
    db = np.sum(-2 * 1 * (Y - predict(X, a, b))) # a fixed
    return da, db

# Gradient descent
def gradient_descent(X, Y, lr=0.00001, loops=1000):
    a = 0
    b = 0
    for i in range(loops):
        da, db = dJ(a, b)
        a = a - lr * da
        for j in range(loops):
            b = b - lr * db
    return round(a, 1), round(b, 1)

# --------------------------------------------------------------

# Train dataset 1
X = np.array([30, 46, 60, 65, 77, 95])
Y = np.array([31, 30, 80, 49, 70, 118])
print("\nLearning 1")

# Learning a,b
a, b = gradient_descent(X, Y)
print('a =', a, ' b =', b)
print('Predictions:', f'f(x) = {a}x + {b}')

# Predictions
x = 33; y = predict(x, a, b); print("f(%s) =" %x, y)
x = 45; y = predict(x, a, b); print("f(%s) =" %x, y)
x = 62; y = predict(x, a, b); print("f(%s) =" %x, y)

fig, ax = plt.subplots()
ax.set_xlabel('x')
ax.set_ylabel('f(x)')
ax.grid(True, which='both')
ax.axhline(y=0, color='k')
ax.axvline(x=0, color='k')

# Draw dataset 1
ax.plot(X, Y, 'x', color='g', label='training data')
ax.plot(X, a*X + b, label=f'f(x) = {b} + {a}x') # line
ax.plot(55, predict(55, a, b), 'o', color='r')
plt.legend(loc='upper right')

# --------------------------------------------------------------

# Train dataset 2
X = np.array([15, 18, 20, 21, 23, 25, 27, 28, 29, 30, 32, 34, 35, 36])
Y = np.array([23, 74, 65, 82, 135, 321, 440, 400, 290, 620, 630, 610, 560, 568])
print("\nLearning 2")

# Learning a,b
a, b = gradient_descent(X, Y)
print('a =', a, ' b =', b)
print('Predictions:', f'f(x) = {a}x + {a}')

x = 20; y = predict(x, a, b); print("f(%s) =" %x, y)
x = 24; y = predict(x, a, b); print("f(%s) =" %x, y)
x = 33; y = predict(x, a, b); print("f(%s) =" %x, y)

# Draw dataset 2
ax.plot(X, Y, 'x', color='g')
ax.plot(X, a*X + b, label=f'f(x) = {b} + {a}x') # line
ax.plot(55, predict(33, a, b), 'o', color='r')
plt.legend(loc='upper right')

plt.show()

"""
    Learning 1
     a = 1.3  b = -17.3
    Predictions: f(x) = 1.3x + -17.3
     f(33) = 26.0
     f(45) = 41.0
     f(62) = 63.0

    Learning 2
     a = 32.9  b = -533.1
    Predictions: f(x) = 32.9x + 32.9
     f(20) = 125.0
     f(24) = 256.0
     f(33) = 553.0
"""
Algorithm

Algorithm

Can be used to optimize the parameters of any ML model.
\( J(\theta_{0}, \theta_{1}) = 1/2m * \sum_{i=0}^m (h_{\theta}(x^{(i)}) - y^{(i}))^2 \)
 
""" Gradient descent Algorithm
Can be use to optimizes the parameters of any ML model, 
not just linear regresssion.

1. Initialize the parameters
    select initial set of params for the model

2. Compute the cost function
    differences between predictions and actual values

3. Compute the gradients
    partial derivatives of cost function

4. Update the parameters
    use the gradients and a learning rate

5. Repeat steps 2-4
"""

import numpy as np

def cost(theta, x, y):

    y_pred = np.dot(x, theta)
    error = y_pred - y
    return (1 / (2 * len(y))) * np.dot(error.T, error)

def gradient_descent(x, y, theta, lr, num_iterations):

    cost_history = np.zeros(num_iterations)
    for i in range(num_iterations):

        y_pred = np.dot(x, theta)
        error = y_pred - y

        theta = theta - (lr/len(y)) * np.dot(x.T, error)
        cost_history[i] = cost(theta, x, y)

    return theta, cost_history

x = np.array([[1, 2], [1, 3], [1, 4], [1, 5]])
y = np.array([[7], [6], [5], [7]])

theta = np.random.randn(2, 1)
lr = 0.01
num_iterations = 1000

theta, cost_history = gradient_descent(x, y, theta, lr, num_iterations)
print("Theta: ", theta) 
    # [[4.55230192] [0.43431721]]

Questions    
Algorithms, Decision Tree