PROGRAMMING

  minte9
learningjourney




S R Q

Cost function

p64 Build an algorithm to find the best fit parameter for f(x)
 
""" Parameterized SSR (cost function)
Measure the goodness-of-fit (SSR)
For start, let's pretend that intercept is known b = -18
f(x) = ax + -18
SSR(a) = sum(R^2)
"""

import matplotlib.pyplot as plt
import numpy as np

# Training Dataset
X = np.array([30, 46, 60, 65, 77, 95]).reshape(6,1)
Y = np.array([31, 30, 80, 49, 70, 118])

fig, ax = plt.subplots()
plt.ylim(0, 140)
plt.xlim(0, 140)
ax.plot(X, Y, 'o', color='g', label='training data') # points

# Plot lines for some A range values
A = np.linspace(-2, 4.5, 13) # 21 values

for i in range(len(A)):
    msg ='f(x) = -18 + %sx' % A[i].round(1)
    ax.plot(X, -18 + A[i]*X, label = msg) # f(x) = -18 + -2.0x

plt.xlabel("x")
plt.ylabel("f(x)")  
plt.legend()

# Calculate SSR for each a
SSR = []
for a in A:
    P = []  # predictions
    SR = [] # square residuals
    for i in X:
        P.append(-18 + a*i)
    for i in range(0, len(X)):
        SR.append((Y[i] - P[i])**2)
    SSR.append(np.sum(SR).round())

print(SSR)
    # 282654, 197923, 128329, 73872, 34552, 10368, 
    # 1320,
    # 7409, 28635, 64998, 116497, 183133, 264906

# Generic cost function J = SSR(a)
def J(a, b=-18):
    J = 0
    for i in range(len(X)): # number of train points
        J += (Y[i] - (a*X[i] + b))**2
    return J

# Draw J(a)
fig, ax = plt.subplots()
ax.plot(A, J(A)) # J(a)
for a in A:
    msg ='J(%.1f, -18)' % a
    ax.plot(a, J(a), 'o', label = msg) # points
plt.xlabel("a")
plt.ylabel("SSR(a)")  
plt.legend()

# Draw J(a,b)
from mpl_toolkits.mplot3d.axes3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(1,1,1,projection='3d')
a = np.linspace(-1, 4, 20)
b = np.linspace(-100, 100, 10)
aa, bb = np.meshgrid(a, b)
ax.plot_surface(aa, bb, J(aa, bb)) # surface
ax.view_init(50,-150)
plt.show()
Gradient descent

Gradient descent

p76 Finding the optimal value for coeficient.
 
""" Gradient descent
Algorithm starts with a random value of the parameter a, b=-18
Then, it finds the direction in which the function
descrease faster and takes a step in that direction, then repeat
"""

import matplotlib.pyplot as plt
import numpy as np

# Training Dataset
X = np.array([30, 46, 60, 65, 77, 95]).reshape(6,1)
Y = np.array([31, 30, 80, 49, 70, 118])

# Cost function
def J(a):
    J = 0
    for i in range(len(X)): # number of train points
        J += (Y[i] - (a*X[i] + -18))**2
    return J

# Derivative of the cost function
def dJ(a):
    dJ = 0
    for i in range(len(X)):
        dJ += -2*X[i]*(Y[i] - (a*X[i] + -18)) # d(x^2) = 2x
    return dJ.item()

d = dJ(0)
print('Derivative J(0) = ', d) # -67218


# Gradient descent (algorithm)
# J(a) derivative is used to find where the SSR is the lowest

a = 0 # start value
l = 0.00001 # learning rate

a0 = 0
a1 = a  - l * dJ(a)  # step 1
a2 = a1 - l * dJ(a1) # step 2
a3 = a2 - l * dJ(a2) # step 3

print('Step 1 a =', round(a1, 5)) # 0.67218
print('Step 2 a =', round(a2, 5)) # 0.99758
print('Step 3 a =', round(a3, 5)) # 1.15511


# Gradient descent (implementation)
def gradient_descent(X, Y, b=-18, lr=0.00001, loops=15):
    a = 0
    for i in range(15):
        d = dJ(a)
        a = a - d*lr
        # print(f'Step {i+1} a = {round(a, 5)}')
    return round(a, 5)

a = gradient_descent(X, Y)
print(round(a, 4)) # 1.3029


# Grapichs
fig, ax = plt.subplots()
A = np.linspace(-2, 4.5, 23) # 21 values
ax.plot(A, J(A), label='J(a) = sum(R(X)^2)') # J(a)

# Mimin SSR(a), or optim a
ax.plot(a, J(a), 'o', color='g', label='a = 1.3029')

# Draw points (as gradient descends)
ax.plot(a0, J(0), 'o', color='r')
ax.plot(a1, J(a1), 'o', color='r')
ax.plot(a2, J(a2), 'o', color='r')
ax.plot(a3, J(a3), 'o', color='r')

# Draw lines to minimum
ax.plot([a0,  a1], [J(0), J(a1)], color='r')
ax.plot([a1, a2], [J(a1), J(a2)], color='r')
ax.plot([a2, a3], [J(a2), J(a3)], color='r')

# Show figure
plt.xlim(-2, 5)
plt.ylim(-10000, 70000)
plt.xlabel("a")
plt.ylabel("SSR(a)")  

ax.axhline(y=0, color='k')
ax.axvline(x=0, color='k')
plt.legend()
plt.show()
Learning (a, b)

Learning (a, b)

p76 Finding the optimal value for both, coeficient and intercept.
 
""" Gradient descent (two params, a and b)
Algorithm starts with a random value of the parameter a, b
Then, it finds the direction in which the function
descrease faster and takes a step in that direction, then repeat
"""

import matplotlib.pyplot as plt
import numpy as np

# The model (linear)
def predict(X, a, b):
    Y = X*a + b
    return np.round(Y) # f(x) = ax + b

# Cost function
def J(a, b):
    J = np.sum((Y - predict(X, a, b))**2)
    return J

# Derivatives
def dJ(a, b):
    da = np.sum(-2 * X * (Y - predict(X, a, b))) # b fixed
    db = np.sum(-2 * 1 * (Y - predict(X, a, b))) # a fixed
    return da, db

# Gradient descent
def gradient_descent(X, Y, lr=0.00001, loops=1000):
    a = 0
    b = 0
    for i in range(loops):
        da, db = dJ(a, b)
        a = a - lr * da
        for j in range(loops):
            b = b - lr * db
    return round(a, 1), round(b, 1)


# Train dataset 1
X = np.array([30, 46, 60, 65, 77, 95])
Y = np.array([31, 30, 80, 49, 70, 118])
print("\nLearning 1")

# Learning a,b
a, b = gradient_descent(X, Y)
print('a =', a, ' b =', b) # 1.3, -18
print('Predictions:', f'f(x) = {a}x + {b}') # f(x) = 1.3x - 18

# Predictions
x = 33; y = predict(x, a, b); print("f(%s) =" %x, y)
x = 45; y = predict(x, a, b); print("f(%s) =" %x, y)
x = 62; y = predict(x, a, b); print("f(%s) =" %x, y)
    # fx(33) =  25
    # fx(45) =  41
    # fx(62) =  63

fig, ax = plt.subplots()
ax.set_xlabel('x')
ax.set_ylabel('f(x)')
ax.grid(True, which='both')
ax.axhline(y=0, color='k')
ax.axvline(x=0, color='k')

# Draw dataset 1
ax.plot(X, Y, 'x', color='g', label='training data') # points
ax.plot(X, a*X + b, label=f'f(x) = {b} + {a}x') # function line
ax.plot(55, predict(55, a, b), 'o', color='r') # prediction point
plt.legend(loc='upper right')


# Train dataset 2
X = np.array([15, 18, 20, 21, 23, 25, 27, 28, 29, 30, 32, 34, 35, 36])
Y = np.array([23, 74, 65, 82, 135, 321, 440, 400, 290, 620, 630, 610, 560, 568])
print("\nLearning 2")

# Learning a,b
a, b = gradient_descent(X, Y)
print('a =', a, ' b =', b) # a = 32.9  b = -533.1
print('Predictions:', f'f(x) = {a}x + {a}') # f(x) = 32.9x + -533

x = 20; y = predict(x, a, b); print("f(%s) =" %x, y)
x = 24; y = predict(x, a, b); print("f(%s) =" %x, y)
x = 33; y = predict(x, a, b); print("f(%s) =" %x, y)

# Draw dataset 2
ax.plot(X, Y, 'x', color='g') # points
ax.plot(X, a*X + b, label=f'f(x) = {b} + {a}x') # function line
ax.plot(55, predict(33, a, b), 'o', color='r') # prediction point
plt.legend(loc='upper right')


plt.show()
Algorithm

Algorithm

Can be used to optimize the parameters of any ML model.
\( J(\theta_{0}, \theta_{1}) = 1/2m * \sum_{i=0}^m (h_{\theta}(x^{(i)}) - y^{(i}))^2 \)
 
""" Gradient descent Algorithm
Can be use to optimizes the parameters of any ML model, 
not just linear regresssion.

1. Initialize the parameters
    select initial set of params for the model

2. Compute the cost function
    differences between predictions and actual values

3. Compute the gradients
    partial derivatives of cost function

4. Update the parameters
    use the gradients and a learning rate

5. Repeat steps 2-4
"""

import numpy as np

def cost(theta, x, y):

    y_pred = np.dot(x, theta)
    error = y_pred - y
    return (1 / (2 * len(y))) * np.dot(error.T, error)

def gradient_descent(x, y, theta, lr, num_iterations):

    cost_history = np.zeros(num_iterations)
    for i in range(num_iterations):

        y_pred = np.dot(x, theta)
        error = y_pred - y

        theta = theta - (lr/len(y)) * np.dot(x.T, error)
        cost_history[i] = cost(theta, x, y)

    return theta, cost_history

x = np.array([[1, 2], [1, 3], [1, 4], [1, 5]])
y = np.array([[7], [6], [5], [7]])

theta = np.random.randn(2, 1)
lr = 0.01
num_iterations = 1000

theta, cost_history = gradient_descent(x, y, theta, lr, num_iterations)
print("Theta: ", theta) 
    # [[4.55230192] [0.43431721]]

Questions    
Last update: 9 days ago