Algorithms Gradient descent

Linear Regression Lines

Explore linear regression models with different slopes (parameter 'a'). Let's pretend that intercept is known b = -18

\(
    f(x) = ax -18
\)

 
""" Linear Regression lines with know intercept parameter (b = -18)
"""

import numpy as np
import matplotlib.pyplot as plt

# Training Dataset
X = np.array([30, 46, 60, 65, 77, 95]).reshape(6,1)
Y = np.array([31, 30, 80, 49, 70, 118])

# Define a range of slope values (parameter 'a') to explore
A = np.linspace(-2, 4.5, 13) # 13 values

# Output results
print("Slope range: \n", A)

# Create a plot for the training data and various linear regression lines
fig, ax = plt.subplots()
plt.ylim(0, 140)
plt.xlim(0, 140)

# Plot training data points
ax.plot(X, Y, 'o', color='g', label='training data') 

for i in range(len(A)):
    msg ='f(x) = -18 + %sx' % A[i].round(1)

    # Plot linear regression lines
    ax.plot(X, -18 + A[i]*X, label = msg) 

plt.xlabel("x")
plt.ylabel("f(x)")  
plt.legend()
plt.show()

"""
    Slope range: 
     [-2.         -1.45833333 -0.91666667 -0.375       0.16666667  0.70833333
     1.25        1.79166667  2.33333333  2.875       3.41666667  3.95833333
     4.5       ]
"""

Cost function J(a)

The cost function is named SSR(a) or J(a), the sum of squared residuals.

\(
    J(a) =  \sum_{i=1}^{n} (R^2) $
\)

 
""" Cost function J(a) visualization
"""

import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d.axes3d import Axes3D

# Training Dataset
X = np.array([30, 46, 60, 65, 77, 95]).reshape(6,1)
Y = np.array([31, 30, 80, 49, 70, 118])

# # Define a range of slope values (parameter 'a') to explore
A = np.linspace(-2, 4.5, 13) # 13 values

# Initialize a list to store the Sum of Squared Residuals (SSR) for each 'a'
SSR = []

# Loop through each 'a' value and calculate SSR
for a in A:
    P = []  # predictions
    SR = [] # square residuals
    for i in X:
        P.append(-18 + a*i)
    for i in range(0, len(X)):
        SR.append((Y[i] - P[i])**2)
    SSR.append(np.sum(SR).round())

# Output results
print("SSR(a -18): \n", SSR, "\n")
print("SSR optim:", min(SSR))

# Define a generic cost function SSR(a) = J
def J(a, b=-18):
    J = 0
    for i in range(len(X)): # number of train points
        J += (Y[i] - (a*X[i] + b))**2
    return J

# Create a plot of the cost function J(a, -18) for different 'a' values
fig, ax = plt.subplots()
ax.plot(A, J(A)) # J(a)
for a in A:
    msg ='J(%.1f, -18)' % a
    ax.plot(a, J(a), 'o', label = msg) # Plot points on the cost function curve
plt.xlabel("a")
plt.ylabel("SSR(a)")  
plt.legend()
plt.show()

"""
    SSR(a -18): 
    [282654.0, 197923.0, 128329.0, 73872.0, 34552.0, 10368.0, 1320.0, 7409.0, 
     28635.0, 64998.0, 116497.0, 183133.0, 264906.0] 

    SSR optim: 1320.0
"""

Cost function J(a, b)

We can visualize the cost function J(a,b) for different b parameter. The result is a 3D surface.

\(
    $ J(a, b) =  \sum_{i=1}^{n} (R^2) $
\)

 
""" Cost function J(a,b) visualization
"""

import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d.axes3d import Axes3D

# Training Dataset
X = np.array([30, 46, 60, 65, 77, 95]).reshape(6,1)
Y = np.array([31, 30, 80, 49, 70, 118])

# Define a generic cost function SSR(a,b) = J
def J(a, b=-18):
    J = 0
    for i in range(len(X)): # number of train points
        J += (Y[i] - (a*X[i] + b))**2
    return J

# Create a 3D plot of the cost function J(a, b)
fig = plt.figure()
ax = fig.add_subplot(1,1,1,projection='3d')
a = np.linspace(-1, 4, 20)
b = np.linspace(-100, 100, 10)
aa, bb = np.meshgrid(a, b)
ax.plot_surface(aa, bb, J(aa, bb)) # Plot the 3D surface of the cost function
ax.view_init(50,-150) # Set the view angle
plt.show()

Gradient descent

The Algorithm starts with a random value of the parameter a, b=-18 Then, it finds the direction in which the function descrease faster and takes a step in that direction. Then, repeat until it finds the optimal value for coeficient.

 
""" Gradient descent algorithm
Find the optimal value of a linear regression parameter 'a' for a given dataset
"""

import matplotlib.pyplot as plt
import numpy as np

# Training Dataset
X = np.array([30, 46, 60, 65, 77, 95]).reshape(6,1)
Y = np.array([31, 30, 80, 49, 70, 118])

# Cost function
def J(a):
    J = 0

    # Loop through each data point
    for i in range(len(X)):

        # Calculate the squared error
        J += (Y[i] - (a*X[i] + -18))**2 

    return J

# Derivative of the cost function
def dJ(a):
    dJ = 0
    for i in range(len(X)):

        # Calculate the derivative
        dJ += -2*X[i]*(Y[i] - (a*X[i] + -18)) # d(x^2) = 2x

    return dJ.item()

# Gradient descent
def gradient_descent(X, Y, b=-18, lr=0.00001, loops=15):
    a = 0
    for i in range(15):

        # Update 'a' using the gradient of the cost function
        d = dJ(a)
        a = a - d*lr
        
        print(f'Step {i+1} a = {round(a, 5)}')
    return round(a, 5)

# Result
optim_a = gradient_descent(X, Y)

# Compute values to print and plot
a = 0       # start value
l = 0.00001 # learning rate

a0 = 0
a1 = a  - l * dJ(a)  # step 1
a2 = a1 - l * dJ(a1) # step 2
a3 = a2 - l * dJ(a2) # step 3

# Plot lines SSR curve
fig, ax = plt.subplots()
A = np.linspace(-2, 4.5, 23) # 21 values
ax.plot(A, J(A), label='J(a) = sum(R(X)^2)') # J(a)

# Mark the minimum SSR(a) (optim_a)
ax.plot(optim_a, J(optim_a), 'o', color='g', label='optim_a = 1.3029')

# Draw points (as gradient descends)
ax.plot(a0, J(0), 'o', color='r')
ax.plot(a1, J(a1), 'o', color='r')
ax.plot(a2, J(a2), 'o', color='r')
ax.plot(a3, J(a3), 'o', color='r')

# Draw lines to minimum
ax.plot([a0,  a1], [J(0), J(a1)], color='r')
ax.plot([a1, a2], [J(a1), J(a2)], color='r')
ax.plot([a2, a3], [J(a2), J(a3)], color='r')

# Customize the plot
plt.xlim(-2, 5)
plt.ylim(-10000, 70000)
plt.xlabel("a")
plt.ylabel("SSR(a)")  
ax.axhline(y=0, color='k')
ax.axvline(x=0, color='k')
plt.legend()

# Show the plot
plt.show()

# Print results
print('Derivative of cost function J(0) = ', dJ(0))
print('Step 1 a =', round(a1, 5))
print('Step 2 a =', round(a2, 5))
print('Step 3 a =', round(a3, 5), "\n")
print("Gradient descent optim_a slope: \n", round(optim_a, 4))

"""
    Step 1 a = 0.67218
    Step 2 a = 0.99758
    Step 3 a = 1.15511
    Step 4 a = 1.23137
    Step 5 a = 1.26829
    Step 6 a = 1.28616
    Step 7 a = 1.29481
    Step 8 a = 1.299
    Step 9 a = 1.30102
    Step 10 a = 1.30201
    Step 11 a = 1.30248
    Step 12 a = 1.30271
    Step 13 a = 1.30282
    Step 14 a = 1.30288
    Step 15 a = 1.3029
"""

Learning (a, b)

Finding the optimal value for both, coeficient and intercept.

 
""" Gradient descent (two params, a and b)
Algorithm starts with a random value of the parameter a, b
"""

import matplotlib.pyplot as plt
import numpy as np

# The model (linear)
def predict(X, a, b):
    Y = X*a + b
    return np.round(Y) # f(x) = ax + b

# Cost function
def J(a, b):
    J = np.sum((Y - predict(X, a, b))**2)
    return J

# Derivatives
def dJ(a, b):
    da = np.sum(-2 * X * (Y - predict(X, a, b))) # b fixed
    db = np.sum(-2 * 1 * (Y - predict(X, a, b))) # a fixed
    return da, db

# Gradient descent
def gradient_descent(X, Y, lr=0.00001, loops=1000):
    a = 0
    b = 0
    for i in range(loops):
        da, db = dJ(a, b)
        a = a - lr * da
        for j in range(loops):
            b = b - lr * db
    return round(a, 1), round(b, 1)

# Train dataset 1
X = np.array([30, 46, 60, 65, 77, 95])
Y = np.array([31, 30, 80, 49, 70, 118])
print("\nLearning 1")

# Learning a,b
a, b = gradient_descent(X, Y)
print('a =', a, ' b =', b)
print('Predictions:', f'f(x) = {a}x + {b}')

# Predictions
x = 33; y = predict(x, a, b); print("f(%s) =" %x, y)
x = 45; y = predict(x, a, b); print("f(%s) =" %x, y)
x = 62; y = predict(x, a, b); print("f(%s) =" %x, y)

fig, ax = plt.subplots()
ax.set_xlabel('x')
ax.set_ylabel('f(x)')
ax.grid(True, which='both')
ax.axhline(y=0, color='k')
ax.axvline(x=0, color='k')

# Draw dataset 1
ax.plot(X, Y, 'x', color='g', label='training data')
ax.plot(X, a*X + b, label=f'f(x) = {b} + {a}x') # line
ax.plot(55, predict(55, a, b), 'o', color='r')
plt.legend(loc='upper right')

# Train dataset 2
X = np.array([15, 18, 20, 21, 23, 25, 27, 28, 29, 30, 32, 34, 35, 36])
Y = np.array([23, 74, 65, 82, 135, 321, 440, 400, 290, 620, 630, 610, 560, 568])
print("\nLearning 2")

# Learning a,b
a, b = gradient_descent(X, Y)
print('a =', a, ' b =', b)
print('Predictions:', f'f(x) = {a}x + {a}')

x = 20; y = predict(x, a, b); print("f(%s) =" %x, y)
x = 24; y = predict(x, a, b); print("f(%s) =" %x, y)
x = 33; y = predict(x, a, b); print("f(%s) =" %x, y)

# Draw dataset 2
ax.plot(X, Y, 'x', color='g')
ax.plot(X, a*X + b, label=f'f(x) = {b} + {a}x') # line
ax.plot(55, predict(33, a, b), 'o', color='r')
plt.legend(loc='upper right')
plt.show()

"""
    Learning 1
    a = 1.3  b = -17.3
    Predictions: f(x) = 1.3x + -17.3
    f(33) = 26.0
    f(45) = 41.0
    f(62) = 63.0

    Learning 2
    a = 32.9  b = -533.1
    Predictions: f(x) = 32.9x + 32.9
    f(20) = 125.0
    f(24) = 256.0
"""

Algorithm

Can be used to optimize the parameters of any ML model, not just linear regression.

\(
    J(\theta_{0}, \theta_{1}) = 1/2m * \sum_{i=0}^m (h_{\theta}(x^{(i)}) - y^{(i}))^2
\)

 
""" Gradient descent Algorithm

1. Initialize the parameters
    select initial set of params for the model

2. Compute the cost function
    differences between predictions and actual values

3. Compute the gradients
    partial derivatives of cost function

4. Update the parameters
    use the gradients and a learning rate

5. Repeat steps 2-4
"""

import numpy as np

def cost(theta, x, y):

    y_pred = np.dot(x, theta)
    error = y_pred - y
    return (1 / (2 * len(y))) * np.dot(error.T, error)

def gradient_descent(x, y, theta, lr, num_iterations):

    cost_history = np.zeros(num_iterations)
    for i in range(num_iterations):

        y_pred = np.dot(x, theta)
        error = y_pred - y

        theta = theta - (lr/len(y)) * np.dot(x.T, error)
        cost_history[i] = cost(theta, x, y)

    return theta, cost_history

x = np.array([[1, 2], [1, 3], [1, 4], [1, 5]])
y = np.array([[7], [6], [5], [7]])

theta = np.random.randn(2, 1)
lr = 0.01
num_iterations = 1000

theta, cost_history = gradient_descent(x, y, theta, lr, num_iterations)
print("Theta: ", theta) 
    # [[4.55230192] [0.43431721]]

""" Linear Regression lines with know intercept parameter (b = -18)
"""

import numpy as np
import matplotlib.pyplot as plt

# Training Dataset
X = np.array([30, 46, 60, 65, 77, 95]).reshape(6,1)
Y = np.array([31, 30, 80, 49, 70, 118])

# Define a range of slope values (parameter 'a') to explore
A = np.linspace(-2, 4.5, 13) # 13 values

# Output results
print("Slope range: \n", A)

# Create a plot for the training data and various linear regression lines
fig, ax = plt.subplots()
plt.ylim(0, 140)
plt.xlim(0, 140)

# Plot training data points
ax.plot(X, Y, 'o', color='g', label='training data')

for i in range(len(A)):
    msg ='f(x) = -18 + %sx' % A[i].round(1)

# Plot linear regression lines
    ax.plot(X, -18 + A[i]*X, label = msg)

plt.xlabel("x")
plt.ylabel("f(x)")  
plt.legend()
plt.show()

"""
    Slope range: 
     [-2.         -1.45833333 -0.91666667 -0.375       0.16666667  0.70833333
     1.25        1.79166667  2.33333333  2.875       3.41666667  3.95833333
     4.5       ]
"""

""" Cost function J(a) visualization
"""

import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d.axes3d import Axes3D

# Training Dataset
X = np.array([30, 46, 60, 65, 77, 95]).reshape(6,1)
Y = np.array([31, 30, 80, 49, 70, 118])

# # Define a range of slope values (parameter 'a') to explore
A = np.linspace(-2, 4.5, 13) # 13 values

# Initialize a list to store the Sum of Squared Residuals (SSR) for each 'a'
SSR = []

# Loop through each 'a' value and calculate SSR
for a in A:
    P = []  # predictions
    SR = [] # square residuals
    for i in X:
        P.append(-18 + a*i)
    for i in range(0, len(X)):
        SR.append((Y[i] - P[i])**2)
    SSR.append(np.sum(SR).round())

# Output results
print("SSR(a -18): \n", SSR, "\n")
print("SSR optim:", min(SSR))

# Define a generic cost function SSR(a) = J
def J(a, b=-18):
    J = 0
    for i in range(len(X)): # number of train points
        J += (Y[i] - (a*X[i] + b))**2
    return J

# Create a plot of the cost function J(a, -18) for different 'a' values
fig, ax = plt.subplots()
ax.plot(A, J(A)) # J(a)
for a in A:
    msg ='J(%.1f, -18)' % a
    ax.plot(a, J(a), 'o', label = msg) # Plot points on the cost function curve
plt.xlabel("a")
plt.ylabel("SSR(a)")  
plt.legend()
plt.show()

"""
    SSR(a -18): 
    [282654.0, 197923.0, 128329.0, 73872.0, 34552.0, 10368.0, 1320.0, 7409.0, 
     28635.0, 64998.0, 116497.0, 183133.0, 264906.0]

SSR optim: 1320.0
"""

""" Gradient descent algorithm
Find the optimal value of a linear regression parameter 'a' for a given dataset
"""

import matplotlib.pyplot as plt
import numpy as np

# Training Dataset
X = np.array([30, 46, 60, 65, 77, 95]).reshape(6,1)
Y = np.array([31, 30, 80, 49, 70, 118])

# Cost function
def J(a):
    J = 0

# Loop through each data point
    for i in range(len(X)):

# Calculate the squared error
        J += (Y[i] - (a*X[i] + -18))**2

return J

# Derivative of the cost function
def dJ(a):
    dJ = 0
    for i in range(len(X)):

# Calculate the derivative
        dJ += -2*X[i]*(Y[i] - (a*X[i] + -18)) # d(x^2) = 2x

return dJ.item()

# Gradient descent
def gradient_descent(X, Y, b=-18, lr=0.00001, loops=15):
    a = 0
    for i in range(15):

# Update 'a' using the gradient of the cost function
        d = dJ(a)
        a = a - d*lr
        
        print(f'Step {i+1} a = {round(a, 5)}')
    return round(a, 5)

# Result
optim_a = gradient_descent(X, Y)

# Compute values to print and plot
a = 0       # start value
l = 0.00001 # learning rate

a0 = 0
a1 = a  - l * dJ(a)  # step 1
a2 = a1 - l * dJ(a1) # step 2
a3 = a2 - l * dJ(a2) # step 3

# Plot lines SSR curve
fig, ax = plt.subplots()
A = np.linspace(-2, 4.5, 23) # 21 values
ax.plot(A, J(A), label='J(a) = sum(R(X)^2)') # J(a)

# Mark the minimum SSR(a) (optim_a)
ax.plot(optim_a, J(optim_a), 'o', color='g', label='optim_a = 1.3029')

# Draw points (as gradient descends)
ax.plot(a0, J(0), 'o', color='r')
ax.plot(a1, J(a1), 'o', color='r')
ax.plot(a2, J(a2), 'o', color='r')
ax.plot(a3, J(a3), 'o', color='r')

# Draw lines to minimum
ax.plot([a0,  a1], [J(0), J(a1)], color='r')
ax.plot([a1, a2], [J(a1), J(a2)], color='r')
ax.plot([a2, a3], [J(a2), J(a3)], color='r')

# Customize the plot
plt.xlim(-2, 5)
plt.ylim(-10000, 70000)
plt.xlabel("a")
plt.ylabel("SSR(a)")  
ax.axhline(y=0, color='k')
ax.axvline(x=0, color='k')
plt.legend()

# Show the plot
plt.show()

# Print results
print('Derivative of cost function J(0) = ', dJ(0))
print('Step 1 a =', round(a1, 5))
print('Step 2 a =', round(a2, 5))
print('Step 3 a =', round(a3, 5), "\n")
print("Gradient descent optim_a slope: \n", round(optim_a, 4))

"""
    Step 1 a = 0.67218
    Step 2 a = 0.99758
    Step 3 a = 1.15511
    Step 4 a = 1.23137
    Step 5 a = 1.26829
    Step 6 a = 1.28616
    Step 7 a = 1.29481
    Step 8 a = 1.299
    Step 9 a = 1.30102
    Step 10 a = 1.30201
    Step 11 a = 1.30248
    Step 12 a = 1.30271
    Step 13 a = 1.30282
    Step 14 a = 1.30288
    Step 15 a = 1.3029
"""

""" Gradient descent (two params, a and b)
Algorithm starts with a random value of the parameter a, b
"""

import matplotlib.pyplot as plt
import numpy as np

# The model (linear)
def predict(X, a, b):
    Y = X*a + b
    return np.round(Y) # f(x) = ax + b

# Cost function
def J(a, b):
    J = np.sum((Y - predict(X, a, b))**2)
    return J

# Derivatives
def dJ(a, b):
    da = np.sum(-2 * X * (Y - predict(X, a, b))) # b fixed
    db = np.sum(-2 * 1 * (Y - predict(X, a, b))) # a fixed
    return da, db

# Gradient descent
def gradient_descent(X, Y, lr=0.00001, loops=1000):
    a = 0
    b = 0
    for i in range(loops):
        da, db = dJ(a, b)
        a = a - lr * da
        for j in range(loops):
            b = b - lr * db
    return round(a, 1), round(b, 1)

# Train dataset 1
X = np.array([30, 46, 60, 65, 77, 95])
Y = np.array([31, 30, 80, 49, 70, 118])
print("\nLearning 1")

# Learning a,b
a, b = gradient_descent(X, Y)
print('a =', a, ' b =', b)
print('Predictions:', f'f(x) = {a}x + {b}')

# Predictions
x = 33; y = predict(x, a, b); print("f(%s) =" %x, y)
x = 45; y = predict(x, a, b); print("f(%s) =" %x, y)
x = 62; y = predict(x, a, b); print("f(%s) =" %x, y)

fig, ax = plt.subplots()
ax.set_xlabel('x')
ax.set_ylabel('f(x)')
ax.grid(True, which='both')
ax.axhline(y=0, color='k')
ax.axvline(x=0, color='k')

# Draw dataset 1
ax.plot(X, Y, 'x', color='g', label='training data')
ax.plot(X, a*X + b, label=f'f(x) = {b} + {a}x') # line
ax.plot(55, predict(55, a, b), 'o', color='r')
plt.legend(loc='upper right')

# Train dataset 2
X = np.array([15, 18, 20, 21, 23, 25, 27, 28, 29, 30, 32, 34, 35, 36])
Y = np.array([23, 74, 65, 82, 135, 321, 440, 400, 290, 620, 630, 610, 560, 568])
print("\nLearning 2")

# Learning a,b
a, b = gradient_descent(X, Y)
print('a =', a, ' b =', b)
print('Predictions:', f'f(x) = {a}x + {a}')

x = 20; y = predict(x, a, b); print("f(%s) =" %x, y)
x = 24; y = predict(x, a, b); print("f(%s) =" %x, y)
x = 33; y = predict(x, a, b); print("f(%s) =" %x, y)

# Draw dataset 2
ax.plot(X, Y, 'x', color='g')
ax.plot(X, a*X + b, label=f'f(x) = {b} + {a}x') # line
ax.plot(55, predict(33, a, b), 'o', color='r')
plt.legend(loc='upper right')
plt.show()

"""
    Learning 1
    a = 1.3  b = -17.3
    Predictions: f(x) = 1.3x + -17.3
    f(33) = 26.0
    f(45) = 41.0
    f(62) = 63.0

Learning 2
    a = 32.9  b = -533.1
    Predictions: f(x) = 32.9x + 32.9
    f(20) = 125.0
    f(24) = 256.0
"""

""" Gradient descent Algorithm

1. Initialize the parameters
    select initial set of params for the model

2. Compute the cost function
    differences between predictions and actual values

3. Compute the gradients
    partial derivatives of cost function

4. Update the parameters
    use the gradients and a learning rate

5. Repeat steps 2-4
"""

import numpy as np

def cost(theta, x, y):

y_pred = np.dot(x, theta)
    error = y_pred - y
    return (1 / (2 * len(y))) * np.dot(error.T, error)

def gradient_descent(x, y, theta, lr, num_iterations):

cost_history = np.zeros(num_iterations)
    for i in range(num_iterations):

y_pred = np.dot(x, theta)
        error = y_pred - y

theta = theta - (lr/len(y)) * np.dot(x.T, error)
        cost_history[i] = cost(theta, x, y)

return theta, cost_history

x = np.array([[1, 2], [1, 3], [1, 4], [1, 5]])
y = np.array([[7], [6], [5], [7]])

theta = np.random.randn(2, 1)
lr = 0.01
num_iterations = 1000

theta, cost_history = gradient_descent(x, y, theta, lr, num_iterations)
print("Theta: ", theta) 
    # [[4.55230192] [0.43431721]]

 Last update: 617 days ago

Questions and answers:

With gradient descent algorithm we can find:

a) The minim of SSR(a)
b) The optim params a, b

Use gradient descent algorithm:

a) for linear regression
b) to optimize any ML model

minte9
LearnRemember / MLEARNING

Linear Regression Lines

Cost function J(a)

Cost function J(a, b)

Gradient descent

Learning (a, b)

Algorithm

Questions and answers:

References and applications:

Related pages graph:

minte9 LearnRemember / MLEARNING

Linear Regression Lines

Cost function J(a)

Cost function J(a, b)

Gradient descent

Learning (a, b)

Algorithm

Questions and answers:

References and applications:

Related pages graph:

minte9
LearnRemember / MLEARNING