Linear regression
p61 Learn a linear regression on a single variable.\(
h_\theta = \theta_0 + \theta_1 x_1
\)

""" Linear Regression (one parameter)
h(x) = ax + b
Finding the line that best fits the data is known as ...
linear regression (one of the most popular tools in statistics)
"""
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
# Training Dataset
X = np.array([30, 46, 60, 65, 77, 95]).reshape(6,1)
Y = np.array([31, 30, 80, 49, 70, 118])
# Learn a prediction function
r = LinearRegression().fit(X, Y)
a = r.coef_[0].round(1)
b = r.intercept_.round(1)
print(f'f(x) = {a}x + {b}') # f(x) = 1.3x - 18
# Predict unknown
x1 = 80
y1 = a*x1 + b
print(f'f({x1}) = {y1}') # f(80) = 86.0
# Draw graphics
fig, ax = plt.subplots()
plt.ylim(0, 140)
plt.xlim(0, 140)
ax.plot(X, Y, 'x', color='g', label='training data') # Draw dataset points
ax.plot(X, a*X + b, label=f'h(x) = {b} + {a}x') # Draw function line
ax.plot(x1, y1, 'o', color='r', label=f'h({x1}) = {y1}') # Draw unknown point
plt.legend(), plt.show()

Two variables
With multiple regression we can throw in more variables.\(
h_\theta = \theta_0 + \theta_1 x_1 + \theta_2 x_2
\)

""" Linear Regression (two parameters)
h(x) = ax + by + c
We can predict the CO2 emission of a car based on the size of the engine,
but with multiple regression we can throw in more variables,
like the weight of the car, to make the prediction more accurate
"""
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import pandas as pd
import pathlib
import warnings # fitted without feature names
warnings.filterwarnings("ignore", category=Warning)
DIR = pathlib.Path(__file__).resolve().parent
with open(DIR / 'data/cars.csv') as file:
df = pd.read_csv(DIR / 'data/cars.csv')
X = df[[
'Weight',
'Volume',
]].values
y = df['CO2'].values
r = LinearRegression().fit(X, y)
# Draw surface
fig = plt.figure()
Ax, Ay = np.meshgrid(
np.linspace(df.Weight.min(), df.Weight.max(), 100),
np.linspace(df.Volume.min(), df.Volume.max(), 100)
)
onlyX = pd.DataFrame({'Weight': Ax.ravel(), 'Volume': Ay.ravel()})
fittedY = r.predict(onlyX)
fittedY = np.array(fittedY)
ax = fig.add_subplot(111, projection='3d')
ax.scatter(df['Weight'], df['Volume'], df['CO2'], c='g', marker='x', alpha=0.5)
ax.plot_surface(Ax, Ay, fittedY.reshape(Ax.shape), color='b', alpha=0.3)
ax.set_xlabel('Weight')
ax.set_ylabel('Volume')
ax.set_zlabel('CO2')
# Predictions
X = [1600, 1252] # Honda Civic, 1600, 1252 / CO2: 94
y = r.predict([X]) # CO2: 101.5
print(y.round(1).item())
ax.plot(X[0], X[1], y[0], 'o', color='r')
X = [1200, 780] # ?
y = r.predict([X]) # CO2: 94.8
print(y.round(1).item())
ax.plot(X[0], X[1], y[0], 's', color='g')
plt.show()

Multiple variables
The multiple variables form of hypothesis function.\(
h_\theta = \theta_0 + \theta_1 x_1 + \theta_2 x_2 + \cdots + \theta_n x_n
\)

""" Linear Regression (multiple parameters)
h(x) = ax + by + cz + ...
"""
from os import X_OK
import numpy as np, sys
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import pandas as pd
import pathlib
DIR = pathlib.Path(__file__).resolve().parent
with open(DIR / 'data/real_estate.csv') as file:
df = pd.read_csv(file)
X = df[[
'X1 transaction date',
'X2 house age',
'X3 distance to the nearest MRT station',
'X4 number of convenience stores',
'X5 latitude',
'X6 longitude',
]].values
y = df['Y house price of unit area'].values
r = LinearRegression().fit(X, y)
# Predictions
for X in [
[2013.17, 13, 732.85, 0, 24.98, 121.53], # price: 39 (train data)
[2013.58, 16.6, 323.69, 6, 24.98, 121.54], # price: 51 (train data)
[2013.17, 33, 732.85, 0, 24.98, 121.53], # ?
]:
print(r.predict([X]).round(1).item())
# 38.8
# 48.5
# 33.4
➥ Residuals
Residuals
p65 Evaluate the model fit using SSR sum of squared residuals.
""" Linear Regression (evaluation)
A residual is the difference between the actual data point
and the predicted (by our model) value
"""
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
# Training Dataset
X = np.array([30, 46, 60, 65, 77, 95]).reshape(6,1)
Y = np.array([31, 30, 80, 49, 70, 118])
# Learn a prediction function
r = LinearRegression().fit(X, Y)
a = r.coef_[0].round(1)
b = r.intercept_.round(1)
print(f'f(x) = {a}x + {b}') # f(x) = 1.3x - 18
# Evaluate the model
P = [] # predictions (on training dataset)
R = [] # residuals
SSR = 0 # sum of squared residuals
for i in range(len(X)):
P = np.append(P, -18 + 1.3*X[i])
R = np.append(R, Y[i] - P[i])
SSR += R[i] ** 2
print(R) # 10 -11.8 20 -17.5 -12.1 12.5
print(f'SSR = {SSR.round(2).item()}') # 1248.15
# Draw graphics
fig, ax = plt.subplots()
plt.ylim(0, 140)
plt.xlim(0, 140)
ax.plot(X, Y, 'x', color='g', label='training data') # dataset points
ax.plot(X, a*X + b, label=f'h(x) = {b} + {a}x') # function line
for i in range(len(X)): # residuals
ax.plot([X[i], X[i]], [P[i], Y[i]], '-', color='c')
plt.legend(), plt.show()

➥ Questions
Last update: 48 days ago