PROGRAMMING

  minte9
learningjourney




S R Q

Linear regression

p61 Learn a linear regression on a single variable.
\( h_\theta = \theta_0 + \theta_1 x_1 \)
 
""" Linear Regression (one parameter)
h(x) = ax + b
Finding the line that best fits the data is known as ...
linear regression (one of the most popular tools in statistics)
"""

import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

# Training Dataset
X = np.array([30, 46, 60, 65, 77, 95]).reshape(6,1)
Y = np.array([31, 30, 80, 49, 70, 118])


# Learn a prediction function
r = LinearRegression().fit(X, Y)
a = r.coef_[0].round(1)
b = r.intercept_.round(1)

print(f'f(x) = {a}x + {b}') # f(x) = 1.3x - 18

# Predict unknown
x1 = 80
y1 = a*x1 + b
print(f'f({x1}) = {y1}') # f(80) = 86.0


# Draw graphics
fig, ax = plt.subplots()
plt.ylim(0, 140)
plt.xlim(0, 140)

ax.plot(X, Y, 'x', color='g', label='training data') # Draw dataset points
ax.plot(X, a*X + b, label=f'h(x) = {b} + {a}x') # Draw function line
ax.plot(x1, y1, 'o', color='r', label=f'h({x1}) = {y1}') # Draw unknown point

plt.legend(), plt.show()

Two variables

With multiple regression we can throw in more variables.
\( h_\theta = \theta_0 + \theta_1 x_1 + \theta_2 x_2 \)
 
""" Linear Regression (two parameters)
h(x) = ax + by + c
We can predict the CO2 emission of a car based on the size of the engine, 
but with multiple regression we can throw in more variables, 
like the weight of the car, to make the prediction more accurate
"""

import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import pandas as pd
import pathlib
import warnings  # fitted without feature names
warnings.filterwarnings("ignore", category=Warning)

DIR = pathlib.Path(__file__).resolve().parent
with open(DIR / 'data/cars.csv') as file:
    df = pd.read_csv(DIR / 'data/cars.csv')
    X = df[[
        'Weight',
        'Volume',
    ]].values
    y = df['CO2'].values

r = LinearRegression().fit(X, y) 

# Draw surface
fig = plt.figure()
Ax, Ay = np.meshgrid(
    np.linspace(df.Weight.min(), df.Weight.max(), 100),
    np.linspace(df.Volume.min(), df.Volume.max(), 100)
)
onlyX = pd.DataFrame({'Weight': Ax.ravel(), 'Volume': Ay.ravel()})
fittedY = r.predict(onlyX)
fittedY = np.array(fittedY)

ax = fig.add_subplot(111, projection='3d')
ax.scatter(df['Weight'], df['Volume'], df['CO2'], c='g', marker='x', alpha=0.5)
ax.plot_surface(Ax, Ay, fittedY.reshape(Ax.shape), color='b', alpha=0.3)
ax.set_xlabel('Weight')
ax.set_ylabel('Volume')
ax.set_zlabel('CO2')

# Predictions
X = [1600, 1252]   # Honda Civic, 1600, 1252 / CO2: 94
y = r.predict([X]) # CO2: 101.5
print(y.round(1).item())
ax.plot(X[0], X[1], y[0], 'o', color='r')

X = [1200, 780]    # ?
y = r.predict([X]) # CO2: 94.8
print(y.round(1).item())
ax.plot(X[0], X[1], y[0], 's', color='g')

plt.show()

Multiple variables

The multiple variables form of hypothesis function.
\( h_\theta = \theta_0 + \theta_1 x_1 + \theta_2 x_2 + \cdots + \theta_n x_n \)
 
""" Linear Regression (multiple parameters)
h(x) = ax + by + cz + ... 
"""

from os import X_OK
import numpy as np, sys
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import pandas as pd
import pathlib

DIR = pathlib.Path(__file__).resolve().parent
with open(DIR / 'data/real_estate.csv') as file:
    df = pd.read_csv(file)
    X = df[[
        'X1 transaction date',
        'X2 house age',
        'X3 distance to the nearest MRT station',
        'X4 number of convenience stores',
        'X5 latitude',
        'X6 longitude',
    ]].values
    y = df['Y house price of unit area'].values
    r = LinearRegression().fit(X, y)

# Predictions
for X in [
    [2013.17, 13, 732.85, 0, 24.98, 121.53],     # price: 39 (train data)
    [2013.58, 16.6, 323.69, 6, 24.98, 121.54],   # price: 51 (train data)
    [2013.17, 33, 732.85, 0, 24.98, 121.53],     # ?
]:
    print(r.predict([X]).round(1).item())
    # 38.8
    # 48.5
    # 33.4
Residuals

Residuals

p65 Evaluate the model fit using SSR sum of squared residuals.
 
""" Linear Regression (evaluation)
A residual is the difference between the actual data point 
and the predicted (by our model) value
"""

import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

# Training Dataset
X = np.array([30, 46, 60, 65, 77, 95]).reshape(6,1)
Y = np.array([31, 30, 80, 49, 70, 118])

# Learn a prediction function
r = LinearRegression().fit(X, Y)
a = r.coef_[0].round(1)
b = r.intercept_.round(1)
print(f'f(x) = {a}x + {b}') # f(x) = 1.3x - 18

# Evaluate the model
P = []  # predictions (on training dataset)
R = []  # residuals  
SSR = 0 # sum of squared residuals

for i in range(len(X)):
    P = np.append(P, -18 + 1.3*X[i])
    R = np.append(R, Y[i] - P[i])
    SSR += R[i] ** 2

print(R) # 10 -11.8 20 -17.5 -12.1 12.5
print(f'SSR = {SSR.round(2).item()}') # 1248.15

# Draw graphics
fig, ax = plt.subplots()
plt.ylim(0, 140)
plt.xlim(0, 140)

ax.plot(X, Y, 'x', color='g', label='training data')     # dataset points
ax.plot(X, a*X + b, label=f'h(x) = {b} + {a}x')          # function line
for i in range(len(X)):                                  # residuals
    ax.plot([X[i], X[i]], [P[i], Y[i]], '-', color='c')

plt.legend(), plt.show()

Questions    
Last update: 2 days ago
Scikit, Basis Expansion