Programming

  minte9
LearnRemember




Linear regression

We use linear regression to train the model on a single variable.
\( h_\theta = \theta_0 + \theta_1 x_1 \)
 
""" Linear Regression / one parameter
h(x) = ax + b

We find the line that best fits the data.
It is one of the most popular tools in statistics.
"""

import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

# Training Dataset
X = np.array([30, 46, 60, 65, 77, 95]).reshape(6,1)
Y = np.array([31, 30, 80, 49, 70, 118])

# Learn a prediction function
r = LinearRegression().fit(X, Y)
a = r.coef_[0].round(1)
b = r.intercept_.round(1)

# Predict unknown
x1 = 80
y1 = a*x1 + b

print(f'f(x) = {a}x + {b}')
print(f'f({x1}) = {y1}')

# Draw graphics
fig, ax = plt.subplots()
plt.ylim(0, 140)
plt.xlim(0, 140)

ax.plot(X,  Y,  'x', color='g', label='training data')   # Draw dataset points
ax.plot(x1, y1, 'o', color='r', label=f'h({x1}) = {y1}') # Draw unknown point
ax.plot(X, a*X + b,  label=f'h(x) = {b} + {a}x')         # Draw function line


plt.legend(), plt.show()

"""
    f(x) = 1.3x + -18.0
    f(80) = 86.0
"""

Two variables

With multiple regression we can throw in more variables.
\( h_\theta = \theta_0 + \theta_1 x_1 + \theta_2 x_2 \)
 
""" Linear Regression  / two parameters
h(x) = ax + by + c

We can predict the CO2 emission of a car based on the size of the engine. 
With multiple regression we can throw in more variables, 
like the weight of the car, to make the prediction more accurate.
"""

import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import pandas as pd
import pathlib

# fitted without feature names
import warnings  
warnings.filterwarnings("ignore", category=Warning)

# Training Dataset
DIR = pathlib.Path(__file__).resolve().parent
with open(DIR / 'data/cars.csv') as file:
    df = pd.read_csv(DIR / 'data/cars.csv')
    X = df[[
        'Weight',
        'Volume',
    ]].values
    y = df['CO2'].values

# Learn a prediction function
r = LinearRegression().fit(X, y) 

# Draw surface
fig = plt.figure()
Ax, Ay = np.meshgrid(
    np.linspace(df.Weight.min(), df.Weight.max(), 100),
    np.linspace(df.Volume.min(), df.Volume.max(), 100)
)
onlyX = pd.DataFrame({'Weight': Ax.ravel(), 'Volume': Ay.ravel()})
fittedY = r.predict(onlyX)
fittedY = np.array(fittedY)

ax = fig.add_subplot(111, projection='3d')
ax.scatter(df['Weight'], df['Volume'], df['CO2'], c='g', marker='x', alpha=0.5)
ax.plot_surface(Ax, Ay, fittedY.reshape(Ax.shape), color='b', alpha=0.3)
ax.set_xlabel('Weight')
ax.set_ylabel('Volume')
ax.set_zlabel('CO2')

# Predictions
X1 = [1600, 1252]    # Honda Civic, 1600, 1252 / CO2: 94
y1 = r.predict([X1]) # CO2: 101.5

X2 = [1200, 780]     # Unknown car
y2 = r.predict([X2]) # CO2: 94.8

print(df, "\n")
print("Honda Civic, 1600, 1252 / CO2:", y1.round(1).item())
print("Unknow car, 1200, 780 / CO2:", y2.round(1).item())

ax.plot(X1[0], X1[1], y1[0], 'o', color='r')
ax.plot(X2[0], X2[1], y2[0], 's', color='g')

plt.show()

"""
                Car       Model  Volume  Weight  CO2
    0       Toyoty        Aygo    1000     790   99
    1   Mitsubishi  Space Star    1200    1160   95
    2        Skoda      Citigo    1000     929   95
    3         Fiat         500     900     865   90
    4         Mini      Cooper    1500    1140  105
    5           VW         Up!    1000     929  105
    6        Skoda       Fabia    1400    1109   90
    7     Mercedes     A-Class    1500    1365   92
    8         Ford      Fiesta    1500    1112   98
    9         Audi          A1    1600    1150   99
    10     Hyundai         I20    1100     980   99
    11      Suzuki       Swift    1300     990  101
    12        Ford      Fiesta    1000    1112   99
    13       Honda       Civic    1600    1252   94
    14      Hundai         I30    1600    1326   97
    15        Opel       Astra    1600    1330   97
    16         BMW           1    1600    1365   99
    17       Mazda           3    2200    1280  104
    18       Skoda       Rapid    1600    1119  104
    19        Ford       Focus    2000    1328  105
    20        Ford      Mondeo    1600    1584   94
    21        Opel    Insignia    2000    1428   99
    22    Mercedes     C-Class    2100    1365   99
    23       Skoda     Octavia    1600    1415   99
    24       Volvo         S60    2000    1415   99
    25    Mercedes         CLA    1500    1465  102
    26        Audi          A4    2000    1490  104
    27        Audi          A6    2000    1725  114
    28       Volvo         V70    1600    1523  109
    29         BMW           5    2000    1705  114
    30    Mercedes     E-Class    2100    1605  115
    31       Volvo        XC70    2000    1746  117
    32        Ford       B-Max    1600    1235  104
    33         BMW         216    1600    1390  108
    34        Opel      Zafira    1600    1405  109
    35    Mercedes         SLK    2500    1395  120 

    Honda Civic, 1600, 1252 / CO2: 101.5
    Unknow car, 1200, 780 / CO2: 94.8
"""

Multiple variables

The multiple variables form of hypothesis function.
\( h_\theta = \theta_0 + \theta_1 x_1 + \theta_2 x_2 + \cdots + \theta_n x_n \)
 
""" Linear Regression / multiple parameters
h(x) = ax + by + cz + ... 
"""

from os import X_OK
import numpy as np, sys
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import pandas as pd
import pathlib

DIR = pathlib.Path(__file__).resolve().parent
with open(DIR / 'data/real_estate.csv') as file:
    df = pd.read_csv(file)

    # Features
    X = df[[
        'X1 transaction date',
        'X2 house age',
        'X3 distance to the nearest MRT station',
        'X4 number of convenience stores',
        'X5 latitude',
        'X6 longitude',
    ]].values

    # Label
    y = df['Y house price of unit area'].values

# Train the model
r = LinearRegression().fit(X, y)

# Predictions
X1 = [2013.17, 13, 732.85, 0, 24.98, 121.53]     # price: 39 (train data)
X2 = [2013.58, 16.6, 323.69, 6, 24.98, 121.54]   # price: 51 (train data)
X3 = [2013.17, 33, 732.85, 0, 24.98, 121.53]     # ?

print(df, '\n')
print('Predict training item1, price =', r.predict([X1]).round(1).item())
print('Predict training item2, price =', r.predict([X2]).round(1).item())
print('Predict unknow item, price =',    r.predict([X3]).round(1).item())

"""
          No  X1 transaction date  X2 house age  ...
    0      1             2012.917          32.0  ...
    1      2             2012.917          19.5  ...
    2      3             2013.583          13.3  ...
    3      4             2013.500          13.3  ...
    4      5             2012.833           5.0  ...
    ..   ...                  ...           ...  ...
    409  410             2013.000          13.7  ...
    410  411             2012.667           5.6  ...
    411  412             2013.250          18.8  ...
    412  413             2013.000           8.1  ...
    413  414             2013.500           6.5  ...

    [414 rows x 8 columns] 

    Predict training item1, price = 38.8
    Predict training item2, price = 48.5
    Predict unknow item,    price = 33.4
"""

Residuals

Evaluate the model fit using SSR sum of squared residuals.
 
""" Linear Regression / residuals

A residual is the difference between the actual data point 
and the predicted (by our model) value.
"""

import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

# Training Dataset
X = np.array([30, 46, 60, 65, 77, 95]).reshape(6,1)
Y = np.array([31, 30, 80, 49, 70, 118])

# Learn a prediction function
r = LinearRegression().fit(X, Y)
a = r.coef_[0].round(1)
b = r.intercept_.round(1)

# Evaluate the model
P = []  # predictions (on training dataset)
R = []  # residuals  
SSR = 0 # sum of squared residuals

for i in range(len(X)):
    P = np.append(P, -18 + 1.3*X[i])
    R = np.append(R, Y[i] - P[i])
    SSR += R[i] ** 2

print(f'Prediction function: f(x) = {a}x + {b}') # f(x) = 1.3x - 18
print('Residuals:', R) # 10 -11.8 20 -17.5 -12.1 12.5
print(f'SSR = {SSR.round(2).item()}') # 1248.15

# Draw graphics
fig, ax = plt.subplots()
plt.ylim(0, 140)
plt.xlim(0, 140)

ax.plot(X, Y, 'x', color='g', label='training data')     # dataset points
ax.plot(X, a*X + b, label=f'h(x) = {b} + {a}x')          # function line
for i in range(len(X)):                                  # residuals
    ax.plot([X[i], X[i]], [P[i], Y[i]], '-', color='c')

plt.legend(), plt.show()

"""
    Prediction function: f(x) = 1.3x + -18.0
    Residuals: [10.  -11.8  20.  -17.5 -12.1  12.5]
    SSR = 1248.15
"""





References


Related