Make datasets
Generate synthetic datasets for machine learning tasks.
""" The make_* functions in the sklearn.datasets module are used
to generate synthetic datasets for machine learning tasks.
Make blobs generates isotropic Gaussian blobs for clustering tasks.
Make classification generates a random n-class classification problem.
Make regression generates a random regression problem.
"""
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.datasets import make_classification
from sklearn.datasets import make_regression
# Make blob
features1, target1 = make_blobs(
n_samples = 100,
n_features = 2,
centers = 3, # three target classes
cluster_std = 0.5,
shuffle = True,
random_state = 1
)
# Make classification
features2, target2 = make_classification(
n_samples = 100,
n_features = 2,
n_informative = 2,
n_redundant = 0,
n_classes = 2,
weights = [.25, .75],
random_state = 1
)
# Make regression
features3, target3, coef3 = make_regression(
n_samples = 100,
n_features = 3,
n_informative = 3,
n_targets = 1,
noise = 0,
coef = True,
random_state = 1
)
# Plot blobs
plt.scatter(features1[:, 0], features1[:, 1], c=target1)
plt.title('Make blob - Simultated dataset')
plt.show()
plt.scatter(features2[:, 0], features2[:, 1], c=target2)
plt.title('Make classification - Simultated dataset')
plt.show()
plt.scatter(features3[:, 0], features3[:, 1], c=target3)
plt.title('Make regression - Simultated dataset')
plt.show()
print("Blob / Features[0:3]:\n", features1[0:3])
print("Target[:10]:", target1[:10], '\n')
print("Classification / Features[0:3]:\n", features2[0:3])
print("Target[:10]:", target2[:10], '\n')
print("Regression / Features[0:3]:\n", features3[0:3])
print("Target[:10]:\n", target3[:10], '\n')
"""
Blob / Features[0:3]:
[[ -1.22685609 3.25572052]
[ -9.57463218 -4.38310652]
[-10.71976941 -4.20558148]]
Target:[:10]: [0 1 1 1 2 2 2 1 0 0]
Classification / Features[0:3]:
[[ 1.30022717 -0.7856539 ]
[ 1.44184425 -0.56008554]
[-0.84792445 -1.36621324]]
Target:[:10]: [1 1 0 0 0 1 1 1 0 1]
Regression / Features[0:3]:
[[ 1.29322588 -0.61736206 -0.11044703]
[-2.793085 0.36633201 1.93752881]
[ 0.80186103 -0.18656977 0.0465673 ]]
Target[:10]:
[ -10.37865986 25.5124503 19.67705609 149.50205427 -121.65210879
90.29412996 214.01379719 224.74157328 -73.17331138 -195.62776209]
"""
Survey Simulation
Suppose, we have a survey among the employees of a company.
""" Regression Simulated Dataset (experience / salary)
Simulate the data for building a regression model.
Suppose, we have a survey among the employees of a company.
As a developer, often you have no access to survey data.
You need to simulate the data for building the regression model.
"""
import matplotlib.pyplot as plt
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
import numpy as np
import pandas as pd
# Sample dataset
X, y = make_regression(
n_samples = 100,
n_features = 1, # Employ years of experience
n_informative = 1,
n_targets = 1, # Employ's salary
noise = 10,
coef = False,
random_state = 0
)
# Scale feature and target
X = np.interp(X, (X.min(), X.max()), (0, 20))
y = np.interp(y, (y.min(), y.max()), (10000, 200000))
# Fit a linear regression model
reg = LinearRegression().fit(X, y)
# Plot dataset points
plt.scatter(X, y, label='training data')
plt.title('Simultated dataset (Experience / Salary)')
# Plot the regression line
x_line = np.linspace(np.min(X), np.max(X), 100)
y_line = reg.intercept_ + x_line * reg.coef_[0]
plt.plot(x_line, y_line, color='red', label='prediction')
plt.text(10, 25000, r'y = %0.2f + %0.2f x' % (reg.intercept_, reg.coef_[0]))
plt.xlabel('Years of experience')
plt.ylabel('Salary')
plt.legend()
plt.show()
# Print data frame
df = pd.DataFrame(data={'Experience': X.flatten(), 'Salary': y})
print(df.head(10))
"""
Experience Salary
0 9.096218 95224.004179
1 14.637429 132619.663538
2 12.255808 123760.689176
3 7.215160 98496.528556
4 6.905628 80966.199869
5 12.427999 138646.723320
6 6.534503 62290.952298
7 12.363590 129242.508929
8 11.451010 132720.442525
9 9.295277 93053.397973
"""
Last update: 308 days ago