#!/usr/bin/env python
# coding: utf-8
# In[10]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# In[25]:
# loading the data
df = pd.read_csv("50_Startups-Assignment 5 (1).csv")
# In[83]:
print(df.shape)
df.describe()
# In[68]:
#Splitting the data into Regressor(independent variables) and Target(dependent variable)
Regressor = df.iloc[:, :-1].values
Target = df.iloc[:, 4].values
# In[27]:
#Looking at the Dataset we can see that "State" is a String type variable, thus we cannot feed String type variables
#into our Machine Learning model.To overcome this problem we use the Label Encoder object and create Dummy Variables
#using the OneHotEncoder object. So we import ColumnTransformer along with OneHotEncoder
# In[70]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct =ColumnTransformer(
[('one_hot_encodder',OneHotEncoder(),[3])],
remainder='passthrough'
)
Regressor = np.array(ct.fit_transform(Regressor),dtype = np.float)
# In[72]:
Regressor=Regressor[:, 1:]
# In[73]:
#Splitting the dataset into the Training and Test dataset
from sklearn.model_selection import train_test_split
Regressor_train, Regressor_test, Target_train, Target_test = train_test_split( Regressor, Target, test_size=0.25, random_state=0)
# In[74]:
#Fitting Multiple Linear Regression to the Training Set
from sklearn.linear_model import LinearRegression
Model = LinearRegression()
Model.fit(Regressor_train, Target_train)
# In[75]:
#Predicted vector of the test set
Target_pred = First_eq.predict(Regressor_test)
# In[76]:
Target_pred
# In[86]:
#R-square
from sklearn.metrics import r2_score
r2_score(Target_test, Target_pred)
# In[ ]:
#Now we will be looking at the measure errors with different metrics
# In[78]:
#mean squared error
from sklearn.metrics import mean_squared_error
mean_squared_error(Target_test,Target_pred)
# In[79]:
#RMSE
from math import sqrt
rms = sqrt(mean_squared_error(Target_test,Target_pred))
rms
# In[80]:
#MAE
from sklearn.metrics import mean_absolute_error
mean_absolute_error(Target_test,Target_pred)
# In[ ]:
#MinMax scaler
# In[81]:
from sklearn import preprocessing
mm_scaler = preprocessing.MinMaxScaler()
Regressor_train_minmax = mm_scaler.fit_transform(Regressor_train)
mm_scaler.transform(Regressor_test)
# In[ ]:
#Ridge and Lasso
# Ridge regression is an extension of linear regression where the loss function is modified to minimize the complexity of
# the model. This modification is done by adding a penalty parameter that is equivalent to the square of the magnitude of
# the coefficients. The loss function for Ridge regression is given by:
# Loss function = OLS + alpha * summation (squared coefficient values)
# In[82]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
# In[88]:
rr = Ridge(alpha=0.01)
rr.fit(Regressor_train, Target_train)
pred_train_rr= rr.predict(Regressor_train)
print(np.sqrt(mean_squared_error(Target_train,pred_train_rr)))
print(r2_score(Target_train, pred_train_rr))
pred_test_rr= rr.predict(Regressor_test)
print(np.sqrt(mean_squared_error(Target_test,pred_test_rr)))
print(r2_score(Target_test, pred_test_rr))
# Lasso regression, or the Least Absolute Shrinkage and Selection Operator, is also a modification of linear regression.
# In Lasso, the loss function is modified to minimize the complexity of the model by limiting the sum of the absolute values
# of the model coefficients (also called the l1-norm). The loss function for Lasso Regression is given by:
# Loss function = OLS + alpha * summation (absolute values of the magnitude of the coefficients)
# In[90]:
model_lasso = Lasso(alpha=0.01)
model_lasso.fit(Regressor_train, Target_train)
pred_train_lasso= model_lasso.predict(Regressor_train)
print(np.sqrt(mean_squared_error(Target_train,pred_train_lasso)))
print(r2_score(Target_train, pred_train_lasso))
pred_test_lasso= model_lasso.predict(Regressor_test)
print(np.sqrt(mean_squared_error(Target_test,pred_test_lasso)))
print(r2_score(Target_test, pred_test_lasso))
# Conclusion:
# Linear Regression Model: Test set RMSE of 35064.2005 thousand and R-square of 35.27 percent.
#
# Ridge Regression Model: Test set RMSE of 35061.0113 thousand and R-square of 35.28 percent.
#
# Lasso Regression Model: Test set RMSE of 35064.1886 thousand and R-square of 35.27 percent.
#
# In[ ]:
{
"cells": [
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"# loading the data\n",
"df = pd.read_csv(\"50_Startups-Assignment 5 (1).csv\")"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(50, 5)\n"
]
},
{
"data": {
"text/html": [
"
\n",
"\n",
"\n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
"
|
R&D Spend
|
Administration
|
Marketing Spend
|
Profit
|
count
|
50.000000
|
50.00000
|
50.000000
|
50.000000
|
mean
|
95405.964000
|
157588.45116
|
271597.767660
|
146350.607120
|
std
|
59925.318556
|
38608.90335
|
157969.099247
|
56580.157734
|
min
|
0.000000
|
62091.78000
|
0.000000
|
16149.540000
|
25%
|
52284.300250
|
128550.68550
|
160745.698500
|
112362.504000
|
50%
|
93677.806000
|
167104.70400
|
275226.401500
|
144088.595000
|
75%
|
128704.215500
|
178316.72850
|
391928.623500
|
181438.118500
|
max
|
213161.010000
|
219174.67200
|
549252.630000
|
288392.745000
|
\n",
"
"
],
"text/plain": [
" R&D Spend Administration Marketing Spend Profit\n",
"count 50.000000 50.00000 50.000000 50.000000\n",
"mean 95405.964000 157588.45116 271597.767660 146350.607120\n",
"std 59925.318556 38608.90335 157969.099247 56580.157734\n",
"min 0.000000 62091.78000 0.000000 16149.540000\n",
"25% 52284.300250 128550.68550 160745.698500 112362.504000\n",
"50% 93677.806000 167104.70400 275226.401500 144088.595000\n",
"75% 128704.215500 178316.72850 391928.623500 181438.118500\n",
"max 213161.010000 219174.67200 549252.630000 288392.745000"
]
},
"execution_count": 83,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print(df.shape)\n",
"df.describe()"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
"#Splitting the data into Regressor(independent variables) and Target(dependent variable)\n",
"Regressor = df.iloc[:, :-1].values\n",
"Target = df.iloc[:, 4].values"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"#Looking at the Dataset we can see that \"State\" is a String type variable, thus we cannot feed String type variables\n",
"#into our Machine Learning model.To overcome this problem we use the Label Encoder object and create Dummy Variables\n",
"#using the OneHotEncoder object. So we import ColumnTransformer along with OneHotEncoder\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.preprocessing import OneHotEncoder\n",
"from sklearn.compose import ColumnTransformer\n",
"\n",
"ct =ColumnTransformer(\n",
" [('one_hot_encodder',OneHotEncoder(),[3])],\n",
" remainder='passthrough'\n",
" )\n",
"Regressor = np.array(ct.fit_transform(Regressor),dtype = np.float)"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [],
"source": [
"Regressor=Regressor[:, 1:]"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [],
"source": [
"#Splitting the dataset into the Training and Test dataset\n",
"from sklearn.model_selection import train_test_split\n",
"Regressor_train, Regressor_test, Target_train, Target_test = train_test_split( Regressor, Target, test_size=0.25, random_state=0)"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)"
]
},
"execution_count": 74,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Fitting Multiple Linear Regression to the Training Set\n",
"from sklearn.linear_model import LinearRegression\n",
"Model = LinearRegression()\n",
"Model.fit(Regressor_train, Target_train)"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [],
"source": [
"#Predicted vector of the test set\n",
"Target_pred = First_eq.predict(Regressor_test)"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([151477.04933493, 148373.55906895, 145612.04691522, 73551.78938889,\n",
" 204859.29909382, 168287.41891873, 71694.86703419, 146474.04078268,\n",
" 146705.22744893, 199856.05396648, 110629.61038141, 104083.82566741,\n",
" 141706.54794344])"
]
},
"execution_count": 76,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Target_pred"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.35268545008708907"
]
},
"execution_count": 86,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#R-square\n",
"from sklearn.metrics import r2_score\n",
"r2_score(Target_test, Target_pred)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Now we will be looking at the measure errors with different metrics"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1229498158.0398107"
]
},
"execution_count": 78,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#mean squared error\n",
"from sklearn.metrics import mean_squared_error\n",
"mean_squared_error(Target_test,Target_pred)"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"35064.20051904521"
]
},
"execution_count": 79,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#RMSE\n",
"from math import sqrt\n",
"rms = sqrt(mean_squared_error(Target_test,Target_pred))\n",
"rms"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"28760.20722757087"
]
},
"execution_count": 80,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#MAE\n",
"from sklearn.metrics import mean_absolute_error\n",
"mean_absolute_error(Target_test,Target_pred)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#MinMax scaler"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[1. , 0. , 0.45750791, 1.02540534, 0.30832024],\n",
" [0. , 0. , 0.59769212, 0.43354417, 0.55862988],\n",
" [1. , 0. , 0.55463896, 0.67758139, 0.46987268],\n",
" [1. , 0. , 0.16560101, 0.3688421 , 0.36788892],\n",
" [1. , 0. , 0.98690188, 0.45301242, 0.98850922],\n",
" [0. , 1. , 0.46378014, 0.67975351, 0.85583695],\n",
" [0. , 1. , 0.14012089, 0.11127112, 0.4834689 ],\n",
" [0. , 1. , 0.36296832, 0.99020756, 0.19732699],\n",
" [1. , 0. , 0.47591665, 0.63662866, 0.7350049 ],\n",
" [1. , 0. , 1.05461885, 0.25092311, 1.02380923],\n",
" [1. , 0. , 0.38438058, 0.40196356, 0.56011227],\n",
" [0. , 1. , 0.25042093, 0.42743615, 0.42139433],\n",
" [1. , 0. , 0.44722951, 0.91192415, 0.29984388]])"
]
},
"execution_count": 81,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn import preprocessing\n",
"mm_scaler = preprocessing.MinMaxScaler()\n",
"Regressor_train_minmax = mm_scaler.fit_transform(Regressor_train)\n",
"mm_scaler.transform(Regressor_test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Ridge and Lasso"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Ridge regression is an extension of linear regression where the loss function is modified to minimize the complexity of \n",
"the model. This modification is done by adding a penalty parameter that is equivalent to the square of the magnitude of \n",
"the coefficients. The loss function for Ridge regression is given by:\n",
"Loss function = OLS + alpha * summation (squared coefficient values)"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.linear_model import Ridge\n",
"from sklearn.linear_model import Lasso"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"25700.802829932003\n",
"0.813462230059857\n",
"35061.01135727592\n",
"0.3528031939343419\n"
]
}
],
"source": [
"rr = Ridge(alpha=0.01)\n",
"rr.fit(Regressor_train, Target_train) \n",
"pred_train_rr= rr.predict(Regressor_train)\n",
"print(np.sqrt(mean_squared_error(Target_train,pred_train_rr)))\n",
"print(r2_score(Target_train, pred_train_rr))\n",
"\n",
"pred_test_rr= rr.predict(Regressor_test)\n",
"print(np.sqrt(mean_squared_error(Target_test,pred_test_rr))) \n",
"print(r2_score(Target_test, pred_test_rr))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Lasso regression, or the Least Absolute Shrinkage and Selection Operator, is also a modification of linear regression.\n",
"In Lasso, the loss function is modified to minimize the complexity of the model by limiting the sum of the absolute values\n",
"of the model coefficients (also called the l1-norm). The loss function for Lasso Regression is given by:\n",
"Loss function = OLS + alpha * summation (absolute values of the magnitude of the coefficients)"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"25700.801702551787\n",
"0.8134622464250256\n",
"35064.18862097854\n",
"0.35268588938359413\n"
]
}
],
"source": [
"model_lasso = Lasso(alpha=0.01)\n",
"model_lasso.fit(Regressor_train, Target_train) \n",
"pred_train_lasso= model_lasso.predict(Regressor_train)\n",
"print(np.sqrt(mean_squared_error(Target_train,pred_train_lasso)))\n",
"print(r2_score(Target_train, pred_train_lasso))\n",
"\n",
"pred_test_lasso= model_lasso.predict(Regressor_test)\n",
"print(np.sqrt(mean_squared_error(Target_test,pred_test_lasso))) \n",
"print(r2_score(Target_test, pred_test_lasso))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Conclusion:\n",
"Linear Regression Model: Test set RMSE of 35064.2005 thousand and R-square of 35.27 percent.\n",
"\n",
"Ridge Regression Model: Test set RMSE of 35061.0113 thousand and R-square of 35.28 percent.\n",
"\n",
"Lasso Regression Model: Test set RMSE of 35064.1886 thousand and R-square of 35.27 percent.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}