import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
123)
np.random.seed('ignore')
warnings.filterwarnings(%matplotlib inline
Linear Regression Walkthrough with Seoul Bike Sharing Dataset and Google Colab
LINEAR REGRESSION TUTORIAL BY RUVIMBO MAMBINGE
Abstract: The dataset contains count of public bikes rented at each hour in Seoul Bike haring System with the corresponding Weather data and Holidays information
This notebook covers:
- Loading the data
- Simple EDA and feature enginnering
- Data preprocessing and data wrangling
- Creating a simple model
- Evaluation
Import Data
Importing Data is a processing of allowing one to move data from its location to your notebook. From here data is being imported from google drive to the current notebook.
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()= GoogleAuth()
gauth = GoogleCredentials.get_application_default()
gauth.credentials = GoogleDrive(gauth) drive
= 'https://drive.google.com/file/d/1CPx8P4n8Tu6Da2QMJzkUIEsiEm79AJXy/view?usp=sharing' # The shareable link link
Preview the first five rows of the train set
# to get the id part of the file
# id = link.split("/")[-2]
= drive.CreateFile({'id':id})
downloaded 'SeoulBikeData.csv')
downloaded.GetContentFile(
= pd.read_csv('SeoulBikeData.csv' , encoding='mac_roman',)
df 10) df.head(
Date | Rented Bike Count | Hour | Temperature(∞C) | Humidity(%) | Wind speed (m/s) | Visibility (10m) | Dew point temperature(∞C) | Solar Radiation (MJ/m2) | Rainfall(mm) | Snowfall (cm) | Seasons | Holiday | Functioning Day | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 01/12/2017 | 254 | 0 | -5.2 | 37 | 2.2 | 2000 | -17.6 | 0.00 | 0.0 | 0.0 | Winter | No Holiday | Yes |
1 | 01/12/2017 | 204 | 1 | -5.5 | 38 | 0.8 | 2000 | -17.6 | 0.00 | 0.0 | 0.0 | Winter | No Holiday | Yes |
2 | 01/12/2017 | 173 | 2 | -6.0 | 39 | 1.0 | 2000 | -17.7 | 0.00 | 0.0 | 0.0 | Winter | No Holiday | Yes |
3 | 01/12/2017 | 107 | 3 | -6.2 | 40 | 0.9 | 2000 | -17.6 | 0.00 | 0.0 | 0.0 | Winter | No Holiday | Yes |
4 | 01/12/2017 | 78 | 4 | -6.0 | 36 | 2.3 | 2000 | -18.6 | 0.00 | 0.0 | 0.0 | Winter | No Holiday | Yes |
5 | 01/12/2017 | 100 | 5 | -6.4 | 37 | 1.5 | 2000 | -18.7 | 0.00 | 0.0 | 0.0 | Winter | No Holiday | Yes |
6 | 01/12/2017 | 181 | 6 | -6.6 | 35 | 1.3 | 2000 | -19.5 | 0.00 | 0.0 | 0.0 | Winter | No Holiday | Yes |
7 | 01/12/2017 | 460 | 7 | -7.4 | 38 | 0.9 | 2000 | -19.3 | 0.00 | 0.0 | 0.0 | Winter | No Holiday | Yes |
8 | 01/12/2017 | 930 | 8 | -7.6 | 37 | 1.1 | 2000 | -19.8 | 0.01 | 0.0 | 0.0 | Winter | No Holiday | Yes |
9 | 01/12/2017 | 490 | 9 | -6.5 | 27 | 0.5 | 1928 | -22.4 | 0.23 | 0.0 | 0.0 | Winter | No Holiday | Yes |
# Check the shape of the train set
df.shape
(8760, 14)
#Description of data
print(df.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8760 entries, 0 to 8759
Data columns (total 14 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Date 8760 non-null object
1 Rented Bike Count 8760 non-null int64
2 Hour 8760 non-null int64
3 Temperature(∞C) 8760 non-null float64
4 Humidity(%) 8760 non-null int64
5 Wind speed (m/s) 8760 non-null float64
6 Visibility (10m) 8760 non-null int64
7 Dew point temperature(∞C) 8760 non-null float64
8 Solar Radiation (MJ/m2) 8760 non-null float64
9 Rainfall(mm) 8760 non-null float64
10 Snowfall (cm) 8760 non-null float64
11 Seasons 8760 non-null object
12 Holiday 8760 non-null object
13 Functioning Day 8760 non-null object
dtypes: float64(6), int64(4), object(4)
memory usage: 958.2+ KB
None
df.describe()
Rented Bike Count | Hour | Temperature(∞C) | Humidity(%) | Wind speed (m/s) | Visibility (10m) | Dew point temperature(∞C) | Solar Radiation (MJ/m2) | Rainfall(mm) | Snowfall (cm) | |
---|---|---|---|---|---|---|---|---|---|---|
count | 8760.000000 | 8760.000000 | 8760.000000 | 8760.000000 | 8760.000000 | 8760.000000 | 8760.000000 | 8760.000000 | 8760.000000 | 8760.000000 |
mean | 704.602055 | 11.500000 | 12.882922 | 58.226256 | 1.724909 | 1436.825799 | 4.073813 | 0.569111 | 0.148687 | 0.075068 |
std | 644.997468 | 6.922582 | 11.944825 | 20.362413 | 1.036300 | 608.298712 | 13.060369 | 0.868746 | 1.128193 | 0.436746 |
min | 0.000000 | 0.000000 | -17.800000 | 0.000000 | 0.000000 | 27.000000 | -30.600000 | 0.000000 | 0.000000 | 0.000000 |
25% | 191.000000 | 5.750000 | 3.500000 | 42.000000 | 0.900000 | 940.000000 | -4.700000 | 0.000000 | 0.000000 | 0.000000 |
50% | 504.500000 | 11.500000 | 13.700000 | 57.000000 | 1.500000 | 1698.000000 | 5.100000 | 0.010000 | 0.000000 | 0.000000 |
75% | 1065.250000 | 17.250000 | 22.500000 | 74.000000 | 2.300000 | 2000.000000 | 14.800000 | 0.930000 | 0.000000 | 0.000000 |
max | 3556.000000 | 23.000000 | 39.400000 | 98.000000 | 7.400000 | 2000.000000 | 27.200000 | 3.520000 | 35.000000 | 8.800000 |
=['O']) df.describe(include
Date | Seasons | Holiday | Functioning Day | |
---|---|---|---|---|
count | 8760 | 8760 | 8760 | 8760 |
unique | 365 | 4 | 2 | 2 |
top | 13/11/2018 | Summer | No Holiday | Yes |
freq | 24 | 2208 | 8328 | 8465 |
Check whether there are null values in the data set.
sum() df.isnull().
Date 0
Rented Bike Count 0
Hour 0
Temperature(∞C) 0
Humidity(%) 0
Wind speed (m/s) 0
Visibility (10m) 0
Dew point temperature(∞C) 0
Solar Radiation (MJ/m2) 0
Rainfall(mm) 0
Snowfall (cm) 0
Seasons 0
Holiday 0
Functioning Day 0
dtype: int64
Data preparation
This where we transform our raw data.
- Convert Categorical values to numeric using Label Encoder conversion
- Convert Date to a Date format that brings out insights when visualisation is done and makes it easy for algorithms to understand and brings out insights when visualisation is done.
- One Hot Encoding conversion, creating dummy variables to convert categorical into numeric values
#import preprocessing module
from sklearn.preprocessing import LabelEncoder
# Label Encoder conversion
= LabelEncoder()
le "Holiday"] = le.fit_transform(df["Holiday"])
df["Functioning Day"] = le.fit_transform(df["Functioning Day"]) df[
df.head()
Date | Rented Bike Count | Hour | Temperature(∞C) | Humidity(%) | Wind speed (m/s) | Visibility (10m) | Dew point temperature(∞C) | Solar Radiation (MJ/m2) | Rainfall(mm) | Snowfall (cm) | Seasons | Holiday | Functioning Day | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 01/12/2017 | 254 | 0 | -5.2 | 37 | 2.2 | 2000 | -17.6 | 0.0 | 0.0 | 0.0 | Winter | 1 | 1 |
1 | 01/12/2017 | 204 | 1 | -5.5 | 38 | 0.8 | 2000 | -17.6 | 0.0 | 0.0 | 0.0 | Winter | 1 | 1 |
2 | 01/12/2017 | 173 | 2 | -6.0 | 39 | 1.0 | 2000 | -17.7 | 0.0 | 0.0 | 0.0 | Winter | 1 | 1 |
3 | 01/12/2017 | 107 | 3 | -6.2 | 40 | 0.9 | 2000 | -17.6 | 0.0 | 0.0 | 0.0 | Winter | 1 | 1 |
4 | 01/12/2017 | 78 | 4 | -6.0 | 36 | 2.3 | 2000 | -18.6 | 0.0 | 0.0 | 0.0 | Winter | 1 | 1 |
'Date'] = pd.to_datetime(df['Date'])
df[= df.assign(
df =df.Date.dt.day,
day=df.Date.dt.month,
month=df.Date.dt.year) year
df.head()
Date | Rented Bike Count | Hour | Temperature(∞C) | Humidity(%) | Wind speed (m/s) | Visibility (10m) | Dew point temperature(∞C) | Solar Radiation (MJ/m2) | Rainfall(mm) | Snowfall (cm) | Seasons | Holiday | Functioning Day | day | month | year | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2017-01-12 | 254 | 0 | -5.2 | 37 | 2.2 | 2000 | -17.6 | 0.0 | 0.0 | 0.0 | Winter | 1 | 1 | 12 | 1 | 2017 |
1 | 2017-01-12 | 204 | 1 | -5.5 | 38 | 0.8 | 2000 | -17.6 | 0.0 | 0.0 | 0.0 | Winter | 1 | 1 | 12 | 1 | 2017 |
2 | 2017-01-12 | 173 | 2 | -6.0 | 39 | 1.0 | 2000 | -17.7 | 0.0 | 0.0 | 0.0 | Winter | 1 | 1 | 12 | 1 | 2017 |
3 | 2017-01-12 | 107 | 3 | -6.2 | 40 | 0.9 | 2000 | -17.6 | 0.0 | 0.0 | 0.0 | Winter | 1 | 1 | 12 | 1 | 2017 |
4 | 2017-01-12 | 78 | 4 | -6.0 | 36 | 2.3 | 2000 | -18.6 | 0.0 | 0.0 | 0.0 | Winter | 1 | 1 | 12 | 1 | 2017 |
# One Hot Encoding conversion, creating dummy variables to convert categorical into numeric values
= list(df.select_dtypes(include=['object']).columns)
Seasons = pd.get_dummies(df[Seasons], prefix=Seasons)
dummies =1, inplace = True)
df.drop(Seasons, axis
= pd.concat([df, dummies], axis = 1) df
df.head()
Date | Rented Bike Count | Hour | Temperature(∞C) | Humidity(%) | Wind speed (m/s) | Visibility (10m) | Dew point temperature(∞C) | Solar Radiation (MJ/m2) | Rainfall(mm) | Snowfall (cm) | Holiday | Functioning Day | day | month | year | Seasons_Autumn | Seasons_Spring | Seasons_Summer | Seasons_Winter | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2017-01-12 | 254 | 0 | -5.2 | 37 | 2.2 | 2000 | -17.6 | 0.0 | 0.0 | 0.0 | 1 | 1 | 12 | 1 | 2017 | 0 | 0 | 0 | 1 |
1 | 2017-01-12 | 204 | 1 | -5.5 | 38 | 0.8 | 2000 | -17.6 | 0.0 | 0.0 | 0.0 | 1 | 1 | 12 | 1 | 2017 | 0 | 0 | 0 | 1 |
2 | 2017-01-12 | 173 | 2 | -6.0 | 39 | 1.0 | 2000 | -17.7 | 0.0 | 0.0 | 0.0 | 1 | 1 | 12 | 1 | 2017 | 0 | 0 | 0 | 1 |
3 | 2017-01-12 | 107 | 3 | -6.2 | 40 | 0.9 | 2000 | -17.6 | 0.0 | 0.0 | 0.0 | 1 | 1 | 12 | 1 | 2017 | 0 | 0 | 0 | 1 |
4 | 2017-01-12 | 78 | 4 | -6.0 | 36 | 2.3 | 2000 | -18.6 | 0.0 | 0.0 | 0.0 | 1 | 1 | 12 | 1 | 2017 | 0 | 0 | 0 | 1 |
=df.drop(['Date'], axis=1) df
20) df.head(
Rented Bike Count | Hour | Temperature(∞C) | Humidity(%) | Wind speed (m/s) | Visibility (10m) | Dew point temperature(∞C) | Solar Radiation (MJ/m2) | Rainfall(mm) | Snowfall (cm) | Holiday | Functioning Day | day | month | year | Seasons_Autumn | Seasons_Spring | Seasons_Summer | Seasons_Winter | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 254 | 0 | -5.2 | 37 | 2.2 | 2000 | -17.6 | 0.00 | 0.0 | 0.0 | 1 | 1 | 12 | 1 | 2017 | 0 | 0 | 0 | 1 |
1 | 204 | 1 | -5.5 | 38 | 0.8 | 2000 | -17.6 | 0.00 | 0.0 | 0.0 | 1 | 1 | 12 | 1 | 2017 | 0 | 0 | 0 | 1 |
2 | 173 | 2 | -6.0 | 39 | 1.0 | 2000 | -17.7 | 0.00 | 0.0 | 0.0 | 1 | 1 | 12 | 1 | 2017 | 0 | 0 | 0 | 1 |
3 | 107 | 3 | -6.2 | 40 | 0.9 | 2000 | -17.6 | 0.00 | 0.0 | 0.0 | 1 | 1 | 12 | 1 | 2017 | 0 | 0 | 0 | 1 |
4 | 78 | 4 | -6.0 | 36 | 2.3 | 2000 | -18.6 | 0.00 | 0.0 | 0.0 | 1 | 1 | 12 | 1 | 2017 | 0 | 0 | 0 | 1 |
5 | 100 | 5 | -6.4 | 37 | 1.5 | 2000 | -18.7 | 0.00 | 0.0 | 0.0 | 1 | 1 | 12 | 1 | 2017 | 0 | 0 | 0 | 1 |
6 | 181 | 6 | -6.6 | 35 | 1.3 | 2000 | -19.5 | 0.00 | 0.0 | 0.0 | 1 | 1 | 12 | 1 | 2017 | 0 | 0 | 0 | 1 |
7 | 460 | 7 | -7.4 | 38 | 0.9 | 2000 | -19.3 | 0.00 | 0.0 | 0.0 | 1 | 1 | 12 | 1 | 2017 | 0 | 0 | 0 | 1 |
8 | 930 | 8 | -7.6 | 37 | 1.1 | 2000 | -19.8 | 0.01 | 0.0 | 0.0 | 1 | 1 | 12 | 1 | 2017 | 0 | 0 | 0 | 1 |
9 | 490 | 9 | -6.5 | 27 | 0.5 | 1928 | -22.4 | 0.23 | 0.0 | 0.0 | 1 | 1 | 12 | 1 | 2017 | 0 | 0 | 0 | 1 |
10 | 339 | 10 | -3.5 | 24 | 1.2 | 1996 | -21.2 | 0.65 | 0.0 | 0.0 | 1 | 1 | 12 | 1 | 2017 | 0 | 0 | 0 | 1 |
11 | 360 | 11 | -0.5 | 21 | 1.3 | 1936 | -20.2 | 0.94 | 0.0 | 0.0 | 1 | 1 | 12 | 1 | 2017 | 0 | 0 | 0 | 1 |
12 | 449 | 12 | 1.7 | 23 | 1.4 | 2000 | -17.2 | 1.11 | 0.0 | 0.0 | 1 | 1 | 12 | 1 | 2017 | 0 | 0 | 0 | 1 |
13 | 451 | 13 | 2.4 | 25 | 1.6 | 2000 | -15.6 | 1.16 | 0.0 | 0.0 | 1 | 1 | 12 | 1 | 2017 | 0 | 0 | 0 | 1 |
14 | 447 | 14 | 3.0 | 26 | 2.0 | 2000 | -14.6 | 1.01 | 0.0 | 0.0 | 1 | 1 | 12 | 1 | 2017 | 0 | 0 | 0 | 1 |
15 | 463 | 15 | 2.1 | 36 | 3.2 | 2000 | -11.4 | 0.54 | 0.0 | 0.0 | 1 | 1 | 12 | 1 | 2017 | 0 | 0 | 0 | 1 |
16 | 484 | 16 | 1.2 | 54 | 4.2 | 793 | -7.0 | 0.24 | 0.0 | 0.0 | 1 | 1 | 12 | 1 | 2017 | 0 | 0 | 0 | 1 |
17 | 555 | 17 | 0.8 | 58 | 1.6 | 2000 | -6.5 | 0.08 | 0.0 | 0.0 | 1 | 1 | 12 | 1 | 2017 | 0 | 0 | 0 | 1 |
18 | 862 | 18 | 0.6 | 66 | 1.4 | 2000 | -5.0 | 0.00 | 0.0 | 0.0 | 1 | 1 | 12 | 1 | 2017 | 0 | 0 | 0 | 1 |
19 | 600 | 19 | 0.0 | 77 | 1.7 | 2000 | -3.5 | 0.00 | 0.0 | 0.0 | 1 | 1 | 12 | 1 | 2017 | 0 | 0 | 0 | 1 |
# Look at correlation with target. Correlation explains how one or more variables are related to each other.
# Here we checking how the target variable relates to other variables.
= df.corr()
corr_matrix "Rented Bike Count"].sort_values(ascending=False) corr_matrix[
Rented Bike Count 1.000000
Temperature(∞C) 0.538558
Hour 0.410257
Dew point temperature(∞C) 0.379788
Seasons_Summer 0.296549
Solar Radiation (MJ/m2) 0.261837
year 0.215162
Functioning Day 0.203943
Visibility (10m) 0.199280
Wind speed (m/s) 0.121108
Seasons_Autumn 0.102753
Holiday 0.072338
month 0.070861
day 0.046849
Seasons_Spring 0.022888
Rainfall(mm) -0.123074
Snowfall (cm) -0.141804
Humidity(%) -0.199780
Seasons_Winter -0.424925
Name: Rented Bike Count, dtype: float64
#Separate training features from target
= df.drop(['Rented Bike Count', 'Humidity(%)', 'Seasons_Winter', 'Snowfall (cm)',
X_train 'Rainfall(mm)', 'Holiday','month','day','Seasons_Spring'], axis=1)
= df['Rented Bike Count'] y_train
# Split train_data
from sklearn.model_selection import train_test_split
= train_test_split(
train_X, test_X, train_y, test_y =0.3)
X_train, y_train, test_size train_X.shape
(6132, 10)
Create a Model
- Used Linear Regression to create the model.
#import regression algorithm here
from sklearn.linear_model import LinearRegression
# create models
= LinearRegression()
lg_model
#fitting the models
lg_model.fit(X_train, y_train)
LinearRegression()
Train the Model
# import evaluation metrics
from sklearn import metrics
# evaluate the model
= lg_model.predict(test_X)
lg_y_pred
# print the coefficients
print(lg_model.intercept_)
print(lg_model.coef_)
-37427.781096961895
[ 2.64280800e+01 5.71179303e+01 1.47637751e+01 4.79006794e-02
-2.66713058e+01 -8.01931235e+01 9.37059471e+02 1.79459806e+01
1.82507218e+02 -4.70284541e+01]
# pair the feature names with the coefficients
list(zip(X_train, lg_model.coef_))
[('Hour', 26.4280800075682),
('Temperature(∞C)', 57.11793034670109),
('Wind speed (m/s)', 14.763775070526737),
('Visibility (10m)', 0.0479006793964835),
('Dew point temperature(∞C)', -26.67130584610349),
('Solar Radiation (MJ/m2)', -80.19312354619825),
('Functioning Day', 937.0594705456832),
('year', 17.945980589708437),
('Seasons_Autumn', 182.50721754158653),
('Seasons_Summer', -47.02845406960871)]
Evaluation
# Get Mean Absolute Error, Root Mean Squared Error
print('Mean Absolute Error:', metrics.mean_absolute_error(test_y, lg_y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(test_y, lg_y_pred)))
Mean Absolute Error: 333.6620242647687
Root Mean Squared Error: 442.28942454735244
Items with a higher coeffcient have better chances of bikes being hired
#Calculating coefficients
from pandas import Series, DataFrame
= DataFrame(X_train.columns)
coeff 'Coefficient Estimate']= Series(lg_model.coef_)
coeff[ coeff
0 | Coefficient Estimate | |
---|---|---|
0 | Hour | 26.428080 |
1 | Temperature(∞C) | 57.117930 |
2 | Wind speed (m/s) | 14.763775 |
3 | Visibility (10m) | 0.047901 |
4 | Dew point temperature(∞C) | -26.671306 |
5 | Solar Radiation (MJ/m2) | -80.193124 |
6 | Functioning Day | 937.059471 |
7 | year | 17.945981 |
8 | Seasons_Autumn | 182.507218 |
9 | Seasons_Summer | -47.028454 |
MODEL EVALUATION
#Calculating r-square
lg_model.score(test_X, test_y)
0.5139926793868411
Making Predictions
= lg_model.predict(test_X) y_pred
= pd.DataFrame({'Actual': test_y, 'Predicted': y_pred})
df 10) df.head(
Actual | Predicted | |
---|---|---|
3771 | 249 | 432.883930 |
2680 | 752 | 856.066439 |
4280 | 2194 | 796.015953 |
1409 | 178 | 591.771773 |
340 | 69 | -165.616933 |
8140 | 107 | 376.548148 |
4900 | 375 | 858.975057 |
1352 | 322 | -198.832826 |
6117 | 1337 | 1271.321273 |
299 | 251 | 127.903183 |