Let's import all necessary libraries!
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
import glob
From all the available NYC CitiBike Data (105 months, from June 2013 till February 2022), 80% will be kept for training and 20% for test.
dates_trn = pd.date_range(start = "2013-06-01", end = "2020-05-01", freq='MS')
df_training = pd.DataFrame({'dates':dates_trn})
df_training.head()
dates | |
---|---|
0 | 2013-06-01 |
1 | 2013-07-01 |
2 | 2013-08-01 |
3 | 2013-09-01 |
4 | 2013-10-01 |
dates_test = pd.date_range(start = "2020-06-01", end = "2022-02-01", freq='MS')
df_test = pd.DataFrame({'dates':dates_test})
df_test.head()
dates | |
---|---|
0 | 2020-06-01 |
1 | 2020-07-01 |
2 | 2020-08-01 |
3 | 2020-09-01 |
4 | 2020-10-01 |
path_trn = "data_bikes/training/"
all_files = glob.glob(path_trn + "*.csv")
all_files.sort()
num_trips = []
for filename in all_files:
df_01 = pd.read_csv(filename)
df_02 = len(df_01)
num_trips.append(df_02)
df_training["trips"] = num_trips
df_training
dates | trips | |
---|---|---|
0 | 2013-06-01 | 577703 |
1 | 2013-07-01 | 843416 |
2 | 2013-08-01 | 1001958 |
3 | 2013-09-01 | 1034359 |
4 | 2013-10-01 | 1037712 |
... | ... | ... |
79 | 2020-01-01 | 1240596 |
80 | 2020-02-01 | 1146830 |
81 | 2020-03-01 | 1068457 |
82 | 2020-04-01 | 682762 |
83 | 2020-05-01 | 1487890 |
84 rows × 2 columns
path_tst = "data_bikes/test/"
all_files_02 = glob.glob(path_tst + "*.csv")
all_files_02.sort()
num_trips_tst = []
for filename_tst in all_files_02:
df_tst_01 = pd.read_csv(filename_tst)
df_tst_02 = len(df_tst_01)
num_trips_tst.append(df_tst_02)
df_test["trips"] = num_trips_tst
df_test
dates | trips | |
---|---|---|
0 | 2020-06-01 | 1882273 |
1 | 2020-07-01 | 2105808 |
2 | 2020-08-01 | 2329514 |
3 | 2020-09-01 | 2488225 |
4 | 2020-10-01 | 2248869 |
5 | 2020-11-01 | 1736704 |
6 | 2020-12-01 | 1088929 |
7 | 2021-01-01 | 1095346 |
8 | 2021-02-01 | 649983 |
9 | 2021-03-01 | 1531094 |
10 | 2021-04-01 | 2067669 |
11 | 2021-05-01 | 2724165 |
12 | 2021-06-01 | 3177517 |
13 | 2021-07-01 | 3084537 |
14 | 2021-08-01 | 3072478 |
15 | 2021-09-01 | 3280221 |
16 | 2021-10-01 | 3069239 |
17 | 2021-11-01 | 2159283 |
18 | 2021-12-01 | 1748287 |
19 | 2022-01-01 | 1052349 |
20 | 2022-02-01 | 1233714 |
rain_raw = pd.read_csv("weather_data/monthly_rainfall_average_nyc.csv")
rain = rain_raw.transpose()
rain.columns = rain.iloc[0]
rain = rain.reset_index(drop=True)
rain = rain.drop(0)
temp_raw = pd.read_csv("weather_data/monthly_average_temp_nyc.csv")
temp = temp_raw.transpose()
temp.columns = temp.iloc[0]
temp = temp.reset_index(drop=True)
temp = temp.drop(0)
rain
Year | 2013.0 | 2014.0 | 2015.0 | 2016.0 | 2017.0 | 2018.0 | 2019.0 | 2020.0 | 2021.0 | 2022.0 |
---|---|---|---|---|---|---|---|---|---|---|
1 | 2.76 | 2.79 | 5.23 | 4.41 | 4.83 | 2.18 | 3.58 | 1.93 | 2.31 | 4.29 |
2 | 4.25 | 5.48 | 2.04 | 4.40 | 2.48 | 5.83 | 3.14 | 2.54 | 5.13 | 3.23 |
3 | 2.90 | 3.67 | 4.72 | 1.17 | 5.25 | 5.17 | 3.87 | 3.78 | 3.41 | 0.00 |
4 | 1.31 | 7.85 | 2.08 | 1.61 | 3.84 | 5.78 | 4.55 | 4.49 | 2.69 | 0.00 |
5 | 8.00 | 4.37 | 1.86 | 3.75 | 6.38 | 3.53 | 6.82 | 1.65 | 4.36 | 0.00 |
6 | 10.10 | 4.26 | 4.79 | 2.60 | 4.76 | 3.11 | 5.46 | 1.76 | 2.62 | 0.00 |
7 | 2.84 | 5.59 | 3.98 | 7.02 | 4.19 | 7.45 | 5.77 | 6.58 | 11.09 | 0.00 |
8 | 2.85 | 2.25 | 2.35 | 1.97 | 3.34 | 8.59 | 3.70 | 5.03 | 10.32 | 0.00 |
9 | 2.95 | 1.21 | 3.28 | 2.79 | 2.00 | 6.19 | 0.95 | 3.94 | 10.03 | 0.00 |
10 | 0.36 | 5.77 | 3.91 | 4.15 | 4.18 | 3.59 | 6.15 | 5.05 | 5.26 | 0.00 |
11 | 3.15 | 4.51 | 2.01 | 5.41 | 1.58 | 7.62 | 1.95 | 3.99 | 1.12 | 0.00 |
12 | 4.85 | 6.04 | 4.72 | 2.89 | 2.21 | 6.51 | 7.09 | 4.61 | 1.39 | 0.00 |
#export inverted
rain.to_csv('/Users/alexherrera/Desktop/JupiterNotebook/Bicing_Prediction_WML/weather_data/rain_m.csv', index = False, header = True)
temp.to_csv('/Users/alexherrera/Desktop/JupiterNotebook/Bicing_Prediction_WML/weather_data/temp_m.csv', index = False, header = True)
rain_training = pd.read_csv('/Users/alexherrera/Desktop/JupiterNotebook/Bicing_Prediction_WML/weather_data/rain_training.csv')
rain_test = pd.read_csv('/Users/alexherrera/Desktop/JupiterNotebook/Bicing_Prediction_WML/weather_data/rain_test.csv')
temp_training = pd.read_csv('/Users/alexherrera/Desktop/JupiterNotebook/Bicing_Prediction_WML/weather_data/temp_traing.csv')
temp_test = pd.read_csv('/Users/alexherrera/Desktop/JupiterNotebook/Bicing_Prediction_WML/weather_data/temp_test.csv')
df_training['Rain'] = rain_training['month_average_training']
df_training['Temperature'] = temp_training['monthly_av_training']
df_test['Rain'] = rain_test['month_average_test']
df_test['Temperature'] = temp_test['monthly_av_test']
df_training.to_csv('/Users/alexherrera/Desktop/JupiterNotebook/Bicing_Prediction_WML/training.csv', index = False, header = True)
df_test.to_csv('/Users/alexherrera/Desktop/JupiterNotebook/Bicing_Prediction_WML/test.csv', index = False, header = True)