Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import numpy as np
- import matplotlib.pyplot as plt
- import seaborn as sns
- from sklearn.model_selection import train_test_split
- df = pd.read_csv('./uber.csv')
- df.info()
- df.shape
- df.head()
- df.isnull()
- df.drop(columns=["Unnamed: 0", "key"], inplace=True)
- df.head()
- df.isnull().sum()
- df['dropoff_latitude'].fillna(value=df['dropoff_latitude'].mean(),inplace = True)
- df['dropoff_longitude'].fillna(value=df['dropoff_longitude'].median(),inplace = True)
- df.dtypes
- df.pickup_datetime = pd.to_datetime(df.pickup_datetime)
- df.dtypes
- df = df.assign(hour = df.pickup_datetime.dt.hour,
- day = df.pickup_datetime.dt.day,
- month = df.pickup_datetime.dt.month,
- year = df.pickup_datetime.dt.year,
- dayofweek = df.pickup_datetime.dt.dayofweek)
- df
- df = df.drop(["pickup_datetime"], axis =1)
- df
- from math import *
- def distance_formula(longitude1, latitude1, longitude2, latitude2):
- travel_dist = []
- for pos in range (len(longitude1)):
- lon1, lan1, lon2, lan2 = map(radians, [longitude1[pos], latitude1[pos], longitude2[pos], latitude2[pos]])
- dist_lon = lon2 - lon1
- dist_lan = lan2 - lan1
- a = sin(dist_lan/2)**2 + cos(lan1) * cos(lan2) * sin(dist_lon/2)**2
- c = 2 * asin(sqrt(a)) * 6371
- travel_dist.append(c)
- return travel_dist
- df['dist_travel_km'] = distance_formula(df.pickup_longitude.to_numpy(), df.pickup_latitude.to_numpy(),
- df.dropoff_longitude.to_numpy(), df.dropoff_latitude.to_numpy())
- df.plot(kind = "box",subplots = True,layout = (6,2),figsize=(15,20))
- plt.show()
- def remove_outlier(df1 , col):
- Q1 = df1[col].quantile(0.25)
- Q3 = df1[col].quantile(0.75)
- IQR = Q3 - Q1
- lower_whisker = Q1-1.5*IQR
- upper_whisker = Q3+1.5*IQR
- df[col] = np.clip(df1[col] , lower_whisker , upper_whisker)
- return df1
- def treat_outliers_all(df1 , col_list):
- for c in col_list:
- df1 = remove_outlier(df , c)
- return df1
- df = treat_outliers_all(df , df.iloc[: , 0::])
- df.plot(kind = "box",subplots = True,layout = (7,2),figsize=(15,20))
- plt.show()
- corr = df.corr()
- corr
- fig,axis = plt.subplots(figsize = (10,6))
- sns.heatmap(df.corr(),annot = True)
- df_x = df[['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude',
- 'passenger_count','hour','day','month','year','dayofweek','dist_travel_km']]
- df_y = df['fare_amount']
- x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=1)
- df
- from sklearn.linear_model import LinearRegression
- reg = LinearRegression()
- reg.fit(x_train, y_train)
- y_pred_lin = reg.predict(x_test)
- print(y_pred_lin)
- from sklearn.ensemble import RandomForestRegressor
- rf = RandomForestRegressor(n_estimators=100)
- rf.fit(x_train,y_train)
- y_pred_rf = rf.predict(x_test)
- print(y_pred_rf)
- cols = ['Model', 'RMSE', 'R-Squared']
- result_tabulation = pd.DataFrame(columns = cols)
- import pandas as pd
- import numpy as np
- from sklearn import metrics
- from sklearn.metrics import r2_score
- result_tabulation = pd.DataFrame()
- reg_RMSE = np.sqrt(metrics.mean_squared_error(y_test, y_pred_lin))
- reg_squared = r2_score(y_test, y_pred_lin)
- full_metrics = pd.Series({'Model': "Linear Regression", 'RMSE' : reg_RMSE, 'R-Squared' : reg_squared})
- for i in range(0,len(result_tabulation)):
- result_tabulation = result_tabulation.drop(i)
- result_tabulation = pd.concat([result_tabulation, full_metrics.to_frame().transpose()], ignore_index=True)
- print(result_tabulation)
- rf_RMSE = np.sqrt(metrics.mean_squared_error(y_test, y_pred_rf))
- rf_squared = r2_score(y_test, y_pred_rf)
- full_metrics_rf = pd.Series({'Model': "Random Forest", 'RMSE': rf_RMSE, 'R-Squared': rf_squared})
- for i in range(0,len(result_tabulation)):
- result_tabulation = result_tabulation.drop(i)
- result_tabulation = pd.concat([result_tabulation, full_metrics_rf.to_frame().transpose()], ignore_index=True)
- print(result_tabulation)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement