Advertisement
amu2002

uber using linear regression

Nov 20th, 2023
55
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.04 KB | None | 0 0
  1. import pandas as pd
  2. import numpy as np
  3. import matplotlib.pyplot as plt
  4. import seaborn as sns
  5. from sklearn.model_selection import train_test_split
  6.  
  7. df = pd.read_csv('./uber.csv')
  8. df.info()
  9.  
  10. df.shape
  11.  
  12. df.head()
  13.  
  14. df.isnull()
  15.  
  16. df.drop(columns=["Unnamed: 0", "key"], inplace=True)
  17. df.head()
  18.  
  19. df.isnull().sum()
  20.  
  21. df['dropoff_latitude'].fillna(value=df['dropoff_latitude'].mean(),inplace = True)
  22. df['dropoff_longitude'].fillna(value=df['dropoff_longitude'].median(),inplace = True)
  23.  
  24. df.dtypes
  25.  
  26. df.pickup_datetime = pd.to_datetime(df.pickup_datetime)
  27. df.dtypes
  28.  
  29. df = df.assign(hour = df.pickup_datetime.dt.hour,
  30.                day = df.pickup_datetime.dt.day,
  31.                month = df.pickup_datetime.dt.month,
  32.                year = df.pickup_datetime.dt.year,
  33.                dayofweek = df.pickup_datetime.dt.dayofweek)
  34.  
  35. df
  36.  
  37. df = df.drop(["pickup_datetime"], axis =1)
  38. df
  39.  
  40. from math import *
  41.  
  42. def distance_formula(longitude1, latitude1, longitude2, latitude2):
  43.     travel_dist = []
  44.  
  45.     for pos in range (len(longitude1)):
  46.         lon1, lan1, lon2, lan2 = map(radians, [longitude1[pos], latitude1[pos], longitude2[pos], latitude2[pos]])
  47.         dist_lon = lon2 - lon1
  48.         dist_lan = lan2 - lan1
  49.  
  50.         a = sin(dist_lan/2)**2 + cos(lan1) * cos(lan2) * sin(dist_lon/2)**2
  51.         c = 2 * asin(sqrt(a)) * 6371
  52.         travel_dist.append(c)
  53.  
  54.     return  travel_dist
  55.  
  56. df['dist_travel_km'] = distance_formula(df.pickup_longitude.to_numpy(), df.pickup_latitude.to_numpy(),
  57.                                         df.dropoff_longitude.to_numpy(), df.dropoff_latitude.to_numpy())
  58.  
  59. df.plot(kind = "box",subplots = True,layout = (6,2),figsize=(15,20))
  60. plt.show()
  61.  
  62. def remove_outlier(df1 , col):
  63.     Q1 = df1[col].quantile(0.25)
  64.     Q3 = df1[col].quantile(0.75)
  65.     IQR = Q3 - Q1
  66.     lower_whisker = Q1-1.5*IQR
  67.     upper_whisker = Q3+1.5*IQR
  68.     df[col] = np.clip(df1[col] , lower_whisker , upper_whisker)
  69.     return df1
  70.  
  71. def treat_outliers_all(df1 , col_list):
  72.     for c in col_list:
  73.         df1 = remove_outlier(df , c)
  74.     return df1
  75.  
  76. df = treat_outliers_all(df , df.iloc[: , 0::])
  77.  
  78. df.plot(kind = "box",subplots = True,layout = (7,2),figsize=(15,20))
  79. plt.show()
  80.  
  81. corr = df.corr()
  82. corr
  83.  
  84. fig,axis = plt.subplots(figsize = (10,6))
  85. sns.heatmap(df.corr(),annot = True)
  86.  
  87. df_x = df[['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude',
  88.            'passenger_count','hour','day','month','year','dayofweek','dist_travel_km']]
  89. df_y = df['fare_amount']
  90.  
  91. x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=1)
  92.  
  93. df
  94.  
  95. from sklearn.linear_model import LinearRegression
  96.  
  97. reg = LinearRegression()
  98.  
  99. reg.fit(x_train, y_train)
  100.  
  101. y_pred_lin = reg.predict(x_test)
  102. print(y_pred_lin)
  103.  
  104. from sklearn.ensemble import RandomForestRegressor
  105.  
  106. rf = RandomForestRegressor(n_estimators=100)
  107. rf.fit(x_train,y_train)
  108.  
  109. y_pred_rf = rf.predict(x_test)
  110. print(y_pred_rf)
  111.  
  112. cols = ['Model', 'RMSE', 'R-Squared']
  113.  
  114. result_tabulation = pd.DataFrame(columns = cols)
  115.  
  116. import pandas as pd
  117. import numpy as np
  118. from sklearn import metrics
  119. from sklearn.metrics import r2_score
  120.  
  121. result_tabulation = pd.DataFrame()
  122. reg_RMSE = np.sqrt(metrics.mean_squared_error(y_test, y_pred_lin))
  123. reg_squared = r2_score(y_test, y_pred_lin)
  124.  
  125. full_metrics = pd.Series({'Model': "Linear Regression", 'RMSE' : reg_RMSE, 'R-Squared' : reg_squared})
  126. for i in range(0,len(result_tabulation)):
  127.     result_tabulation = result_tabulation.drop(i)
  128.  
  129. result_tabulation = pd.concat([result_tabulation, full_metrics.to_frame().transpose()], ignore_index=True)
  130.  
  131. print(result_tabulation)
  132.  
  133. rf_RMSE = np.sqrt(metrics.mean_squared_error(y_test, y_pred_rf))
  134. rf_squared = r2_score(y_test, y_pred_rf)
  135.  
  136. full_metrics_rf = pd.Series({'Model': "Random Forest", 'RMSE': rf_RMSE, 'R-Squared': rf_squared})
  137.  
  138. for i in range(0,len(result_tabulation)):
  139.     result_tabulation = result_tabulation.drop(i)
  140.  
  141. result_tabulation = pd.concat([result_tabulation, full_metrics_rf.to_frame().transpose()], ignore_index=True)
  142.  
  143. print(result_tabulation)
  144.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement