Advertisement
VisualPaul

Untitled

Apr 8th, 2016
357
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.98 KB | None | 0 0
  1. from sklearn.cluster import KMeans
  2. from sklearn.preprocessing import OneHotEncoder
  3. from geopy.distance import vincenty
  4. smallKMeans = KMeans(n_clusters=32).fit(array([[ 5.57193379e+01, 3.75042493e+01], [ 5.99548735e+01, 3.03414877e+01], [ 5.63016261e+01, 4.39558484e+01], [ 5.16780537e+01, 3.91903597e+01], [ -1.42108547e-14, 7.10542736e-15], [ 3.74616427e+01, 5.58080684e+01], [ 5.57937074e+01, 4.91418695e+01], [ 5.57978899e+01, 3.78080745e+01], [ 5.57613385e+01, 3.75826331e+01], [ 5.58280793e+01, 3.73813172e+01], [ 5.38598600e+01, 7.90272750e+01], [ 5.54182033e+01, 3.78960873e+01], [ 5.56160826e+01, 3.76962533e+01], [ 5.57240184e+01, 3.76440402e+01], [ 5.58629747e+01, 3.75753680e+01], [ 5.56272181e+01, 3.73043914e+01], [ 5.58686034e+01, 3.76740503e+01], [ 5.59518665e+01, 3.74080757e+01], [ 5.56780265e+01, 3.75789472e+01], [ 5.57753068e+01, 3.76353879e+01], [ 5.57929992e+01, 3.75085978e+01], [ 5.57305654e+01, 3.74187758e+01], [ 5.25622300e+01, 1.03875951e+02], [ 5.57015057e+01, 3.78589313e+01], [ 5.56551281e+01, 3.74975318e+01], [ 5.98439744e+01, 3.02816403e+01], [ 5.57762242e+01, 3.77056910e+01], [ 5.58036530e+01, 3.71794222e+01], [ 5.55786622e+01, 3.75734711e+01], [ 5.58036379e+01, 3.79957685e+01], [ 5.58540031e+01, 3.74722757e+01], [ 5.56906482e+01, 3.77552389e+01]]))
  5. holidays = "1.01,2.01,3.01,4.01,5.01,6.01,7.01,8.01,23.02,8.03,9.03,10.03,1.05,2.05,3.05,4.05,9.05,10.05,11.05,12.06,13.06,14.06,15.06".split(',')
  6. holidays = set(tuple(map(int, x.split('.'))) for x in holidays)
  7.  
  8. def get_features(data):
  9. dist = data.dist.values
  10. lat, lon = data.lat.values, data.lon.values
  11. weekday, month = data.day_of_week.values, data.month.values
  12. hourx, houry = cos(data.hour / 23), sin(data.hour / 23)
  13. hour = data.hour
  14. hota, hotb, hotc = zeros_like(hour, dtype=float32), zeros_like(hour, dtype=float32), zeros_like(hour, dtype=float32)
  15.  
  16. hota[(data.f_class == 'econom').values] += 1.00
  17. hota[(data.s_class == 'econom').values] += 0.50
  18. hota[(data.t_class == 'econom').values] += 0.25
  19.  
  20. hotb[(data.f_class == 'business').values] += 1.00
  21. hotb[(data.s_class == 'business').values] += 0.50
  22. hotb[(data.t_class == 'business').values] += 0.25
  23.  
  24. hotc[(data.f_class == 'vip').values] += 1.00
  25. hotc[(data.s_class == 'vip').values] += 0.50
  26. hotc[(data.t_class == 'vip').values] += 0.25
  27.  
  28. isHoliday = array([[1 if x in holidays else 0] for x in zip(data['day'], data['month'])])
  29. city = smallKMeans.predict(ds[['lat', 'lon']])
  30. smallClusters = OneHotEncoder().fit_transform(city.reshape(-1, 1)).toarray()
  31. cityDistance = array([vincenty(smallKMeans.cluster_centers_[city[i]], (data['lat'][i], data['lon'][i])).meters for i in range(len(data))])
  32. weekday = OneHotEncoder().fit_transform(ds['day_of_week'].reshape(-1, 1)).toarray()
  33. features = array(list(zip(dist, lat, lon, month, hourx, houry, hour, hota, hotb, hotc)))
  34. features = hstack((features, smallClusters, isHoliday, weekday, cityDistance.reshape(-1, 1)))
  35. return features
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement