Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from sklearn.cluster import KMeans
- from sklearn.preprocessing import OneHotEncoder
- from geopy.distance import vincenty
- smallKMeans = KMeans(n_clusters=32).fit(array([[ 5.57193379e+01, 3.75042493e+01], [ 5.99548735e+01, 3.03414877e+01], [ 5.63016261e+01, 4.39558484e+01], [ 5.16780537e+01, 3.91903597e+01], [ -1.42108547e-14, 7.10542736e-15], [ 3.74616427e+01, 5.58080684e+01], [ 5.57937074e+01, 4.91418695e+01], [ 5.57978899e+01, 3.78080745e+01], [ 5.57613385e+01, 3.75826331e+01], [ 5.58280793e+01, 3.73813172e+01], [ 5.38598600e+01, 7.90272750e+01], [ 5.54182033e+01, 3.78960873e+01], [ 5.56160826e+01, 3.76962533e+01], [ 5.57240184e+01, 3.76440402e+01], [ 5.58629747e+01, 3.75753680e+01], [ 5.56272181e+01, 3.73043914e+01], [ 5.58686034e+01, 3.76740503e+01], [ 5.59518665e+01, 3.74080757e+01], [ 5.56780265e+01, 3.75789472e+01], [ 5.57753068e+01, 3.76353879e+01], [ 5.57929992e+01, 3.75085978e+01], [ 5.57305654e+01, 3.74187758e+01], [ 5.25622300e+01, 1.03875951e+02], [ 5.57015057e+01, 3.78589313e+01], [ 5.56551281e+01, 3.74975318e+01], [ 5.98439744e+01, 3.02816403e+01], [ 5.57762242e+01, 3.77056910e+01], [ 5.58036530e+01, 3.71794222e+01], [ 5.55786622e+01, 3.75734711e+01], [ 5.58036379e+01, 3.79957685e+01], [ 5.58540031e+01, 3.74722757e+01], [ 5.56906482e+01, 3.77552389e+01]]))
- holidays = "1.01,2.01,3.01,4.01,5.01,6.01,7.01,8.01,23.02,8.03,9.03,10.03,1.05,2.05,3.05,4.05,9.05,10.05,11.05,12.06,13.06,14.06,15.06".split(',')
- holidays = set(tuple(map(int, x.split('.'))) for x in holidays)
- def get_features(data):
- dist = data.dist.values
- lat, lon = data.lat.values, data.lon.values
- weekday, month = data.day_of_week.values, data.month.values
- hourx, houry = cos(data.hour / 23), sin(data.hour / 23)
- hour = data.hour
- hota, hotb, hotc = zeros_like(hour, dtype=float32), zeros_like(hour, dtype=float32), zeros_like(hour, dtype=float32)
- hota[(data.f_class == 'econom').values] += 1.00
- hota[(data.s_class == 'econom').values] += 0.50
- hota[(data.t_class == 'econom').values] += 0.25
- hotb[(data.f_class == 'business').values] += 1.00
- hotb[(data.s_class == 'business').values] += 0.50
- hotb[(data.t_class == 'business').values] += 0.25
- hotc[(data.f_class == 'vip').values] += 1.00
- hotc[(data.s_class == 'vip').values] += 0.50
- hotc[(data.t_class == 'vip').values] += 0.25
- isHoliday = array([[1 if x in holidays else 0] for x in zip(data['day'], data['month'])])
- city = smallKMeans.predict(ds[['lat', 'lon']])
- smallClusters = OneHotEncoder().fit_transform(city.reshape(-1, 1)).toarray()
- cityDistance = array([vincenty(smallKMeans.cluster_centers_[city[i]], (data['lat'][i], data['lon'][i])).meters for i in range(len(data))])
- weekday = OneHotEncoder().fit_transform(ds['day_of_week'].reshape(-1, 1)).toarray()
- features = array(list(zip(dist, lat, lon, month, hourx, houry, hour, hota, hotb, hotc)))
- features = hstack((features, smallClusters, isHoliday, weekday, cityDistance.reshape(-1, 1)))
- return features
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement