Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from sklearn.cluster import KMeans
- from sklearn.preprocessing import OneHotEncoder
- from geopy.distance import vincenty
- from sklearn.neighbors import BallTree
- smallKMeans = KMeans(n_clusters=3).fit(array([[ 55.73008165, 37.59531199], [ 59.91301691, 30.31944249],[ 55.67814337, 46.11249841]]))
- holidays = "1.01,2.01,3.01,4.01,5.01,6.01,7.01,8.01,23.02,8.03,9.03,10.03,1.05,2.05,3.05,4.05,9.05,10.05,11.05,12.06,13.06,14.06,15.06".split(',')
- holidays = set(tuple(map(int, x.split('.'))) for x in holidays)
- def get_features(data):
- dist = data.dist.values
- lat, lon = data.lat.values, data.lon.values
- weekday, month = data.day_of_week.values, data.month.values
- hourx, houry = cos(data.hour / 23), sin(data.hour / 23)
- hour = data.hour
- hota, hotb, hotc = zeros_like(hour, dtype=float32), zeros_like(hour, dtype=float32), zeros_like(hour, dtype=float32)
- hota[(data.f_class == 'econom').values] += 1.00
- hota[(data.s_class == 'econom').values] += 0.50
- hota[(data.t_class == 'econom').values] += 0.25
- hotb[(data.f_class == 'business').values] += 1.00
- hotb[(data.s_class == 'business').values] += 0.50
- hotb[(data.t_class == 'business').values] += 0.25
- hotc[(data.f_class == 'vip').values] += 1.00
- hotc[(data.s_class == 'vip').values] += 0.50
- hotc[(data.t_class == 'vip').values] += 0.25
- isHoliday = array([[1 if x in holidays else 0] for x in zip(data['day'], data['month'])])
- city = smallKMeans.predict(ds[['lat', 'lon']])
- smallClusters = OneHotEncoder().fit_transform(city.reshape(-1, 1)).toarray()
- cityDistance = array([vincenty(smallKMeans.cluster_centers_[city[i]], (data['lat'][i], data['lon'][i])).meters for i in range(len(data))])
- ballTree = BallTree(data[['lat', 'lon']])
- coord = array(list(zip(data['lat'], data['lon'])))
- sumDist = array([sum(vincenty(coord[i], coord[x]).meters for x in ballTree.query(coord[i].reshape(1, -1), 5)[0]) for i in range(len(coord))])
- weekday = OneHotEncoder().fit_transform(ds['day_of_week'].reshape(-1, 1)).toarray()
- features = array(list(zip(dist, lat, lon, month, hourx, houry, hour, hota, hotb, hotc)))
- features = hstack((features, smallClusters, isHoliday, weekday, cityDistance.reshape(-1, 1), sumDist.reshape(-1, 1)))
- return features
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement