Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import pandas as pd
- from sklearn.model_selection import train_test_split, cross_val_score
- from sklearn.tree import DecisionTreeClassifier
- from sklearn.metrics import precision_score, recall_score, f1_score
- from sklearn.preprocessing import LabelEncoder
- from sklearn.ensemble import RandomForestClassifier
- from sklearn.metrics import classification_report
- from sklearn.model_selection import GridSearchCV
- #Загрузка данных
- vgsales = pd.read_csv("vgsales.csv")
- #print(vgsales)
- #Отберем необходимые параметры
- vgsales = vgsales.loc[:,vgsales.columns.isin(["Rank","Platform","Year","Genre","Publisher","NA_Sales","EU_Sales","JP_Sales","Other_Sales","Global_Sales",])]
- #print(vgsales)
- #Удалим данные с NA
- vgsales = vgsales.dropna()
- #Жанр Sport = 0, остальные = 1
- vgsales['Genre'] = np.where(vgsales['Genre'] == 'Sports', 0, 1)
- #Преобразуем столбцы со строковыми значениями в числовые
- le = LabelEncoder()
- vgsales['Platform'] = le.fit_transform(vgsales['Platform'])
- vgsales['Publisher'] = le.fit_transform(vgsales['Publisher'])
- vgsales_test = vgsales
- X_training, X_testing, y_training, y_testing = train_test_split(vgsales_test.drop(['Genre'], axis = 1), vgsales_test['Genre'],
- test_size = 0.33,
- random_state = 42)
- #создание и обучение классификатора
- model = DecisionTreeClassifier(random_state=42)
- model.fit(X_training, y_training)
- y_predict = model.predict(X_testing)
- #Оценим точность классификатора
- precision = precision_score(y_testing, y_predict)
- recall = recall_score(y_testing, y_predict)
- f1 = f1_score(y_testing, y_predict)
- print('precision:', precision)
- print('recall:', recall)
- print('f1:', f1)
- depth = model.tree_.max_depth
- print("Depth:",depth)
- T_training, T_testing, u_training, u_testing = train_test_split(vgsales_test.drop(['Genre'], axis = 1), vgsales_test['Genre'],
- test_size = 0.3,
- random_state = 42)
- model2 = RandomForestClassifier(random_state = 42)
- model2.fit(T_training, u_training)
- u_predict = model2.predict(T_testing)
- precision_2 = precision_score(u_testing, u_predict)
- recall_2 = recall_score(u_testing, u_predict)
- f1_2 = f1_score(u_testing, u_predict)
- print('precision:', precision_2)
- print('recall:', recall_2)
- print('f1:', f1_2)
- #Параметры для перебора
- param_grid = {
- "n_estimators": [50, 100, 150, 200],
- "max_depth": [None, 5, 10, 15],
- "min_samples_split": [2, 5, 10],
- "min_samples_leaf": [1, 2, 4],
- "max_features": ["sqrt", "log2"],
- }
- # создание экземпляра класса GridSearchCV
- grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, n_jobs=-1)
- #Обучаем модель на тестовых данных
- grid_search.fit(T_training, u_training)
- # Оцениваем качество классификации
- print(classification_report(u_testing, u_predict))
- print("Best parameters:", grid_search.best_params_)
- '''
- #Случайный лес основан на использовании нескольких классификаторах решающего дерева, которые обучаются на некоторых подмножествах обучающей выборки, а потом дают независимые результаты,
- после их ответы объединяются, чтобы определить результат
- # Метрики precision, recall, F1
- 1) precision:
- P = TP / (TP + FP)
- 2) recall:
- R = TP / (TP + FN)
- 3) F1:
- '''
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement