Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import math
- import numpy as np
- import pandas as pd
- import matplotlib.pyplot as plt
- from sklearn import tree
- from sklearn.preprocessing import OneHotEncoder
- print("********************************")
- print("Entropy - Information Gain")
- print("********************************")
- weather = pd.read_csv("./weather.txt")
- names = ["Outlook", "Temperature", "Humidity"]
- print("DataFrame: ")
- print(weather)
- print()
- # ************************************************************************
- # ************************************************************************
- # GINI Index
- print("********************************")
- print("GINI Index")
- print("********************************")
- # ************************************************************************
- # ************************************************************************
- # GINI Index - Outlook
- absfreq = pd.crosstab(weather.Outlook, weather.Play)
- freq = pd.crosstab(weather.Outlook, weather.Play, normalize='index') # Contains the indices
- freqSum = pd.crosstab(weather.Outlook, weather.Play, normalize='all').sum(axis=1)
- print("absfreq = ")
- print(absfreq)
- print()
- print("freq = ")
- print(freq)
- print()
- print("freqSum = ")
- print(freqSum)
- print()
- GINI_Sunny = 1 - freq.loc["Sunny", "No"]**2 - freq.loc["Sunny", "Yes"]**2
- GINI_Rainy = 1 - freq.loc["Rainy", "No"]**2 - freq.loc["Rainy", "Yes"]**2
- GINI_Outlook = freqSum.loc["Sunny"] * GINI_Sunny + freqSum["Rainy"] * GINI_Rainy
- print("GINI_Sunny = " + str(GINI_Sunny))
- print("GINI_Rainy = " + str(GINI_Rainy))
- print("GINI_Outlook = " + str(GINI_Outlook))
- print()
- # GINI Index - Temperature
- abstemp = pd.crosstab(weather.Temperature, weather.Play)
- temp = pd.crosstab(weather.Temperature, weather.Play, normalize='index') # Contains the indices
- tempSum = pd.crosstab(weather.Temperature, weather.Play, normalize='all').sum(axis=1)
- print("abstemp = ")
- print(abstemp)
- print()
- print("temp = ")
- print(temp)
- print()
- print("tempSum = ")
- print(tempSum)
- print()
- GINI_Hot = 1 - temp.loc["Hot", "No"]**2 - temp.loc["Hot", "Yes"]**2
- GINI_Cool = 1 - temp.loc["Cool", "No"]**2 - temp.loc["Cool", "Yes"]**2
- GINI_Temperature = tempSum.loc["Hot"] * GINI_Hot + tempSum["Cool"] * GINI_Cool
- print("GINI_Hot = " + str(GINI_Hot))
- print("GINI_Cool = " + str(GINI_Cool))
- print("GINI_Temperature = " + str(GINI_Temperature))
- print()
- # GINI Index - Humidity
- abshum = pd.crosstab(weather.Humidity, weather.Play)
- hum = pd.crosstab(weather.Humidity, weather.Play, normalize='index') # Contains the indices
- humSum = pd.crosstab(weather.Humidity, weather.Play, normalize='all').sum(axis=1)
- print("abshum = ")
- print(abshum)
- print()
- print("hum = ")
- print(hum)
- print()
- print("humSum = ")
- print(humSum)
- print()
- GINI_High = 1 - hum.loc["High", "No"]**2 - hum.loc["High", "Yes"]**2
- GINI_Low = 1 - hum.loc["Low", "No"]**2 - hum.loc["Low", "Yes"]**2
- GINI_Humidity = humSum.loc["High"] * GINI_High + humSum["Low"] * GINI_Low
- print("GINI_High = " + str(GINI_High))
- print("GINI_Low = " + str(GINI_Low))
- print("GINI_Humidity = " + str(GINI_Humidity))
- print()
- GINIs = [GINI_Outlook, GINI_Temperature, GINI_Humidity]
- GINIs_df = pd.DataFrame(GINIs, names)
- print(GINIs_df)
- print()
- MIN = min(GINIs)
- MIN_INDEX = GINIs.index(MIN)
- print("The most appropriate feature for classification with GINI is '" + names[MIN_INDEX] + "' with GINI = " + str(MIN))
- print()
- print()
- print()
- # ************************************************************************
- # ************************************************************************
- # Entropy - Information Gain
- # ************************************************************************
- # ************************************************************************
- print("********************************")
- print("Entropy - Information Gain")
- print("********************************")
- # First, I have to find the total entropy
- freq_tot = pd.crosstab("Play", weather.Play, normalize="index")
- print(freq_tot)
- Entropy_All = - freq_tot.No * math.log2(freq_tot.No) - freq_tot.Yes * math.log2(freq_tot.Yes)
- Entropy_All = Entropy_All['Play']
- print("Entropy_All = " + str(Entropy_All))
- print()
- # Entropy - Outlook
- Entropy_Sunny = - freq.loc['Sunny', 'No'] * math.log2(freq.loc['Sunny', 'No']) - freq.loc['Sunny', 'Yes'] * math.log2(freq.loc['Sunny', 'Yes'])
- Entropy_Rainy = - freq.loc['Rainy', 'No'] * math.log2(freq.loc['Rainy', 'No']) - freq.loc['Rainy', 'Yes'] * math.log2(freq.loc['Rainy', 'Yes'])
- GAIN_Outlook = Entropy_All - freqSum.loc['Sunny'] * Entropy_Sunny - freqSum.loc['Rainy'] * Entropy_Rainy
- print("Entropy_Sunny = " + str(Entropy_Sunny))
- print("Entropy_Rainy = " + str(Entropy_Rainy))
- print("GAIN_Outlook = " + str(GAIN_Outlook))
- print()
- # Entropy - Temperature
- Entropy_Hot = - temp.loc['Hot', 'No'] * math.log2(temp.loc['Hot', 'No']) - temp.loc['Hot', 'Yes'] * math.log2(temp.loc['Hot', 'Yes'])
- Entropy_Cool = - temp.loc['Cool', 'No'] * math.log2(temp.loc['Cool', 'No']) - temp.loc['Cool', 'Yes'] * math.log2(temp.loc['Cool', 'Yes'])
- GAIN_Temperature = Entropy_All - tempSum.loc['Hot'] * Entropy_Hot - tempSum.loc['Cool'] * Entropy_Cool
- print("Entropy_Hot = " + str(Entropy_Hot))
- print("Entropy_Cool = " + str(Entropy_Cool))
- print("GAIN_Temperature = " + str(GAIN_Temperature))
- print()
- # Entropy - Humidity
- Entropy_High = - hum.loc['High', 'No'] * math.log2(hum.loc['High', 'No']) - hum.loc['High', 'Yes'] * math.log2(hum.loc['High', 'Yes'])
- Entropy_Low = - hum.loc['Low', 'No'] * math.log2(hum.loc['Low', 'No']) - hum.loc['Low', 'Yes'] * math.log2(hum.loc['Low', 'Yes'])
- GAIN_Humidity = Entropy_All - humSum.loc['High'] * Entropy_High - humSum.loc['Low'] * Entropy_Low
- print("Entropy_High = " + str(Entropy_High))
- print("Entropy_Low = " + str(Entropy_Low))
- print("GAIN_Humidity = " + str(GAIN_Humidity))
- print()
- GAINs = [GAIN_Outlook, GAIN_Temperature, GAIN_Humidity]
- GAINs_df = pd.DataFrame(GAINs, names)
- print(GAINs_df)
- print()
- MIN = min(GAINs)
- MIN_INDEX = GAINs.index(MIN)
- print("The most appropriate feature for classification with GAIN is '" + names[MIN_INDEX] + "' with GINI = " + str(MIN))
- print()
- print()
- print()
- # ************************************************************************
- # ************************************************************************
- # Tree Creation
- # ************************************************************************
- # ************************************************************************
- # Encoder
- encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)
- encoder.fit(weather.loc[:, ['Outlook', 'Temperature', 'Humidity']])
- transformed = encoder.transform(weather.loc[:, ['Outlook', 'Temperature', 'Humidity']])
- # Classification with trees
- clf = tree.DecisionTreeClassifier()
- clf = clf.fit(transformed, weather.loc[:, 'Play'])
- # Plots
- fig = plt.figure(figsize=(10, 9))
- tree.plot_tree(clf, class_names=['No', 'Yes'], filled=True)
- plt.show()
- # Text representation
- text_representation = tree.export_text(clf)
- print(text_representation)
- # Prediction of new data
- new_data = pd.DataFrame({"Outlook": ["Sunny"], "Temperature": ["Cold"], "Humidity": ["High"]})
- transformed_new_data = encoder.transform(new_data)
- print(clf.predict(transformed_new_data))
- print(clf.predict_proba(transformed_new_data))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement