Advertisement
makispaiktis

Kaggle 4 - Overfitting and underfitting

Jul 13th, 2023
989
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.79 KB | None | 0 0
  1. import matplotlib.pyplot as plt
  2. # Set Matplotlib defaults
  3. plt.style.use('seaborn-whitegrid')
  4. plt.rc('figure', autolayout=True)
  5. plt.rc('axes', labelweight='bold', labelsize='large', titleweight='bold', titlesize=18, titlepad=10)
  6. plt.rc('animation', html='html5')
  7.  
  8. import pandas as pd
  9. from sklearn.preprocessing import StandardScaler, OneHotEncoder
  10. from sklearn.compose import make_column_transformer
  11. from sklearn.model_selection import GroupShuffleSplit
  12. from tensorflow import keras
  13. from tensorflow.keras import layers
  14. from tensorflow.keras import callbacks
  15. from tensorflow.keras.callbacks import EarlyStopping
  16.  
  17.  
  18.  
  19. # 0. Auxiliary Function
  20. def group_split(X, y, group, train_size=0.75):
  21.     """
  22.     A grouped split to keep all of an artist's songs in one split or the other in order to prevent signal leakage
  23.     """
  24.     splitter = GroupShuffleSplit(train_size=train_size)
  25.     train, test = next(splitter.split(X, y, groups=group))
  26.     return (X.iloc[train], X.iloc[test], y.iloc[train], y.iloc[test])
  27.  
  28.  
  29.  
  30. # 1. Read the dataset
  31. spotify = pd.read_csv('../input/dl-course-data/spotify.csv')
  32. X = spotify.copy().dropna()
  33. y = X.pop('track_popularity')
  34.  
  35. # Target and predictors (12 numerical and 1 categorical feature-predictor)
  36. artists = X['track_artist']
  37. features_num = ['danceability', 'energy', 'key', 'loudness', 'mode',
  38.                 'speechiness', 'acousticness', 'instrumentalness',
  39.                 'liveness', 'valence', 'tempo', 'duration_ms']
  40. features_cat = ['playlist_genre']
  41.  
  42.  
  43. # 2. Preprocessor for all the columns (scaler + OH encoder)
  44. preprocessor = make_column_transformer( (StandardScaler(), features_num),
  45.                                         (OneHotEncoder(), features_cat) )
  46. X_train, X_valid, y_train, y_valid = group_split(X, y, artists)
  47.  
  48. X_train = preprocessor.fit_transform(X_train)
  49. X_valid = preprocessor.transform(X_valid)
  50. # Popularity is on a scale 0-100, so I have to rescale to 0-1.
  51. y_train = y_train / 100
  52. y_valid = y_valid / 100
  53.  
  54. input_shape = [X_train.shape[1]]
  55. print("Input shape: {}".format(input_shape))
  56.  
  57.  
  58.  
  59. # 3. Create a linear model with 1 neuron and plot the losses
  60. model = keras.Sequential([ layers.Dense(1, input_shape=input_shape) ])
  61.  
  62. model.compile(optimizer='adam', loss='mae')
  63.  
  64. history = model.fit(X_train, y_train,
  65.                     validation_data=(X_valid, y_valid),
  66.                     batch_size=512,
  67.                     epochs=50,
  68.                     verbose=0) # suppress output
  69.  
  70. history_df = pd.DataFrame(history.history)
  71. history_df.loc[0:, ['loss', 'val_loss']].plot()
  72. print("Minimum Validation Loss: {:0.4f}".format(history_df['val_loss'].min()));
  73.  
  74.  
  75.  
  76. # 4. Zoom in after 10-th epoch
  77. history_df.loc[10:, ['loss', 'val_loss']].plot()
  78. print("Minimum Validation Loss: {:0.4f}".format(history_df['val_loss'].min()));
  79. print("The gap between these curves is quite small and the validation loss never increases, so it's more likely that the network is underfitting than overfitting. It would be worth experimenting with more capacity to see if that's the case.", end="\n\n\n")
  80.  
  81.  
  82.  
  83. # 5. Create a more complex network with more neurons
  84. model = keras.Sequential([ layers.Dense(128, activation='relu', input_shape=input_shape),
  85.                         layers.Dense(64, activation='relu'),
  86.                         layers.Dense(1) ])
  87. model.compile(optimizer='adam', loss='mae')
  88.  
  89. history = model.fit(X_train, y_train,
  90.                     validation_data=(X_valid, y_valid),
  91.                     batch_size=512,
  92.                     epochs=50)
  93.  
  94. history_df = pd.DataFrame(history.history)
  95. history_df.loc[:, ['loss', 'val_loss']].plot()
  96. print("Minimum Validation Loss: {:0.4f}".format(history_df['val_loss'].min()));
  97. print("Now the validation loss begins to rise very early, while the training loss continues to decrease. This indicates that the network has begun to overfit. At this point, we would need to try something to prevent it, either by reducing the number of units or through a method like early stopping. ", end="\n\n\n")
  98.  
  99.  
  100.  
  101. # 6. Create an object about early stopping rounds
  102. early_stopping = EarlyStopping(min_delta = 0.001,
  103.                                patience = 5,
  104.                                restore_best_weights = True)
  105.  
  106. model = keras.Sequential([ layers.Dense(128, activation='relu', input_shape=input_shape),
  107.                            layers.Dense(64, activation='relu'),    
  108.                            layers.Dense(1) ])
  109. model.compile(optimizer='adam', loss='mae')
  110.  
  111. history = model.fit(X_train, y_train,
  112.                     validation_data=(X_valid, y_valid),
  113.                     batch_size=512,
  114.                     epochs=50,
  115.                     callbacks=[early_stopping] )
  116.  
  117. history_df = pd.DataFrame(history.history)
  118. history_df.loc[:, ['loss', 'val_loss']].plot()
  119. print("Minimum Validation Loss: {:0.4f}".format(history_df['val_loss'].min()));
  120. print("The early stopping callback did stop the training once the network began overfitting. Moreover, by including restore_best_weights we still get to keep the model where validation loss was lowest.", end="\n\n\n")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement