Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import torch
- import torch.nn as nn
- import numpy as np
- import pandas as pd
- from sklearn.preprocessing import LabelEncoder
- from sklearn.model_selection import train_test_split
- from sklearn.preprocessing import MinMaxScaler
- # Generator Model
- class Generator(nn.Module):
- def __init__(self, input_dim, output_dim):
- super(Generator, self).__init__()
- self.model = nn.Sequential(
- nn.Linear(input_dim, 128),
- nn.ReLU(),
- nn.Linear(128, 256),
- nn.ReLU(),
- nn.Linear(256, output_dim),
- nn.Tanh() # Use Tanh because output values are in the range of [-1, 1]
- )
- def forward(self, x):
- return self.model(x)
- # Discriminator Model
- class Discriminator(nn.Module):
- def __init__(self, input_dim):
- super(Discriminator, self).__init__()
- self.model = nn.Sequential(
- nn.Linear(input_dim, 256),
- nn.LeakyReLU(0.2),
- nn.Linear(256, 128),
- nn.LeakyReLU(0.2),
- nn.Linear(128, 1),
- nn.Sigmoid() # Outputs a probability
- )
- def forward(self, x):
- return self.model(x)
- # GAN-based resampling method
- def gan_resample(X_train, y_train, num_epochs=5000, batch_size=64):
- # Find majority and minority classes
- majority_class = np.argmax(np.bincount(y_train))
- minority_class = np.argmin(np.bincount(y_train))
- X_minority = X_train[y_train == minority_class]
- X_majority = X_train[y_train == majority_class]
- # GAN dimensions
- input_dim = X_minority.shape[1]
- latent_dim = 32 # Size of the random noise vector
- output_dim = input_dim
- # Initialize Generator and Discriminator
- generator = Generator(latent_dim, output_dim)
- discriminator = Discriminator(input_dim)
- # Loss and optimizers
- criterion = nn.BCELoss()
- g_optimizer = torch.optim.Adam(generator.parameters(), lr=0.0002)
- d_optimizer = torch.optim.Adam(discriminator.parameters(), lr=0.0002)
- # Training loop for GAN
- for epoch in range(num_epochs):
- # Sample noise as generator input
- z = torch.randn(batch_size, latent_dim)
- real_samples = torch.Tensor(X_minority[np.random.randint(0, X_minority.shape[0], batch_size)])
- # Train Discriminator
- d_optimizer.zero_grad()
- real_labels = torch.ones(batch_size, 1)
- fake_labels = torch.zeros(batch_size, 1)
- # Compute loss with real samples
- real_loss = criterion(discriminator(real_samples), real_labels)
- # Generate fake samples
- fake_samples = generator(z).detach()
- fake_loss = criterion(discriminator(fake_samples), fake_labels)
- # Total discriminator loss
- d_loss = real_loss + fake_loss
- d_loss.backward()
- d_optimizer.step()
- # Train Generator
- g_optimizer.zero_grad()
- # Generate fake samples
- z = torch.randn(batch_size, latent_dim)
- generated_samples = generator(z)
- g_loss = criterion(discriminator(generated_samples), real_labels)
- g_loss.backward()
- g_optimizer.step()
- if epoch % 1000 == 0:
- print(f"Epoch {epoch}/{num_epochs}, d_loss: {d_loss.item()}, g_loss: {g_loss.item()}")
- # Generate new synthetic samples from the trained Generator
- num_samples_needed = X_majority.shape[0] - X_minority.shape[0]
- z = torch.randn(num_samples_needed, latent_dim)
- synthetic_samples = generator(z).detach().numpy()
- # Concatenate original and synthetic samples
- X_balanced = np.vstack([X_train, synthetic_samples])
- y_balanced = np.hstack([y_train, np.full(num_samples_needed, minority_class)])
- return X_balanced, y_balanced
- # Preprocessing function modified for GAN resampling
- def preprocess_data_with_gan(input_file, dataset_percent):
- data = pd.read_csv(input_file)
- # Handle missing values (e.g., replacing with median)
- data.fillna(data.median(), inplace=True)
- # Convert categorical data
- for col in data.select_dtypes(include=['object']).columns:
- data[col] = LabelEncoder().fit_transform(data[col])
- # Separate features and target
- X = data.iloc[:, :-1].values
- y = data.iloc[:, -1].values
- # MinMax scaling
- scaler = MinMaxScaler()
- X = scaler.fit_transform(X)
- # Split into train and test
- X_train, _, y_train, _ = train_test_split(X, y, train_size=dataset_percent / 100, stratify=y)
- # Balance the dataset using the GAN-based resampling method
- X_balanced, y_balanced = gan_resample(X_train, y_train)
- return X_balanced, y_balanced
- # Usage example
- input_file = 'input.csv' # Input dataset
- dataset_percent = 10 # Use 10% of the dataset
- X_balanced, y_balanced = preprocess_data_with_gan(input_file, dataset_percent)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement