Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import torch
- import numpy as np
- import pandas as pd
- from torch.utils.data import Dataset, DataLoader
- import torch.nn as nn
- import re
- from random import shuffle
- from time import time
- import pickle
- import wandb
- from dawgz import job, schedule
- import os
- import json
- def tokenize(review):
- review = review.lower()
- review = re.sub(r'[^a-zA-Z0-9_ ]', '', review)
- review = review.split(' ')
- review = list(filter(lambda x: x != '', review))
- return review
- def load_voc():
- global i2t
- global t2i
- global eos_token_id
- global pad_token_id
- global unk_token_id
- with open ('voc', 'rb') as fp:
- i2t = pickle.load(fp)
- t2i = {i2t[i]: i for i in range(len(i2t))}
- eos_token_id = t2i['<eos>']
- pad_token_id = t2i['<pad>']
- unk_token_id = t2i['<unk>']
- class SteamReviewsDataset(Dataset):
- def __init__(self, reviews, scores):
- self.reviews = reviews
- self.scores = scores
- def __len__(self):
- return len(self.reviews)
- def __getitem__(self, idx):
- return self.reviews[idx], self.scores[idx]
- def collate_fn(data):
- reviews, scores = zip(*data)
- max_len = max([len(d[0]) for d in data]) + 1
- def eos_pad(review):
- len_r = len(review)
- review = torch.cat([review, torch.tensor([eos_token_id])])
- if len_r == max_len - 1:
- return review
- pad = torch.tensor([pad_token_id] * (max_len - len_r - 1))
- return torch.cat([pad, review])
- reviews = list(map(eos_pad, reviews))
- return torch.stack(reviews), torch.stack(scores)
- def load_from_csv(N: int, file_name: str):
- data: pd.DataFrame = pd.read_csv(f'{file_name}.csv', nrows=N)
- X: pd.Series = data['review_text'].astype(str)
- X = X.apply(tokenize)
- X = X.apply(
- lambda review: torch.tensor([t2i.get(word, unk_token_id) for word in review])
- ).tolist()
- y = torch.tensor(data['review_score'].to_numpy()).to(torch.float32)
- del data
- return X, y
- def make_data(N_total: int, N_train: int, batch_size: int, t2i: dict, unk_token_id: int, device: str = 'cpu'):
- Xp, yp = load_from_csv(N_total // 2, 'data_positive_50to500')
- Xn, yn = load_from_csv(N_total // 2, 'data_negative_50to500')
- X = Xp + Xn
- y = torch.cat([yp, yn])
- perm = torch.randperm(N_total)
- X, y = [X[i] for i in perm], y[perm]
- X_train, y_train = X[:N_train], y[:N_train]
- X_test, y_test = X[N_train:], y[N_train:]
- dataset_train = SteamReviewsDataset(X_train, y_train)
- dataset_test = SteamReviewsDataset(X_test, y_test)
- train_dl = DataLoader(dataset_train, collate_fn=collate_fn, batch_size=batch_size, shuffle=True, num_workers=0, generator=torch.Generator(y.device))
- test_dl = DataLoader(dataset_test, collate_fn=collate_fn, batch_size=batch_size, shuffle=True, num_workers=0, generator=torch.Generator(y.device))
- return train_dl, test_dl
- class SentimentClassifier(nn.Module):
- def __init__(self,
- emb_size=128,
- hidden_size=256,
- bidirectional=False,
- mode='last',
- voc_size=10_000):
- """
- mode: 'last', 'mean'
- """
- super().__init__()
- self.mode = mode
- self.embedding = nn.Parameter(torch.randn(voc_size, emb_size))
- self.lstm = nn.LSTM(
- emb_size,
- hidden_size,
- bidirectional=bidirectional,
- batch_first=True,
- )
- self.dropout = nn.Dropout(0.15)
- D = 2 if bidirectional else 1
- out_size = hidden_size * D
- self.out_mlp = nn.Sequential(
- nn.Linear(out_size, 1),
- nn.Sigmoid(),
- )
- def forward(self, x):
- emb = self.embedding[x]
- output, _ = self.lstm(emb)
- if self.mode == 'mean':
- output = output.mean(dim=1)
- elif self.mode == 'last':
- output = output[:, -1, :]
- output = self.dropout(output)
- output = self.out_mlp(output)
- return output.squeeze()
- CONFIGS = [
- {
- 'name': f'{mode} emb={e} h={h} lr={lr}',
- 'emb_size': e,
- 'hidden_size': h,
- 'lr': lr,
- 'mode': mode,
- }
- for mode in ['last', 'mean']
- for e in [32, 64, 128, 256, 512]
- for h in [32, 64, 128, 256, 512]
- for lr in [1e-6, 1e-5, 1e-4, 1e-3, 1e-2]
- ]
- @job(
- #array=1,
- array=len(CONFIGS),
- partition="a5000,tesla,quadro,2080ti",
- cpus=4,
- gpus=1,
- ram="16GB",
- time="24:00:00",
- name="fctlstm",
- )
- def train(i: int):
- torch.set_default_device('cpu')
- run_config = CONFIGS[i]
- load_voc()
- # Train
- sc = SentimentClassifier(
- emb_size=run_config['emb_size'],
- hidden_size=run_config['hidden_size'],
- voc_size=len(i2t),
- mode=run_config['mode'],
- )
- nb_params = sum(p.numel() for p in sc.parameters())
- lr=run_config['lr']
- batch_size=64
- opt = torch.optim.Adam(sc.parameters(), lr=lr)
- loss = nn.BCELoss()
- def acc(preds, gt):
- preds, gt = preds.squeeze(), gt.squeeze()
- return (torch.sum(torch.round(preds) == gt) / len(preds) * 100).item()
- def acc_neg(preds, gt):
- preds, gt = preds.squeeze(), gt.squeeze()
- preds = preds[gt == 0.]
- gt = gt[gt == 0.]
- if len(preds) == 0:
- return 100.0
- return (torch.sum(torch.round(preds) == gt) / len(preds) * 100).item()
- def acc_pos(preds, gt):
- preds, gt = preds.squeeze(), gt.squeeze()
- preds = preds[gt == 1.]
- gt = gt[gt == 1.]
- if len(preds) == 0:
- return 100.0
- return (torch.sum(torch.round(preds) == gt) / len(preds) * 100).item()
- wandb_enabled = True
- if wandb_enabled:
- wandb.init(
- project='sentiment',
- name=run_config['name'],
- config={
- 'lr': lr,
- 'batch_size': batch_size,
- 'nb_params': nb_params,
- 'mode': sc.mode,
- 'experiment-name': 'alan',
- 'emb_size': sc.embedding.shape[1],
- 'hidden_size': sc.lstm.hidden_size,
- 'bidirectional': sc.lstm.bidirectional,
- 'dropout': sc.dropout.p,
- },
- reinit=True,
- )
- wandb.define_metric("iter")
- wandb.define_metric("train/*", step_metric="iter")
- wandb.define_metric("testiter")
- wandb.define_metric("test/*", step_metric="train/step")
- print(f"Starting training with device {sc.embedding.device}.")
- print(f"Trainable parameters: {nb_params}.")
- train_data, test_data = make_data(N_total=200_000, N_train=195_000, batch_size=batch_size, t2i=t2i, unk_token_id=unk_token_id)
- step = 0
- test_interval = 1000
- for epoch in range(n_epochs := 10):
- for reviews, scores in train_data:
- start = time()
- preds = sc(reviews)
- l = loss(preds, scores)
- opt.zero_grad()
- l.backward()
- opt.step()
- log_dict = {}
- if wandb_enabled:
- log_dict.update({
- 'train/loss': l.item(),
- 'train/seq_size': reviews.shape[1],
- 'train/time': time() - start,
- 'iter': step,
- })
- if step % test_interval == 0:
- with torch.no_grad():
- sc.eval()
- test_losses = torch.zeros(len(test_data))
- test_accuracies = torch.zeros(len(test_data))
- test_accuracies_pos = []
- test_accuracies_neg = []
- for i, (reviews, scores) in enumerate(test_data):
- preds = sc(reviews)
- l = loss(preds, scores)
- test_losses[i] = l
- test_accuracies[i] = acc(preds, scores)
- acc_negative = acc_neg(preds, scores)
- acc_positive = acc_pos(preds, scores)
- if acc_negative != None:
- test_accuracies_neg.append(acc_negative)
- if acc_positive != None:
- test_accuracies_pos.append(acc_positive)
- log_dict.update({
- 'test/loss': test_losses.mean().item(),
- 'test/acc': test_accuracies.mean().item(),
- 'test/acc_positive': torch.tensor(test_accuracies_pos).mean().item(),
- 'test/acc_negative': torch.tensor(test_accuracies_neg).mean().item(),
- 'testiter': step,
- })
- # TODO: Qualitative examples.
- X_t, y_t = next(iter(test_data))
- X_t, y_t = X_t[:8], y_t[:8]
- y_hat = sc(X_t)
- # Conevrt X_t to text
- X_t_text = []
- for x in X_t:
- x_text = []
- for i in x:
- if i.item() == pad_token_id:
- continue
- if i.item() == unk_token_id:
- x_text.append("<??>")
- continue
- x_text.append(i2t[i.item()])
- x_text = " ".join(x_text)
- X_t_text.append(x_text)
- y_t = y_t.tolist()
- y_hat = y_hat.tolist()
- test_samples = [[X_t_text[i], y_hat[i], y_t[i]] for i in range(len(X_t_text))]
- log_dict.update({
- "test/samples": wandb.Table(
- data=test_samples,
- columns=["Review", "Predicted Label", "True Label"]
- )})
- sc.train()
- wandb.log(log_dict)
- step += 1
- if wandb_enabled:
- wandb.finish()
- os.makedirs(f'models/{run_config["name"]}', exist_ok=True)
- with open(f'models/{run_config["name"]}/config.json', 'w') as fp:
- json.dump(run_config, fp)
- torch.save(sc.state_dict(), f'models/{run_config["name"]}/model.pt')
- if __name__ == '__main__':
- schedule(
- train,
- backend='slurm',
- export="ALL",
- shell="/bin/sh",
- env=["export WANDB_SILENT=true"],
- )
- print(f"Scheduled {len(CONFIGS)} jobs.")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement