Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # обучение трансформера по последовательности 128 последних кластеров для предсказания множества из 20 кластеров
- import pandas as pd
- import numpy as np
- import torch
- from torch import nn
- from torch.utils.data import DataLoader, Dataset
- from torch.nn import functional as F
- import pytorch_lightning as pl
- from pl_bolts.datasets import DummyDataset
- from sklearn.model_selection import train_test_split
- import math
- import gc
- import time
- str_device = "cuda" if torch.cuda.is_available() else "cpu"
- device = torch.device(str_device)
- input_path = './input'
- PATH_PL = 'sber-tran-9.ckpt'
- class params:
- n_classes = 8_000 #
- emsize = 300 # embedding dimension
- nhead = 6 # the number of heads in the multiheadattention models
- nhid = 200 # the dimension of the feedforward network model in nn.TransformerEncoder
- nlayers = 2 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
- dropout = 0.1 # the dropout value
- batch_size = 1_024
- seq_size = 128
- lr=0.001 # 1e-3
- test_size = 0.1
- random_state = 12345
- epochs = 6
- train_data = pd.read_parquet(f'{input_path}/train.parquet')
- cluster_weights = pd.read_parquet(f'{input_path}/cluster-weights.parquet')
- cluster_weights = cluster_weights.sort_values(by='cluster_id', ascending=True)
- w_cluster = cluster_weights['w'].values
- print(w_cluster.shape, w_cluster)
- idx_cluster = cluster_weights['cluster_id'].values
- print(idx_cluster.shape, idx_cluster)
- w = np.zeros(params.n_classes) + 0.1
- w[idx_cluster] = w_cluster
- print(w[:10], w.shape)
- train_data['month'] = train_data['completed_at'].dt.month
- df_m = train_data.groupby(['id', 'month'])['cluster_id'].apply(list)
- df_month_h = df_m.unstack(fill_value='').reset_index()
- df_month_h.columns = ['id', 'm_6', 'm_7', 'm_8', 'm_9']
- del train_data
- gc.collect()
- # m_6 ... m_9
- def add_prev(r):
- a = r.prev
- if type(a)==str: # nan
- a=[]
- a = list(set(a))
- if len(a)>=params.seq_size:
- return a[:params.seq_size]
- for i in range(r.month-2, 5, -1):
- clusters = r[f'm_{i}']
- if type(clusters)==str: # nan
- continue
- if len(clusters) > 0:
- a = list(set(a).union(set(clusters)))
- if len(a)>=params.seq_size:
- return a[:params.seq_size]
- return a[:params.seq_size]
- def get_train_df():
- df = None
- for i in range(7,10):
- print('month', i)
- df_temp = df_month_h[df_month_h[f'm_{i}']!=''].copy()
- df_temp['prev'] = df_temp[f'm_{i-1}']
- df_temp['target'] = df_temp[f'm_{i}']
- df_temp['month'] = i
- if i > 7:
- df_temp['prev'] = df_temp.apply(add_prev, axis=1)
- else:
- df_temp['prev'] = df_temp['prev'].apply(lambda x: x if type(x)!=str else [])
- if df is None:
- df = df_temp
- else:
- df = pd.concat([df, df_temp])
- del df_temp
- gc.collect()
- return df[['id', 'prev', 'target']]
- df = get_train_df()
- df_train, df_val = train_test_split(
- df,
- test_size=params.test_size,
- random_state=params.random_state,
- )
- print(df_train.shape, df_val.shape)
- del df_month_h
- del df
- gc.collect()
- class MyDummyDataset(Dataset):
- def __init__(self, df):
- self.data = df
- def __len__(self):
- return len(self.data)
- def __getitem__(self, idx):
- prev = self.data.iloc[idx]['prev']
- target_s = set(self.data.iloc[idx]['target'])
- x_l = prev[:params.seq_size]
- if len(x_l) < params.seq_size:
- x_l = x_l + [params.n_classes] * (params.seq_size-len(x_l))
- diff_s = target_s - set(prev)
- ost_s = target_s - diff_s
- target_idx = list(diff_s)
- if len(diff_s) < 20:
- ost_l = list(ost_s)
- if len(ost_l) > 0:
- df = pd.DataFrame({'c':ost_l, 'w':w[ost_l]})
- df = df.sort_values(by='w', ascending=False)
- target_idx = list(diff_s) + list(df.head(20-len(diff_s))['c'].values)
- Y = np.zeros(params.n_classes)
- Y[target_idx] = 1.0
- return torch.tensor(x_l).int(), Y
- train = MyDummyDataset(df_train)
- train = DataLoader(train, batch_size=params.batch_size, num_workers=0)
- val = MyDummyDataset(df_val)
- val = DataLoader(val, batch_size=params.batch_size, num_workers=0)
- class MyTransformer(pl.LightningModule):
- def __init__(self):
- super().__init__()
- self.src_mask = None
- encoder_layers = nn.TransformerEncoderLayer(d_model=params.emsize,
- nhead=params.nhead,
- dim_feedforward=params.nhid,
- dropout=params.dropout,
- # batch_first=True,
- )
- self.transformer_encoder = nn.TransformerEncoder(encoder_layer=encoder_layers,
- num_layers=params.nlayers,
- )
- self.encoder = nn.Embedding(params.n_classes + 1, params.emsize) # еще служебный 8_000
- self.emsize = params.emsize
- self.pool = nn.AvgPool2d((params.seq_size, 1))
- self.decoder = nn.Linear(params.emsize, params.n_classes)
- self.w_1 = w
- self.init_weights()
- def _generate_square_subsequent_mask(self, sz):
- mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
- mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
- mask = torch.zeros(sz, sz)
- return mask
- def init_weights(self):
- initrange = 0.1
- self.encoder.weight.data.uniform_(-initrange, initrange)
- self.decoder.bias.data.zero_()
- self.decoder.weight.data.uniform_(-initrange, initrange)
- def forward(self, x):
- if self.src_mask is None or self.src_mask.size(0) != len(x):
- device = x.device
- mask = self._generate_square_subsequent_mask(params.seq_size).to(device)
- self.src_mask = mask
- src = self.encoder(x) * math.sqrt(self.emsize)
- src = src.permute(1, 0, 2)
- output = self.transformer_encoder(src, self.src_mask)
- output = output.permute(1, 0, 2)
- output = self.pool(output)
- x_hat = self.decoder(output)
- return x_hat
- def training_step(self, batch, batch_idx):
- x, y = batch
- x_hat = self(x)
- w_all = self.get_w()
- loss = F.binary_cross_entropy_with_logits(x_hat.view(-1, params.n_classes), y, weight=w_all)
- self.log('train_loss', loss)
- return loss
- def validation_step(self, batch, batch_idx):
- x, y = batch
- x_hat = self(x)
- w_all = self.get_w()
- loss = F.binary_cross_entropy_with_logits(x_hat.view(-1, params.n_classes), y, weight=w_all)
- self.log('val_loss', loss, prog_bar=True)
- return loss
- def get_w(self):
- w_all = torch.tensor(self.w_1).float().to(device)
- return w_all
- def test_step(self, batch, batch_idx):
- return self.validation_step(batch, batch_idx)
- def configure_optimizers(self):
- optimizer = torch.optim.Adam(self.parameters(), lr=params.lr)
- return optimizer
- model = MyTransformer()
- if str_device == "cuda":
- trainer = pl.Trainer(gpus=1, max_epochs=params.epochs, progress_bar_refresh_rate=20)
- else:
- trainer = pl.Trainer(gpus=0, max_epochs=params.epochs, progress_bar_refresh_rate=20)
- # Train the model
- trainer.fit(model, train, val)
- trainer.save_checkpoint(PATH_PL)
Add Comment
Please, Sign In to add comment