Untitled

    def cycle(self, X_train, y_train, X_val, y_val, best_score, l1_lambda=0.001, l2_lambda=0.001):
        model = Agent().to(d)
        X_train, y_train, X_val, y_val = X_train.to(
            d), y_train.to(d), X_val.to(d), y_val.to(d)
        # Weight initialization
        try:
            weights_path = "./zlv7_full.pt"
            state_dict = torch.load(weights_path, map_location=d)
            model.load_state_dict(state_dict)
        except FileNotFoundError:
            for m in model.modules():
                if isinstance(m, nn.Linear):
                    nn.init.xavier_uniform_(m.weight)
                    if m.bias is not None:
                        nn.init.constant_(m.bias, 0)

        # loss function and optimizer
        loss_fn = nn.MSELoss()  # mean square error
        # loss_fn2 = nn.HuberLoss()
        # loss_fn3 =
        # Set weight_decay to 0 for L2 regularization
        optimizer = optim.AdamW(
            model.parameters(), lr=1e-5, weight_decay=0.003)
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, factor=0.98, patience=3, verbose=True
        )
        n_epochs = 300
        batch_size = 8192  # size of each batch
        batch_start = torch.arange(0, len(X_train), batch_size)

        # Hold the best model
        best_mse = np.inf  # initialise value as infinite
        best_weights = None
        history = []
        accumulation_steps = 2  # accumulate gradients over 2 batches
        for _ in tqdm.tqdm(range(n_epochs), desc="Epochs"):
            model.train()
            epoch_loss = 0.0
            for i, batch_idx in enumerate(batch_start):
                batch_X, batch_y = (
                    X_train[batch_idx: batch_idx + batch_size],
                    y_train[batch_idx: batch_idx + batch_size],
                )
                batch_X, batch_y = batch_X.to(dtype=torch.float32), batch_y.to(dtype=torch.float32)
                optimizer.zero_grad()
                y_pred = model.forward(batch_X).to(d)
                loss = loss_fn(y_pred, batch_y.view(-1, 1)).to(d)
                # L1 regularization
                l1_reg = torch.tensor(0.).to(d)
                for name, param in model.named_parameters():
                    if 'weight' in name:
                        l1_reg += torch.norm(param, 1)
                loss += l1_lambda * l1_reg

                # L2 regularization
                l2_reg = torch.tensor(0.).to(d)
                for name, param in model.named_parameters():
                    if 'weight' in name:
                        l2_reg += torch.norm(param, 2)
                loss += l2_lambda * l2_reg

                if d == torch.device("cuda"):
                    scaler.scale(loss).backward()  # NEED GPU

                    # accumulate gradients over several batches
                    if (i + 1) % accumulation_steps == 0 or (i + 1) == len(batch_start):
                        scaler.step(optimizer)  # NEED GPU
                        scaler.update()  # NEED GPU
                model.zero_grad()
                y_pred = model(batch_X).to(d)
                loss = loss_fn(y_pred, batch_y.view(-1, 1)).to(d)
                # L1 regularization
                l1_reg = torch.tensor(0.).to(d)
                for name, param in model.named_parameters():
                    if 'weight' in name:
                        l1_reg += torch.norm(param, 1)
                loss += l1_lambda * l1_reg

                # L2 regularization
                l2_reg = torch.tensor(0.).to(d)
                for name, param in model.named_parameters():
                    if 'weight' in name:
                        l2_reg += torch.norm(param, 2)
                loss += l2_lambda * l2_reg

                loss.backward()
                optimizer.step()
                epoch_loss += loss.item() * batch_X.shape[0]
            epoch_loss /= len(X_train)
            scheduler.step(epoch_loss)
            history.append(epoch_loss)
            if epoch_loss < best_mse:
                best_mse = epoch_loss

        print("MSE: %.2f" % best_mse)
        print("RMSE: %.2f" % np.sqrt(best_mse))
        plt.plot(history)
        plt.title("Epoch loss for ZL")
        plt.xlabel("Number of Epochs")
        plt.ylabel("Epoch Loss")
        plt.draw()
        plt.savefig("ai-eval-losses.jpg")
        best_weights = copy.deepcopy(model.state_dict())
        torch.save(best_weights, "zlv7_full.pt")
        if best_score > epoch_loss:
            best_weights = copy.deepcopy(model.state_dict())
            torch.save(best_weights, "zlv7_full.pt")
        if d == torch.device("cuda"):
            torch.cuda.empty_cache()
        del X_train
        del X_val
        del y_train
        del y_val
        gc.enable()
        gc.collect()
        gc.disable()
        return epoch_loss