Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- %%capture
- !pip install transformers datasets
- from datasets import load_dataset, Dataset
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer,DataCollatorForSeq2Seq
- import torch
- def preprocessor(examples):
- model_inputs = tokenizer(examples["source"], max_length = 128, truncation = True, padding = "max_length")
- labels = tokenizer(examples["paraphrase"], max_length = 128, truncation = True, padding = "max_length")
- model_inputs["labels"] = labels["input_ids"]
- return model_inputs
- device = "cuda" if torch.cuda.is_available() else "cpu"
- %%capture
- dataset = load_dataset("mhdank/trainT5")
- train_data = Dataset.from_dict(dataset["train"][:100])
- test_data = Dataset.from_dict(dataset["test"][:10])
- val_data = Dataset.from_dict(dataset["validation"][:10])
- train_data_processed = train_data.map(preprocessor, batched = True)
- test_data_processed = test_data.map(preprocessor, batched = True)
- val_data_processed = val_data.map(preprocessor, batched = True)
- training_args = Seq2SeqTrainingArguments(
- output_dir="/t5-paraphrase",
- num_train_epochs=2,
- learning_rate=2e-5,
- optim="adamw_torch_fused",
- save_strategy="epoch",
- per_device_train_batch_size=8,
- per_device_eval_batch_size=8,
- weight_decay=0.01,
- save_total_limit=3,
- warmup_ratio=0.03,
- push_to_hub=False,
- report_to="none"
- )
- trainer = Seq2SeqTrainer(
- model=model,
- args=training_args,
- train_dataset=train_data_processed,
- eval_dataset=val_data_processed,
- tokenizer=tokenizer,
- data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
- )
- trainer.train()
- import torch
- device = "cuda" if torch.cuda.is_available() else "cpu"
- model.to(device)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement