Spaces:
Sleeping
Sleeping
| # -*- coding: utf-8 -*- | |
| """Untitled6.ipynb | |
| Automatically generated by Colab. | |
| Original file is located at | |
| https://colab.research.google.com/drive/11megvyfcr49Oy4FGK7kteQ2iMdxZYp4L | |
| """ | |
| pip install transformers datasets sentence-transformers evaluate scikit-learn | |
| from google.colab import files | |
| uploaded = files.upload() | |
| from google.colab import files | |
| uploaded = files.upload() | |
| from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq | |
| from datasets import load_dataset, DatasetDict, load_metric | |
| from sentence_transformers import SentenceTransformer, util | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import pandas as pd | |
| import torch | |
| import numpy as np | |
| def load_csv_datasets(train_path, eval_path): | |
| train_df = pd.read_csv(train_path) | |
| eval_df = pd.read_csv(eval_path) | |
| dataset = DatasetDict({ | |
| 'train': Dataset.from_pandas(train_df), | |
| 'eval': Dataset.from_pandas(eval_df) | |
| }) | |
| return dataset | |
| def preprocess(example): | |
| input_text = example['input'] | |
| target_text = example['target'] | |
| model_inputs = tokenizer(input_text, max_length=512, padding='max_length', truncation=True) | |
| labels = tokenizer(target_text, max_length=64, padding='max_length', truncation=True) | |
| model_inputs["labels"] = labels["input_ids"] | |
| return model_inputs | |
| model_name = "valhalla/t5-base-qg-hl" | |
| tokenizer = T5Tokenizer.from_pretrained(model_name) | |
| model = T5ForConditionalGeneration.from_pretrained(model_name) | |
| dataset = load_csv_datasets("train.csv", "eval.csv") | |
| tokenized_dataset = dataset.map(preprocess, batched=True) | |
| data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model) | |
| training_args = TrainingArguments( | |
| output_dir="./qg_finetuned", | |
| eval_strategy="epoch", | |
| save_strategy="epoch", | |
| per_device_train_batch_size=4, | |
| per_device_eval_batch_size=4, | |
| num_train_epochs=3, | |
| logging_dir='./logs', | |
| logging_steps=10, | |
| save_total_limit=1, | |
| load_best_model_at_end=True, | |
| metric_for_best_model="cosine", | |
| greater_is_better=True | |
| ) | |
| def compute_metrics(eval_pred): | |
| predictions, labels = eval_pred | |
| # Ensure predictions is a list of lists of integers for batch_decode | |
| if isinstance(predictions, tuple): | |
| predictions = predictions[0] | |
| # Replace -100 in labels as we can't decode them. | |
| labels = np.where(labels != -100, labels, tokenizer.pad_token_id) | |
| # Filter out invalid token IDs from predictions | |
| valid_vocab_size = tokenizer.vocab_size | |
| filtered_predictions = [] | |
| for pred_seq in predictions.tolist(): | |
| filtered_seq = [token_id for token_id in pred_seq[0] if 0 <= token_id < valid_vocab_size] | |
| filtered_predictions.append(filtered_seq) | |
| decoded_preds = tokenizer.batch_decode(filtered_predictions, skip_special_tokens=True) | |
| decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) | |
| # Using SentenceTransformer for cosine similarity | |
| embedder = SentenceTransformer('all-MiniLM-L6-v2') | |
| embeddings_pred = embedder.encode(decoded_preds, convert_to_tensor=True) | |
| embeddings_label = embedder.encode(decoded_labels, convert_to_tensor=True) | |
| cosine_scores = util.cos_sim(embeddings_pred, embeddings_label).diagonal() | |
| avg_cosine = cosine_scores.mean().item() | |
| return {"cosine": avg_cosine} | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=tokenized_dataset["train"], | |
| eval_dataset=tokenized_dataset["eval"], | |
| tokenizer=tokenizer, | |
| data_collator=data_collator, | |
| compute_metrics=compute_metrics, | |
| ) | |
| trainer.train() | |
| results = trainer.evaluate() | |
| print("Evaluation Results:", results) | |