--- license: afl-3.0 datasets: - HuggingFaceTB/cosmopedia metrics: - accuracy library_name: adapter-transformers pipeline_tag: text-classification tags: - code --- # Install the necessary libraries !pip install transformers !pip install torch import torch from transformers import RobertaTokenizer, RobertaForSequenceClassification, XLNetTokenizer, XLNetForSequenceClassification from transformers import Trainer, TrainingArguments from sklearn.model_selection import train_test_split import numpy as np from sklearn.metrics import accuracy_score, precision_recall_fscore_support # Example dataset for text classification (replace with your own dataset) texts = [...] # List of input texts labels = [...] # List of corresponding labels (0 or 1 for binary classification) # Split the dataset into training and testing sets train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42) # Define the tokenizer and model for RoBERTa roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base") roberta_model = RobertaForSequenceClassification.from_pretrained("roberta-base") # Define the tokenizer and model for XLNet xlnet_tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased") xlnet_model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased") # Tokenize and encode the training and testing sets train_encodings_roberta = roberta_tokenizer(train_texts, truncation=True, padding=True) test_encodings_roberta = roberta_tokenizer(test_texts, truncation=True, padding=True) train_encodings_xlnet = xlnet_tokenizer(train_texts, truncation=True, padding=True) test_encodings_xlnet = xlnet_tokenizer(test_texts, truncation=True, padding=True) class MyDataset(torch.utils.data.Dataset): def __init__(self, encodings, labels): self.encodings = encodings self.labels = labels def __getitem__(self, idx): item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} item['labels'] = torch.tensor(self.labels[idx]) return item def __len__(self): return len(self.labels) train_dataset_roberta = MyDataset(train_encodings_roberta, train_labels) test_dataset_roberta = MyDataset(test_encodings_roberta, test_labels) train_dataset_xlnet = MyDataset(train_encodings_xlnet, train_labels) test_dataset_xlnet = MyDataset(test_encodings_xlnet, test_labels) # Fine-tune RoBERTa model training_args = TrainingArguments( per_device_train_batch_size=8, per_device_eval_batch_size=8, num_train_epochs=3, logging_dir='./logs', logging_steps=10, ) trainer_roberta = Trainer( model=roberta_model, args=training_args, train_dataset=train_dataset_roberta, eval_dataset=test_dataset_roberta, ) trainer_roberta.train() # Fine-tune XLNet model trainer_xlnet = Trainer( model=xlnet_model, args=training_args, train_dataset=train_dataset_xlnet, eval_dataset=test_dataset_xlnet, ) trainer_xlnet.train() # Evaluate models def evaluate_model(model, test_dataset): predictions = [] labels = [] for batch in test_dataset: input_ids = batch['input_ids'].to(model.device) attention_mask = batch['attention_mask'].to(model.device) labels.extend(batch['labels'].tolist()) with torch.no_grad(): outputs = model(input_ids, attention_mask=attention_mask) logits = outputs.logits predictions.extend(torch.argmax(logits, axis=1).tolist()) accuracy = accuracy_score(labels, predictions) precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary') return accuracy, precision, recall, f1 accuracy_roberta, precision_roberta, recall_roberta, f1_roberta = evaluate_model(roberta_model, test_dataset_roberta) accuracy_xlnet, precision_xlnet, recall_xlnet, f1_xlnet = evaluate_model(xlnet_model, test_dataset_xlnet) print("RoBERTa Model Evaluation:") print(f"Accuracy: {accuracy_roberta}") print(f"Precision: {precision_roberta}") print(f"Recall: {recall_roberta}") print(f"F1 Score: {f1_roberta}") print("\nXLNet Model Evaluation:") print(f"Accuracy: {accuracy_xlnet}") print(f"Precision: {precision_xlnet}") print(f"Recall: {recall_xlnet}") print(f"F1 Score: {f1_xlnet}")