File size: 4,271 Bytes
d2a1ee9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
---
license: afl-3.0
datasets:
- HuggingFaceTB/cosmopedia
metrics:
- accuracy
library_name: adapter-transformers
pipeline_tag: text-classification
tags:
- code
---
# Install the necessary libraries
!pip install transformers
!pip install torch

import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, XLNetTokenizer, XLNetForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Example dataset for text classification (replace with your own dataset)
texts = [...]  # List of input texts
labels = [...]  # List of corresponding labels (0 or 1 for binary classification)

# Split the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Define the tokenizer and model for RoBERTa
roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
roberta_model = RobertaForSequenceClassification.from_pretrained("roberta-base")

# Define the tokenizer and model for XLNet
xlnet_tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
xlnet_model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased")

# Tokenize and encode the training and testing sets
train_encodings_roberta = roberta_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_roberta = roberta_tokenizer(test_texts, truncation=True, padding=True)

train_encodings_xlnet = xlnet_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_xlnet = xlnet_tokenizer(test_texts, truncation=True, padding=True)

class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset_roberta = MyDataset(train_encodings_roberta, train_labels)
test_dataset_roberta = MyDataset(test_encodings_roberta, test_labels)

train_dataset_xlnet = MyDataset(train_encodings_xlnet, train_labels)
test_dataset_xlnet = MyDataset(test_encodings_xlnet, test_labels)

# Fine-tune RoBERTa model
training_args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10,
)

trainer_roberta = Trainer(
    model=roberta_model,
    args=training_args,
    train_dataset=train_dataset_roberta,
    eval_dataset=test_dataset_roberta,
)

trainer_roberta.train()

# Fine-tune XLNet model
trainer_xlnet = Trainer(
    model=xlnet_model,
    args=training_args,
    train_dataset=train_dataset_xlnet,
    eval_dataset=test_dataset_xlnet,
)

trainer_xlnet.train()

# Evaluate models
def evaluate_model(model, test_dataset):
    predictions = []
    labels = []
    for batch in test_dataset:
        input_ids = batch['input_ids'].to(model.device)
        attention_mask = batch['attention_mask'].to(model.device)
        labels.extend(batch['labels'].tolist())
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, axis=1).tolist())
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    return accuracy, precision, recall, f1

accuracy_roberta, precision_roberta, recall_roberta, f1_roberta = evaluate_model(roberta_model, test_dataset_roberta)
accuracy_xlnet, precision_xlnet, recall_xlnet, f1_xlnet = evaluate_model(xlnet_model, test_dataset_xlnet)

print("RoBERTa Model Evaluation:")
print(f"Accuracy: {accuracy_roberta}")
print(f"Precision: {precision_roberta}")
print(f"Recall: {recall_roberta}")
print(f"F1 Score: {f1_roberta}")

print("\nXLNet Model Evaluation:")
print(f"Accuracy: {accuracy_xlnet}")
print(f"Precision: {precision_xlnet}")
print(f"Recall: {recall_xlnet}")
print(f"F1 Score: {f1_xlnet}")