Spaces:
No application file
No application file
Commit
•
6e53961
1
Parent(s):
a005360
Upload text_class.py
Browse files- text_class.py +88 -0
text_class.py
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
!pip install datasets
|
2 |
+
pip install transformers[torch]
|
3 |
+
|
4 |
+
from datasets import load_dataset, load_metric
|
5 |
+
raw_datasets = load_dataset("wiki_qa")
|
6 |
+
|
7 |
+
dataset = raw_datasets['test'].train_test_split(train_size=0.67, seed=42)
|
8 |
+
raw_datasets["validation"]=dataset.pop("test")
|
9 |
+
raw_datasets['test']= dataset['train']
|
10 |
+
|
11 |
+
|
12 |
+
print(raw_datasets)
|
13 |
+
|
14 |
+
raw_datasets.set_format('pandas')
|
15 |
+
|
16 |
+
print('n\n\n\ntraining_labels:\n', raw_datasets['train']['label'].value_counts(),'\n\n',
|
17 |
+
'validation_labels:\n', raw_datasets['validation']['label'].value_counts(),'\n\n',
|
18 |
+
'testing_labels:\n',raw_datasets['test']['label'].value_counts())
|
19 |
+
raw_datasets.reset_format()
|
20 |
+
|
21 |
+
from transformers import GPT2Config, GPT2ForSequenceClassification, GPT2Tokenizer
|
22 |
+
|
23 |
+
# Load the GPT-2 tokenizer
|
24 |
+
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
|
25 |
+
|
26 |
+
# Load the GPT-2 configuration
|
27 |
+
config = GPT2Config.from_pretrained("gpt2")
|
28 |
+
|
29 |
+
# Modify the configuration for sequence classification
|
30 |
+
config.num_labels = 2 # Specify the number of classes for your classification task
|
31 |
+
config.pad_token_id = tokenizer.eos_token_id
|
32 |
+
|
33 |
+
# Initialize the GPT-2 model for sequence classification
|
34 |
+
model = GPT2ForSequenceClassification.from_pretrained("gpt2", config=config)
|
35 |
+
|
36 |
+
|
37 |
+
tokenizer.pad_token = tokenizer.eos_token
|
38 |
+
|
39 |
+
def tokenize_function(examples):
|
40 |
+
# Tokenize the question and answer text
|
41 |
+
question_inputs = tokenizer(examples['question'], padding='max_length', truncation=True, return_tensors='pt', max_length=800)
|
42 |
+
answer_inputs = tokenizer(examples['answer'], padding='max_length', truncation=True, return_tensors='pt', max_length=800)
|
43 |
+
|
44 |
+
# Combine question and answer inputs
|
45 |
+
inputs = {
|
46 |
+
'input_ids': question_inputs['input_ids'],
|
47 |
+
'attention_mask': question_inputs['attention_mask'],
|
48 |
+
'answer_input_ids': answer_inputs['input_ids'],
|
49 |
+
'answer_attention_mask': answer_inputs['attention_mask'],
|
50 |
+
}
|
51 |
+
|
52 |
+
return inputs
|
53 |
+
|
54 |
+
# Tokenize the train, test, and validation datasets
|
55 |
+
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
|
56 |
+
|
57 |
+
|
58 |
+
from transformers import Trainer, TrainingArguments
|
59 |
+
|
60 |
+
|
61 |
+
# Training arguments
|
62 |
+
training_args = TrainingArguments(
|
63 |
+
output_dir="./output",
|
64 |
+
num_train_epochs=3,
|
65 |
+
evaluation_strategy="steps",
|
66 |
+
save_total_limit=2,
|
67 |
+
per_device_train_batch_size=4,
|
68 |
+
per_device_eval_batch_size=4,
|
69 |
+
save_steps=200,
|
70 |
+
eval_steps=200,
|
71 |
+
logging_steps=200,
|
72 |
+
fp16=True,
|
73 |
+
)
|
74 |
+
|
75 |
+
# Trainer
|
76 |
+
trainer = Trainer(
|
77 |
+
model=model,
|
78 |
+
args=training_args,
|
79 |
+
train_dataset=tokenized_datasets['train'],
|
80 |
+
eval_dataset=tokenized_datasets['validation'],
|
81 |
+
)
|
82 |
+
|
83 |
+
# Train the model
|
84 |
+
trainer.train()
|
85 |
+
|
86 |
+
# Evaluate on the test dataset
|
87 |
+
results = trainer.evaluate(tokenized_datasets['test'])
|
88 |
+
print(results)
|