Spaces:
No application file
No application file
!pip install datasets | |
pip install transformers[torch] | |
from datasets import load_dataset, load_metric | |
raw_datasets = load_dataset("wiki_qa") | |
dataset = raw_datasets['test'].train_test_split(train_size=0.67, seed=42) | |
raw_datasets["validation"]=dataset.pop("test") | |
raw_datasets['test']= dataset['train'] | |
print(raw_datasets) | |
raw_datasets.set_format('pandas') | |
print('n\n\n\ntraining_labels:\n', raw_datasets['train']['label'].value_counts(),'\n\n', | |
'validation_labels:\n', raw_datasets['validation']['label'].value_counts(),'\n\n', | |
'testing_labels:\n',raw_datasets['test']['label'].value_counts()) | |
raw_datasets.reset_format() | |
from transformers import GPT2Config, GPT2ForSequenceClassification, GPT2Tokenizer | |
# Load the GPT-2 tokenizer | |
tokenizer = GPT2Tokenizer.from_pretrained("gpt2") | |
# Load the GPT-2 configuration | |
config = GPT2Config.from_pretrained("gpt2") | |
# Modify the configuration for sequence classification | |
config.num_labels = 2 # Specify the number of classes for your classification task | |
config.pad_token_id = tokenizer.eos_token_id | |
# Initialize the GPT-2 model for sequence classification | |
model = GPT2ForSequenceClassification.from_pretrained("gpt2", config=config) | |
tokenizer.pad_token = tokenizer.eos_token | |
def tokenize_function(examples): | |
# Tokenize the question and answer text | |
question_inputs = tokenizer(examples['question'], padding='max_length', truncation=True, return_tensors='pt', max_length=800) | |
answer_inputs = tokenizer(examples['answer'], padding='max_length', truncation=True, return_tensors='pt', max_length=800) | |
# Combine question and answer inputs | |
inputs = { | |
'input_ids': question_inputs['input_ids'], | |
'attention_mask': question_inputs['attention_mask'], | |
'answer_input_ids': answer_inputs['input_ids'], | |
'answer_attention_mask': answer_inputs['attention_mask'], | |
} | |
return inputs | |
# Tokenize the train, test, and validation datasets | |
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) | |
from transformers import Trainer, TrainingArguments | |
# Training arguments | |
training_args = TrainingArguments( | |
output_dir="./output", | |
num_train_epochs=3, | |
evaluation_strategy="steps", | |
save_total_limit=2, | |
per_device_train_batch_size=4, | |
per_device_eval_batch_size=4, | |
save_steps=200, | |
eval_steps=200, | |
logging_steps=200, | |
fp16=True, | |
) | |
# Trainer | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=tokenized_datasets['train'], | |
eval_dataset=tokenized_datasets['validation'], | |
) | |
# Train the model | |
trainer.train() | |
# Evaluate on the test dataset | |
results = trainer.evaluate(tokenized_datasets['test']) | |
print(results) |