Spaces:

Vedarutvija
/

TextClass

No application file

App Files Files Community

Vedarutvija commited on Nov 8, 2023

Commit

6e53961

•

1 Parent(s): a005360

Upload text_class.py

Browse files

Files changed (1) hide show

text_class.py +88 -0

text_class.py ADDED Viewed

	@@ -0,0 +1,88 @@

+!pip install datasets
+pip install transformers[torch]
+from datasets import load_dataset, load_metric
+raw_datasets = load_dataset("wiki_qa")
+dataset = raw_datasets['test'].train_test_split(train_size=0.67, seed=42)
+raw_datasets["validation"]=dataset.pop("test")
+raw_datasets['test']= dataset['train']
+print(raw_datasets)
+raw_datasets.set_format('pandas')
+print('n\n\n\ntraining_labels:\n', raw_datasets['train']['label'].value_counts(),'\n\n',
+     'validation_labels:\n', raw_datasets['validation']['label'].value_counts(),'\n\n',
+     'testing_labels:\n',raw_datasets['test']['label'].value_counts())
+raw_datasets.reset_format()
+from transformers import GPT2Config, GPT2ForSequenceClassification, GPT2Tokenizer
+# Load the GPT-2 tokenizer
+tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+# Load the GPT-2 configuration
+config = GPT2Config.from_pretrained("gpt2")
+# Modify the configuration for sequence classification
+config.num_labels = 2  # Specify the number of classes for your classification task
+config.pad_token_id = tokenizer.eos_token_id
+# Initialize the GPT-2 model for sequence classification
+model = GPT2ForSequenceClassification.from_pretrained("gpt2", config=config)
+tokenizer.pad_token = tokenizer.eos_token
+def tokenize_function(examples):
+    # Tokenize the question and answer text
+    question_inputs = tokenizer(examples['question'], padding='max_length', truncation=True, return_tensors='pt', max_length=800)
+    answer_inputs = tokenizer(examples['answer'], padding='max_length', truncation=True, return_tensors='pt', max_length=800)
+    # Combine question and answer inputs
+    inputs = {
+        'input_ids': question_inputs['input_ids'],
+        'attention_mask': question_inputs['attention_mask'],
+        'answer_input_ids': answer_inputs['input_ids'],
+        'answer_attention_mask': answer_inputs['attention_mask'],
+    }
+    return inputs
+# Tokenize the train, test, and validation datasets
+tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
+from transformers import Trainer, TrainingArguments
+# Training arguments
+training_args = TrainingArguments(
+    output_dir="./output",
+    num_train_epochs=3,
+    evaluation_strategy="steps",
+    save_total_limit=2,
+    per_device_train_batch_size=4,
+    per_device_eval_batch_size=4,
+    save_steps=200,
+    eval_steps=200,
+    logging_steps=200,
+    fp16=True,
+)
+# Trainer
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=tokenized_datasets['train'],
+    eval_dataset=tokenized_datasets['validation'],
+)
+# Train the model
+trainer.train()
+# Evaluate on the test dataset
+results = trainer.evaluate(tokenized_datasets['test'])
+print(results)