Spaces:

Vedarutvija
/

TextClass

No application file

App Files Files Community

TextClass / text_class.py

Vedarutvija

Upload text_class.py

6e53961 12 months ago

raw

history blame

2.69 kB

	!pip install datasets
	pip install transformers[torch]

	from datasets import load_dataset, load_metric
	raw_datasets = load_dataset("wiki_qa")

	dataset = raw_datasets['test'].train_test_split(train_size=0.67, seed=42)
	raw_datasets["validation"]=dataset.pop("test")
	raw_datasets['test']= dataset['train']


	print(raw_datasets)

	raw_datasets.set_format('pandas')

	print('n\n\n\ntraining_labels:\n', raw_datasets['train']['label'].value_counts(),'\n\n',
	'validation_labels:\n', raw_datasets['validation']['label'].value_counts(),'\n\n',
	'testing_labels:\n',raw_datasets['test']['label'].value_counts())
	raw_datasets.reset_format()

	from transformers import GPT2Config, GPT2ForSequenceClassification, GPT2Tokenizer

	# Load the GPT-2 tokenizer
	tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

	# Load the GPT-2 configuration
	config = GPT2Config.from_pretrained("gpt2")

	# Modify the configuration for sequence classification
	config.num_labels = 2 # Specify the number of classes for your classification task
	config.pad_token_id = tokenizer.eos_token_id

	# Initialize the GPT-2 model for sequence classification
	model = GPT2ForSequenceClassification.from_pretrained("gpt2", config=config)


	tokenizer.pad_token = tokenizer.eos_token

	def tokenize_function(examples):
	# Tokenize the question and answer text
	question_inputs = tokenizer(examples['question'], padding='max_length', truncation=True, return_tensors='pt', max_length=800)
	answer_inputs = tokenizer(examples['answer'], padding='max_length', truncation=True, return_tensors='pt', max_length=800)

	# Combine question and answer inputs
	inputs = {
	'input_ids': question_inputs['input_ids'],
	'attention_mask': question_inputs['attention_mask'],
	'answer_input_ids': answer_inputs['input_ids'],
	'answer_attention_mask': answer_inputs['attention_mask'],
	}

	return inputs

	# Tokenize the train, test, and validation datasets
	tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)


	from transformers import Trainer, TrainingArguments


	# Training arguments
	training_args = TrainingArguments(
	output_dir="./output",
	num_train_epochs=3,
	evaluation_strategy="steps",
	save_total_limit=2,
	per_device_train_batch_size=4,
	per_device_eval_batch_size=4,
	save_steps=200,
	eval_steps=200,
	logging_steps=200,
	fp16=True,
	)

	# Trainer
	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=tokenized_datasets['train'],
	eval_dataset=tokenized_datasets['validation'],
	)

	# Train the model
	trainer.train()

	# Evaluate on the test dataset
	results = trainer.evaluate(tokenized_datasets['test'])
	print(results)