smeintadmin
/

image_intents

Text Classification

Inference Endpoints

Model card Files Files and versions Metrics Training metrics Community

image_intents / fineTune.py

smeintadmin's picture

Upload 16 files

1ae8986 about 1 year ago

2.57 kB

	from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
	from datasets import load_dataset, concatenate_datasets

	MODEL_NAME = "roberta-large"
	SAVE_MODEL_FOLDER = "img_intents_model"
	OUTPUT_DIR = "./results"
	output_dir = "/results"

	# Load the model and tokenizer
	model = AutoModelForSequenceClassification.from_pretrained(SAVE_MODEL_FOLDER)
	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

	# Load the sentences from the text files into datasets
	positives_dataset = load_dataset('text', data_files='test_positives.txt')
	negatives_dataset = load_dataset('text', data_files='test_negatives.txt')

	# Manually assign split names to the datasets
	positives_dataset = positives_dataset['train'].map(lambda example: {'label': 1})
	negatives_dataset = negatives_dataset['train'].map(lambda example: {'label': 0})

	# Combine into a single dataset and add a 'label' column
	train_dataset = concatenate_datasets([positives_dataset, negatives_dataset])

	# Preprocessing function
	def preprocess_function(examples):
	# Tokenize the texts
	return tokenizer(examples["text"], truncation=True, max_length=512, padding='max_length')

	train_dataset = train_dataset.map(preprocess_function, batched=True)

	# Make sure all your tensors are the same size for batching together
	train_dataset = train_dataset.remove_columns(["text"]).rename_column("label", "labels").with_format("torch")

	# TrainingArguments
	training_args = TrainingArguments(
	output_dir=OUTPUT_DIR,
	num_train_epochs=5, # Fine-tune for a few epochs
	per_device_train_batch_size=16, # Decrease this if necessary
	per_device_eval_batch_size=64,
	warmup_steps=500,
	weight_decay=0.01,
	logging_dir=OUTPUT_DIR,
	logging_strategy='steps', # Log after every training step
	logging_steps=10, # Adjust this to change how often logging occurs
	evaluation_strategy='steps', # Evaluate after every training step
	eval_steps=100, # Adjust this to change how often evaluation occurs
	save_strategy='steps', # Save after every training step
	save_steps=500, # Adjust this to change how often saving occurs
	no_cuda=False, # Use GPU
	gradient_accumulation_steps=2, # If necessary
	fp16=True, # Use mixed precision training
	report_to='tensorboard'
	)
	# Create a Trainer
	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=train_dataset,
	)

	# Fine-tune the model
	trainer.train()

	# Save the model
	trainer.save_model(SAVE_MODEL_FOLDER)

	# Save the tokenizer
	tokenizer.save_pretrained(OUTPUT_DIR)