image_intents / fineTune.py
smeintadmin's picture
Upload 16 files
1ae8986
raw
history blame
2.57 kB
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, concatenate_datasets
MODEL_NAME = "roberta-large"
SAVE_MODEL_FOLDER = "img_intents_model"
OUTPUT_DIR = "./results"
output_dir = "/results"
# Load the model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(SAVE_MODEL_FOLDER)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# Load the sentences from the text files into datasets
positives_dataset = load_dataset('text', data_files='test_positives.txt')
negatives_dataset = load_dataset('text', data_files='test_negatives.txt')
# Manually assign split names to the datasets
positives_dataset = positives_dataset['train'].map(lambda example: {'label': 1})
negatives_dataset = negatives_dataset['train'].map(lambda example: {'label': 0})
# Combine into a single dataset and add a 'label' column
train_dataset = concatenate_datasets([positives_dataset, negatives_dataset])
# Preprocessing function
def preprocess_function(examples):
# Tokenize the texts
return tokenizer(examples["text"], truncation=True, max_length=512, padding='max_length')
train_dataset = train_dataset.map(preprocess_function, batched=True)
# Make sure all your tensors are the same size for batching together
train_dataset = train_dataset.remove_columns(["text"]).rename_column("label", "labels").with_format("torch")
# TrainingArguments
training_args = TrainingArguments(
output_dir=OUTPUT_DIR,
num_train_epochs=5, # Fine-tune for a few epochs
per_device_train_batch_size=16, # Decrease this if necessary
per_device_eval_batch_size=64,
warmup_steps=500,
weight_decay=0.01,
logging_dir=OUTPUT_DIR,
logging_strategy='steps', # Log after every training step
logging_steps=10, # Adjust this to change how often logging occurs
evaluation_strategy='steps', # Evaluate after every training step
eval_steps=100, # Adjust this to change how often evaluation occurs
save_strategy='steps', # Save after every training step
save_steps=500, # Adjust this to change how often saving occurs
no_cuda=False, # Use GPU
gradient_accumulation_steps=2, # If necessary
fp16=True, # Use mixed precision training
report_to='tensorboard'
)
# Create a Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
)
# Fine-tune the model
trainer.train()
# Save the model
trainer.save_model(SAVE_MODEL_FOLDER)
# Save the tokenizer
tokenizer.save_pretrained(OUTPUT_DIR)