dejanseo
/

sentiment-croatian

Text Classification

Inference Endpoints

Model card Files Files and versions Community

sentiment-croatian / train.py

dejanseo's picture

Upload 2 files

eb404b7 verified 5 months ago

history blame contribute delete

2.69 kB

	import pandas as pd
	from sklearn.model_selection import train_test_split
	from transformers import ElectraTokenizer, ElectraForSequenceClassification, Trainer, TrainingArguments
	import torch
	from datasets import Dataset
	import wandb
	from sklearn.metrics import precision_recall_fscore_support, accuracy_score

	# Load dataset
	data = pd.read_csv('sentences.csv')

	# Split dataset into train and eval sets
	train_df, eval_df = train_test_split(data, test_size=0.2, random_state=42)

	# Convert to Hugging Face Dataset
	train_dataset = Dataset.from_pandas(train_df)
	eval_dataset = Dataset.from_pandas(eval_df)

	# Initialize the tokenizer and model
	model_name = 'classla/bcms-bertic'
	tokenizer = ElectraTokenizer.from_pretrained(model_name)
	model = ElectraForSequenceClassification.from_pretrained(model_name, num_labels=3)

	# Tokenize the datasets
	def tokenize_function(examples):
	return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

	train_dataset = train_dataset.map(tokenize_function, batched=True)
	eval_dataset = eval_dataset.map(tokenize_function, batched=True)

	# Set format for PyTorch
	train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
	eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

	# Define the compute_metrics function
	def compute_metrics(p):
	preds = p.predictions.argmax(-1)
	precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
	acc = accuracy_score(p.label_ids, preds)
	return {
	'accuracy': acc,
	'precision': precision,
	'recall': recall,
	'f1': f1
	}

	# Define the training arguments
	training_args = TrainingArguments(
	output_dir='./results',
	evaluation_strategy='epoch',
	save_strategy='epoch',
	learning_rate=1e-5,
	per_device_train_batch_size=128,
	per_device_eval_batch_size=128,
	num_train_epochs=20,
	weight_decay=0.01,
	warmup_steps=500,
	logging_dir='./logs',
	logging_steps=10,
	save_total_limit=20,
	load_best_model_at_end=True,
	metric_for_best_model='accuracy',
	report_to='wandb',
	run_name='sentiment-classification',
	)

	# Initialize WandB
	wandb.init(project="sentiment-classification", entity="dejan")

	# Define Trainer
	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=train_dataset,
	eval_dataset=eval_dataset,
	compute_metrics=compute_metrics
	)

	# Train the model
	trainer.train()

	# Evaluate the model
	trainer.evaluate()

	# Finish the WandB run
	wandb.finish()

	# Save the model
	model.save_pretrained('./sentiment-model')
	tokenizer.save_pretrained('./sentiment-model')