Unable to reproduce the results from the paper
- opened
I have been unable to reproduce the results from the paper.
These are the results of the best epoch (out of 10):
precision recall f1-score supportβββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 16/16 [02:00<00:00, 6.33s/it]
0 0.79 0.82 0.81 61
1 0.89 0.85 0.87 288
2 0.76 0.82 0.79 136
accuracy 0.84 485
macro avg 0.82 0.83 0.82 485
weighted avg 0.84 0.84 0.84 485
[[ 50 7 4]
[ 12 245 31]
[ 1 23 112]]
{'eval_loss': 0.5206248760223389, 'eval_accuracy': 0.8391752577319588, 'eval_runtime': 128.6691, 'eval_samples_per_second': 3.769, 'eval_steps_per_second': 0.124, 'epoch': 3.0}
As you can see, far from those reported.
And this is my code:
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
import torch
import evaluate
from transformers import Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification, AutoTokenizer
#from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn import metrics
import numpy as np
import pandas as pd
### Variables ###
#pre_model = '/home/emoman/Work/exploration/ipex/deberta-v3-base'
pre_model = 'ahmedrachid/FinancialBERT'
### Prepare the database ###
df = pd.read_csv('/home/emoman/Work/exploration/ipex/datasets/varia/FinancialPhraseBank-v1.0/Sentences_50Agree.txt',
header=None, sep='@', encoding='iso-8859-1', names=['Sentence', 'Sentiment'])
label_encode = {'negative': 0, 'neutral': 1, 'positive': 2}
df['Sentiment'] = df['Sentiment'].map(label_encode)
df['Sentiment'] = df['Sentiment'].astype('int')
X = df['Sentence']
y = df['Sentiment']
train_texts, test_texts, train_labels, test_labels = train_test_split(X, y, test_size=0.2, random_state=31416, stratify=y)
#, stratify=y
test_texts, val_texts, test_labels, val_labels = train_test_split(test_texts, test_labels, test_size=0.5, random_state=31416, stratify=test_labels)
#, stratify=test_labels
train_texts = train_texts.to_list()
test_texts = test_texts.to_list()
val_texts = val_texts.to_list()
train_labels = train_labels.to_list()
test_labels = test_labels.to_list()
val_labels = val_labels.to_list()
### Tokenize ###
tokenizer = AutoTokenizer.from_pretrained(pre_model)
train_encodings = tokenizer(train_texts, max_length=max_seq_length, padding='max_length', truncation=True, return_attention_mask=True, return_tensors='pt')
val_encodings = tokenizer(val_texts, max_length=max_seq_length, padding='max_length', truncation=True, return_attention_mask=True, return_tensors='pt')
test_encodings = tokenizer(test_texts, max_length=max_seq_length, padding='max_length', truncation=True, return_attention_mask=True, return_tensors='pt')
### Created dataset object from encodings ###
# To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
class kaggleFinSA(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
#item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
#item['labels'] = torch.tensor(self.labels[idx])
item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
item['labels'] = self.labels[idx]
return item
def __len__(self):
return len(self.labels)
train_dataset = kaggleFinSA(train_encodings, train_labels)
val_dataset = kaggleFinSA(val_encodings, val_labels)
test_dataset = kaggleFinSA(test_encodings, test_labels)
### Fine-tune with trainer ###
metric = evaluate.load('accuracy')
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
print(metrics.classification_report(labels, predictions))
print(metrics.confusion_matrix(labels, predictions))
return metric.compute(predictions=predictions, references=labels)
training_args = TrainingArguments(
output_dir='./results', # output directory
num_train_epochs=10, # total number of training epochs
per_device_train_batch_size=32, # batch size per device during training
model = AutoModelForSequenceClassification.from_pretrained(pre_model, num_labels=3)
trainer = Trainer(
model=model, # the instantiated Transformers model to be trained
args=training_args, # training arguments, defined above
train_dataset=train_dataset, # training dataset
eval_dataset=val_dataset, # evaluation dataset
Any ideas on what the cause for this discrepancy may be?
Still running, but FinBERT produces better results with exactly the same code:
precision recall f1-score supportβββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 16/16 [02:00<00:00, 6.37s/it]
0 0.79 0.87 0.83 61
1 0.94 0.86 0.90 288
2 0.81 0.92 0.86 136
accuracy 0.88 485
macro avg 0.85 0.88 0.86 485
weighted avg 0.89 0.88 0.88 485
[[ 53 5 3]
[ 13 249 26]
[ 1 10 125]]
{'eval_loss': 0.5141590237617493, 'eval_accuracy': 0.8804123711340206, 'eval_runtime': 129.502, 'eval_samples_per_second': 3.745, 'eval_steps_per_second': 0.124, 'epoch': 4.0}
Hello @mirix , soorry for the late response. Not sure if you could reproduce same results as datasets are different .. I've used many data sources (financial reports, news...)