Unable to reproduce the results from the paper

I have been unable to reproduce the results from the paper.

These are the results of the best epoch (out of 10):

             precision    recall  f1-score   supportβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 16/16 [02:00<00:00,  6.33s/it]

           0       0.79      0.82      0.81        61
           1       0.89      0.85      0.87       288
           2       0.76      0.82      0.79       136

    accuracy                           0.84       485
   macro avg       0.82      0.83      0.82       485
weighted avg       0.84      0.84      0.84       485

[[ 50   7   4]
 [ 12 245  31]
 [  1  23 112]]
{'eval_loss': 0.5206248760223389, 'eval_accuracy': 0.8391752577319588, 'eval_runtime': 128.6691, 'eval_samples_per_second': 3.769, 'eval_steps_per_second': 0.124, 'epoch': 3.0} 

As you can see, far from those reported.

And this is my code:

import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

import torch
import evaluate

from transformers import Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification, AutoTokenizer
#from transformers import RobertaTokenizer, RobertaForSequenceClassification

from sklearn.model_selection import train_test_split
from sklearn import metrics

import numpy as np
import pandas as pd

### Variables ###

#pre_model = '/home/emoman/Work/exploration/ipex/deberta-v3-base'
pre_model = 'ahmedrachid/FinancialBERT'

### Prepare the database ###

df = pd.read_csv('/home/emoman/Work/exploration/ipex/datasets/varia/FinancialPhraseBank-v1.0/Sentences_50Agree.txt', 
                    header=None, sep='@', encoding='iso-8859-1', names=['Sentence', 'Sentiment'])

label_encode = {'negative': 0, 'neutral': 1, 'positive': 2}

df['Sentiment'] = df['Sentiment'].map(label_encode)
df['Sentiment'] = df['Sentiment'].astype('int')

X = df['Sentence']
y = df['Sentiment']

train_texts, test_texts, train_labels, test_labels = train_test_split(X, y, test_size=0.2, random_state=31416, stratify=y)
#, stratify=y
test_texts, val_texts, test_labels, val_labels = train_test_split(test_texts, test_labels, test_size=0.5, random_state=31416, stratify=test_labels)
#, stratify=test_labels

train_texts = train_texts.to_list()
test_texts = test_texts.to_list()
val_texts = val_texts.to_list()

train_labels = train_labels.to_list()
test_labels = test_labels.to_list()
val_labels = val_labels.to_list()

### Tokenize ###

tokenizer =  AutoTokenizer.from_pretrained(pre_model)

train_encodings = tokenizer(train_texts, max_length=max_seq_length, padding='max_length', truncation=True, return_attention_mask=True, return_tensors='pt')
val_encodings = tokenizer(val_texts, max_length=max_seq_length, padding='max_length', truncation=True, return_attention_mask=True, return_tensors='pt')
test_encodings = tokenizer(test_texts, max_length=max_seq_length, padding='max_length', truncation=True, return_attention_mask=True, return_tensors='pt')

### Created dataset object from encodings ###
# To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).

class kaggleFinSA(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        #item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        #item['labels'] = torch.tensor(self.labels[idx])
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = kaggleFinSA(train_encodings, train_labels)
val_dataset = kaggleFinSA(val_encodings, val_labels)
test_dataset = kaggleFinSA(test_encodings, test_labels)

### Fine-tune with trainer ###

metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    print(metrics.classification_report(labels, predictions))
    print(metrics.confusion_matrix(labels, predictions))
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=10,              # total number of training epochs
    per_device_train_batch_size=32,  # batch size per device during training

model = AutoModelForSequenceClassification.from_pretrained(pre_model, num_labels=3)

trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset



Any ideas on what the cause for this discrepancy may be?



Still running, but FinBERT produces better results with exactly the same code:

             precision    recall  f1-score   supportβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 16/16 [02:00<00:00,  6.37s/it]

           0       0.79      0.87      0.83        61
           1       0.94      0.86      0.90       288
           2       0.81      0.92      0.86       136

    accuracy                           0.88       485
   macro avg       0.85      0.88      0.86       485
weighted avg       0.89      0.88      0.88       485

[[ 53   5   3]
 [ 13 249  26]
 [  1  10 125]]
{'eval_loss': 0.5141590237617493, 'eval_accuracy': 0.8804123711340206, 'eval_runtime': 129.502, 'eval_samples_per_second': 3.745, 'eval_steps_per_second': 0.124, 'epoch': 4.0} 

Hello @mirix , soorry for the late response. Not sure if you could reproduce same results as datasets are different .. I've used many data sources (financial reports, news...)

