Unable to reproduce the results from the paper

#3
by mirix - opened

Hello,

I have been unable to reproduce the results from the paper.

These are the results of the best epoch (out of 10):

             precision    recall  f1-score   supportβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 16/16 [02:00<00:00,  6.33s/it]

           0       0.79      0.82      0.81        61
           1       0.89      0.85      0.87       288
           2       0.76      0.82      0.79       136

    accuracy                           0.84       485
   macro avg       0.82      0.83      0.82       485
weighted avg       0.84      0.84      0.84       485

[[ 50   7   4]
 [ 12 245  31]
 [  1  23 112]]
{'eval_loss': 0.5206248760223389, 'eval_accuracy': 0.8391752577319588, 'eval_runtime': 128.6691, 'eval_samples_per_second': 3.769, 'eval_steps_per_second': 0.124, 'epoch': 3.0} 

As you can see, far from those reported.

And this is my code:

import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

import torch
import evaluate

from transformers import Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification, AutoTokenizer
#from transformers import RobertaTokenizer, RobertaForSequenceClassification

from sklearn.model_selection import train_test_split
from sklearn import metrics

import numpy as np
import pandas as pd

### Variables ###

max_seq_length=512
#pre_model = '/home/emoman/Work/exploration/ipex/deberta-v3-base'
pre_model = 'ahmedrachid/FinancialBERT'

### Prepare the database ###

df = pd.read_csv('/home/emoman/Work/exploration/ipex/datasets/varia/FinancialPhraseBank-v1.0/Sentences_50Agree.txt', 
                    header=None, sep='@', encoding='iso-8859-1', names=['Sentence', 'Sentiment'])

label_encode = {'negative': 0, 'neutral': 1, 'positive': 2}

df['Sentiment'] = df['Sentiment'].map(label_encode)
df['Sentiment'] = df['Sentiment'].astype('int')

X = df['Sentence']
y = df['Sentiment']

train_texts, test_texts, train_labels, test_labels = train_test_split(X, y, test_size=0.2, random_state=31416, stratify=y)
#, stratify=y
test_texts, val_texts, test_labels, val_labels = train_test_split(test_texts, test_labels, test_size=0.5, random_state=31416, stratify=test_labels)
#, stratify=test_labels

train_texts = train_texts.to_list()
test_texts = test_texts.to_list()
val_texts = val_texts.to_list()

train_labels = train_labels.to_list()
test_labels = test_labels.to_list()
val_labels = val_labels.to_list()

### Tokenize ###

tokenizer =  AutoTokenizer.from_pretrained(pre_model)

train_encodings = tokenizer(train_texts, max_length=max_seq_length, padding='max_length', truncation=True, return_attention_mask=True, return_tensors='pt')
val_encodings = tokenizer(val_texts, max_length=max_seq_length, padding='max_length', truncation=True, return_attention_mask=True, return_tensors='pt')
test_encodings = tokenizer(test_texts, max_length=max_seq_length, padding='max_length', truncation=True, return_attention_mask=True, return_tensors='pt')

### Created dataset object from encodings ###
# To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).

class kaggleFinSA(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        #item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        #item['labels'] = torch.tensor(self.labels[idx])
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = kaggleFinSA(train_encodings, train_labels)
val_dataset = kaggleFinSA(val_encodings, val_labels)
test_dataset = kaggleFinSA(test_encodings, test_labels)

### Fine-tune with trainer ###

metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    print(metrics.classification_report(labels, predictions))
    print(metrics.confusion_matrix(labels, predictions))
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=10,              # total number of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=32,
    evaluation_strategy='epoch',
    #eval_steps=100,
    learning_rate=2e-5,
    #weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    greater_is_better=True,
    overwrite_output_dir=True,
    optim='adamw_torch',
    #use_ipex=True,
    no_cuda=True,
    #bf16=True,
    #weights_prepack=False,
    #jit_mode_eval=True,
    do_train=True,
    do_eval=True,
    do_predict=True,
    #save_total_limit=2,
    save_strategy='epoch'
)

model = AutoModelForSequenceClassification.from_pretrained(pre_model, num_labels=3)
#problem_type='multi_label_classification'

trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics=compute_metrics,
)

trainer.train()
#metrics=trainer.evaluate()
#print(metrics)

trainer.save_model('./model')

Any ideas on what the cause for this discrepancy may be?

Best,

Ed

Still running, but FinBERT produces better results with exactly the same code:

             precision    recall  f1-score   supportβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 16/16 [02:00<00:00,  6.37s/it]

           0       0.79      0.87      0.83        61
           1       0.94      0.86      0.90       288
           2       0.81      0.92      0.86       136

    accuracy                           0.88       485
   macro avg       0.85      0.88      0.86       485
weighted avg       0.89      0.88      0.88       485

[[ 53   5   3]
 [ 13 249  26]
 [  1  10 125]]
{'eval_loss': 0.5141590237617493, 'eval_accuracy': 0.8804123711340206, 'eval_runtime': 129.502, 'eval_samples_per_second': 3.745, 'eval_steps_per_second': 0.124, 'epoch': 4.0} 

Hello @mirix , soorry for the late response. Not sure if you could reproduce same results as datasets are different .. I've used many data sources (financial reports, news...)

Sign up or log in to comment