In [None]:
!pip install datasets transformers


In [2]:
import torch

In [4]:
from datasets import load_dataset, load_metric

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import pandas as pd

In [None]:
tokenizer=AutoTokenizer.from_pretrained("ProsusAI/finbert")


In [7]:
df=pd.read_csv('/content/news_prepared_to_model_wig20_th.csv', index_col=0)

In [None]:
df['sentiment'].value_counts()

In [9]:
df['sentiment']=df['sentiment'].map({1:'negative',2:'neutral',3:'positive'})

In [10]:
get_dum_labels=pd.get_dummies(df['sentiment'])

In [11]:
df=df.join(get_dum_labels)

In [12]:
df=df.rename(columns={'sentiment':'label'})

In [13]:
df=df[['text','positive','negative','neutral']]

In [14]:
df.to_csv('/content/news_prepared.csv', index=False)

In [None]:
from datasets import load_dataset, load_metric
dataset=load_dataset('csv', data_files='/content/news_prepared.csv', split='train')

In [16]:
dataset = dataset.train_test_split(test_size=0.3)

In [None]:
cols = dataset['train'].column_names

dataset = dataset.map(lambda x : {"label": [x[c] for c in cols if c != "text"]})
dataset

In [19]:
def preprocess(data):
    return tokenizer(data['text'], padding=True, truncation=True)

In [22]:
dataset=dataset.map(preprocess, batched=True, batch_size=len(dataset))

  0%|          | 0/2593 [00:00<?, ?ba/s]

  0%|          | 0/1112 [00:00<?, ?ba/s]

In [23]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'positive', 'negative', 'neutral', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5186
    })
    test: Dataset({
        features: ['text', 'positive', 'negative', 'neutral', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2223
    })
})

In [24]:
label_end_train=dataset['train']['label']
label_end_test=dataset['test']['label']
dataset['train'].remove_columns('label')
dataset['test'].remove_columns('label')

Dataset({
    features: ['text', 'positive', 'negative', 'neutral', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2223
})

In [25]:
dataset['train']=dataset['train'].remove_columns('label')

In [26]:
dataset['test']=dataset['test'].remove_columns('label')

In [27]:
dataset['test']

Dataset({
    features: ['text', 'positive', 'negative', 'neutral', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2223
})

In [28]:
dataset['train']=dataset['train'].add_column(name='label', column=label_end_train)
dataset['test']=dataset['test'].add_column(name='label', column=label_end_test)

In [29]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'positive', 'negative', 'neutral', 'input_ids', 'token_type_ids', 'attention_mask', 'label'],
        num_rows: 5186
    })
    test: Dataset({
        features: ['text', 'positive', 'negative', 'neutral', 'input_ids', 'token_type_ids', 'attention_mask', 'label'],
        num_rows: 2223
    })
})

In [30]:
dataset['train']=dataset['train'].remove_columns('text')
dataset['test']=dataset['test'].remove_columns('text')

In [None]:
#cast labels to float
dataset.set_format("torch")
dataset = (dataset
          .map(lambda x : {"float_labels": x["label"].to(torch.float)}, remove_columns=["label"])
          .rename_column("float_labels", "label"))

In [33]:
dataset=dataset.remove_columns(['positive','negative','neutral'])

In [34]:
dataset['train'][0]

{'input_ids': tensor([  101,  6948,  2361, 18668,  5630,  2000,  2035, 24755,  2618,  1037,
         11443,  4859,  1997, 20228,  2078,  2454,  1045,  1041, 20228,  2078,
          2566,  3745,  2004,  2012,  1996,  2154,  1997, 11092,  1037,  5813,
          2030, 20228,  2078,  2044,  2825,  7375,  1997, 10943,  2077,  1996,
          2154,   102]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 'label': tensor([1., 0., 0.])}

In [35]:
batch_size=16
epochs=10

In [43]:
#!pip install evaluate
import evaluate
import numpy as np
def compute_metrics(eval_pred):
  accuracy_score=evaluate.load('accuracy')
  predictions, labels = eval_pred
  predictions=np.argmax(predictions, axis=-1)
  labels=np.argmax(labels, axis=-1)
  
  return accuracy_score.compute(references=labels, predictions=predictions)
  

In [45]:
args = TrainingArguments(
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    push_to_hub=True,
    output_dir='/content/stock_sentiment_hp'
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [38]:
#according to docs we have to wrap model in function
def model_init():
    return AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert", num_labels=3)

In [46]:
trainer = Trainer(
    model_init=model_init,
    args=args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--ProsusAI--finbert/snapshots/54bddcea2cca580dd1df6a88d33242dcf4c61a71/config.json
Model config BertConfig {
  "_name_or_path": "ProsusAI/finbert",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "positive",
    "1": "negative",
    "2": "neutral"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "negative": 1,
    "neutral": 2,
    "positive": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522


In [None]:
#library to search hyperparameters
!pip install "ray[tune]"

In [None]:
#run hyperparameters tuning
best_run = trainer.hyperparameter_search(n_trials=10, direction="maximize")

In [59]:
#best hyperparameters
for n, v in best_run.hyperparameters.items():
    
    if n=='seed':
      v=int(np.round(v))
    print(n,v)
    setattr(trainer.args, n, v)

learning_rate 1.1207606211860571e-05
num_train_epochs 4
seed 2
per_device_train_batch_size 16
