In [None]:
#install datasets and transformers
!pip install datasets transformers
#install huggingface hub to deploy model
!pip install huggingface_hub

In [None]:
#install wandb to visualize metrics, you can ommit this step and next if it is not necessary
!pip install wandb


In [None]:
import wandb
wandb.login()

In [None]:
import torch
import transformers

In [None]:
#login to huggingface hub, first sign up to hugging face and create write token
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from datasets import load_dataset, load_metric

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import pandas as pd

In [None]:
#import tokenizer from huggingface
tokenizer=AutoTokenizer.from_pretrained("ProsusAI/finbert")


In [None]:
df=pd.read_csv('/content/news_prepared_to_model_wig20_th.csv', index_col=0)

In [None]:
df['sentiment']=df['sentiment'].map({1:'negative',2:'neutral',3:'positive'})

In [None]:
#one hot encoding labels
get_dum_labels=pd.get_dummies(df['sentiment'])

In [None]:
df=df.join(get_dum_labels)

In [None]:
df=df.rename(columns={'sentiment':'label'})

In [None]:
df=df[['text','positive','negative','neutral']]

In [None]:
df.to_csv('/content/news_prepared.csv', index=False)

In [None]:
from datasets import load_dataset, load_metric
dataset=load_dataset('csv', data_files='/content/news_prepared.csv', split='train')

In [None]:
dataset = dataset.train_test_split(test_size=0.3)

In [None]:
cols = dataset['train'].column_names

dataset = dataset.map(lambda x : {"label": [x[c] for c in cols if c != "text"]})
dataset

  0%|          | 0/5186 [00:00<?, ?ex/s]

  0%|          | 0/2223 [00:00<?, ?ex/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'positive', 'negative', 'neutral', 'label'],
        num_rows: 5186
    })
    test: Dataset({
        features: ['text', 'positive', 'negative', 'neutral', 'label'],
        num_rows: 2223
    })
})

In [None]:
dataset["train"][0]

{'text': 'orange polska have generate in the third quarter pln million of ebitda profit after take into account leasing cost by percent more than a year ago the company say in the report analyst respond by pap bizne forecast that the company would achieve the result at level pln million',
 'positive': 1,
 'negative': 0,
 'neutral': 0,
 'label': [1, 0, 0]}

In [None]:
#function to tokenize text
def preprocess(data):
    return tokenizer(data['text'], padding=True, truncation=True)

In [None]:
dataset=dataset.map(preprocess, batched=True, batch_size=len(dataset))

  0%|          | 0/2593 [00:00<?, ?ba/s]

  0%|          | 0/1112 [00:00<?, ?ba/s]

In [None]:
#create labels series to append later on the end of dataset
label_end_train=dataset['train']['label']
label_end_test=dataset['test']['label']
dataset['train'].remove_columns('label')
dataset['test'].remove_columns('label')

Dataset({
    features: ['text', 'positive', 'negative', 'neutral', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2223
})

In [None]:
dataset['train']=dataset['train'].remove_columns('label')

In [None]:
dataset['test']=dataset['test'].remove_columns('label')

In [None]:
dataset['test']

Dataset({
    features: ['text', 'positive', 'negative', 'neutral', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2223
})

In [None]:
#add removing columns on the end of dataset
dataset['train']=dataset['train'].add_column(name='label', column=label_end_train)
dataset['test']=dataset['test'].add_column(name='label', column=label_end_test)

In [None]:
#remove text columns we keep only tokenized data
dataset['train']=dataset['train'].remove_columns('text')
dataset['test']=dataset['test'].remove_columns('text')

In [None]:
dataset.set_format("torch")
dataset = (dataset
          .map(lambda x : {"float_labels": x["label"].to(torch.float)}, remove_columns=["label"])
          .rename_column("float_labels", "label"))

  0%|          | 0/5186 [00:00<?, ?ex/s]

  0%|          | 0/2223 [00:00<?, ?ex/s]

In [None]:
#remove another unnecessary columns
dataset=dataset.remove_columns(['positive','negative','neutral'])

In [None]:
#batch size from hyperparameter tuning results
batch_size=16
epochs=10

In [None]:
!pip install evaluate
import evaluate
import numpy as np
def compute_metrics(eval_pred):
  accuracy_score=evaluate.load('accuracy')
  predictions, labels = eval_pred
  predictions=np.argmax(predictions, axis=-1)
  labels=np.argmax(labels, axis=-1)
  
  return accuracy_score.compute(references=labels, predictions=predictions)
  

In [None]:
%env WANDB_PROJECT=stock_sentiment_hp_wb

env: WANDB_PROJECT=stock_sentiment_hp_wb


In [None]:
#create early stopping to stop training after second epoch that not improve eval_loss
early_stopping=transformers.EarlyStoppingCallback(early_stopping_patience= 2,early_stopping_threshold = 0.01)

In [None]:
#define training arguments
args = TrainingArguments(
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=1.12e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01,
    seed=2,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    push_to_hub=True,
    output_dir='/content/stock_sentiment_hp_wb',
    report_to="wandb"
)

PyTorch: setting up devices


In [None]:
model=AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert", num_labels=3)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--ProsusAI--finbert/snapshots/54bddcea2cca580dd1df6a88d33242dcf4c61a71/config.json
Model config BertConfig {
  "_name_or_path": "ProsusAI/finbert",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "positive",
    "1": "negative",
    "2": "neutral"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "negative": 1,
    "neutral": 2,
    "positive": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522


In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping]
)

/content/stock_sentiment_hp_wb is already a clone of https://huggingface.co/slisowski/stock_sentiment_hp_wb. Make sure you pull the latest changes with `repo.git_pull()`.


In [None]:
trainer.train()

In [None]:
#push model to hub
trainer.push_to_hub()

In [None]:
#import model for create pipeline
tokenizer = AutoTokenizer.from_pretrained("slisowski/stock_sentiment_hp")

model = AutoModelForSequenceClassification.from_pretrained("slisowski/stock_sentiment_hp")

In [None]:
from transformers import TextClassificationPipeline
from transformers import pipeline

In [None]:
#create pipeline
pipe_stock=pipeline('text-classification', model="slisowski/stock_sentiment_hp", tokenizer='ProsusAI/finbert', device='cpu', framework='pt')

In [None]:
#use pipeline and print all predicted labels with score
pipe_stock('The withdrawal of futures on the S&P 500 causes anxiety on the WSE', top_k=3)

In [None]:
#save pipeline
pipe_stock.save_pretrained('/content/drive/MyDrive/colab_data/pipeline_hf_stock/')

Configuration saved in /content/drive/MyDrive/colab_data/pipeline_hf_stock/config.json
Model weights saved in /content/drive/MyDrive/colab_data/pipeline_hf_stock/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/colab_data/pipeline_hf_stock/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/colab_data/pipeline_hf_stock/special_tokens_map.json
