Spaces:

sohomghosh
/

FinLanSer_Financial_Language_Simplifier

Runtime error

File size: 3,833 Bytes

4855ea0

import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer, BertModel, BertTokenizer
import pandas as pd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

MAX_LEN = 128
BATCH_SIZE = 20
text_col_name = 'sentence'

def scoring_data_prep(dataset):
    out = []
    target = []
    mask = []
    
    for i in range(len(dataset)):
        rec = dataset[i]
        out.append(rec['ids'].reshape(-1,MAX_LEN))
        mask.append(rec['mask'].reshape(-1,MAX_LEN))

        out_stack = torch.cat(out, dim = 0)
        mask_stack = torch.cat(mask, dim =0 )
        out_stack = out_stack.to(device, dtype = torch.long)
        mask_stack = mask_stack.to(device, dtype = torch.long)

    return out_stack, mask_stack

class Triage(Dataset):
    """
    This is a subclass of torch packages Dataset class. It processes input to create ids, masks and targets required for model training. 
    """

    def __init__(self, dataframe, tokenizer, max_len, text_col_name):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.text_col_name = text_col_name
        

    def __getitem__(self, index):
        title = str(self.data[self.text_col_name][index])
        title = " ".join(title.split())
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True, #padding='max_length' #For future version use `padding='max_length'`
            return_token_type_ids=True,
            truncation=True,
        )
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            
        }

    def __len__(self):
        return self.len

class BERTClass(torch.nn.Module):
    def __init__(self, num_class, task):
        super(BERTClass, self).__init__()
        self.num_class = num_class
        if task =="sustanability":
          self.l1 = RobertaModel.from_pretrained("roberta-base")
        else:
          self.l1 = BertModel.from_pretrained("ProsusAI/finbert")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, self.num_class)
        self.history = dict()

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

def do_predict(model, tokenizer, test_df):
  test_set = Triage(test_df, tokenizer, MAX_LEN, text_col_name)
  test_params = {'batch_size' : BATCH_SIZE, 'shuffle': False, 'num_workers':0}
  test_loader = DataLoader(test_set, **test_params)
  out_stack, mask_stack = scoring_data_prep(dataset = test_set)
  n = 0
  combined_output = []
  model.eval()
  with torch.no_grad():
      while n < test_df.shape[0]:
          output = model(out_stack[n:n+BATCH_SIZE,:],mask_stack[n:n+BATCH_SIZE,:])
          n = n + BATCH_SIZE
          combined_output.append(output)
      combined_output = torch.cat(combined_output, dim = 0)
      preds = torch.argsort(combined_output, axis = 1, descending = True)
  preds = preds.to('cpu')
  actual_predictions = [i[0] for i in preds.tolist()]
  combined_output = combined_output.to('cpu')
  prob_predictions= [i[1] for i in combined_output.tolist()]
  return (actual_predictions, prob_predictions)