sohomghosh
/

LIPI_FinSim4_ESG_task2

PyTorch

Model card Files Files and versions Community

sohomghosh commited on Jun 27, 2022

Commit

1881945

•

1 Parent(s): 3093cb2

Update README.md

Browse files

Files changed (1) hide show

README.md +139 -0

README.md CHANGED Viewed

@@ -1,3 +1,142 @@
 ---
 license: mit
 ---

 ---
 license: mit
 ---
+How to use ths model?
+```python
+import torch
+import transformers
+from torch.utils.data import Dataset, dataLoader
+from transformers import RobertaModel, RobertaTokenizer, BertModel, BertTokenizer
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+MAX_LEN = 128
+BATCH_SIZE = 20
+text_col_name = 'sentence'
+category_col = 'label_text'
+#Input should be one dataframe having one column with header as 'sentence' : test_df (do reset_index() if needed)
+test_df = pd.DataFrame({"sentence":['We are striving to reduce the amount of waste we produce, and to reduce water as well as paper consumption.']})
+def scoring_data_prep(dataset):
+    out = []
+    target = []
+    mask = []
+    for i in range(len(dataset)):
+        rec = dataset[i]
+        out.append(rec['ids'].reshape(-1,MAX_LEN))
+        mask.append(rec['mask'].reshape(-1,MAX_LEN))
+        out_stack = torch.cat(out, dim = 0)
+        mask_stack = torch.cat(mask, dim =0 )
+        out_stack = out_stack.to(device, dtype = torch.long)
+        mask_stack = mask_stack.to(device, dtype = torch.long)
+    return out_stack, mask_stack
+class Triage(Dataset):
+    """
+    This is a subclass of torch packages Dataset class. It processes input to create ids, masks and targets required for model training.
+    """
+    def __init__(self, dataframe, tokenizer, max_len, text_col_name, category_col):
+        self.len = len(dataframe)
+        self.data = dataframe
+        self.tokenizer = tokenizer
+        self.max_len = max_len
+        self.text_col_name = text_col_name
+        self.category_col = category_col
+    def __getitem__(self, index):
+        title = str(self.data[self.text_col_name][index])
+        title = " ".join(title.split())
+        inputs = self.tokenizer.encode_plus(
+            title,
+            None,
+            add_special_tokens=True,
+            max_length=self.max_len,
+            pad_to_max_length=True,
+            return_token_type_ids=True,
+            truncation=True,
+        )
+        ids = inputs["input_ids"]
+        mask = inputs["attention_mask"]
+        return {
+            "ids": torch.tensor(ids, dtype=torch.long),
+            "mask": torch.tensor(mask, dtype=torch.long),
+            "targets": torch.tensor(
+                self.data[self.category_col][index], dtype=torch.long
+            ),
+        }
+    def __len__(self):
+        return self.len
+class BERTClass(torch.nn.Module):
+    def __init__(self, num_class):
+        super(BERTClass, self).__init__()
+        self.num_class = num_class
+        self.l1 = RobertaModel.from_pretrained("roberta-base")
+        self.pre_classifier = torch.nn.Linear(768, 768)
+        self.dropout = torch.nn.Dropout(0.3)
+        self.classifier = torch.nn.Linear(768, self.num_class)
+        self.history = dict()
+    def forward(self, input_ids, attention_mask):
+        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
+        hidden_state = output_1[0]
+        pooler = hidden_state[:, 0]
+        pooler = self.pre_classifier(pooler)
+        pooler = torch.nn.ReLU()(pooler)
+        pooler = self.dropout(pooler)
+        output = self.classifier(pooler)
+        return output
+def do_predict(tokenizer):
+  test_set = Triage(test_df, tokenizer, MAX_LEN, text_col_name)
+  test_params = {'batch_size' : BATCH_SIZE, 'shuffle': False, 'num_workers':0}
+  test_loader = DataLoader(test_set, **test_params)
+  out_stack, mask_stack = scoring_data_prep(dataset = test_set)
+  n = 0
+  combined_output = []
+  model.eval()
+  with torch.no_grad():
+      while n < test_df.shape[0]:
+          output = model(out_stack[n:n+BATCH_SIZE,:],mask_stack[n:n+BATCH_SIZE,:])
+          n = n + BATCH_SIZE
+          combined_output.append(output)
+      combined_output = torch.cat(combined_output, dim = 0)
+      preds = torch.argsort(combined_output, axis = 1, descending = True)
+  preds = preds.to('cpu')
+  actual_predictions = [i[0] for i in preds.tolist()]
+  return actual_predictions
+model_sus = BERTClass(2)
+model_sus.load_stat_dict(torch.load('pytorch_model.bin')['model_state_dict'])
+tokenizer_sus = BertTokenizer.from_pretrained('roberta-base')
+actual_predictions_sus = do_predict(tokenizer_sus)
+test_df['sustainability'] = ['sustainable' if i==0 else 'unsustainable' for i in actual_predictions_read]
+```
+Our work can be cited as follows:
+```bibtex
+@inproceedings{ghosh-2022-finsim-esg,
+    title = "Ranking Environment, Social And Governance Related Concepts And Assessing Sustainability Aspect Of Financial Texts",
+    author={Ghosh, Sohom and Naskar, Sudip Kumar},
+    booktitle = "Proceedings of the Fourth Workshop on Financial Technology and Natural Language Processing (FinNLP@IJCAI-ECAI 2022)",
+    month = "July" ,
+    year = "2022",
+    address = "Vienna, Austria",
+    publisher = "-",
+    url = "https://mx.nthu.edu.tw/~chungchichen/FinNLP2022_IJCAI/14.pdf",
+    pages = "87--92",
+}
+```