File size: 4,945 Bytes
dab4a1c
 
 
1881945
9b1a74b
1881945
 
7ffaed7
1881945
 
199069b
1881945
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed29696
1881945
 
 
 
 
ed29696
1881945
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed29696
1881945
 
 
 
ed29696
1881945
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed4339d
1881945
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed29696
8a57cfb
 
1881945
 
ed4339d
1881945
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
---
license: mit
---
How to use ths model?
Download the pytorch_model.bin file and execute the following:

```python
import pandas as pd
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer, BertModel, BertTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

MAX_LEN = 128
BATCH_SIZE = 20
text_col_name = 'sentence'
category_col = 'label_text'

#Input should be one dataframe having one column with header as 'sentence' : test_df (do reset_index() if needed)
test_df = pd.DataFrame({"sentence":['We are striving to reduce the amount of waste we produce, and to reduce water as well as paper consumption.']})

def scoring_data_prep(dataset):
    out = []
    target = []
    mask = []
    
    for i in range(len(dataset)):
        rec = dataset[i]
        out.append(rec['ids'].reshape(-1,MAX_LEN))
        mask.append(rec['mask'].reshape(-1,MAX_LEN))

        out_stack = torch.cat(out, dim = 0)
        mask_stack = torch.cat(mask, dim =0 )
        out_stack = out_stack.to(device, dtype = torch.long)
        mask_stack = mask_stack.to(device, dtype = torch.long)

    return out_stack, mask_stack


class Triage(Dataset):
    """
    This is a subclass of torch packages Dataset class. It processes input to create ids, masks and targets required for model training. 
    """

    def __init__(self, dataframe, tokenizer, max_len, text_col_name):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.text_col_name = text_col_name
        

    def __getitem__(self, index):
        title = str(self.data[self.text_col_name][index])
        title = " ".join(title.split())
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True,
        )
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            
        }

    def __len__(self):
        return self.len
        
class BERTClass(torch.nn.Module):
    def __init__(self, num_class):
        super(BERTClass, self).__init__()
        self.num_class = num_class
        self.l1 = RobertaModel.from_pretrained("roberta-base")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, self.num_class)
        self.history = dict()

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output
        
def do_predict(model, tokenizer, test_df):
  test_set = Triage(test_df, tokenizer, MAX_LEN, text_col_name)
  test_params = {'batch_size' : BATCH_SIZE, 'shuffle': False, 'num_workers':0}
  test_loader = DataLoader(test_set, **test_params)
  out_stack, mask_stack = scoring_data_prep(dataset = test_set)
  n = 0
  combined_output = []
  model.eval()
  with torch.no_grad():
      while n < test_df.shape[0]:
          output = model(out_stack[n:n+BATCH_SIZE,:],mask_stack[n:n+BATCH_SIZE,:])
          n = n + BATCH_SIZE
          combined_output.append(output)
      combined_output = torch.cat(combined_output, dim = 0)
      preds = torch.argsort(combined_output, axis = 1, descending = True)
  preds = preds.to('cpu')
  actual_predictions = [i[0] for i in preds.tolist()]
  return actual_predictions
  
model_sustain = BERTClass(2)
model_sustain.to(device)
model_sustain.load_state_dict(torch.load('pytorch_model.bin', map_location=device)['model_state_dict'])

tokenizer_sus = BertTokenizer.from_pretrained('roberta-base')
actual_predictions_sus = do_predict(model_sustain, tokenizer_sus, test_df)

test_df['sustainability'] = ['sustainable' if i==0 else 'unsustainable' for i in actual_predictions_read]
```

Our work can be cited as follows:

```bibtex 
@inproceedings{ghosh-2022-finsim-esg,
    title = "Ranking Environment, Social And Governance Related Concepts And Assessing Sustainability Aspect Of Financial Texts",
    author={Ghosh, Sohom and Naskar, Sudip Kumar},
    booktitle = "Proceedings of the Fourth Workshop on Financial Technology and Natural Language Processing (FinNLP@IJCAI-ECAI 2022)",
    month = "July" ,
    year = "2022",
    address = "Vienna, Austria",
    publisher = "-",
    url = "https://mx.nthu.edu.tw/~chungchichen/FinNLP2022_IJCAI/14.pdf",
    pages = "87--92",
}

```