sohomghosh commited on
Commit
1881945
1 Parent(s): 3093cb2

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +139 -0
README.md CHANGED
@@ -1,3 +1,142 @@
1
  ---
2
  license: mit
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: mit
3
  ---
4
+ How to use ths model?
5
+
6
+ ```python
7
+ import torch
8
+ import transformers
9
+ from torch.utils.data import Dataset, dataLoader
10
+ from transformers import RobertaModel, RobertaTokenizer, BertModel, BertTokenizer
11
+
12
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13
+
14
+ MAX_LEN = 128
15
+ BATCH_SIZE = 20
16
+ text_col_name = 'sentence'
17
+ category_col = 'label_text'
18
+
19
+ #Input should be one dataframe having one column with header as 'sentence' : test_df (do reset_index() if needed)
20
+ test_df = pd.DataFrame({"sentence":['We are striving to reduce the amount of waste we produce, and to reduce water as well as paper consumption.']})
21
+
22
+ def scoring_data_prep(dataset):
23
+ out = []
24
+ target = []
25
+ mask = []
26
+
27
+ for i in range(len(dataset)):
28
+ rec = dataset[i]
29
+ out.append(rec['ids'].reshape(-1,MAX_LEN))
30
+ mask.append(rec['mask'].reshape(-1,MAX_LEN))
31
+
32
+ out_stack = torch.cat(out, dim = 0)
33
+ mask_stack = torch.cat(mask, dim =0 )
34
+ out_stack = out_stack.to(device, dtype = torch.long)
35
+ mask_stack = mask_stack.to(device, dtype = torch.long)
36
+
37
+ return out_stack, mask_stack
38
+
39
+
40
+ class Triage(Dataset):
41
+ """
42
+ This is a subclass of torch packages Dataset class. It processes input to create ids, masks and targets required for model training.
43
+ """
44
+
45
+ def __init__(self, dataframe, tokenizer, max_len, text_col_name, category_col):
46
+ self.len = len(dataframe)
47
+ self.data = dataframe
48
+ self.tokenizer = tokenizer
49
+ self.max_len = max_len
50
+ self.text_col_name = text_col_name
51
+ self.category_col = category_col
52
+
53
+ def __getitem__(self, index):
54
+ title = str(self.data[self.text_col_name][index])
55
+ title = " ".join(title.split())
56
+ inputs = self.tokenizer.encode_plus(
57
+ title,
58
+ None,
59
+ add_special_tokens=True,
60
+ max_length=self.max_len,
61
+ pad_to_max_length=True,
62
+ return_token_type_ids=True,
63
+ truncation=True,
64
+ )
65
+ ids = inputs["input_ids"]
66
+ mask = inputs["attention_mask"]
67
+
68
+ return {
69
+ "ids": torch.tensor(ids, dtype=torch.long),
70
+ "mask": torch.tensor(mask, dtype=torch.long),
71
+ "targets": torch.tensor(
72
+ self.data[self.category_col][index], dtype=torch.long
73
+ ),
74
+ }
75
+
76
+ def __len__(self):
77
+ return self.len
78
+
79
+ class BERTClass(torch.nn.Module):
80
+ def __init__(self, num_class):
81
+ super(BERTClass, self).__init__()
82
+ self.num_class = num_class
83
+ self.l1 = RobertaModel.from_pretrained("roberta-base")
84
+ self.pre_classifier = torch.nn.Linear(768, 768)
85
+ self.dropout = torch.nn.Dropout(0.3)
86
+ self.classifier = torch.nn.Linear(768, self.num_class)
87
+ self.history = dict()
88
+
89
+ def forward(self, input_ids, attention_mask):
90
+ output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
91
+ hidden_state = output_1[0]
92
+ pooler = hidden_state[:, 0]
93
+ pooler = self.pre_classifier(pooler)
94
+ pooler = torch.nn.ReLU()(pooler)
95
+ pooler = self.dropout(pooler)
96
+ output = self.classifier(pooler)
97
+ return output
98
+
99
+ def do_predict(tokenizer):
100
+ test_set = Triage(test_df, tokenizer, MAX_LEN, text_col_name)
101
+ test_params = {'batch_size' : BATCH_SIZE, 'shuffle': False, 'num_workers':0}
102
+ test_loader = DataLoader(test_set, **test_params)
103
+ out_stack, mask_stack = scoring_data_prep(dataset = test_set)
104
+ n = 0
105
+ combined_output = []
106
+ model.eval()
107
+ with torch.no_grad():
108
+ while n < test_df.shape[0]:
109
+ output = model(out_stack[n:n+BATCH_SIZE,:],mask_stack[n:n+BATCH_SIZE,:])
110
+ n = n + BATCH_SIZE
111
+ combined_output.append(output)
112
+ combined_output = torch.cat(combined_output, dim = 0)
113
+ preds = torch.argsort(combined_output, axis = 1, descending = True)
114
+ preds = preds.to('cpu')
115
+ actual_predictions = [i[0] for i in preds.tolist()]
116
+ return actual_predictions
117
+
118
+ model_sus = BERTClass(2)
119
+ model_sus.load_stat_dict(torch.load('pytorch_model.bin')['model_state_dict'])
120
+
121
+ tokenizer_sus = BertTokenizer.from_pretrained('roberta-base')
122
+ actual_predictions_sus = do_predict(tokenizer_sus)
123
+
124
+ test_df['sustainability'] = ['sustainable' if i==0 else 'unsustainable' for i in actual_predictions_read]
125
+ ```
126
+
127
+ Our work can be cited as follows:
128
+
129
+ ```bibtex
130
+ @inproceedings{ghosh-2022-finsim-esg,
131
+ title = "Ranking Environment, Social And Governance Related Concepts And Assessing Sustainability Aspect Of Financial Texts",
132
+ author={Ghosh, Sohom and Naskar, Sudip Kumar},
133
+ booktitle = "Proceedings of the Fourth Workshop on Financial Technology and Natural Language Processing (FinNLP@IJCAI-ECAI 2022)",
134
+ month = "July" ,
135
+ year = "2022",
136
+ address = "Vienna, Austria",
137
+ publisher = "-",
138
+ url = "https://mx.nthu.edu.tw/~chungchichen/FinNLP2022_IJCAI/14.pdf",
139
+ pages = "87--92",
140
+ }
141
+
142
+ ```