File size: 4,882 Bytes
dab4a1c 1881945 7ffaed7 1881945 199069b 1881945 ed29696 1881945 ed29696 1881945 ed29696 1881945 ed29696 1881945 ed4339d 1881945 ed29696 8a57cfb 1881945 ed4339d 1881945 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
---
license: mit
---
How to use ths model?
```python
import pandas as pd
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer, BertModel, BertTokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MAX_LEN = 128
BATCH_SIZE = 20
text_col_name = 'sentence'
category_col = 'label_text'
#Input should be one dataframe having one column with header as 'sentence' : test_df (do reset_index() if needed)
test_df = pd.DataFrame({"sentence":['We are striving to reduce the amount of waste we produce, and to reduce water as well as paper consumption.']})
def scoring_data_prep(dataset):
out = []
target = []
mask = []
for i in range(len(dataset)):
rec = dataset[i]
out.append(rec['ids'].reshape(-1,MAX_LEN))
mask.append(rec['mask'].reshape(-1,MAX_LEN))
out_stack = torch.cat(out, dim = 0)
mask_stack = torch.cat(mask, dim =0 )
out_stack = out_stack.to(device, dtype = torch.long)
mask_stack = mask_stack.to(device, dtype = torch.long)
return out_stack, mask_stack
class Triage(Dataset):
"""
This is a subclass of torch packages Dataset class. It processes input to create ids, masks and targets required for model training.
"""
def __init__(self, dataframe, tokenizer, max_len, text_col_name):
self.len = len(dataframe)
self.data = dataframe
self.tokenizer = tokenizer
self.max_len = max_len
self.text_col_name = text_col_name
def __getitem__(self, index):
title = str(self.data[self.text_col_name][index])
title = " ".join(title.split())
inputs = self.tokenizer.encode_plus(
title,
None,
add_special_tokens=True,
max_length=self.max_len,
pad_to_max_length=True,
return_token_type_ids=True,
truncation=True,
)
ids = inputs["input_ids"]
mask = inputs["attention_mask"]
return {
"ids": torch.tensor(ids, dtype=torch.long),
"mask": torch.tensor(mask, dtype=torch.long),
}
def __len__(self):
return self.len
class BERTClass(torch.nn.Module):
def __init__(self, num_class):
super(BERTClass, self).__init__()
self.num_class = num_class
self.l1 = RobertaModel.from_pretrained("roberta-base")
self.pre_classifier = torch.nn.Linear(768, 768)
self.dropout = torch.nn.Dropout(0.3)
self.classifier = torch.nn.Linear(768, self.num_class)
self.history = dict()
def forward(self, input_ids, attention_mask):
output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
hidden_state = output_1[0]
pooler = hidden_state[:, 0]
pooler = self.pre_classifier(pooler)
pooler = torch.nn.ReLU()(pooler)
pooler = self.dropout(pooler)
output = self.classifier(pooler)
return output
def do_predict(model, tokenizer, test_df):
test_set = Triage(test_df, tokenizer, MAX_LEN, text_col_name)
test_params = {'batch_size' : BATCH_SIZE, 'shuffle': False, 'num_workers':0}
test_loader = DataLoader(test_set, **test_params)
out_stack, mask_stack = scoring_data_prep(dataset = test_set)
n = 0
combined_output = []
model.eval()
with torch.no_grad():
while n < test_df.shape[0]:
output = model(out_stack[n:n+BATCH_SIZE,:],mask_stack[n:n+BATCH_SIZE,:])
n = n + BATCH_SIZE
combined_output.append(output)
combined_output = torch.cat(combined_output, dim = 0)
preds = torch.argsort(combined_output, axis = 1, descending = True)
preds = preds.to('cpu')
actual_predictions = [i[0] for i in preds.tolist()]
return actual_predictions
model_sustain = BERTClass(2)
model_sustain.to(device)
model_sustain.load_state_dict(torch.load('pytorch_model.bin', map_location=device)['model_state_dict'])
tokenizer_sus = BertTokenizer.from_pretrained('roberta-base')
actual_predictions_sus = do_predict(model_sustain, tokenizer_sus, test_df)
test_df['sustainability'] = ['sustainable' if i==0 else 'unsustainable' for i in actual_predictions_read]
```
Our work can be cited as follows:
```bibtex
@inproceedings{ghosh-2022-finsim-esg,
title = "Ranking Environment, Social And Governance Related Concepts And Assessing Sustainability Aspect Of Financial Texts",
author={Ghosh, Sohom and Naskar, Sudip Kumar},
booktitle = "Proceedings of the Fourth Workshop on Financial Technology and Natural Language Processing (FinNLP@IJCAI-ECAI 2022)",
month = "July" ,
year = "2022",
address = "Vienna, Austria",
publisher = "-",
url = "https://mx.nthu.edu.tw/~chungchichen/FinNLP2022_IJCAI/14.pdf",
pages = "87--92",
}
``` |