sohomghosh's picture
Update README.md
9b1a74b
---
license: mit
---
How to use ths model?
Download the pytorch_model.bin file and execute the following:
```python
import pandas as pd
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer, BertModel, BertTokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MAX_LEN = 128
BATCH_SIZE = 20
text_col_name = 'sentence'
category_col = 'label_text'
#Input should be one dataframe having one column with header as 'sentence' : test_df (do reset_index() if needed)
test_df = pd.DataFrame({"sentence":['We are striving to reduce the amount of waste we produce, and to reduce water as well as paper consumption.']})
def scoring_data_prep(dataset):
out = []
target = []
mask = []
for i in range(len(dataset)):
rec = dataset[i]
out.append(rec['ids'].reshape(-1,MAX_LEN))
mask.append(rec['mask'].reshape(-1,MAX_LEN))
out_stack = torch.cat(out, dim = 0)
mask_stack = torch.cat(mask, dim =0 )
out_stack = out_stack.to(device, dtype = torch.long)
mask_stack = mask_stack.to(device, dtype = torch.long)
return out_stack, mask_stack
class Triage(Dataset):
"""
This is a subclass of torch packages Dataset class. It processes input to create ids, masks and targets required for model training.
"""
def __init__(self, dataframe, tokenizer, max_len, text_col_name):
self.len = len(dataframe)
self.data = dataframe
self.tokenizer = tokenizer
self.max_len = max_len
self.text_col_name = text_col_name
def __getitem__(self, index):
title = str(self.data[self.text_col_name][index])
title = " ".join(title.split())
inputs = self.tokenizer.encode_plus(
title,
None,
add_special_tokens=True,
max_length=self.max_len,
pad_to_max_length=True,
return_token_type_ids=True,
truncation=True,
)
ids = inputs["input_ids"]
mask = inputs["attention_mask"]
return {
"ids": torch.tensor(ids, dtype=torch.long),
"mask": torch.tensor(mask, dtype=torch.long),
}
def __len__(self):
return self.len
class BERTClass(torch.nn.Module):
def __init__(self, num_class):
super(BERTClass, self).__init__()
self.num_class = num_class
self.l1 = RobertaModel.from_pretrained("roberta-base")
self.pre_classifier = torch.nn.Linear(768, 768)
self.dropout = torch.nn.Dropout(0.3)
self.classifier = torch.nn.Linear(768, self.num_class)
self.history = dict()
def forward(self, input_ids, attention_mask):
output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
hidden_state = output_1[0]
pooler = hidden_state[:, 0]
pooler = self.pre_classifier(pooler)
pooler = torch.nn.ReLU()(pooler)
pooler = self.dropout(pooler)
output = self.classifier(pooler)
return output
def do_predict(model, tokenizer, test_df):
test_set = Triage(test_df, tokenizer, MAX_LEN, text_col_name)
test_params = {'batch_size' : BATCH_SIZE, 'shuffle': False, 'num_workers':0}
test_loader = DataLoader(test_set, **test_params)
out_stack, mask_stack = scoring_data_prep(dataset = test_set)
n = 0
combined_output = []
model.eval()
with torch.no_grad():
while n < test_df.shape[0]:
output = model(out_stack[n:n+BATCH_SIZE,:],mask_stack[n:n+BATCH_SIZE,:])
n = n + BATCH_SIZE
combined_output.append(output)
combined_output = torch.cat(combined_output, dim = 0)
preds = torch.argsort(combined_output, axis = 1, descending = True)
preds = preds.to('cpu')
actual_predictions = [i[0] for i in preds.tolist()]
return actual_predictions
model_sustain = BERTClass(2)
model_sustain.to(device)
model_sustain.load_state_dict(torch.load('pytorch_model.bin', map_location=device)['model_state_dict'])
tokenizer_sus = BertTokenizer.from_pretrained('roberta-base')
actual_predictions_sus = do_predict(model_sustain, tokenizer_sus, test_df)
test_df['sustainability'] = ['sustainable' if i==0 else 'unsustainable' for i in actual_predictions_read]
```
Our work can be cited as follows:
```bibtex
@inproceedings{ghosh-2022-finsim-esg,
title = "Ranking Environment, Social And Governance Related Concepts And Assessing Sustainability Aspect Of Financial Texts",
author={Ghosh, Sohom and Naskar, Sudip Kumar},
booktitle = "Proceedings of the Fourth Workshop on Financial Technology and Natural Language Processing (FinNLP@IJCAI-ECAI 2022)",
month = "July" ,
year = "2022",
address = "Vienna, Austria",
publisher = "-",
url = "https://mx.nthu.edu.tw/~chungchichen/FinNLP2022_IJCAI/14.pdf",
pages = "87--92",
}
```