Spaces:

sohomghosh
/

FLUEnT

Runtime error

App Files Files Community

FLUEnT / fin_readability_sustainability.py

sohomghosh

Upload fin_readability_sustainability.py

b142f60 almost 2 years ago

raw

history blame contribute delete

No virus

3.83 kB

	import torch
	import transformers
	from torch.utils.data import Dataset, DataLoader
	from transformers import RobertaModel, RobertaTokenizer, BertModel, BertTokenizer
	import pandas as pd

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	MAX_LEN = 128
	BATCH_SIZE = 20
	text_col_name = 'sentence'

	def scoring_data_prep(dataset):
	out = []
	target = []
	mask = []

	for i in range(len(dataset)):
	rec = dataset[i]
	out.append(rec['ids'].reshape(-1,MAX_LEN))
	mask.append(rec['mask'].reshape(-1,MAX_LEN))

	out_stack = torch.cat(out, dim = 0)
	mask_stack = torch.cat(mask, dim =0 )
	out_stack = out_stack.to(device, dtype = torch.long)
	mask_stack = mask_stack.to(device, dtype = torch.long)

	return out_stack, mask_stack

	class Triage(Dataset):
	"""
	This is a subclass of torch packages Dataset class. It processes input to create ids, masks and targets required for model training.
	"""

	def __init__(self, dataframe, tokenizer, max_len, text_col_name):
	self.len = len(dataframe)
	self.data = dataframe
	self.tokenizer = tokenizer
	self.max_len = max_len
	self.text_col_name = text_col_name


	def __getitem__(self, index):
	title = str(self.data[self.text_col_name][index])
	title = " ".join(title.split())
	inputs = self.tokenizer.encode_plus(
	title,
	None,
	add_special_tokens=True,
	max_length=self.max_len,
	pad_to_max_length=True, #padding='max_length' #For future version use `padding='max_length'`
	return_token_type_ids=True,
	truncation=True,
	)
	ids = inputs["input_ids"]
	mask = inputs["attention_mask"]

	return {
	"ids": torch.tensor(ids, dtype=torch.long),
	"mask": torch.tensor(mask, dtype=torch.long),

	}

	def __len__(self):
	return self.len

	class BERTClass(torch.nn.Module):
	def __init__(self, num_class, task):
	super(BERTClass, self).__init__()
	self.num_class = num_class
	if task =="sustanability":
	self.l1 = RobertaModel.from_pretrained("roberta-base")
	else:
	self.l1 = BertModel.from_pretrained("ProsusAI/finbert")
	self.pre_classifier = torch.nn.Linear(768, 768)
	self.dropout = torch.nn.Dropout(0.3)
	self.classifier = torch.nn.Linear(768, self.num_class)
	self.history = dict()

	def forward(self, input_ids, attention_mask):
	output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
	hidden_state = output_1[0]
	pooler = hidden_state[:, 0]
	pooler = self.pre_classifier(pooler)
	pooler = torch.nn.ReLU()(pooler)
	pooler = self.dropout(pooler)
	output = self.classifier(pooler)
	return output

	def do_predict(model, tokenizer, test_df):
	test_set = Triage(test_df, tokenizer, MAX_LEN, text_col_name)
	test_params = {'batch_size' : BATCH_SIZE, 'shuffle': False, 'num_workers':0}
	test_loader = DataLoader(test_set, **test_params)
	out_stack, mask_stack = scoring_data_prep(dataset = test_set)
	n = 0
	combined_output = []
	model.eval()
	with torch.no_grad():
	while n < test_df.shape[0]:
	output = model(out_stack[n:n+BATCH_SIZE,:],mask_stack[n:n+BATCH_SIZE,:])
	n = n + BATCH_SIZE
	combined_output.append(output)
	combined_output = torch.cat(combined_output, dim = 0)
	preds = torch.argsort(combined_output, axis = 1, descending = True)
	preds = preds.to('cpu')
	actual_predictions = [i[0] for i in preds.tolist()]
	combined_output = combined_output.to('cpu')
	prob_predictions= [i[1] for i in combined_output.tolist()]
	return (actual_predictions, prob_predictions)