Spaces:

yutingg
/

essay-main-idea

Running

App Files Files Community

essay-main-idea / main_idea_with_torch.py

yutingg

Predict main idea sentence with custom-distill-bert-for-sentence-label

ecf6936 6 months ago

raw history blame contribute delete

No virus

4.31 kB

	from nltk.tokenize import sent_tokenize
	import pandas as pd

	######################
	# prerequisite:
	# 1. Pip install transformer
	# 2. Define tokenizer + MAX_LEN
	# 3. Construct DistillBERTClass_SL class
	# 4. Construct Triage_SL class
	# 5. Define predict__SL class
	# 6. Load model_SL & call eval()
	# 7. Pre_define predict_params_SL
	####################

	from transformers import DistilBertTokenizer

	tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')


	import torch

	"""### DataSet Class -- Triage_SL"""

	from torch.utils.data import Dataset, DataLoader

	class Triage_SL(Dataset):
	# initialize the directory containing the dataframe, the tokenizer, and the max lens of sentences
	def __init__(self, dataframe, tokenizer, max_len):
	self.len = len(dataframe)
	self.data = dataframe
	self.tokenizer = tokenizer # load in tokenizer, used in _getitem
	self.max_len = max_len

	# The __getitem__ function loads and returns a sample from the dataset at the given index idx.
	def __getitem__(self, index):
	if index >= len(self):
	raise StopIteration
	# preprossessing sentences to standarize format as in: word+""+word
	sent = str(self.data.sentence[index])
	sent = " ".join(sent.split())
	# 1.- Split the sentence into tokens.
	# 2.- Add the special [CLS] and [SEP] tokens.
	# 3.- Map the tokens to their IDs.
	# 4.- Pad or truncate all sentences to the same length.
	# 5.- Create the attention masks which explicitly differentiate real tokens from [PAD] tokens.
	inputs = self.tokenizer.encode_plus(
	sent, # Sentence to encode
	None, # text_pair
	add_special_tokens=True, # Add '[CLS]' and '[SEP]'
	max_length=self.max_len,
	pad_to_max_length=True, # Pad & truncate all sentences.
	return_token_type_ids=True,
	truncation=True
	)
	ids = inputs['input_ids']
	mask = inputs['attention_mask']

	return {
	'ids': torch.tensor(ids, dtype=torch.long),
	'mask': torch.tensor(mask, dtype=torch.long),
	# 'targets': torch.tensor(self.data.ENCODE_LABEL[index], dtype=torch.float), # sentence label -> y value
	# 'combined_label': self.data.combined_label[index]
	}
	# The __len__ function returns the number of samples in our dataset.
	def __len__(self):
	return self.len


	# read in an essay and resturns a df in sentence level
	def essay_to_sent_df(essay):
	sentences = []
	paragraphs = [l for l in essay.split('\n') if len(l) > 0]
	for para in paragraphs:
	# tokenize paragraph by "." and concatenate to sentences[]
	sentences.extend(sent_tokenize(para))
	return pd.DataFrame(sentences, columns=['sentence'])

	# Defining some key variables that will be used later on in the training
	MAX_LEN = 512
	"""### Predefine predict_params_SL"""

	PREDICT_BATCH_SIZE = 1
	predict_params_SL = {'batch_size': PREDICT_BATCH_SIZE,
	'shuffle': False,
	'num_workers': 0
	}

	"""### Predict Fn -- predict_SL"""

	sigmoid = torch.nn.Sigmoid()

	def predict_SL(model, validation_loader):
	epoch_val_outputs=[]
	cpu_device = 'cpu'
	model.eval()
	with torch.no_grad():
	for _, data in enumerate(validation_loader, 0):
	ids = data['ids'].to(cpu_device, dtype = torch.long)
	mask = data['mask'].to(cpu_device, dtype = torch.long)
	outputs = model(ids, mask)["logits"].squeeze() # ??squeeze??
	outputs = (sigmoid(outputs).data>0.5).float()
	epoch_val_outputs.append(outputs.item())
	return epoch_val_outputs

	def predict_mainidea_sent_old(paragraph, model):
	# prepare data
	sent_df = essay_to_sent_df(paragraph)
	predicting_SL_set = Triage_SL(sent_df, tokenizer, MAX_LEN)
	predicting_SL_loader = DataLoader(predicting_SL_set, **predict_params_SL)
	# load model to device
	device = 'cpu'
	model.to(device)
	# predict + roundup
	sent_label = predict_SL(model, predicting_SL_loader)
	print(sent_label)
	return pd.DataFrame([(str(l), s) for l, s in zip(sent_label, sent_df.sentence)], columns=['label', 'sentence'])