essay-main-idea / main_idea_with_torch.py
yutingg's picture
Predict main idea sentence with custom-distill-bert-for-sentence-label
ecf6936
from nltk.tokenize import sent_tokenize
import pandas as pd
######################
# prerequisite:
# 1. Pip install transformer
# 2. Define tokenizer + MAX_LEN
# 3. Construct DistillBERTClass_SL class
# 4. Construct Triage_SL class
# 5. Define predict__SL class
# 6. Load model_SL & call eval()
# 7. Pre_define predict_params_SL
####################
from transformers import DistilBertTokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
import torch
"""### DataSet Class -- Triage_SL"""
from torch.utils.data import Dataset, DataLoader
class Triage_SL(Dataset):
# initialize the directory containing the dataframe, the tokenizer, and the max lens of sentences
def __init__(self, dataframe, tokenizer, max_len):
self.len = len(dataframe)
self.data = dataframe
self.tokenizer = tokenizer # load in tokenizer, used in _getitem
self.max_len = max_len
# The __getitem__ function loads and returns a sample from the dataset at the given index idx.
def __getitem__(self, index):
if index >= len(self):
raise StopIteration
# preprossessing sentences to standarize format as in: word+""+word
sent = str(self.data.sentence[index])
sent = " ".join(sent.split())
# 1.- Split the sentence into tokens.
# 2.- Add the special [CLS] and [SEP] tokens.
# 3.- Map the tokens to their IDs.
# 4.- Pad or truncate all sentences to the same length.
# 5.- Create the attention masks which explicitly differentiate real tokens from [PAD] tokens.
inputs = self.tokenizer.encode_plus(
sent, # Sentence to encode
None, # text_pair
add_special_tokens=True, # Add '[CLS]' and '[SEP]'
max_length=self.max_len,
pad_to_max_length=True, # Pad & truncate all sentences.
return_token_type_ids=True,
truncation=True
)
ids = inputs['input_ids']
mask = inputs['attention_mask']
return {
'ids': torch.tensor(ids, dtype=torch.long),
'mask': torch.tensor(mask, dtype=torch.long),
# 'targets': torch.tensor(self.data.ENCODE_LABEL[index], dtype=torch.float), # sentence label -> y value
# 'combined_label': self.data.combined_label[index]
}
# The __len__ function returns the number of samples in our dataset.
def __len__(self):
return self.len
# read in an essay and resturns a df in sentence level
def essay_to_sent_df(essay):
sentences = []
paragraphs = [l for l in essay.split('\n') if len(l) > 0]
for para in paragraphs:
# tokenize paragraph by "." and concatenate to sentences[]
sentences.extend(sent_tokenize(para))
return pd.DataFrame(sentences, columns=['sentence'])
# Defining some key variables that will be used later on in the training
MAX_LEN = 512
"""### Predefine predict_params_SL"""
PREDICT_BATCH_SIZE = 1
predict_params_SL = {'batch_size': PREDICT_BATCH_SIZE,
'shuffle': False,
'num_workers': 0
}
"""### Predict Fn -- predict_SL"""
sigmoid = torch.nn.Sigmoid()
def predict_SL(model, validation_loader):
epoch_val_outputs=[]
cpu_device = 'cpu'
model.eval()
with torch.no_grad():
for _, data in enumerate(validation_loader, 0):
ids = data['ids'].to(cpu_device, dtype = torch.long)
mask = data['mask'].to(cpu_device, dtype = torch.long)
outputs = model(ids, mask)["logits"].squeeze() # ??squeeze??
outputs = (sigmoid(outputs).data>0.5).float()
epoch_val_outputs.append(outputs.item())
return epoch_val_outputs
def predict_mainidea_sent_old(paragraph, model):
# prepare data
sent_df = essay_to_sent_df(paragraph)
predicting_SL_set = Triage_SL(sent_df, tokenizer, MAX_LEN)
predicting_SL_loader = DataLoader(predicting_SL_set, **predict_params_SL)
# load model to device
device = 'cpu'
model.to(device)
# predict + roundup
sent_label = predict_SL(model, predicting_SL_loader)
print(sent_label)
return pd.DataFrame([(str(l), s) for l, s in zip(sent_label, sent_df.sentence)], columns=['label', 'sentence'])