from nltk.tokenize import sent_tokenize import pandas as pd ###################### # prerequisite: # 1. Pip install transformer # 2. Define tokenizer + MAX_LEN # 3. Construct DistillBERTClass_SL class # 4. Construct Triage_SL class # 5. Define predict__SL class # 6. Load model_SL & call eval() # 7. Pre_define predict_params_SL #################### from transformers import DistilBertTokenizer tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') import torch """### DataSet Class -- Triage_SL""" from torch.utils.data import Dataset, DataLoader class Triage_SL(Dataset): # initialize the directory containing the dataframe, the tokenizer, and the max lens of sentences def __init__(self, dataframe, tokenizer, max_len): self.len = len(dataframe) self.data = dataframe self.tokenizer = tokenizer # load in tokenizer, used in _getitem self.max_len = max_len # The __getitem__ function loads and returns a sample from the dataset at the given index idx. def __getitem__(self, index): if index >= len(self): raise StopIteration # preprossessing sentences to standarize format as in: word+""+word sent = str(self.data.sentence[index]) sent = " ".join(sent.split()) # 1.- Split the sentence into tokens. # 2.- Add the special [CLS] and [SEP] tokens. # 3.- Map the tokens to their IDs. # 4.- Pad or truncate all sentences to the same length. # 5.- Create the attention masks which explicitly differentiate real tokens from [PAD] tokens. inputs = self.tokenizer.encode_plus( sent, # Sentence to encode None, # text_pair add_special_tokens=True, # Add '[CLS]' and '[SEP]' max_length=self.max_len, pad_to_max_length=True, # Pad & truncate all sentences. return_token_type_ids=True, truncation=True ) ids = inputs['input_ids'] mask = inputs['attention_mask'] return { 'ids': torch.tensor(ids, dtype=torch.long), 'mask': torch.tensor(mask, dtype=torch.long), # 'targets': torch.tensor(self.data.ENCODE_LABEL[index], dtype=torch.float), # sentence label -> y value # 'combined_label': self.data.combined_label[index] } # The __len__ function returns the number of samples in our dataset. def __len__(self): return self.len # read in an essay and resturns a df in sentence level def essay_to_sent_df(essay): sentences = [] paragraphs = [l for l in essay.split('\n') if len(l) > 0] for para in paragraphs: # tokenize paragraph by "." and concatenate to sentences[] sentences.extend(sent_tokenize(para)) return pd.DataFrame(sentences, columns=['sentence']) # Defining some key variables that will be used later on in the training MAX_LEN = 512 """### Predefine predict_params_SL""" PREDICT_BATCH_SIZE = 1 predict_params_SL = {'batch_size': PREDICT_BATCH_SIZE, 'shuffle': False, 'num_workers': 0 } """### Predict Fn -- predict_SL""" sigmoid = torch.nn.Sigmoid() def predict_SL(model, validation_loader): epoch_val_outputs=[] cpu_device = 'cpu' model.eval() with torch.no_grad(): for _, data in enumerate(validation_loader, 0): ids = data['ids'].to(cpu_device, dtype = torch.long) mask = data['mask'].to(cpu_device, dtype = torch.long) outputs = model(ids, mask)["logits"].squeeze() # ??squeeze?? outputs = (sigmoid(outputs).data>0.5).float() epoch_val_outputs.append(outputs.item()) return epoch_val_outputs def predict_mainidea_sent_old(paragraph, model): # prepare data sent_df = essay_to_sent_df(paragraph) predicting_SL_set = Triage_SL(sent_df, tokenizer, MAX_LEN) predicting_SL_loader = DataLoader(predicting_SL_set, **predict_params_SL) # load model to device device = 'cpu' model.to(device) # predict + roundup sent_label = predict_SL(model, predicting_SL_loader) print(sent_label) return pd.DataFrame([(str(l), s) for l, s in zip(sent_label, sent_df.sentence)], columns=['label', 'sentence'])