from nltk.tokenize import sent_tokenize, word_tokenize import pandas as pd # read in an essay and resturns a df in sentence level def essay_to_sent(essay): sentences = [] paragraphs = [l for l in essay.split('\n') if len(l) > 0] for para in paragraphs: # tokenize paragraph by "." and concatenate to sentences[] sentences.extend(sent_tokenize(para)) return sentences ###################### # prerequisite: # 1. Pip install transformer # 2. Define tokenizer + MAX_LEN # 3. Construct DistillBERTClass_SL class # 4. Construct Triage_SL class # 5. Define predict__SL class # 6. Load model_SL & call eval() # 7. Pre_define predict_params_SL #################### from transformers import DistilBertTokenizer from transformers import pipeline tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') def predict_mainidea_sent(paragraph, model): # prepare data sentences = essay_to_sent(paragraph) pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device="cpu") probability_score = pipe(sentences, batch_size=8, function_to_apply="sigmoid") labels = [score['score'] > 0.5 for score in probability_score] return pd.DataFrame([(str(l), s) for l, s in zip(labels, sentences)], columns=['label', 'sentence'])