File size: 1,301 Bytes
ecf6936
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
from nltk.tokenize import sent_tokenize, word_tokenize
import pandas as pd


# read in an essay and resturns a df in sentence level
def essay_to_sent(essay):
    sentences = []
    paragraphs = [l for l in essay.split('\n') if len(l) > 0]
    for para in paragraphs:
        # tokenize paragraph by "." and concatenate to sentences[]
        sentences.extend(sent_tokenize(para))
    return sentences


######################
# prerequisite:
# 1. Pip install transformer
# 2. Define tokenizer + MAX_LEN
# 3. Construct DistillBERTClass_SL class
# 4. Construct Triage_SL class
# 5. Define predict__SL class
# 6. Load model_SL & call eval()
# 7. Pre_define predict_params_SL
####################

from transformers import DistilBertTokenizer
from transformers import pipeline

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')


def predict_mainidea_sent(paragraph, model):
    # prepare data
    sentences = essay_to_sent(paragraph)

    pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device="cpu")
    probability_score = pipe(sentences, batch_size=8, function_to_apply="sigmoid")
    labels = [score['score'] > 0.5 for score in probability_score]
    return pd.DataFrame([(str(l), s) for l, s in zip(labels, sentences)], columns=['label', 'sentence'])