yutingg commited on
Commit
ecf6936
1 Parent(s): 134e623

Predict main idea sentence with custom-distill-bert-for-sentence-label

Browse files
Files changed (3) hide show
  1. app.py +23 -3
  2. main_idea_with_pipeline.py +39 -0
  3. main_idea_with_torch.py +119 -0
app.py CHANGED
@@ -1,7 +1,27 @@
1
  import gradio as gr
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
 
6
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  iface.launch()
 
1
  import gradio as gr
2
+ from transformers import AutoModel, AutoConfig
3
+ from main_idea_with_torch import predict_mainidea_sent_old
4
+ from main_idea_with_pipeline import predict_mainidea_sent
5
 
6
+ config = AutoConfig.from_pretrained("yutingg/custom-distill-bert-for-sentence-label", trust_remote_code=True)
7
+ model = AutoModel.from_pretrained("yutingg/custom-distill-bert-for-sentence-label", trust_remote_code=True, config=config)
8
 
9
+ def greet(essay):
10
+ ret = predict_mainidea_sent(essay, model), predict_mainidea_sent_old(essay, model)
11
+ return ret
12
+
13
+ iface = gr.Interface(fn=greet, inputs="text", outputs=[
14
+ gr.Dataframe(
15
+ label="pipeline output",
16
+ headers=['label: is main idea', 'sentence'],
17
+ datatype=["str", "str"],
18
+ col_count=(2, "fixed"),
19
+ ),
20
+ gr.Dataframe(
21
+ label="torch output with Triage",
22
+ headers=['label: is main idea', 'sentence'],
23
+ datatype=["str", "str"],
24
+ col_count=(2, "fixed"),
25
+ )
26
+ ])
27
  iface.launch()
main_idea_with_pipeline.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from nltk.tokenize import sent_tokenize, word_tokenize
2
+ import pandas as pd
3
+
4
+
5
+ # read in an essay and resturns a df in sentence level
6
+ def essay_to_sent(essay):
7
+ sentences = []
8
+ paragraphs = [l for l in essay.split('\n') if len(l) > 0]
9
+ for para in paragraphs:
10
+ # tokenize paragraph by "." and concatenate to sentences[]
11
+ sentences.extend(sent_tokenize(para))
12
+ return sentences
13
+
14
+
15
+ ######################
16
+ # prerequisite:
17
+ # 1. Pip install transformer
18
+ # 2. Define tokenizer + MAX_LEN
19
+ # 3. Construct DistillBERTClass_SL class
20
+ # 4. Construct Triage_SL class
21
+ # 5. Define predict__SL class
22
+ # 6. Load model_SL & call eval()
23
+ # 7. Pre_define predict_params_SL
24
+ ####################
25
+
26
+ from transformers import DistilBertTokenizer
27
+ from transformers import pipeline
28
+
29
+ tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
30
+
31
+
32
+ def predict_mainidea_sent(paragraph, model):
33
+ # prepare data
34
+ sentences = essay_to_sent(paragraph)
35
+
36
+ pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device="cpu")
37
+ probability_score = pipe(sentences, batch_size=8, function_to_apply="sigmoid")
38
+ labels = [score['score'] > 0.5 for score in probability_score]
39
+ return pd.DataFrame([(str(l), s) for l, s in zip(labels, sentences)], columns=['label', 'sentence'])
main_idea_with_torch.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from nltk.tokenize import sent_tokenize
2
+ import pandas as pd
3
+
4
+ ######################
5
+ # prerequisite:
6
+ # 1. Pip install transformer
7
+ # 2. Define tokenizer + MAX_LEN
8
+ # 3. Construct DistillBERTClass_SL class
9
+ # 4. Construct Triage_SL class
10
+ # 5. Define predict__SL class
11
+ # 6. Load model_SL & call eval()
12
+ # 7. Pre_define predict_params_SL
13
+ ####################
14
+
15
+ from transformers import DistilBertTokenizer
16
+
17
+ tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
18
+
19
+
20
+ import torch
21
+
22
+ """### DataSet Class -- Triage_SL"""
23
+
24
+ from torch.utils.data import Dataset, DataLoader
25
+
26
+ class Triage_SL(Dataset):
27
+ # initialize the directory containing the dataframe, the tokenizer, and the max lens of sentences
28
+ def __init__(self, dataframe, tokenizer, max_len):
29
+ self.len = len(dataframe)
30
+ self.data = dataframe
31
+ self.tokenizer = tokenizer # load in tokenizer, used in _getitem
32
+ self.max_len = max_len
33
+
34
+ # The __getitem__ function loads and returns a sample from the dataset at the given index idx.
35
+ def __getitem__(self, index):
36
+ if index >= len(self):
37
+ raise StopIteration
38
+ # preprossessing sentences to standarize format as in: word+""+word
39
+ sent = str(self.data.sentence[index])
40
+ sent = " ".join(sent.split())
41
+ # 1.- Split the sentence into tokens.
42
+ # 2.- Add the special [CLS] and [SEP] tokens.
43
+ # 3.- Map the tokens to their IDs.
44
+ # 4.- Pad or truncate all sentences to the same length.
45
+ # 5.- Create the attention masks which explicitly differentiate real tokens from [PAD] tokens.
46
+ inputs = self.tokenizer.encode_plus(
47
+ sent, # Sentence to encode
48
+ None, # text_pair
49
+ add_special_tokens=True, # Add '[CLS]' and '[SEP]'
50
+ max_length=self.max_len,
51
+ pad_to_max_length=True, # Pad & truncate all sentences.
52
+ return_token_type_ids=True,
53
+ truncation=True
54
+ )
55
+ ids = inputs['input_ids']
56
+ mask = inputs['attention_mask']
57
+
58
+ return {
59
+ 'ids': torch.tensor(ids, dtype=torch.long),
60
+ 'mask': torch.tensor(mask, dtype=torch.long),
61
+ # 'targets': torch.tensor(self.data.ENCODE_LABEL[index], dtype=torch.float), # sentence label -> y value
62
+ # 'combined_label': self.data.combined_label[index]
63
+ }
64
+ # The __len__ function returns the number of samples in our dataset.
65
+ def __len__(self):
66
+ return self.len
67
+
68
+
69
+ # read in an essay and resturns a df in sentence level
70
+ def essay_to_sent_df(essay):
71
+ sentences = []
72
+ paragraphs = [l for l in essay.split('\n') if len(l) > 0]
73
+ for para in paragraphs:
74
+ # tokenize paragraph by "." and concatenate to sentences[]
75
+ sentences.extend(sent_tokenize(para))
76
+ return pd.DataFrame(sentences, columns=['sentence'])
77
+
78
+ # Defining some key variables that will be used later on in the training
79
+ MAX_LEN = 512
80
+ """### Predefine predict_params_SL"""
81
+
82
+ PREDICT_BATCH_SIZE = 1
83
+ predict_params_SL = {'batch_size': PREDICT_BATCH_SIZE,
84
+ 'shuffle': False,
85
+ 'num_workers': 0
86
+ }
87
+
88
+ """### Predict Fn -- predict_SL"""
89
+
90
+ sigmoid = torch.nn.Sigmoid()
91
+
92
+ def predict_SL(model, validation_loader):
93
+ epoch_val_outputs=[]
94
+ cpu_device = 'cpu'
95
+ model.eval()
96
+ with torch.no_grad():
97
+ for _, data in enumerate(validation_loader, 0):
98
+ ids = data['ids'].to(cpu_device, dtype = torch.long)
99
+ mask = data['mask'].to(cpu_device, dtype = torch.long)
100
+ outputs = model(ids, mask)["logits"].squeeze() # ??squeeze??
101
+ outputs = (sigmoid(outputs).data>0.5).float()
102
+ epoch_val_outputs.append(outputs.item())
103
+ return epoch_val_outputs
104
+
105
+ def predict_mainidea_sent_old(paragraph, model):
106
+ # prepare data
107
+ sent_df = essay_to_sent_df(paragraph)
108
+ predicting_SL_set = Triage_SL(sent_df, tokenizer, MAX_LEN)
109
+ predicting_SL_loader = DataLoader(predicting_SL_set, **predict_params_SL)
110
+ # load model to device
111
+ device = 'cpu'
112
+ model.to(device)
113
+ # predict + roundup
114
+ sent_label = predict_SL(model, predicting_SL_loader)
115
+ print(sent_label)
116
+ return pd.DataFrame([(str(l), s) for l, s in zip(sent_label, sent_df.sentence)], columns=['label', 'sentence'])
117
+
118
+
119
+