Pragformer commited on
Commit
094450f
1 Parent(s): a7a7a33

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +120 -48
app.py CHANGED
@@ -1,9 +1,13 @@
1
  import gradio as gr
2
- import transformers
3
  from simpletransformers.classification import ClassificationModel, ClassificationArgs
 
4
  import torch
 
 
 
5
  import json
6
 
 
7
  # load all models
8
  deep_scc_model_args = ClassificationArgs(num_train_epochs=10,max_seq_length=300,use_multiprocessing=False)
9
  deep_scc_model = ClassificationModel("roberta", "NTUYG/DeepSCC-RoBERTa", num_labels=19, args=deep_scc_model_args, use_cuda=False)
@@ -18,71 +22,127 @@ with_omp_str = 'Should contain a parallel work-sharing loop construct'
18
  without_omp_str = 'Should not contain a parallel work-sharing loop construct'
19
  name_file = ['bash', 'c', 'c#', 'c++','css', 'haskell', 'java', 'javascript', 'lua', 'objective-c', 'perl', 'php', 'python','r','ruby', 'scala', 'sql', 'swift', 'vb.net']
20
 
21
-
22
  tokenizer = transformers.AutoTokenizer.from_pretrained('NTUYG/DeepSCC-RoBERTa')
23
 
24
  with open('c_data.json', 'r') as f:
25
  data = json.load(f)
26
 
27
  def fill_code(code_pth):
28
- pragma = data[code_pth]['pragma']
29
- code = data[code_pth]['code']
30
- return 'None' if len(pragma)==0 else pragma, code
31
-
32
 
33
  def predict(code_txt):
34
- code = code_txt.lstrip().rstrip()
35
- tokenized = tokenizer.batch_encode_plus(
36
- [code],
37
- max_length = 150,
38
- pad_to_max_length = True,
39
- truncation = True
40
- )
41
- pred = pragformer(torch.tensor(tokenized['input_ids']), torch.tensor(tokenized['attention_mask']))
42
 
43
- y_hat = torch.argmax(pred).item()
44
- return with_omp_str if y_hat==1 else without_omp_str, torch.nn.Softmax(dim=1)(pred).squeeze()[y_hat].item()
45
 
46
 
47
  def is_private(code_txt):
48
- if predict(code_txt)[0] == without_omp_str:
49
- return gr.update(visible=False)
50
 
51
- code = code_txt.lstrip().rstrip()
52
- tokenized = tokenizer.batch_encode_plus(
53
- [code],
54
- max_length = 150,
55
- pad_to_max_length = True,
56
- truncation = True
57
- )
58
- pred = pragformer_private(torch.tensor(tokenized['input_ids']), torch.tensor(tokenized['attention_mask']))
59
 
60
- y_hat = torch.argmax(pred).item()
61
- # if y_hat == 0:
62
- # return gr.update(visible=False)
63
- # else:
64
- return gr.update(value=f"Should {'not' if y_hat==0 else ''} contain private with confidence: {torch.nn.Softmax(dim=1)(pred).squeeze()[y_hat].item()}", visible=True)
65
 
66
 
67
  def is_reduction(code_txt):
68
- if predict(code_txt)[0] == without_omp_str:
69
- return gr.update(visible=False)
70
-
71
- code = code_txt.lstrip().rstrip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  tokenized = tokenizer.batch_encode_plus(
73
- [code],
74
- max_length = 150,
75
- pad_to_max_length = True,
76
- truncation = True
77
- )
78
- pred = pragformer_reduction(torch.tensor(tokenized['input_ids']), torch.tensor(tokenized['attention_mask']))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
- y_hat = torch.argmax(pred).item()
81
- # if y_hat == 0:
82
- # return gr.update(visible=False)
83
- # else:
84
- return gr.update(value=f"Should {'not' if y_hat==0 else ''} contain reduction with confidence: {torch.nn.Softmax(dim=1)(pred).squeeze()[y_hat].item()}", visible=True)
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
  def lang_predict(code_txt):
88
  res = {}
@@ -120,9 +180,13 @@ with gr.Blocks() as pragformer_gui:
120
  pragma = gr.Textbox(label="Original parallelization classification (if any)")
121
  with gr.Row():
122
  code_in = gr.Textbox(lines=5, label="Write some C code and see if it should contain a parallel work-sharing loop construct")
123
- lang_pred = gr.Textbox(lines=5, label="DeepSCC programming language prediction")
124
 
125
  submit_btn = gr.Button("Submit")
 
 
 
 
126
  with gr.Column():
127
  gr.Markdown("## Results")
128
 
@@ -134,11 +198,17 @@ with gr.Blocks() as pragformer_gui:
134
  private = gr.Textbox(label="Data-sharing attribute clause- private", visible=False)
135
  reduction = gr.Textbox(label="Data-sharing attribute clause- reduction", visible=False)
136
 
137
- code_in.change(fn=lang_predict, inputs=code_in, outputs=lang_pred)
 
 
 
 
 
138
 
139
  submit_btn.click(fn=predict, inputs=code_in, outputs=[label_out, confidence_out])
140
  submit_btn.click(fn=is_private, inputs=code_in, outputs=private)
141
  submit_btn.click(fn=is_reduction, inputs=code_in, outputs=reduction)
 
142
  sample_btn.click(fn=fill_code, inputs=drop, outputs=[pragma, code_in])
143
 
144
  gr.Markdown(
@@ -179,5 +249,7 @@ with gr.Blocks() as pragformer_gui:
179
  """)
180
 
181
 
 
 
182
  pragformer_gui.launch()
183
 
 
1
  import gradio as gr
 
2
  from simpletransformers.classification import ClassificationModel, ClassificationArgs
3
+ from lime.lime_text import LimeTextExplainer
4
  import torch
5
+ from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
6
+ import re
7
+ import transformers
8
  import json
9
 
10
+
11
  # load all models
12
  deep_scc_model_args = ClassificationArgs(num_train_epochs=10,max_seq_length=300,use_multiprocessing=False)
13
  deep_scc_model = ClassificationModel("roberta", "NTUYG/DeepSCC-RoBERTa", num_labels=19, args=deep_scc_model_args, use_cuda=False)
 
22
  without_omp_str = 'Should not contain a parallel work-sharing loop construct'
23
  name_file = ['bash', 'c', 'c#', 'c++','css', 'haskell', 'java', 'javascript', 'lua', 'objective-c', 'perl', 'php', 'python','r','ruby', 'scala', 'sql', 'swift', 'vb.net']
24
 
 
25
  tokenizer = transformers.AutoTokenizer.from_pretrained('NTUYG/DeepSCC-RoBERTa')
26
 
27
  with open('c_data.json', 'r') as f:
28
  data = json.load(f)
29
 
30
  def fill_code(code_pth):
31
+ pragma = data[code_pth]['pragma']
32
+ code = data[code_pth]['code']
33
+ return 'None' if len(pragma)==0 else pragma, code
34
+
35
 
36
  def predict(code_txt):
37
+ code = code_txt.lstrip().rstrip()
38
+ tokenized = tokenizer.batch_encode_plus(
39
+ [code],
40
+ max_length = 150,
41
+ pad_to_max_length = True,
42
+ truncation = True
43
+ )
44
+ pred = pragformer(torch.tensor(tokenized['input_ids']), torch.tensor(tokenized['attention_mask']))
45
 
46
+ y_hat = torch.argmax(pred).item()
47
+ return with_omp_str if y_hat==1 else without_omp_str, torch.nn.Softmax(dim=1)(pred).squeeze()[y_hat].item()
48
 
49
 
50
  def is_private(code_txt):
51
+ if predict(code_txt)[0] == without_omp_str:
52
+ return gr.update(visible=False)
53
 
54
+ code = code_txt.lstrip().rstrip()
55
+ tokenized = tokenizer.batch_encode_plus(
56
+ [code],
57
+ max_length = 150,
58
+ pad_to_max_length = True,
59
+ truncation = True
60
+ )
61
+ pred = pragformer_private(torch.tensor(tokenized['input_ids']), torch.tensor(tokenized['attention_mask']))
62
 
63
+ y_hat = torch.argmax(pred).item()
64
+ # if y_hat == 0:
65
+ # return gr.update(visible=False)
66
+ # else:
67
+ return gr.update(value=f"Should {'not' if y_hat==0 else ''} contain private with confidence: {torch.nn.Softmax(dim=1)(pred).squeeze()[y_hat].item()}", visible=True)
68
 
69
 
70
  def is_reduction(code_txt):
71
+ if predict(code_txt)[0] == without_omp_str:
72
+ return gr.update(visible=False)
73
+
74
+ code = code_txt.lstrip().rstrip()
75
+ tokenized = tokenizer.batch_encode_plus(
76
+ [code],
77
+ max_length = 150,
78
+ pad_to_max_length = True,
79
+ truncation = True
80
+ )
81
+ pred = pragformer_reduction(torch.tensor(tokenized['input_ids']), torch.tensor(tokenized['attention_mask']))
82
+
83
+ y_hat = torch.argmax(pred).item()
84
+ # if y_hat == 0:
85
+ # return gr.update(visible=False)
86
+ # else:
87
+ return gr.update(value=f"Should {'not' if y_hat==0 else ''} contain reduction with confidence: {torch.nn.Softmax(dim=1)(pred).squeeze()[y_hat].item()}", visible=True)
88
+
89
+
90
+ def predictor(texts):
91
  tokenized = tokenizer.batch_encode_plus(
92
+ texts,
93
+ max_length = 150,
94
+ pad_to_max_length = True,
95
+ truncation = True
96
+ )
97
+ test_seq = torch.tensor(tokenized['input_ids'])
98
+ test_mask = torch.tensor(tokenized['attention_mask'])
99
+ test_y = torch.tensor([1]*len(texts))
100
+ test_data = TensorDataset(test_seq, test_mask, test_y)
101
+ test_sampler = SequentialSampler(test_seq)
102
+ test_dataloader = DataLoader(test_data, sampler = test_sampler, batch_size = len(texts))
103
+ total_probas = []
104
+ for step, batch in enumerate(test_dataloader):
105
+ sent_id, mask, labels = batch
106
+ outputs = pragformer(sent_id, mask)
107
+ probas = outputs.detach().numpy()
108
+ total_probas.extend(probas)
109
+
110
+ return torch.nn.Softmax(dim=1)(torch.tensor(probas)).numpy()
111
+
112
+
113
+ def lime_explain(code_txt):
114
+ class_names = ['Without OpenMP', 'With OpenMP']
115
+ SAMPLES = 40
116
+ exp = []
117
 
118
+ if predict(code_txt)[0] == without_omp_str:
119
+ return gr.update(visible=False)
 
 
 
120
 
121
+ explainer = LimeTextExplainer(class_names=class_names, split_expression=r"\s+")
122
+ exp = explainer.explain_instance(code_txt, predictor, num_features=20, num_samples=SAMPLES)
123
+
124
+ return gr.update(visible=True, value=exp.as_pyplot_figure())
125
+
126
+
127
+ def activate_c(lang_pred):
128
+ langs = lang_pred.split('\n')
129
+ langs = {lang[5:lang.find(':')]:float(lang[lang.find(':')+1:]) for lang in langs}
130
+
131
+ if any([lang in langs for lang in ['c', 'c++', 'c#']]) and any([val > 0.15 for val in langs.values()]):
132
+ return gr.update(visible=True)
133
+ else:
134
+ return gr.update(visible=False)
135
+
136
+
137
+ def activate_button(lang_pred):
138
+ langs = lang_pred.split('\n')
139
+ langs = {lang[5:lang.find(':')]:float(lang[lang.find(':')+1:]) for lang in langs}
140
+
141
+ if any([lang in langs for lang in ['c', 'c++', 'c#']]) and any([val > 0.15 for val in langs.values()]):
142
+ return gr.update(visible=False)
143
+ else:
144
+ return gr.update(visible=True)
145
+
146
 
147
  def lang_predict(code_txt):
148
  res = {}
 
180
  pragma = gr.Textbox(label="Original parallelization classification (if any)")
181
  with gr.Row():
182
  code_in = gr.Textbox(lines=5, label="Write some C code and see if it should contain a parallel work-sharing loop construct")
183
+ lang_pred = gr.Textbox(lines=5, label="DeepScc programming language prediction (only codes written in a C-like syntax will be executed)")
184
 
185
  submit_btn = gr.Button("Submit")
186
+ err_msg = gr.Markdown("""
187
+ <div style='text-align: center;''>
188
+ <span style='color:red'>According to the DeepSCC prediction, the code language is not of a C-like syntax</span>
189
+ </div>""", visible=False)
190
  with gr.Column():
191
  gr.Markdown("## Results")
192
 
 
198
  private = gr.Textbox(label="Data-sharing attribute clause- private", visible=False)
199
  reduction = gr.Textbox(label="Data-sharing attribute clause- reduction", visible=False)
200
 
201
+ explanation = gr.Plot(visible=False)
202
+
203
+
204
+ code_in.change(fn=lang_predict, inputs=code_in, outputs=[lang_pred])
205
+ lang_pred.change(fn=activate_c, inputs=lang_pred, outputs=submit_btn)
206
+ lang_pred.change(fn=activate_button, inputs=lang_pred, outputs=err_msg)
207
 
208
  submit_btn.click(fn=predict, inputs=code_in, outputs=[label_out, confidence_out])
209
  submit_btn.click(fn=is_private, inputs=code_in, outputs=private)
210
  submit_btn.click(fn=is_reduction, inputs=code_in, outputs=reduction)
211
+ submit_btn.click(fn=lime_explain, inputs=code_in, outputs=explanation)
212
  sample_btn.click(fn=fill_code, inputs=drop, outputs=[pragma, code_in])
213
 
214
  gr.Markdown(
 
249
  """)
250
 
251
 
252
+
253
+
254
  pragformer_gui.launch()
255