Spaces:

Pragformer
/

PragFormer-demo

Build error

App Files Files Community

Pragformer commited on Jan 6, 2023

Commit

094450f

•

1 Parent(s): a7a7a33

Update app.py

Browse files

Files changed (1) hide show

app.py +120 -48

app.py CHANGED Viewed

@@ -1,9 +1,13 @@
 import gradio as gr
-import transformers
 from simpletransformers.classification import ClassificationModel, ClassificationArgs
 import torch
 import json
 # load all models
 deep_scc_model_args = ClassificationArgs(num_train_epochs=10,max_seq_length=300,use_multiprocessing=False)
 deep_scc_model = ClassificationModel("roberta", "NTUYG/DeepSCC-RoBERTa", num_labels=19, args=deep_scc_model_args, use_cuda=False)
@@ -18,71 +22,127 @@ with_omp_str = 'Should contain a parallel work-sharing loop construct'
 without_omp_str = 'Should not contain a parallel work-sharing loop construct'
 name_file = ['bash', 'c', 'c#', 'c++','css', 'haskell', 'java', 'javascript', 'lua', 'objective-c', 'perl', 'php', 'python','r','ruby', 'scala', 'sql', 'swift', 'vb.net']
 tokenizer = transformers.AutoTokenizer.from_pretrained('NTUYG/DeepSCC-RoBERTa')
 with open('c_data.json', 'r') as f:
     data = json.load(f)
 def fill_code(code_pth):
-    pragma = data[code_pth]['pragma']
-    code = data[code_pth]['code']
-    return 'None' if len(pragma)==0 else pragma, code
 def predict(code_txt):
-    code = code_txt.lstrip().rstrip()
-    tokenized = tokenizer.batch_encode_plus(
-                [code],
-                max_length = 150,
-                pad_to_max_length = True,
-                truncation = True
-            )
-    pred = pragformer(torch.tensor(tokenized['input_ids']), torch.tensor(tokenized['attention_mask']))
-    y_hat = torch.argmax(pred).item()
-    return with_omp_str if y_hat==1 else without_omp_str, torch.nn.Softmax(dim=1)(pred).squeeze()[y_hat].item()
 def is_private(code_txt):
-    if predict(code_txt)[0] == without_omp_str:
-        return gr.update(visible=False)
-    code = code_txt.lstrip().rstrip()
-    tokenized = tokenizer.batch_encode_plus(
-                [code],
-                max_length = 150,
-                pad_to_max_length = True,
-                truncation = True
-            )
-    pred = pragformer_private(torch.tensor(tokenized['input_ids']), torch.tensor(tokenized['attention_mask']))
-    y_hat = torch.argmax(pred).item()
-    # if y_hat == 0:
-    #     return gr.update(visible=False)
-    # else:
-    return gr.update(value=f"Should {'not' if y_hat==0 else ''} contain private with confidence: {torch.nn.Softmax(dim=1)(pred).squeeze()[y_hat].item()}", visible=True)
 def is_reduction(code_txt):
-    if predict(code_txt)[0] == without_omp_str:
-        return gr.update(visible=False)
-    code = code_txt.lstrip().rstrip()
     tokenized = tokenizer.batch_encode_plus(
-                [code],
-                max_length = 150,
-                pad_to_max_length = True,
-                truncation = True
-            )
-    pred = pragformer_reduction(torch.tensor(tokenized['input_ids']), torch.tensor(tokenized['attention_mask']))
-    y_hat = torch.argmax(pred).item()
-    # if y_hat == 0:
-    #     return gr.update(visible=False)
-    # else:
-    return gr.update(value=f"Should {'not' if y_hat==0 else ''} contain reduction with confidence: {torch.nn.Softmax(dim=1)(pred).squeeze()[y_hat].item()}", visible=True)
 def lang_predict(code_txt):
     res = {}
@@ -120,9 +180,13 @@ with gr.Blocks() as pragformer_gui:
             pragma =  gr.Textbox(label="Original parallelization classification (if any)")
         with gr.Row():
             code_in = gr.Textbox(lines=5, label="Write some C code and see if it should contain a parallel work-sharing loop construct")
-            lang_pred = gr.Textbox(lines=5, label="DeepSCC programming language prediction")
         submit_btn = gr.Button("Submit")
     with gr.Column():
         gr.Markdown("## Results")
@@ -134,11 +198,17 @@ with gr.Blocks() as pragformer_gui:
             private = gr.Textbox(label="Data-sharing attribute clause- private", visible=False)
             reduction = gr.Textbox(label="Data-sharing attribute clause- reduction", visible=False)
-    code_in.change(fn=lang_predict, inputs=code_in, outputs=lang_pred)
     submit_btn.click(fn=predict, inputs=code_in, outputs=[label_out, confidence_out])
     submit_btn.click(fn=is_private, inputs=code_in, outputs=private)
     submit_btn.click(fn=is_reduction, inputs=code_in, outputs=reduction)
     sample_btn.click(fn=fill_code, inputs=drop, outputs=[pragma, code_in])
     gr.Markdown(
@@ -179,5 +249,7 @@ with gr.Blocks() as pragformer_gui:
     """)
 pragformer_gui.launch()

 import gradio as gr
 from simpletransformers.classification import ClassificationModel, ClassificationArgs
+from lime.lime_text import LimeTextExplainer
 import torch
+from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
+import re
+import transformers
 import json
 # load all models
 deep_scc_model_args = ClassificationArgs(num_train_epochs=10,max_seq_length=300,use_multiprocessing=False)
 deep_scc_model = ClassificationModel("roberta", "NTUYG/DeepSCC-RoBERTa", num_labels=19, args=deep_scc_model_args, use_cuda=False)
 without_omp_str = 'Should not contain a parallel work-sharing loop construct'
 name_file = ['bash', 'c', 'c#', 'c++','css', 'haskell', 'java', 'javascript', 'lua', 'objective-c', 'perl', 'php', 'python','r','ruby', 'scala', 'sql', 'swift', 'vb.net']
 tokenizer = transformers.AutoTokenizer.from_pretrained('NTUYG/DeepSCC-RoBERTa')
 with open('c_data.json', 'r') as f:
     data = json.load(f)
 def fill_code(code_pth):
+  pragma = data[code_pth]['pragma']
+  code = data[code_pth]['code']
+  return 'None' if len(pragma)==0 else pragma, code
 def predict(code_txt):
+  code = code_txt.lstrip().rstrip()
+  tokenized = tokenizer.batch_encode_plus(
+            [code],
+            max_length = 150,
+            pad_to_max_length = True,
+            truncation = True
+        )
+  pred = pragformer(torch.tensor(tokenized['input_ids']), torch.tensor(tokenized['attention_mask']))
+  y_hat = torch.argmax(pred).item()
+  return with_omp_str if y_hat==1 else without_omp_str, torch.nn.Softmax(dim=1)(pred).squeeze()[y_hat].item()
 def is_private(code_txt):
+  if predict(code_txt)[0] == without_omp_str:
+      return gr.update(visible=False)
+  code = code_txt.lstrip().rstrip()
+  tokenized = tokenizer.batch_encode_plus(
+            [code],
+            max_length = 150,
+            pad_to_max_length = True,
+            truncation = True
+        )
+  pred = pragformer_private(torch.tensor(tokenized['input_ids']), torch.tensor(tokenized['attention_mask']))
+  y_hat = torch.argmax(pred).item()
+  # if y_hat == 0:
+  #     return gr.update(visible=False)
+  # else:
+  return gr.update(value=f"Should {'not' if y_hat==0 else ''} contain private with confidence: {torch.nn.Softmax(dim=1)(pred).squeeze()[y_hat].item()}", visible=True)
 def is_reduction(code_txt):
+  if predict(code_txt)[0] == without_omp_str:
+      return gr.update(visible=False)
+  code = code_txt.lstrip().rstrip()
+  tokenized = tokenizer.batch_encode_plus(
+            [code],
+            max_length = 150,
+            pad_to_max_length = True,
+            truncation = True
+        )
+  pred = pragformer_reduction(torch.tensor(tokenized['input_ids']), torch.tensor(tokenized['attention_mask']))
+  y_hat = torch.argmax(pred).item()
+  # if y_hat == 0:
+  #     return gr.update(visible=False)
+  # else:
+  return gr.update(value=f"Should {'not' if y_hat==0 else ''} contain reduction with confidence: {torch.nn.Softmax(dim=1)(pred).squeeze()[y_hat].item()}", visible=True)
+def predictor(texts):
     tokenized = tokenizer.batch_encode_plus(
+        texts,
+        max_length = 150,
+        pad_to_max_length = True,
+        truncation = True
+    )
+    test_seq = torch.tensor(tokenized['input_ids'])
+    test_mask = torch.tensor(tokenized['attention_mask'])
+    test_y = torch.tensor([1]*len(texts))
+    test_data = TensorDataset(test_seq, test_mask, test_y)
+    test_sampler = SequentialSampler(test_seq)
+    test_dataloader = DataLoader(test_data, sampler = test_sampler, batch_size = len(texts))
+    total_probas = []
+    for step, batch in enumerate(test_dataloader):
+        sent_id, mask, labels = batch
+        outputs = pragformer(sent_id, mask)
+        probas = outputs.detach().numpy()
+        total_probas.extend(probas)
+    return torch.nn.Softmax(dim=1)(torch.tensor(probas)).numpy()
+def lime_explain(code_txt):
+  class_names = ['Without OpenMP', 'With OpenMP']
+  SAMPLES = 40
+  exp = []
+  if predict(code_txt)[0] == without_omp_str:
+      return gr.update(visible=False)
+  explainer = LimeTextExplainer(class_names=class_names, split_expression=r"\s+")
+  exp = explainer.explain_instance(code_txt, predictor, num_features=20, num_samples=SAMPLES)
+  return gr.update(visible=True, value=exp.as_pyplot_figure())
+def activate_c(lang_pred):
+  langs = lang_pred.split('\n')
+  langs = {lang[5:lang.find(':')]:float(lang[lang.find(':')+1:]) for lang in langs}
+  if any([lang in langs for lang in ['c', 'c++', 'c#']]) and any([val > 0.15 for val in langs.values()]):
+    return gr.update(visible=True)
+  else:
+    return gr.update(visible=False)
+def activate_button(lang_pred):
+  langs = lang_pred.split('\n')
+  langs = {lang[5:lang.find(':')]:float(lang[lang.find(':')+1:]) for lang in langs}
+  if any([lang in langs for lang in ['c', 'c++', 'c#']]) and any([val > 0.15 for val in langs.values()]):
+    return gr.update(visible=False)
+  else:
+    return gr.update(visible=True)
 def lang_predict(code_txt):
     res = {}
             pragma =  gr.Textbox(label="Original parallelization classification (if any)")
         with gr.Row():
             code_in = gr.Textbox(lines=5, label="Write some C code and see if it should contain a parallel work-sharing loop construct")
+            lang_pred = gr.Textbox(lines=5, label="DeepScc programming language prediction (only codes written in a C-like syntax will be executed)")
         submit_btn = gr.Button("Submit")
+        err_msg = gr.Markdown("""
+                               <div style='text-align: center;''>
+                                <span style='color:red'>According to the DeepSCC prediction, the code language is not of a C-like syntax</span>
+                                </div>""", visible=False)
     with gr.Column():
         gr.Markdown("## Results")
             private = gr.Textbox(label="Data-sharing attribute clause- private", visible=False)
             reduction = gr.Textbox(label="Data-sharing attribute clause- reduction", visible=False)
+        explanation = gr.Plot(visible=False)
+    code_in.change(fn=lang_predict, inputs=code_in, outputs=[lang_pred])
+    lang_pred.change(fn=activate_c, inputs=lang_pred, outputs=submit_btn)
+    lang_pred.change(fn=activate_button, inputs=lang_pred, outputs=err_msg)
     submit_btn.click(fn=predict, inputs=code_in, outputs=[label_out, confidence_out])
     submit_btn.click(fn=is_private, inputs=code_in, outputs=private)
     submit_btn.click(fn=is_reduction, inputs=code_in, outputs=reduction)
+    submit_btn.click(fn=lime_explain, inputs=code_in, outputs=explanation)
     sample_btn.click(fn=fill_code, inputs=drop, outputs=[pragma, code_in])
     gr.Markdown(
     """)
 pragformer_gui.launch()