Spaces:

nouf-sst
/

TGRL-bad-smells

Sleeping

App Files Files Community

nouf-sst commited on Apr 13, 2023

Commit

44d1a52

1 Parent(s): 1e3ed9c

Fix Complexity function

Browse files

Files changed (1) hide show

app.py +82 -138

app.py CHANGED Viewed

@@ -2,7 +2,6 @@ import gradio as gr
 import re
 import json
 import nltk
-import stanza
 from transformers import AutoTokenizer, AutoModelForTokenClassification, TokenClassificationPipeline
 from sentence_transformers import CrossEncoder
 from autocorrect import Speller
@@ -10,33 +9,7 @@ from transformers import BertTokenizer, BertForSequenceClassification
 import torch
 from torch.nn.utils.rnn import pad_sequence
 import numpy as np
-from stanza.server import CoreNLPClient
-# ********************* Setting up Stanford CoreNLP *********************
-# Download the Stanford CoreNLP package with Stanza's installation command
-# This'll take several minutes, depending on the network speed
-#corenlp_dir = './corenlp'
-#stanza.install_corenlp(dir=corenlp_dir)
-# Set the CORENLP_HOME environment variable to point to the installation location
-#import os
-#os.environ["CORENLP_HOME"] = corenlp_dir
-# Construct a CoreNLPClient with some basic annotators, a memory allocation of 4GB, and port number 9001
-#client = CoreNLPClient(
-#    annotators=['tokenize','ssplit', 'pos', 'lemma', 'ner', 'parse'],
-#    memory='4G',
-#    endpoint='http://localhost:9001',
-#    be_quiet=True)
-#print(client)
-# Start the background server and wait for some time
-# Note that in practice this is totally optional, as by default the server will be started when the first annotation is performed
-#client.start()
-#import time; time.sleep(10)
-# ************************************************************************
 # ***************************** TGRL Parsing *****************************
@@ -169,115 +142,14 @@ def get_long_elements(elements): # Using RegEx
     return "Long elements:\nNone."
 # #####################################
-'''
 # ######### Complex Sentences #########
-# Complex sentences
-def get_verb_phrases(t):
-    verb_phrases = []
-    num_children = len(t)
-    num_VP = sum(1 if t[i].label() == "VP" else 0 for i in range(0, num_children))
-    if t.label() != "VP":
-        for i in range(0, num_children):
-            if t[i].height() > 2:
-                verb_phrases.extend(get_verb_phrases(t[i]))
-    elif t.label() == "VP" and num_VP > 1:
-        for i in range(0, num_children):
-            if t[i].label() == "VP":
-                if t[i].height() > 2:
-                    verb_phrases.extend(get_verb_phrases(t[i]))
-    else:
-        verb_phrases.append(' '.join(t.leaves()))
-    return verb_phrases
-def get_pos(t):
-    vp_pos = []
-    sub_conj_pos = []
-    num_children = len(t)
-    children = [t[i].label() for i in range(0,num_children)]
-    flag = re.search(r"(S|SBAR|SBARQ|SINV|SQ)", ' '.join(children))
-    if "VP" in children and not flag:
-        for i in range(0, num_children):
-            if t[i].label() == "VP":
-                vp_pos.append(t[i].treeposition())
-    elif not "VP" in children and not flag:
-        for i in range(0, num_children):
-            if t[i].height() > 2:
-                temp1,temp2 = get_pos(t[i])
-                vp_pos.extend(temp1)
-                sub_conj_pos.extend(temp2)
-    # comment this "else" part, if want to include subordinating conjunctions
-    else:
-        for i in range(0, num_children):
-            if t[i].label() in ["S","SBAR","SBARQ","SINV","SQ"]:
-                temp1, temp2 = get_pos(t[i])
-                vp_pos.extend(temp1)
-                sub_conj_pos.extend(temp2)
-            else:
-                sub_conj_pos.append(t[i].treeposition())
-    return (vp_pos,sub_conj_pos)
-  # get all clauses
-def get_clause_list(sent):
-    parser = client.annotate(sent, properties={"annotators":"parse","outputFormat": "json"})
-    sent_tree = nltk.tree.ParentedTree.fromstring(parser["sentences"][0]["parse"])
-    #print(sent_tree)
-    clause_level_list = ["S","SBAR","SBARQ","SINV","SQ"]
-    clause_list = []
-    sub_trees = []
-    #sent_tree.pretty_print()
-    # break the tree into subtrees of clauses using
-    # clause levels "S","SBAR","SBARQ","SINV","SQ"
-    for sub_tree in reversed(list(sent_tree.subtrees())):
-        if sub_tree.label() in clause_level_list:
-            if sub_tree.parent().label() in clause_level_list:
-                continue
-            if (len(sub_tree) == 1 and sub_tree.label() == "S" and sub_tree[0].label() == "VP"
-                and not sub_tree.parent().label() in clause_level_list):
-                continue
-            sub_trees.append(sub_tree)
-            del sent_tree[sub_tree.treeposition()]
-    #print(sub_trees)
-    # for each clause level subtree, extract relevant simple sentence
-    for t in sub_trees:
-        # get verb phrases from the new modified tree
-        verb_phrases = get_verb_phrases(t)
-        #print(verb_phrases)
-        # get tree without verb phrases (mainly subject)
-        # remove subordinating conjunctions
-        vp_pos,sub_conj_pos = get_pos(t)
-        for i in vp_pos:
-            del t[i]
-        for i in sub_conj_pos:
-            del t[i]
-        subject_phrase = ' '.join(t.leaves())
-        # update the clause_list
-        for i in verb_phrases:
-            clause_list.append(subject_phrase + " " + i)
-    return clause_list
 def get_complex_sentences(elements):
   complex_sentences = []
   for key, value in elements.items():
       for i in range(0, len(elements[key])):
-          if len(get_clause_list(re.sub(r"(\.|,|\?|\(|\)|\[|\])"," ", elements[key][i]))) > 1:
               complex_sentences.append(elements[key][i])
   if complex_sentences:
@@ -285,8 +157,81 @@ def get_complex_sentences(elements):
     return "Complex sentences:\n" + complex_sentences
   else:
     return "Complex sentences:\nNone."
-# #################################
-'''
 # ########## Punctuations #########
 def get_punctuations(elements):
@@ -574,8 +519,8 @@ def identify_bad_smells(tgrl_file, selected_bad_smells):
   if 'Size' in selected_bad_smells:
     output = output + get_long_elements(elements) + "\n\n"
-  #if 'Complexity' in selected_bad_smells:
-  #  output = output + get_complex_sentences(elements) + "\n\n"
   if 'Punctuations' in selected_bad_smells:
     output = output + get_punctuations(elements) + "\n\n"
@@ -610,12 +555,11 @@ def identify_bad_smells(tgrl_file, selected_bad_smells):
 interface = gr.Interface(fn = identify_bad_smells,
                          inputs = [gr.File(label="TGRL File"),
-                          gr.CheckboxGroup(["Size", "Punctuations", "Actors Syntax", "Goals Syntax", "Softgoals Syntax", "Tasks Syntax", "Similar Elements", "Spelling Mistakes", "Goal-Subgoal Mismatch", "Contradicting Elements"],
                                            label="Which bad smells you want to detect?")],
                          outputs = ["text"],
                          title = "TGRL Bad Smells Detection",
                          description = "Upload your .xgrl file and we will find the bad smells for you!")
-#"Complexity"
-interface.launch(inline = False)
-#interface.launch()

 import re
 import json
 import nltk
 from transformers import AutoTokenizer, AutoModelForTokenClassification, TokenClassificationPipeline
 from sentence_transformers import CrossEncoder
 from autocorrect import Speller
 import torch
 from torch.nn.utils.rnn import pad_sequence
 import numpy as np
+import spacy
 # ***************************** TGRL Parsing *****************************
     return "Long elements:\nNone."
 # #####################################
 # ######### Complex Sentences #########
 def get_complex_sentences(elements):
   complex_sentences = []
   for key, value in elements.items():
       for i in range(0, len(elements[key])):
+          if len(get_clauses_list(elements[key][i])) > 1:
               complex_sentences.append(elements[key][i])
   if complex_sentences:
     return "Complex sentences:\n" + complex_sentences
   else:
     return "Complex sentences:\nNone."
+def find_root_of_sentence(doc):
+    root_token = None
+    for token in doc:
+      if (token.dep_ == "ROOT"):
+        root_token = token
+    return root_token
+def find_other_verbs(doc, root_token):
+  other_verbs = []
+  for token in doc:
+    ancestors = list(token.ancestors)
+    if (token.pos_ == "VERB" and len(ancestors) == 1 and ancestors[0] == root_token):
+      other_verbs.append(token)
+  return other_verbs
+#  find the token spans for each verb
+def get_clause_token_span_for_verb(verb, doc, all_verbs):
+    first_token_index = len(doc)
+    last_token_index = 0
+    this_verb_children = list(verb.children)
+    for child in this_verb_children:
+        if (child not in all_verbs):
+            if (child.i < first_token_index):
+                first_token_index = child.i
+            if (child.i > last_token_index):
+                last_token_index = child.i
+    return(first_token_index, last_token_index)
+def get_clauses_list(sent):
+  nlp = spacy.load('en_core_web_sm')
+  doc = nlp(sent)
+  # find part of speech, dependency tag, ancestors, and children of each token
+  for token in doc:
+    ancestors = [t.text for t in token.ancestors]
+    children = [t.text for t in token.children]
+    #print(token.text, "\t", token.i, "\t", token.pos_, "\t", token.dep_, "\t", ancestors, "\t", children)
+  # find the root token of the sentenc
+  root_token = find_root_of_sentence(doc)
+  #  find the other verbs
+  other_verbs = find_other_verbs(doc, root_token)
+  # put together all the verbs in one array and process each using get_clause_token_span_for_verb function
+  # this will return a tuple of start and end indices for each verb's clause
+  token_spans = []
+  all_verbs = [root_token] + other_verbs
+  for other_verb in all_verbs:
+      (first_token_index, last_token_index) = \
+      get_clause_token_span_for_verb(other_verb,
+                                      doc, all_verbs)
+      token_spans.append((first_token_index,
+                          last_token_index))
+  # put together token spans for each clause
+  sentence_clauses = []
+  for token_span in token_spans:
+      start = token_span[0]
+      end = token_span[1]
+      if (start < end):
+          clause = doc[start:end]
+          sentence_clauses.append(clause)
+  sentence_clauses = sorted(sentence_clauses, key=lambda tup: tup[0])
+  # get the final result
+  clauses_text = [clause.text for clause in sentence_clauses]
+  #print(clauses_text)
+  return clauses_text
+# #####################################
 # ########## Punctuations #########
 def get_punctuations(elements):
   if 'Size' in selected_bad_smells:
     output = output + get_long_elements(elements) + "\n\n"
+  if 'Complexity' in selected_bad_smells:
+    output = output + get_complex_sentences(elements) + "\n\n"
   if 'Punctuations' in selected_bad_smells:
     output = output + get_punctuations(elements) + "\n\n"
 interface = gr.Interface(fn = identify_bad_smells,
                          inputs = [gr.File(label="TGRL File"),
+                          gr.CheckboxGroup(["Size", "Complexity", "Punctuations", "Actors Syntax", "Goals Syntax", "Softgoals Syntax", "Tasks Syntax", "Similar Elements", "Spelling Mistakes", "Goal-Subgoal Mismatch", "Contradicting Elements"],
                                            label="Which bad smells you want to detect?")],
                          outputs = ["text"],
                          title = "TGRL Bad Smells Detection",
                          description = "Upload your .xgrl file and we will find the bad smells for you!")
+interface.launch(inline = False)