Spaces:
Sleeping
Sleeping
Fix Complexity function
Browse files
app.py
CHANGED
@@ -2,7 +2,6 @@ import gradio as gr
|
|
2 |
import re
|
3 |
import json
|
4 |
import nltk
|
5 |
-
import stanza
|
6 |
from transformers import AutoTokenizer, AutoModelForTokenClassification, TokenClassificationPipeline
|
7 |
from sentence_transformers import CrossEncoder
|
8 |
from autocorrect import Speller
|
@@ -10,33 +9,7 @@ from transformers import BertTokenizer, BertForSequenceClassification
|
|
10 |
import torch
|
11 |
from torch.nn.utils.rnn import pad_sequence
|
12 |
import numpy as np
|
13 |
-
|
14 |
-
|
15 |
-
# ********************* Setting up Stanford CoreNLP *********************
|
16 |
-
|
17 |
-
# Download the Stanford CoreNLP package with Stanza's installation command
|
18 |
-
# This'll take several minutes, depending on the network speed
|
19 |
-
#corenlp_dir = './corenlp'
|
20 |
-
#stanza.install_corenlp(dir=corenlp_dir)
|
21 |
-
|
22 |
-
# Set the CORENLP_HOME environment variable to point to the installation location
|
23 |
-
#import os
|
24 |
-
#os.environ["CORENLP_HOME"] = corenlp_dir
|
25 |
-
|
26 |
-
# Construct a CoreNLPClient with some basic annotators, a memory allocation of 4GB, and port number 9001
|
27 |
-
#client = CoreNLPClient(
|
28 |
-
# annotators=['tokenize','ssplit', 'pos', 'lemma', 'ner', 'parse'],
|
29 |
-
# memory='4G',
|
30 |
-
# endpoint='http://localhost:9001',
|
31 |
-
# be_quiet=True)
|
32 |
-
#print(client)
|
33 |
-
|
34 |
-
# Start the background server and wait for some time
|
35 |
-
# Note that in practice this is totally optional, as by default the server will be started when the first annotation is performed
|
36 |
-
#client.start()
|
37 |
-
#import time; time.sleep(10)
|
38 |
-
|
39 |
-
# ************************************************************************
|
40 |
|
41 |
|
42 |
# ***************************** TGRL Parsing *****************************
|
@@ -169,115 +142,14 @@ def get_long_elements(elements): # Using RegEx
|
|
169 |
return "Long elements:\nNone."
|
170 |
# #####################################
|
171 |
|
172 |
-
'''
|
173 |
# ######### Complex Sentences #########
|
174 |
-
# Complex sentences
|
175 |
-
|
176 |
-
def get_verb_phrases(t):
|
177 |
-
verb_phrases = []
|
178 |
-
num_children = len(t)
|
179 |
-
num_VP = sum(1 if t[i].label() == "VP" else 0 for i in range(0, num_children))
|
180 |
-
|
181 |
-
if t.label() != "VP":
|
182 |
-
for i in range(0, num_children):
|
183 |
-
if t[i].height() > 2:
|
184 |
-
verb_phrases.extend(get_verb_phrases(t[i]))
|
185 |
-
elif t.label() == "VP" and num_VP > 1:
|
186 |
-
for i in range(0, num_children):
|
187 |
-
if t[i].label() == "VP":
|
188 |
-
if t[i].height() > 2:
|
189 |
-
verb_phrases.extend(get_verb_phrases(t[i]))
|
190 |
-
else:
|
191 |
-
verb_phrases.append(' '.join(t.leaves()))
|
192 |
-
|
193 |
-
return verb_phrases
|
194 |
-
|
195 |
-
def get_pos(t):
|
196 |
-
vp_pos = []
|
197 |
-
sub_conj_pos = []
|
198 |
-
num_children = len(t)
|
199 |
-
children = [t[i].label() for i in range(0,num_children)]
|
200 |
-
|
201 |
-
flag = re.search(r"(S|SBAR|SBARQ|SINV|SQ)", ' '.join(children))
|
202 |
-
|
203 |
-
if "VP" in children and not flag:
|
204 |
-
for i in range(0, num_children):
|
205 |
-
if t[i].label() == "VP":
|
206 |
-
vp_pos.append(t[i].treeposition())
|
207 |
-
elif not "VP" in children and not flag:
|
208 |
-
for i in range(0, num_children):
|
209 |
-
if t[i].height() > 2:
|
210 |
-
temp1,temp2 = get_pos(t[i])
|
211 |
-
vp_pos.extend(temp1)
|
212 |
-
sub_conj_pos.extend(temp2)
|
213 |
-
# comment this "else" part, if want to include subordinating conjunctions
|
214 |
-
else:
|
215 |
-
for i in range(0, num_children):
|
216 |
-
if t[i].label() in ["S","SBAR","SBARQ","SINV","SQ"]:
|
217 |
-
temp1, temp2 = get_pos(t[i])
|
218 |
-
vp_pos.extend(temp1)
|
219 |
-
sub_conj_pos.extend(temp2)
|
220 |
-
else:
|
221 |
-
sub_conj_pos.append(t[i].treeposition())
|
222 |
-
|
223 |
-
return (vp_pos,sub_conj_pos)
|
224 |
-
|
225 |
-
# get all clauses
|
226 |
-
def get_clause_list(sent):
|
227 |
-
|
228 |
-
parser = client.annotate(sent, properties={"annotators":"parse","outputFormat": "json"})
|
229 |
-
sent_tree = nltk.tree.ParentedTree.fromstring(parser["sentences"][0]["parse"])
|
230 |
-
#print(sent_tree)
|
231 |
-
clause_level_list = ["S","SBAR","SBARQ","SINV","SQ"]
|
232 |
-
clause_list = []
|
233 |
-
sub_trees = []
|
234 |
-
#sent_tree.pretty_print()
|
235 |
-
|
236 |
-
# break the tree into subtrees of clauses using
|
237 |
-
# clause levels "S","SBAR","SBARQ","SINV","SQ"
|
238 |
-
for sub_tree in reversed(list(sent_tree.subtrees())):
|
239 |
-
if sub_tree.label() in clause_level_list:
|
240 |
-
if sub_tree.parent().label() in clause_level_list:
|
241 |
-
continue
|
242 |
-
|
243 |
-
if (len(sub_tree) == 1 and sub_tree.label() == "S" and sub_tree[0].label() == "VP"
|
244 |
-
and not sub_tree.parent().label() in clause_level_list):
|
245 |
-
continue
|
246 |
-
|
247 |
-
sub_trees.append(sub_tree)
|
248 |
-
del sent_tree[sub_tree.treeposition()]
|
249 |
-
|
250 |
-
#print(sub_trees)
|
251 |
-
|
252 |
-
# for each clause level subtree, extract relevant simple sentence
|
253 |
-
for t in sub_trees:
|
254 |
-
# get verb phrases from the new modified tree
|
255 |
-
verb_phrases = get_verb_phrases(t)
|
256 |
-
#print(verb_phrases)
|
257 |
-
|
258 |
-
# get tree without verb phrases (mainly subject)
|
259 |
-
# remove subordinating conjunctions
|
260 |
-
vp_pos,sub_conj_pos = get_pos(t)
|
261 |
-
for i in vp_pos:
|
262 |
-
del t[i]
|
263 |
-
for i in sub_conj_pos:
|
264 |
-
del t[i]
|
265 |
-
|
266 |
-
subject_phrase = ' '.join(t.leaves())
|
267 |
-
|
268 |
-
# update the clause_list
|
269 |
-
for i in verb_phrases:
|
270 |
-
clause_list.append(subject_phrase + " " + i)
|
271 |
-
|
272 |
-
return clause_list
|
273 |
-
|
274 |
def get_complex_sentences(elements):
|
275 |
|
276 |
complex_sentences = []
|
277 |
|
278 |
for key, value in elements.items():
|
279 |
for i in range(0, len(elements[key])):
|
280 |
-
if len(
|
281 |
complex_sentences.append(elements[key][i])
|
282 |
|
283 |
if complex_sentences:
|
@@ -285,8 +157,81 @@ def get_complex_sentences(elements):
|
|
285 |
return "Complex sentences:\n" + complex_sentences
|
286 |
else:
|
287 |
return "Complex sentences:\nNone."
|
288 |
-
|
289 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
290 |
# ########## Punctuations #########
|
291 |
def get_punctuations(elements):
|
292 |
|
@@ -574,8 +519,8 @@ def identify_bad_smells(tgrl_file, selected_bad_smells):
|
|
574 |
if 'Size' in selected_bad_smells:
|
575 |
output = output + get_long_elements(elements) + "\n\n"
|
576 |
|
577 |
-
|
578 |
-
|
579 |
|
580 |
if 'Punctuations' in selected_bad_smells:
|
581 |
output = output + get_punctuations(elements) + "\n\n"
|
@@ -610,12 +555,11 @@ def identify_bad_smells(tgrl_file, selected_bad_smells):
|
|
610 |
|
611 |
interface = gr.Interface(fn = identify_bad_smells,
|
612 |
inputs = [gr.File(label="TGRL File"),
|
613 |
-
gr.CheckboxGroup(["Size", "Punctuations", "Actors Syntax", "Goals Syntax", "Softgoals Syntax", "Tasks Syntax", "Similar Elements", "Spelling Mistakes", "Goal-Subgoal Mismatch", "Contradicting Elements"],
|
614 |
label="Which bad smells you want to detect?")],
|
615 |
outputs = ["text"],
|
616 |
title = "TGRL Bad Smells Detection",
|
617 |
description = "Upload your .xgrl file and we will find the bad smells for you!")
|
618 |
-
#"Complexity"
|
619 |
|
620 |
-
|
621 |
-
|
|
|
2 |
import re
|
3 |
import json
|
4 |
import nltk
|
|
|
5 |
from transformers import AutoTokenizer, AutoModelForTokenClassification, TokenClassificationPipeline
|
6 |
from sentence_transformers import CrossEncoder
|
7 |
from autocorrect import Speller
|
|
|
9 |
import torch
|
10 |
from torch.nn.utils.rnn import pad_sequence
|
11 |
import numpy as np
|
12 |
+
import spacy
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
|
15 |
# ***************************** TGRL Parsing *****************************
|
|
|
142 |
return "Long elements:\nNone."
|
143 |
# #####################################
|
144 |
|
|
|
145 |
# ######### Complex Sentences #########
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
def get_complex_sentences(elements):
|
147 |
|
148 |
complex_sentences = []
|
149 |
|
150 |
for key, value in elements.items():
|
151 |
for i in range(0, len(elements[key])):
|
152 |
+
if len(get_clauses_list(elements[key][i])) > 1:
|
153 |
complex_sentences.append(elements[key][i])
|
154 |
|
155 |
if complex_sentences:
|
|
|
157 |
return "Complex sentences:\n" + complex_sentences
|
158 |
else:
|
159 |
return "Complex sentences:\nNone."
|
160 |
+
|
161 |
+
def find_root_of_sentence(doc):
|
162 |
+
root_token = None
|
163 |
+
for token in doc:
|
164 |
+
if (token.dep_ == "ROOT"):
|
165 |
+
root_token = token
|
166 |
+
return root_token
|
167 |
+
|
168 |
+
def find_other_verbs(doc, root_token):
|
169 |
+
other_verbs = []
|
170 |
+
for token in doc:
|
171 |
+
ancestors = list(token.ancestors)
|
172 |
+
if (token.pos_ == "VERB" and len(ancestors) == 1 and ancestors[0] == root_token):
|
173 |
+
other_verbs.append(token)
|
174 |
+
return other_verbs
|
175 |
+
|
176 |
+
# find the token spans for each verb
|
177 |
+
def get_clause_token_span_for_verb(verb, doc, all_verbs):
|
178 |
+
first_token_index = len(doc)
|
179 |
+
last_token_index = 0
|
180 |
+
this_verb_children = list(verb.children)
|
181 |
+
for child in this_verb_children:
|
182 |
+
if (child not in all_verbs):
|
183 |
+
if (child.i < first_token_index):
|
184 |
+
first_token_index = child.i
|
185 |
+
if (child.i > last_token_index):
|
186 |
+
last_token_index = child.i
|
187 |
+
return(first_token_index, last_token_index)
|
188 |
+
|
189 |
+
def get_clauses_list(sent):
|
190 |
+
|
191 |
+
nlp = spacy.load('en_core_web_sm')
|
192 |
+
|
193 |
+
doc = nlp(sent)
|
194 |
+
|
195 |
+
# find part of speech, dependency tag, ancestors, and children of each token
|
196 |
+
for token in doc:
|
197 |
+
ancestors = [t.text for t in token.ancestors]
|
198 |
+
children = [t.text for t in token.children]
|
199 |
+
#print(token.text, "\t", token.i, "\t", token.pos_, "\t", token.dep_, "\t", ancestors, "\t", children)
|
200 |
+
|
201 |
+
# find the root token of the sentenc
|
202 |
+
root_token = find_root_of_sentence(doc)
|
203 |
+
|
204 |
+
# find the other verbs
|
205 |
+
other_verbs = find_other_verbs(doc, root_token)
|
206 |
+
|
207 |
+
# put together all the verbs in one array and process each using get_clause_token_span_for_verb function
|
208 |
+
# this will return a tuple of start and end indices for each verb's clause
|
209 |
+
token_spans = []
|
210 |
+
all_verbs = [root_token] + other_verbs
|
211 |
+
for other_verb in all_verbs:
|
212 |
+
(first_token_index, last_token_index) = \
|
213 |
+
get_clause_token_span_for_verb(other_verb,
|
214 |
+
doc, all_verbs)
|
215 |
+
token_spans.append((first_token_index,
|
216 |
+
last_token_index))
|
217 |
+
|
218 |
+
# put together token spans for each clause
|
219 |
+
sentence_clauses = []
|
220 |
+
for token_span in token_spans:
|
221 |
+
start = token_span[0]
|
222 |
+
end = token_span[1]
|
223 |
+
if (start < end):
|
224 |
+
clause = doc[start:end]
|
225 |
+
sentence_clauses.append(clause)
|
226 |
+
sentence_clauses = sorted(sentence_clauses, key=lambda tup: tup[0])
|
227 |
+
|
228 |
+
# get the final result
|
229 |
+
clauses_text = [clause.text for clause in sentence_clauses]
|
230 |
+
#print(clauses_text)
|
231 |
+
return clauses_text
|
232 |
+
|
233 |
+
# #####################################
|
234 |
+
|
235 |
# ########## Punctuations #########
|
236 |
def get_punctuations(elements):
|
237 |
|
|
|
519 |
if 'Size' in selected_bad_smells:
|
520 |
output = output + get_long_elements(elements) + "\n\n"
|
521 |
|
522 |
+
if 'Complexity' in selected_bad_smells:
|
523 |
+
output = output + get_complex_sentences(elements) + "\n\n"
|
524 |
|
525 |
if 'Punctuations' in selected_bad_smells:
|
526 |
output = output + get_punctuations(elements) + "\n\n"
|
|
|
555 |
|
556 |
interface = gr.Interface(fn = identify_bad_smells,
|
557 |
inputs = [gr.File(label="TGRL File"),
|
558 |
+
gr.CheckboxGroup(["Size", "Complexity", "Punctuations", "Actors Syntax", "Goals Syntax", "Softgoals Syntax", "Tasks Syntax", "Similar Elements", "Spelling Mistakes", "Goal-Subgoal Mismatch", "Contradicting Elements"],
|
559 |
label="Which bad smells you want to detect?")],
|
560 |
outputs = ["text"],
|
561 |
title = "TGRL Bad Smells Detection",
|
562 |
description = "Upload your .xgrl file and we will find the bad smells for you!")
|
|
|
563 |
|
564 |
+
|
565 |
+
interface.launch(inline = False)
|