nouf-sst commited on
Commit
44d1a52
·
1 Parent(s): 1e3ed9c

Fix Complexity function

Browse files
Files changed (1) hide show
  1. app.py +82 -138
app.py CHANGED
@@ -2,7 +2,6 @@ import gradio as gr
2
  import re
3
  import json
4
  import nltk
5
- import stanza
6
  from transformers import AutoTokenizer, AutoModelForTokenClassification, TokenClassificationPipeline
7
  from sentence_transformers import CrossEncoder
8
  from autocorrect import Speller
@@ -10,33 +9,7 @@ from transformers import BertTokenizer, BertForSequenceClassification
10
  import torch
11
  from torch.nn.utils.rnn import pad_sequence
12
  import numpy as np
13
- from stanza.server import CoreNLPClient
14
-
15
- # ********************* Setting up Stanford CoreNLP *********************
16
-
17
- # Download the Stanford CoreNLP package with Stanza's installation command
18
- # This'll take several minutes, depending on the network speed
19
- #corenlp_dir = './corenlp'
20
- #stanza.install_corenlp(dir=corenlp_dir)
21
-
22
- # Set the CORENLP_HOME environment variable to point to the installation location
23
- #import os
24
- #os.environ["CORENLP_HOME"] = corenlp_dir
25
-
26
- # Construct a CoreNLPClient with some basic annotators, a memory allocation of 4GB, and port number 9001
27
- #client = CoreNLPClient(
28
- # annotators=['tokenize','ssplit', 'pos', 'lemma', 'ner', 'parse'],
29
- # memory='4G',
30
- # endpoint='http://localhost:9001',
31
- # be_quiet=True)
32
- #print(client)
33
-
34
- # Start the background server and wait for some time
35
- # Note that in practice this is totally optional, as by default the server will be started when the first annotation is performed
36
- #client.start()
37
- #import time; time.sleep(10)
38
-
39
- # ************************************************************************
40
 
41
 
42
  # ***************************** TGRL Parsing *****************************
@@ -169,115 +142,14 @@ def get_long_elements(elements): # Using RegEx
169
  return "Long elements:\nNone."
170
  # #####################################
171
 
172
- '''
173
  # ######### Complex Sentences #########
174
- # Complex sentences
175
-
176
- def get_verb_phrases(t):
177
- verb_phrases = []
178
- num_children = len(t)
179
- num_VP = sum(1 if t[i].label() == "VP" else 0 for i in range(0, num_children))
180
-
181
- if t.label() != "VP":
182
- for i in range(0, num_children):
183
- if t[i].height() > 2:
184
- verb_phrases.extend(get_verb_phrases(t[i]))
185
- elif t.label() == "VP" and num_VP > 1:
186
- for i in range(0, num_children):
187
- if t[i].label() == "VP":
188
- if t[i].height() > 2:
189
- verb_phrases.extend(get_verb_phrases(t[i]))
190
- else:
191
- verb_phrases.append(' '.join(t.leaves()))
192
-
193
- return verb_phrases
194
-
195
- def get_pos(t):
196
- vp_pos = []
197
- sub_conj_pos = []
198
- num_children = len(t)
199
- children = [t[i].label() for i in range(0,num_children)]
200
-
201
- flag = re.search(r"(S|SBAR|SBARQ|SINV|SQ)", ' '.join(children))
202
-
203
- if "VP" in children and not flag:
204
- for i in range(0, num_children):
205
- if t[i].label() == "VP":
206
- vp_pos.append(t[i].treeposition())
207
- elif not "VP" in children and not flag:
208
- for i in range(0, num_children):
209
- if t[i].height() > 2:
210
- temp1,temp2 = get_pos(t[i])
211
- vp_pos.extend(temp1)
212
- sub_conj_pos.extend(temp2)
213
- # comment this "else" part, if want to include subordinating conjunctions
214
- else:
215
- for i in range(0, num_children):
216
- if t[i].label() in ["S","SBAR","SBARQ","SINV","SQ"]:
217
- temp1, temp2 = get_pos(t[i])
218
- vp_pos.extend(temp1)
219
- sub_conj_pos.extend(temp2)
220
- else:
221
- sub_conj_pos.append(t[i].treeposition())
222
-
223
- return (vp_pos,sub_conj_pos)
224
-
225
- # get all clauses
226
- def get_clause_list(sent):
227
-
228
- parser = client.annotate(sent, properties={"annotators":"parse","outputFormat": "json"})
229
- sent_tree = nltk.tree.ParentedTree.fromstring(parser["sentences"][0]["parse"])
230
- #print(sent_tree)
231
- clause_level_list = ["S","SBAR","SBARQ","SINV","SQ"]
232
- clause_list = []
233
- sub_trees = []
234
- #sent_tree.pretty_print()
235
-
236
- # break the tree into subtrees of clauses using
237
- # clause levels "S","SBAR","SBARQ","SINV","SQ"
238
- for sub_tree in reversed(list(sent_tree.subtrees())):
239
- if sub_tree.label() in clause_level_list:
240
- if sub_tree.parent().label() in clause_level_list:
241
- continue
242
-
243
- if (len(sub_tree) == 1 and sub_tree.label() == "S" and sub_tree[0].label() == "VP"
244
- and not sub_tree.parent().label() in clause_level_list):
245
- continue
246
-
247
- sub_trees.append(sub_tree)
248
- del sent_tree[sub_tree.treeposition()]
249
-
250
- #print(sub_trees)
251
-
252
- # for each clause level subtree, extract relevant simple sentence
253
- for t in sub_trees:
254
- # get verb phrases from the new modified tree
255
- verb_phrases = get_verb_phrases(t)
256
- #print(verb_phrases)
257
-
258
- # get tree without verb phrases (mainly subject)
259
- # remove subordinating conjunctions
260
- vp_pos,sub_conj_pos = get_pos(t)
261
- for i in vp_pos:
262
- del t[i]
263
- for i in sub_conj_pos:
264
- del t[i]
265
-
266
- subject_phrase = ' '.join(t.leaves())
267
-
268
- # update the clause_list
269
- for i in verb_phrases:
270
- clause_list.append(subject_phrase + " " + i)
271
-
272
- return clause_list
273
-
274
  def get_complex_sentences(elements):
275
 
276
  complex_sentences = []
277
 
278
  for key, value in elements.items():
279
  for i in range(0, len(elements[key])):
280
- if len(get_clause_list(re.sub(r"(\.|,|\?|\(|\)|\[|\])"," ", elements[key][i]))) > 1:
281
  complex_sentences.append(elements[key][i])
282
 
283
  if complex_sentences:
@@ -285,8 +157,81 @@ def get_complex_sentences(elements):
285
  return "Complex sentences:\n" + complex_sentences
286
  else:
287
  return "Complex sentences:\nNone."
288
- # #################################
289
- '''
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290
  # ########## Punctuations #########
291
  def get_punctuations(elements):
292
 
@@ -574,8 +519,8 @@ def identify_bad_smells(tgrl_file, selected_bad_smells):
574
  if 'Size' in selected_bad_smells:
575
  output = output + get_long_elements(elements) + "\n\n"
576
 
577
- #if 'Complexity' in selected_bad_smells:
578
- # output = output + get_complex_sentences(elements) + "\n\n"
579
 
580
  if 'Punctuations' in selected_bad_smells:
581
  output = output + get_punctuations(elements) + "\n\n"
@@ -610,12 +555,11 @@ def identify_bad_smells(tgrl_file, selected_bad_smells):
610
 
611
  interface = gr.Interface(fn = identify_bad_smells,
612
  inputs = [gr.File(label="TGRL File"),
613
- gr.CheckboxGroup(["Size", "Punctuations", "Actors Syntax", "Goals Syntax", "Softgoals Syntax", "Tasks Syntax", "Similar Elements", "Spelling Mistakes", "Goal-Subgoal Mismatch", "Contradicting Elements"],
614
  label="Which bad smells you want to detect?")],
615
  outputs = ["text"],
616
  title = "TGRL Bad Smells Detection",
617
  description = "Upload your .xgrl file and we will find the bad smells for you!")
618
- #"Complexity"
619
 
620
- interface.launch(inline = False)
621
- #interface.launch()
 
2
  import re
3
  import json
4
  import nltk
 
5
  from transformers import AutoTokenizer, AutoModelForTokenClassification, TokenClassificationPipeline
6
  from sentence_transformers import CrossEncoder
7
  from autocorrect import Speller
 
9
  import torch
10
  from torch.nn.utils.rnn import pad_sequence
11
  import numpy as np
12
+ import spacy
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
 
15
  # ***************************** TGRL Parsing *****************************
 
142
  return "Long elements:\nNone."
143
  # #####################################
144
 
 
145
  # ######### Complex Sentences #########
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  def get_complex_sentences(elements):
147
 
148
  complex_sentences = []
149
 
150
  for key, value in elements.items():
151
  for i in range(0, len(elements[key])):
152
+ if len(get_clauses_list(elements[key][i])) > 1:
153
  complex_sentences.append(elements[key][i])
154
 
155
  if complex_sentences:
 
157
  return "Complex sentences:\n" + complex_sentences
158
  else:
159
  return "Complex sentences:\nNone."
160
+
161
+ def find_root_of_sentence(doc):
162
+ root_token = None
163
+ for token in doc:
164
+ if (token.dep_ == "ROOT"):
165
+ root_token = token
166
+ return root_token
167
+
168
+ def find_other_verbs(doc, root_token):
169
+ other_verbs = []
170
+ for token in doc:
171
+ ancestors = list(token.ancestors)
172
+ if (token.pos_ == "VERB" and len(ancestors) == 1 and ancestors[0] == root_token):
173
+ other_verbs.append(token)
174
+ return other_verbs
175
+
176
+ # find the token spans for each verb
177
+ def get_clause_token_span_for_verb(verb, doc, all_verbs):
178
+ first_token_index = len(doc)
179
+ last_token_index = 0
180
+ this_verb_children = list(verb.children)
181
+ for child in this_verb_children:
182
+ if (child not in all_verbs):
183
+ if (child.i < first_token_index):
184
+ first_token_index = child.i
185
+ if (child.i > last_token_index):
186
+ last_token_index = child.i
187
+ return(first_token_index, last_token_index)
188
+
189
+ def get_clauses_list(sent):
190
+
191
+ nlp = spacy.load('en_core_web_sm')
192
+
193
+ doc = nlp(sent)
194
+
195
+ # find part of speech, dependency tag, ancestors, and children of each token
196
+ for token in doc:
197
+ ancestors = [t.text for t in token.ancestors]
198
+ children = [t.text for t in token.children]
199
+ #print(token.text, "\t", token.i, "\t", token.pos_, "\t", token.dep_, "\t", ancestors, "\t", children)
200
+
201
+ # find the root token of the sentenc
202
+ root_token = find_root_of_sentence(doc)
203
+
204
+ # find the other verbs
205
+ other_verbs = find_other_verbs(doc, root_token)
206
+
207
+ # put together all the verbs in one array and process each using get_clause_token_span_for_verb function
208
+ # this will return a tuple of start and end indices for each verb's clause
209
+ token_spans = []
210
+ all_verbs = [root_token] + other_verbs
211
+ for other_verb in all_verbs:
212
+ (first_token_index, last_token_index) = \
213
+ get_clause_token_span_for_verb(other_verb,
214
+ doc, all_verbs)
215
+ token_spans.append((first_token_index,
216
+ last_token_index))
217
+
218
+ # put together token spans for each clause
219
+ sentence_clauses = []
220
+ for token_span in token_spans:
221
+ start = token_span[0]
222
+ end = token_span[1]
223
+ if (start < end):
224
+ clause = doc[start:end]
225
+ sentence_clauses.append(clause)
226
+ sentence_clauses = sorted(sentence_clauses, key=lambda tup: tup[0])
227
+
228
+ # get the final result
229
+ clauses_text = [clause.text for clause in sentence_clauses]
230
+ #print(clauses_text)
231
+ return clauses_text
232
+
233
+ # #####################################
234
+
235
  # ########## Punctuations #########
236
  def get_punctuations(elements):
237
 
 
519
  if 'Size' in selected_bad_smells:
520
  output = output + get_long_elements(elements) + "\n\n"
521
 
522
+ if 'Complexity' in selected_bad_smells:
523
+ output = output + get_complex_sentences(elements) + "\n\n"
524
 
525
  if 'Punctuations' in selected_bad_smells:
526
  output = output + get_punctuations(elements) + "\n\n"
 
555
 
556
  interface = gr.Interface(fn = identify_bad_smells,
557
  inputs = [gr.File(label="TGRL File"),
558
+ gr.CheckboxGroup(["Size", "Complexity", "Punctuations", "Actors Syntax", "Goals Syntax", "Softgoals Syntax", "Tasks Syntax", "Similar Elements", "Spelling Mistakes", "Goal-Subgoal Mismatch", "Contradicting Elements"],
559
  label="Which bad smells you want to detect?")],
560
  outputs = ["text"],
561
  title = "TGRL Bad Smells Detection",
562
  description = "Upload your .xgrl file and we will find the bad smells for you!")
 
563
 
564
+
565
+ interface.launch(inline = False)