import gradio as gr
import re
import json
import nltk
from transformers import AutoTokenizer, AutoModelForTokenClassification, TokenClassificationPipeline
from sentence_transformers import CrossEncoder
from autocorrect import Speller
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.nn.utils.rnn import pad_sequence
import numpy as np
import spacy


# ***************************** TGRL Parsing *****************************

def parse_tgrl(file_obj):

  with open(file_obj.name, 'r') as f:
    tgrl_text = f.read()
    tgrl_text = tgrl_text.replace('\t', '')
    tgrl_text = tgrl_text.replace('\n', '')
    
  return tgrl_text

def extract_elements(tgrl_text):

  # Extract actors
  actors = re.findall("(?:.*?actor\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s-]*)(?:\")", tgrl_text)
  # Extract goals
  goals = re.findall("(?:.*?goal\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s]*)(?:\")", tgrl_text)
  # Extract softGoals
  softGoals = re.findall("(?:.*?softGoal\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s]*)(?:\")", tgrl_text)
  # Extract tasks
  tasks = re.findall("(?:.*?task\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s]*)(?:\")", tgrl_text)
  # Extract resources
  resources = re.findall("(?:.*?resource\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s]*)(?:\")", tgrl_text)

  elements = {
    "actors": actors,
    "goals": goals,
    "softGoals": softGoals,
    "tasks": tasks,
    "resources": resources
  }

  # get elements per actor
  elements_per_actor = {}

  for goal in goals:
    corresponding_actor = tgrl_text.rfind('actor', 0, tgrl_text.index(goal))
    corresponding_actor = re.split(' |{', tgrl_text[corresponding_actor:])[1]
    if corresponding_actor not in elements_per_actor:
        elements_per_actor[corresponding_actor] = []
    elements_per_actor[corresponding_actor].append(goal)
  
  for softGoal in softGoals:
    corresponding_actor = tgrl_text.rfind('actor', 0, tgrl_text.index(softGoal))
    corresponding_actor = re.split(' |{', tgrl_text[corresponding_actor:])[1]
    if corresponding_actor not in elements_per_actor:
        elements_per_actor[corresponding_actor] = []
    elements_per_actor[corresponding_actor].append(softGoal)

  for task in tasks:
    corresponding_actor = tgrl_text.rfind('actor', 0, tgrl_text.index(task))
    corresponding_actor = re.split(' |{', tgrl_text[corresponding_actor:])[1]
    if corresponding_actor not in elements_per_actor:
        elements_per_actor[corresponding_actor] = []
    elements_per_actor[corresponding_actor].append(task)

  # get decomposed elements

  new_lines = tgrl_text
  decomposed_elements = {}

  main_elements = re.findall("\w+(?=\s+decomposedBy)", new_lines)

  for main_element in main_elements:
      
      sub_elements = []
      
      sub_element = (re.findall(main_element+"(?: decomposedBy )([A-Za-z\s]*)", new_lines)[0])
      sub_elements.append(sub_element)
      new_lines = new_lines.replace(sub_element+', ', '')
          
      temp = main_element + " decomposedBy "
      for idx, sub_element in enumerate(sub_elements):
          if idx+1 == len (sub_elements):
              temp = temp + sub_element + ";"
          else:
              temp = temp + sub_element + ", "
      
      while temp not in tgrl_text:
          
          sub_element = (re.findall(main_element+"(?: decomposedBy )([A-Za-z\s]*)", new_lines)[0])
          sub_elements.append(sub_element)
          new_lines = new_lines.replace(sub_element+', ', '')
          
          temp = main_element + " decomposedBy "
          for idx, sub_element in enumerate(sub_elements):
              if idx+1 == len (sub_elements):
                  temp = temp + sub_element + ";"
              else:
                  temp = temp + sub_element + ", "

      decomposed_elements[main_element] = sub_elements

      # Replace elements IDs with names
      new_decomposed_elements = {}

      for key, _ in decomposed_elements.items():
          
          new_key = re.findall("(?:"+key+" {\s*name\s=\s\")([A-Za-z\s]*)", tgrl_text)[0]
          new_values = []
          
          for element in decomposed_elements[key]:
              new_value = re.findall("(?:"+element+" {\s*name\s=\s\")([A-Za-z\s]*)", tgrl_text)[0]
              new_values.append(new_value)
          
          new_decomposed_elements[new_key] = new_values

  return elements, elements_per_actor, new_decomposed_elements

# ************************************************************************

# ************************* Bad Smells Detection *************************

# ########### Long Elements ###########
def get_long_elements(elements): # Using RegEx 

  long_elements = []

  for key, value in elements.items():
    for i in range(0, len(elements[key])):
        if len(re. findall(r'\w+', elements[key][i])) > 4:
            long_elements.append(elements[key][i])
  
  if long_elements:
    long_elements = "\n".join(long_elements)
    return "Long elements:\n" + long_elements
  else:
    return "Long elements:\nNone."
# #####################################

# ######### Complex Sentences #########
def get_complex_sentences(elements):

  complex_sentences = []

  for key, value in elements.items():
      for i in range(0, len(elements[key])):
          if len(get_clauses_list(elements[key][i])) > 1:
              complex_sentences.append(elements[key][i])
              
  if complex_sentences:
    complex_sentences = "\n".join(complex_sentences)
    return "Complex sentences:\n" + complex_sentences
  else:
    return "Complex sentences:\nNone."

def find_root_of_sentence(doc):
    root_token = None
    for token in doc:
      if (token.dep_ == "ROOT"):
        root_token = token
    return root_token

def find_other_verbs(doc, root_token):
  other_verbs = []
  for token in doc:
    ancestors = list(token.ancestors)
    if (token.pos_ == "VERB" and len(ancestors) == 1 and ancestors[0] == root_token):
      other_verbs.append(token)
  return other_verbs

#  find the token spans for each verb
def get_clause_token_span_for_verb(verb, doc, all_verbs):
    first_token_index = len(doc)
    last_token_index = 0
    this_verb_children = list(verb.children)
    for child in this_verb_children:
        if (child not in all_verbs):
            if (child.i < first_token_index):
                first_token_index = child.i
            if (child.i > last_token_index):
                last_token_index = child.i
    return(first_token_index, last_token_index)

def get_clauses_list(sent):

  nlp = spacy.load('en_core_web_sm')

  doc = nlp(sent)

  # find part of speech, dependency tag, ancestors, and children of each token
  for token in doc:
    ancestors = [t.text for t in token.ancestors]
    children = [t.text for t in token.children]
    #print(token.text, "\t", token.i, "\t", token.pos_, "\t", token.dep_, "\t", ancestors, "\t", children)

  # find the root token of the sentenc
  root_token = find_root_of_sentence(doc)

  #  find the other verbs
  other_verbs = find_other_verbs(doc, root_token)

  # put together all the verbs in one array and process each using get_clause_token_span_for_verb function
  # this will return a tuple of start and end indices for each verb's clause
  token_spans = []   
  all_verbs = [root_token] + other_verbs
  for other_verb in all_verbs:
      (first_token_index, last_token_index) = \
      get_clause_token_span_for_verb(other_verb, 
                                      doc, all_verbs)
      token_spans.append((first_token_index, 
                          last_token_index))

  # put together token spans for each clause
  sentence_clauses = []
  for token_span in token_spans:
      start = token_span[0]
      end = token_span[1]
      if (start < end):
          clause = doc[start:end]
          sentence_clauses.append(clause)
  sentence_clauses = sorted(sentence_clauses, key=lambda tup: tup[0])

  # get the final result
  clauses_text = [clause.text for clause in sentence_clauses]
  #print(clauses_text)
  return clauses_text

# #####################################
            
# ########## Punctuations ######### 
def get_punctuations(elements):

  punctuations = []

  for key, value in elements.items():
      for i in range(0, len(elements[key])):
          if len(re.findall("[^\s\w\d-]", elements[key][i])) > 0:
              punctuations.append(elements[key][i])

  if punctuations:
    punctuations = "\n".join(punctuations)
    return "Punctuations:\n" + punctuations
  else:
    return "Punctuations:\nNone."
# #################################

# ########## Incorrect Actor Syntax ##########
def find_non_NPs(sentences):

  model_name = "QCRI/bert-base-multilingual-cased-pos-english"
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  model = AutoModelForTokenClassification.from_pretrained(model_name)

  pipeline = TokenClassificationPipeline(model=model, tokenizer=tokenizer)

  outputs = pipeline(sentences)

  Non_NPs = []

  for idx, output in enumerate(outputs):
    if not output[0]['entity'].startswith('N'):
      Non_NPs.append(sentences[idx])
    
  return Non_NPs

def check_actor_syntax(actors):

  incorrect_actor_syntax = find_non_NPs(actors)
  
  if incorrect_actor_syntax:
    incorrect_actor_syntax = "\n".join(incorrect_actor_syntax)
    return "Incorrect Actors Syntax:\n" + incorrect_actor_syntax
  else:
    return "All actors are syntactically correct."
# ############################################

# ########## Incorrect Goal Syntax ###########
def check_goal_syntax(goals):

  incorrect_goal_syntax = find_non_NPs(goals)

  if incorrect_goal_syntax:
    incorrect_goal_syntax = "\n".join(incorrect_goal_syntax)
    return "Incorrect Goals Syntax:\n" + incorrect_goal_syntax
  else:
    return "All goals are syntactically correct."
# ############################################

# ########## Incorrect Softgoal Syntax ###########
def check_softgoal_syntax(softgoals):

  incorrect_softgoal_syntax = find_non_NPs(softgoals)

  if incorrect_softgoal_syntax:
    incorrect_softgoal_syntax = "\n".join(incorrect_softgoal_syntax)
    return "Incorrect Softgoals Syntax:\n" + incorrect_softgoal_syntax
  else:
    return "All softgoal are syntactically correct."
# ############################################

# ########## Incorrect Task Syntax ###########
def find_non_VPs(sentences):

  model_name = "QCRI/bert-base-multilingual-cased-pos-english"
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  model = AutoModelForTokenClassification.from_pretrained(model_name)

  pipeline = TokenClassificationPipeline(model=model, tokenizer=tokenizer)

  outputs = pipeline(sentences)

  Non_VPs = []

  for idx, output in enumerate(outputs):
    if not output[0]['entity'].startswith('V'):
      Non_VPs.append(sentences[idx])
    
  return Non_VPs

def check_task_syntax(tasks):

  incorrect_task_syntax = find_non_VPs(tasks)

  if incorrect_task_syntax:
    incorrect_task_syntax = "\n".join(incorrect_task_syntax)
    return "Incorrect Tasks Syntax:\n" + incorrect_task_syntax
  else:
    return "All tasks are syntactically correct."
# ############################################

# ########## Similarity ###########
def get_similar_elements(elements_per_actor):

  # Load the pre-trained model
  model = CrossEncoder('cross-encoder/stsb-roberta-base')

  # Prepare sentence pair array
  sentence_pairs = []

  for key, value in elements_per_actor.items():

      for i in range(len(elements_per_actor[key])):
          for j in range(i+1,len(elements_per_actor[key])):
              sentence_pairs.append([elements_per_actor[key][i], elements_per_actor[key][j]])
  
  # Predict semantic similarity 
  semantic_similarity_scores = model.predict(sentence_pairs, show_progress_bar=True)

  similar_elements = []
  for index, value in enumerate(sentence_pairs):
    if semantic_similarity_scores[index] > 0.5:
      similar_elements.append(value)
      #semantic_similarity["pair_"+str(index+1)] = [value,semantic_similarity_scores[index]]

  if similar_elements:
    similar_elements = [' and '.join(ele) for ele in similar_elements]
    similar_elements = "\n".join(similar_elements)
    return "The following elements are semantically similar:\n" + similar_elements
  else:
    return "There are no similar elements."

  return semantic_similarity
# #################################

# ########## Misspelling ###########
def get_misspelled_words(sentence):

  spell = Speller(only_replacements=True)
    
  misspelled= []
  
  for word in sentence.split():
      correct_word = spell(word)
      if word != correct_word:
          misspelled.append([word, correct_word])

  return misspelled

def check_spelling(elements):

  spelling_mistakes = []
  spelling_mistakes_string = ""

  for key, value in elements.items():
    for i in range(0, len(elements[key])):
        if get_misspelled_words(elements[key][i]):
            spelling_mistakes.append([elements[key][i], get_misspelled_words(elements[key][i])])
  
  for idx, element in enumerate(spelling_mistakes):
    for spelling_mistake in element[1]:
      temp = ' should be written as '.join(spelling_mistake) 
      spelling_mistakes_string = spelling_mistakes_string + "\n" + element[0] + ": " + temp 

  return spelling_mistakes_string
# ##################################

# ########## NLI ###########
def do_nli(premise, hypothesis, model, tokenizer):

  # Tokenization 
  token_ids = []
  seg_ids = []
  mask_ids = []

  premise_id = tokenizer.encode(premise, add_special_tokens = False)
  hypothesis_id = tokenizer.encode(hypothesis, add_special_tokens = False)
  pair_token_ids = [tokenizer.cls_token_id] + premise_id + [tokenizer.sep_token_id] + hypothesis_id + [tokenizer.sep_token_id]
  premise_len = len(premise_id)
  hypothesis_len = len(hypothesis_id)

  segment_ids = torch.tensor([0] * (premise_len + 2) + [1] * (hypothesis_len + 1))  # sentence 0 and sentence 1
  attention_mask_ids = torch.tensor([1] * (premise_len + hypothesis_len + 3))  # mask padded values

  token_ids.append(torch.tensor(pair_token_ids))
  seg_ids.append(segment_ids)
  mask_ids.append(attention_mask_ids)

  # Forward pass 
  token_ids = pad_sequence(token_ids, batch_first=True)
  mask_ids = pad_sequence(mask_ids, batch_first=True)
  seg_ids = pad_sequence(seg_ids, batch_first=True)

  with torch.no_grad():
    output = model(token_ids,
                  token_type_ids=seg_ids,
                  attention_mask=mask_ids)

  # Output predication 
  result = ""
  prediction = np.argmax(output.logits.cpu().numpy()).flatten().item()
  if prediction == 0:
    result = "Entailment"
    #print("Entailment")
  elif prediction == 1:
    result = "Contradiction"
    #print("Contradiction")
  elif prediction == 2:
    result = "Neutral"
    #print("Neutral")
  
  return result

# Entailment
def check_entailment(decomposed_elements):

  model = BertForSequenceClassification.from_pretrained("nouf-sst/bert-base-MultiNLI", use_auth_token="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")
  tokenizer = BertTokenizer.from_pretrained("nouf-sst/bert-base-MultiNLI", use_auth_token="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", do_lower_case=True)

  sentence_pairs = []
  non_matching_elements = []

  for key, value in decomposed_elements.items():
      #print(key, value)
      for i in decomposed_elements[key]:
          #print(key, i)
          sentence_pairs.append([key, i])
              
  for sentence_pair in sentence_pairs:
    result = do_nli(sentence_pair[0], sentence_pair[1], model, tokenizer)
    print(result)
    if result != "Entailment":
      non_matching_elements.append(sentence_pair)
  
  if non_matching_elements:
    non_matching_elements = [' and '.join(ele) for ele in non_matching_elements]
    non_matching_elements = "\n".join(non_matching_elements)
    return "The following elements are miss matching:\n" + non_matching_elements
  else:
    return "There are no miss matched elements."

  return result

# Contradiction
def check_contradiction(elements_per_actor):

  model = BertForSequenceClassification.from_pretrained("nouf-sst/bert-base-MultiNLI", use_auth_token="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")
  tokenizer = BertTokenizer.from_pretrained("nouf-sst/bert-base-MultiNLI", use_auth_token="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", do_lower_case=True)

  sentence_pairs = [] 
  contradicting_elements = []

  for key, value in elements_per_actor.items():

      for i in range(len(elements_per_actor[key])):
          for j in range(i+1,len(elements_per_actor[key])):
              sentence_pairs.append([elements_per_actor[key][i], elements_per_actor[key][j]])
  
  #print(sentence_pairs)
  # Check contradiction
  for sentence_pair in sentence_pairs:
    result = do_nli(sentence_pair[0], sentence_pair[1], model, tokenizer)
    #print(result)
    if result == "Contradiction":
      contradicting_elements.append(sentence_pair)

  if contradicting_elements:
    contradicting_elements = [' and '.join(ele) for ele in contradicting_elements]
    contradicting_elements = "\n".join(contradicting_elements)
    return "The following elements are contradicting:\n" + contradicting_elements
  else:
    return "There are no contradicting elements."
# ##########################

# ************************* User Interface *************************

def identify_bad_smells(tgrl_file, selected_bad_smells):

  output = ""

  tgrl_text = parse_tgrl(tgrl_file)

  elements, elements_per_actor, decomposed_elements = extract_elements(tgrl_text)

  if 'Size' in selected_bad_smells:
    output = output + get_long_elements(elements) + "\n\n"

  if 'Complexity' in selected_bad_smells:
    output = output + get_complex_sentences(elements) + "\n\n"
  
  if 'Punctuations' in selected_bad_smells:
    output = output + get_punctuations(elements) + "\n\n"

  if 'Actors Syntax' in selected_bad_smells:
    output = output + check_actor_syntax(elements['actors']) + "\n\n"

  if 'Goals Syntax' in selected_bad_smells:
    output = output + check_goal_syntax(elements['goals']) + "\n\n"

  if 'Softgoals Syntax' in selected_bad_smells:
    output = output + check_softgoal_syntax(elements['softGoals']) + "\n\n"
  
  if 'Tasks Syntax' in selected_bad_smells:
    output = output + check_task_syntax(elements['tasks']) + "\n\n"

  if 'Similar Elements' in selected_bad_smells:
    output = output + get_similar_elements(elements_per_actor) + "\n\n"

  if 'Spelling Mistakes' in selected_bad_smells:
    output = output + check_spelling(elements) + "\n\n"

  if 'Goal-Subgoal Mismatch' in selected_bad_smells:
    output = output + check_entailment(decomposed_elements) + "\n\n"
  
  if 'Contradicting Elements' in selected_bad_smells:
    output = output + check_contradiction(elements_per_actor) + "\n\n"
    
    
  return output


interface = gr.Interface(fn = identify_bad_smells, 
                         inputs = [gr.File(label="TGRL File"),
                          gr.CheckboxGroup(["Size", "Complexity", "Punctuations", "Actors Syntax", "Goals Syntax", "Softgoals Syntax", "Tasks Syntax", "Similar Elements", "Spelling Mistakes", "Goal-Subgoal Mismatch", "Contradicting Elements"],
                                           label="Which bad smells you want to detect?")],
                         outputs = ["text"],
                         title = "TGRL Bad Smells Detection",
                         description = "Upload your .xgrl file and we will find the bad smells for you!")


interface.launch(inline = False)