from googletrans import Translator
import spacy
import gradio as gr
import nltk
from nltk.corpus import wordnet
import wikipedia

nltk.download('maxent_ne_chunker') #Chunker
nltk.download('stopwords') #Stop Words List (Mainly Roman Languages)
nltk.download('words') #200 000+ Alphabetical order list
nltk.download('punkt') #Tokenizer
nltk.download('verbnet') #For Description of Verbs
nltk.download('omw')
nltk.download('omw-1.4') #Multilingual Wordnet
nltk.download('wordnet') #For Definitions, Antonyms and Synonyms
nltk.download('shakespeare')
nltk.download('dolch') #Sight words
nltk.download('names') #People Names NER
nltk.download('gazetteers') #Location NER
nltk.download('opinion_lexicon') #Sentiment words


spacy.cli.download("en_core_web_sm")

nlp = spacy.load('en_core_web_sm')
translator = Translator()

def Sentencechunker(sentence):
    Sentchunks = sentence.split(" ")
    chunks = []
    for i in range(len(Sentchunks)):
        chunks.append(" ".join(Sentchunks[:i+1]))
    return " | ".join(chunks)

def ReverseSentenceChunker(sentence):
    reversed_sentence = " ".join(reversed(sentence.split()))
    chunks = Sentencechunker(reversed_sentence)
    return chunks

def three_words_chunk(sentence):
    words = sentence.split()
    chunks = [words[i:i+3] for i in range(len(words)-2)]
    chunks = [" ".join(chunk) for chunk in chunks]
    return " | ".join(chunks)

def keep_nouns_verbs(sentence):
    doc = nlp(sentence)
    nouns_verbs = []
    for token in doc:
        if token.pos_ in ['NOUN','VERB','PUNCT']:
            nouns_verbs.append(token.text)
    return " ".join(nouns_verbs)

def unique_word_count(text="", state=None):
    if state is None:
        state = {}
    words = text.split()
    word_counts = state
    for word in words:
        if word in word_counts:
            word_counts[word] += 1
        else:
            word_counts[word] = 1
    sorted_word_counts = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
    return sorted_word_counts,    

def Wordchunker(word):
    chunks = []
    for i in range(len(word)):
        chunks.append(word[:i+1])
    return chunks

def BatchWordChunk(sentence):
  words = sentence.split(" ")
  FinalOutput = ""
  Currentchunks = ""
  ChunksasString = ""
  for word in words:
    ChunksasString = ""
    Currentchunks = Wordchunker(word)
    for chunk in Currentchunks:
      ChunksasString += chunk + " "
    FinalOutput += "\n" + ChunksasString
  return FinalOutput

# Translate from English to French

langdest = gr.Dropdown(choices=["af", "de", "es", "ko", "ja", "zh-cn"], label="Choose Language", value="de")

ChunkModeDrop = gr.Dropdown(choices=["Chunks", "Reverse", "Three Word Chunks", "Spelling Chunks"], label="Choose Chunk Type", value="Chunks")

def FrontRevSentChunk (Chunkmode, Translate, Text, langdest):
  FinalOutput = ""
  TransFinalOutput = ""
  if Chunkmode=="Chunks": 
    FinalOutput += Sentencechunker(Text)
  if Chunkmode=="Reverse":
    FinalOutput += ReverseSentenceChunker(Text)
  if Chunkmode=="Three Word Chunks": 
    FinalOutput += three_words_chunk(Text) 
  if Chunkmode=="Spelling Chunks":
    FinalOutput += BatchWordChunk(Text)
  
  if Translate: 
    TransFinalOutput = FinalOutput
    translated = translator.translate(TransFinalOutput, dest=langdest)
    FinalOutput += "\n" + translated.text
  return FinalOutput

# Define a function to filter out non-verb, noun, or adjective words
def filter_words(words):
    # Use NLTK to tag each word with its part of speech
    tagged_words = nltk.pos_tag(words)

    # Define a set of parts of speech to keep (verbs, nouns, adjectives)
    keep_pos = {'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJR', 'JJS'}

    # Filter the list to only include words with the desired parts of speech
    filtered_words = [word for word, pos in tagged_words if pos in keep_pos]

    return filtered_words

# Call the function to get the filtered list of words
filtered_words = filter_words(words)

print(filtered_words)

def SepHypandSynExpansion(text):
  # Tokenize the text
  tokens = nltk.word_tokenize(text)
  NoHits = ""
  FinalOutput = ""

  # Find synonyms and hypernyms of each word in the text
  for token in tokens:
      synonyms = []
      hypernyms = []
      for synset in wordnet.synsets(token):
          synonyms += synset.lemma_names()
          hypernyms += [hypernym.name() for hypernym in synset.hypernyms()]
      if not synonyms and not hypernyms:
          NoHits += f"{token} | "
      else:
          FinalOutput += "\n" f"{token}: hypernyms={hypernyms}, synonyms={synonyms} \n"
  NoHits = set(NoHits.split(" | "))  
  NoHits = filter_words(NoHits)
  NoHits = "Words to pay special attention to: \n" + str(NoHits)
  return NoHits, FinalOutput


def WikiSearch(term):
    termtoks = term.split(" ")

    for item in termtoks:
      # Search for the term on Wikipedia and get the first result
      result = wikipedia.search(item, results=20)
    return result

with gr.Blocks() as lliface:
  with gr.Tab("Welcome "):
    gr.HTML("<h1> Spaces Test - Still Undercontruction </h1> <p> You only learn when you convert things you dont know to known --> Normally Repetition is the only reliable method for everybody </p>  <p> Knowledge is a Language </p> <p>LingQ is good option for per word state management</p> <p> Arrows app json creator for easy knowledge graphing and spacy POS graph? </p> <p> https://huggingface.co/spaces/RASMUS/Whisper-youtube-crosslingual-subtitles<br>, https://huggingface.co/spaces/vumichien/whisper-speaker-diarization<br>  Maybe duplicate these, private them and then load into spaces? --> Whisper space for youtube, Clip Interrogator, load here and all my random functions esp. text to HTML </p>")
  with gr.Tab("LingQ Addons ideas"):
    gr.HTML("Extra functions needed - Persitent Sentence translation, UNWFWO, POS tagging and Word Count per user of words in their account")      
  with gr.Tab("Transcribe - RASMUS Whisper"):
    gr.HTML("""<p>If this tab doesnt work use the link below ⬇️</p> <a href="https://huggingface.co/spaces/RASMUS/Whisper-youtube-crosslingual-subtitles">https://huggingface.co/spaces/RASMUS/Whisper-youtube-crosslingual-subtitles</a>""")
    gr.Interface.load("spaces/RASMUS/Whisper-youtube-crosslingual-subtitles", title="Subtitles")
  with gr.Tab("Chunks"):
    gr.Interface(fn=FrontRevSentChunk, inputs=[ChunkModeDrop, "checkbox", "text", langdest], outputs="text")
    gr.Interface(fn=keep_nouns_verbs, inputs=["text"], outputs="text", title="Noun and Verbs only (Plus punctuation)")
  with gr.Tab("Unique words, Hypernyms and synonyms"):
    gr.Interface(fn=unique_word_count, inputs="text", outputs="text", title="Wordcounter")
    gr.Interface(fn=SepHypandSynExpansion, inputs="text", outputs=["text", "text"], title="Word suggestions")
    gr.Interface(fn=WikiSearch, inputs="text", outputs="text", title="Unique word suggestions(wiki articles)")  
  with gr.Tab("Timing Practice"):
    gr.HTML("""<iframe height="1200" style="width: 100%;" scrolling="no" title="Memorisation Aid" src="https://codepen.io/kwabs22/embed/preview/GRXKQgj?default-tab=result&editable=true" frameborder="no" loading="lazy" allowtransparency="true" allowfullscreen="true">
  See the Pen <a href="https://codepen.io/kwabs22/pen/GRXKQgj">
  Memorisation Aid</a> by kwabs22 (<a href="https://codepen.io/kwabs22">@kwabs22</a>)
  on <a href="https://codepen.io">CodePen</a>.
</iframe>""")

lliface.launch()