from googletrans import Translator import spacy import gradio as gr import nltk from nltk.corpus import wordnet import wikipedia nltk.download('maxent_ne_chunker') #Chunker nltk.download('stopwords') #Stop Words List (Mainly Roman Languages) nltk.download('words') #200 000+ Alphabetical order list nltk.download('punkt') #Tokenizer nltk.download('verbnet') #For Description of Verbs nltk.download('omw') nltk.download('omw-1.4') #Multilingual Wordnet nltk.download('wordnet') #For Definitions, Antonyms and Synonyms nltk.download('shakespeare') nltk.download('dolch') #Sight words nltk.download('names') #People Names NER nltk.download('gazetteers') #Location NER nltk.download('opinion_lexicon') #Sentiment words spacy.cli.download("en_core_web_sm") nlp = spacy.load('en_core_web_sm') translator = Translator() def Sentencechunker(sentence): Sentchunks = sentence.split(" ") chunks = [] for i in range(len(Sentchunks)): chunks.append(" ".join(Sentchunks[:i+1])) return " | ".join(chunks) def ReverseSentenceChunker(sentence): reversed_sentence = " ".join(reversed(sentence.split())) chunks = Sentencechunker(reversed_sentence) return chunks def three_words_chunk(sentence): words = sentence.split() chunks = [words[i:i+3] for i in range(len(words)-2)] chunks = [" ".join(chunk) for chunk in chunks] return " | ".join(chunks) def keep_nouns_verbs(sentence): doc = nlp(sentence) nouns_verbs = [] for token in doc: if token.pos_ in ['NOUN','VERB','PUNCT']: nouns_verbs.append(token.text) return " ".join(nouns_verbs) def unique_word_count(text="", state=None): if state is None: state = {} words = text.split() word_counts = state for word in words: if word in word_counts: word_counts[word] += 1 else: word_counts[word] = 1 sorted_word_counts = sorted(word_counts.items(), key=lambda x: x[1], reverse=True) return sorted_word_counts, def Wordchunker(word): chunks = [] for i in range(len(word)): chunks.append(word[:i+1]) return chunks def BatchWordChunk(sentence): words = sentence.split(" ") FinalOutput = "" Currentchunks = "" ChunksasString = "" for word in words: ChunksasString = "" Currentchunks = Wordchunker(word) for chunk in Currentchunks: ChunksasString += chunk + " " FinalOutput += "\n" + ChunksasString return FinalOutput # Translate from English to French langdest = gr.Dropdown(choices=["af", "de", "es", "ko", "ja", "zh-cn"], label="Choose Language", value="de") ChunkModeDrop = gr.Dropdown(choices=["Chunks", "Reverse", "Three Word Chunks", "Spelling Chunks"], label="Choose Chunk Type", value="Chunks") def FrontRevSentChunk (Chunkmode, Translate, Text, langdest): FinalOutput = "" TransFinalOutput = "" if Chunkmode=="Chunks": FinalOutput += Sentencechunker(Text) if Chunkmode=="Reverse": FinalOutput += ReverseSentenceChunker(Text) if Chunkmode=="Three Word Chunks": FinalOutput += three_words_chunk(Text) if Chunkmode=="Spelling Chunks": FinalOutput += BatchWordChunk(Text) if Translate: TransFinalOutput = FinalOutput translated = translator.translate(TransFinalOutput, dest=langdest) FinalOutput += "\n" + translated.text return FinalOutput # Define a function to filter out non-verb, noun, or adjective words def filter_words(words): # Use NLTK to tag each word with its part of speech tagged_words = nltk.pos_tag(words) # Define a set of parts of speech to keep (verbs, nouns, adjectives) keep_pos = {'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJR', 'JJS'} # Filter the list to only include words with the desired parts of speech filtered_words = [word for word, pos in tagged_words if pos in keep_pos] return filtered_words # Call the function to get the filtered list of words filtered_words = filter_words(words) print(filtered_words) def SepHypandSynExpansion(text): # Tokenize the text tokens = nltk.word_tokenize(text) NoHits = "" FinalOutput = "" # Find synonyms and hypernyms of each word in the text for token in tokens: synonyms = [] hypernyms = [] for synset in wordnet.synsets(token): synonyms += synset.lemma_names() hypernyms += [hypernym.name() for hypernym in synset.hypernyms()] if not synonyms and not hypernyms: NoHits += f"{token} | " else: FinalOutput += "\n" f"{token}: hypernyms={hypernyms}, synonyms={synonyms} \n" NoHits = set(NoHits.split(" | ")) NoHits = filter_words(NoHits) NoHits = "Words to pay special attention to: \n" + str(NoHits) return NoHits, FinalOutput def WikiSearch(term): termtoks = term.split(" ") for item in termtoks: # Search for the term on Wikipedia and get the first result result = wikipedia.search(item, results=20) return result with gr.Blocks() as lliface: with gr.Tab("Welcome "): gr.HTML("
You only learn when you convert things you dont know to known --> Normally Repetition is the only reliable method for everybody
Knowledge is a Language
LingQ is good option for per word state management
Arrows app json creator for easy knowledge graphing and spacy POS graph?
https://huggingface.co/spaces/RASMUS/Whisper-youtube-crosslingual-subtitles
, https://huggingface.co/spaces/vumichien/whisper-speaker-diarization
Maybe duplicate these, private them and then load into spaces? --> Whisper space for youtube, Clip Interrogator, load here and all my random functions esp. text to HTML
If this tab doesnt work use the link below ⬇️
https://huggingface.co/spaces/RASMUS/Whisper-youtube-crosslingual-subtitles""") gr.Interface.load("spaces/RASMUS/Whisper-youtube-crosslingual-subtitles", title="Subtitles") with gr.Tab("Chunks"): gr.Interface(fn=FrontRevSentChunk, inputs=[ChunkModeDrop, "checkbox", "text", langdest], outputs="text") gr.Interface(fn=keep_nouns_verbs, inputs=["text"], outputs="text", title="Noun and Verbs only (Plus punctuation)") with gr.Tab("Unique words, Hypernyms and synonyms"): gr.Interface(fn=unique_word_count, inputs="text", outputs="text", title="Wordcounter") gr.Interface(fn=SepHypandSynExpansion, inputs="text", outputs=["text", "text"], title="Word suggestions") gr.Interface(fn=WikiSearch, inputs="text", outputs="text", title="Unique word suggestions(wiki articles)") with gr.Tab("Timing Practice"): gr.HTML("""""") lliface.launch()