Spaces:

KwabsHug
/

Language-Learn-Idea

Running

App Files Files Community

KwabsHug commited on Apr 12, 2023

Commit

e5c0437

•

1 Parent(s): 8a460a9

Update app.py

Browse files

Files changed (1) hide show

app.py +211 -8

app.py CHANGED Viewed

@@ -15,7 +15,7 @@ from gtts import gTTS
 from collections import Counter
 from PIL import Image, ImageDraw, ImageFont
 import numpy as np
 #Uncomment these for Huggingface
 nltk.download('maxent_ne_chunker') #Chunker
@@ -189,8 +189,8 @@ def merge_lines(roman_file, w4w_file, full_mean_file, macaronic_file):
     return "\n".join(merged_lines)
-TTSLangOptions = gr.Dropdown(choices=["en", "ja", "ko", "zh-cn"], value="en", label="choose the language of the srt")
-TTSLangOptions2 = gr.Dropdown(choices=["en", "ja", "ko", "zh-cn"], value="en", label="choose the language of the srt")
 def TTSforListeningPractice(text, language = "en"):
     speech = gTTS(text=text, lang=language, slow="False")
@@ -290,6 +290,16 @@ def split_verbs_nouns(text):
 SRTLangOptions = gr.Dropdown(choices=["en", "ja", "ko", "zh-cn"], value="en", label="choose the language of the srt")
 def split_srt_file(text, lang): #file_path):
     # Open the SRT file and read its contents
     #with open(file_path, 'r') as f:
@@ -305,6 +315,7 @@ def split_srt_file(text, lang): #file_path):
     # Split the SRT file by timestamp
     srt_sections = srt_contents.split('\n\n')
     srt_sections_POSversion = []
     # Loop through each section of the SRT file
     for i in range(len(srt_sections)):
@@ -319,14 +330,27 @@ def split_srt_file(text, lang): #file_path):
         #subtitle_text = subtitle_text.replace(' ', ' | ')
         for token in sub_split_line:
             subtitle_text += token.text + " | "
             subtitle_textPOSversion += token.pos_ + " | "
         # Reconstruct the section with the updated subtitle text
         srt_sections[i] = f"{section_lines[0]}\n{timestamp}\n{subtitle_text[3:]}"
         srt_sections_POSversion.append(f"{section_lines[0]}\n{timestamp}\n{subtitle_textPOSversion[3:]}\n\n")
     # Join the SRT sections back together into a single string
-    return '\n\n'.join(srt_sections), ''.join(srt_sections_POSversion)
 def find_string_positions(s, string):
     positions = []
@@ -463,18 +487,191 @@ def add_text_to_image(input_image, text, output_image_path="output.png", border_
     img.save(output_image_path, "PNG")
     return "output.png"
 # Define the Gradio interface inputs and outputs for video split
 spvvideo_file_input = gr.File(label='Video File')
 spvsubtitle_file_input = gr.File(label='Subtitle File')
 spvdownload_output = gr.File(label='Download Segmented Files')
-groupinput_text = gr.inputs.Textbox(lines=2, label="Enter a list of words")
-groupoutput_text = gr.outputs.Textbox(label="Grouped words")
 with gr.Blocks() as lliface:
   gr.HTML("<p> Target 1: Dual audio at word Level while using repitition to train random recall --> Word level Time <br> Target 2: Video --> Split by sentence --> each word repeated (60) + each phrase (10) + each sentence (10) --> TTS file for practice --> State Management/Known word Tracker <hr> The trick is minimum one minute of focus on a new word --> Listening is hard because there are new word within seconds and you need repeated focus on each to learn </p> <p>Audio = best long form attention mechanism AS it is ANTICIPATION (Awareness of something before it happens like knowing song Lyrics) FOCUSED - Attention (Focused Repitition) + Exposure (Random Repitition) </p>")
-  gr.HTML("""<hr> <a href="https://translate.google.com/?hl=en&tab=TT"> -- Google Translate -- </a> | <a href='https://huggingface.co/spaces/damo-vilab/modelscope-text-to-video-synthesis'> -- Modelscope Text to Video -- </a> | <a href='https://huggingface.co/spaces/stabilityai/stable-diffusion'> -- stable-diffusion 2 -- </a> | <a href='https://huggingface.co/spaces/stabilityai/stable-diffusion-1'> -- stable-diffusion 1 -- </a>""")
   with gr.Tab("Welcome"):
     gr.HTML("""<p>Spaces Test - Still Undercontruction  | Knowledge is a Language but productive knowledge is find replace as well | LingQ is good option for per word state management</p> <p> Arrows app json creator for easy knowledge graphing and spacy POS graph? --> Questions? -->
     <p> ChatGPT Turns Learning into a read only what you dont know ask only what you dont know feedback loop --> All you have to do is keep track of what prompts you have asked in the past</p> """)
@@ -482,9 +679,14 @@ with gr.Blocks() as lliface:
     gr.Interface(fn=group_words, inputs=groupinput_text, outputs=groupoutput_text, description="Word Grouping and Rotation - Group a list of words into sets of 10 and rotate them every 60 seconds.") #.queue()
     gr.HTML("""HTML Version <hr> <iframe height="1200" style="width: 100%;" scrolling="no" title="Memorisation Aid" src="https://codepen.io/kwabs22/embed/preview/GRXKQgj?default-tab=result&editable=true" frameborder="no" loading="lazy" allowtransparency="true" allowfullscreen="true">
                See the Pen <a href="https://codepen.io/kwabs22/pen/GRXKQgj"> Memorisation Aid</a> by kwabs22 (<a href="https://codepen.io/kwabs22">@kwabs22</a>) on <a href="https://codepen.io">CodePen</a>. </iframe>""")
   with gr.Tab("Unknown Tracker"):
     gr.HTML("Repitition of things you know is a waste of time when theres stuff you dont know <p> In Language the goal is bigger vocab --> Knowledge equivalent = question answer pairs but to get to those you need related information pairs</p> <p> Vocab = Glossary + all non text wall(lists, diagrams, etc.)</p>")
     gr.Textbox("Placeholder for a function that creates a set list and can takes a list for known words and auto find replaces the stuff you know out of the content")
     with gr.Tab("Unique word ID - use in Infranodus"):
       gr.Interface(fn=unique_word_count, inputs="text", outputs="text", description="Wordcounter")
       gr.Interface(fn=SepHypandSynExpansion, inputs="text", outputs=["text", "text"], description="Word suggestions - Analyse the unique words in infranodus")
@@ -504,6 +706,7 @@ with gr.Blocks() as lliface:
         gr.HTML("""<a href="https://huggingface.co/spaces/pharma/CLIP-Interrogator"> --Huggingface CLIP-Interrogator Space-- </a><br> """)
         gr.Interface(fn=removeTonalMarks, inputs="text", outputs="text", description="For text with characters use this function to remove any conflicting characters (if error below)")
         gr.Interface(fn=add_text_to_image , inputs=["image", "text"], outputs="image", description="Create Annotated images (Can create using stable diffusion and use the prompt)")
   #with gr.Tab("Transcribe - RASMUS Whisper"):
     #gr.Interface.load("spaces/RASMUS/Whisper-youtube-crosslingual-subtitles", title="Subtitles")
   with gr.Tab("Advanced - LingQ Addon Ideas"):
@@ -516,7 +719,7 @@ with gr.Blocks() as lliface:
         #gr.HTML("<p>If Space not loaded its because of offline devopment errors please message for edit</p> <hr>")
         with gr.Tab("Merged Subtitles"):
             gr.HTML("Step 1 - Word for Word Translation Creation in both Directions (Paste Google Translation here)")
-            gr.Interface(fn=split_srt_file, inputs=["text", SRTLangOptions] , outputs=["text", "text"], description="SRT Contents to W4W Split SRT for Google Translate")
             gr.HTML("Step 2 - Pronounciation (Roman) to Subtitle Format --> GTranslate returns unformatted string")
             gr.Interface(fn=splittext, inputs="text", outputs="text", description="Text for w4w creation in G Translate")
             gr.HTML("Step 3 - Merge into one file")

 from collections import Counter
 from PIL import Image, ImageDraw, ImageFont
 import numpy as np
+from docx import Document
 #Uncomment these for Huggingface
 nltk.download('maxent_ne_chunker') #Chunker
     return "\n".join(merged_lines)
+TTSLangOptions = gr.Dropdown(choices=["en", "de", "es", "ja", "ko", "zh-cn"], value="en", label="choose the language of the srt")
+TTSLangOptions2 = gr.Dropdown(choices=["en", "de", "es", "ja", "ko", "zh-cn"], value="en", label="choose the language of the srt")
 def TTSforListeningPractice(text, language = "en"):
     speech = gTTS(text=text, lang=language, slow="False")
 SRTLangOptions = gr.Dropdown(choices=["en", "ja", "ko", "zh-cn"], value="en", label="choose the language of the srt")
+def save_string_to_file(string_to_save, file_name, srtdocx):
+    with open(file_name, 'w', encoding='utf-8') as file:
+        file.write(string_to_save)
+    if srtdocx == "True":
+        with open(file_name.split('.')[0] + '.srt', 'w', encoding='utf-8') as file:
+            file.write(string_to_save)
+        srtdocument = Document()
+        srtdocument.add_paragraph(string_to_save)
+        srtdocument.save('SplitSRT.docx')
 def split_srt_file(text, lang): #file_path):
     # Open the SRT file and read its contents
     #with open(file_path, 'r') as f:
     # Split the SRT file by timestamp
     srt_sections = srt_contents.split('\n\n')
     srt_sections_POSversion = []
+    subaswordlist = ""
     # Loop through each section of the SRT file
     for i in range(len(srt_sections)):
         #subtitle_text = subtitle_text.replace(' ', ' | ')
         for token in sub_split_line:
             subtitle_text += token.text + " | "
+            subaswordlist += token.text + " "
             subtitle_textPOSversion += token.pos_ + " | "
         # Reconstruct the section with the updated subtitle text
         srt_sections[i] = f"{section_lines[0]}\n{timestamp}\n{subtitle_text[3:]}"
         srt_sections_POSversion.append(f"{section_lines[0]}\n{timestamp}\n{subtitle_textPOSversion[3:]}\n\n")
+    SplitSRT = '\n\n'.join(srt_sections)
+    SplitPOSsrt = ''.join(srt_sections_POSversion)
+    save_string_to_file(SplitSRT, "SplitSRT.txt", "True")
+    save_string_to_file(SplitPOSsrt, "SplitPOSsrt.txt", "False")
+    subaswordlist = set(subaswordlist.split(" "))
+    subaswordlistOutput = ""
+    for word in subaswordlist:
+        subaswordlistOutput += "\n | " + word
+    subaswordlistOutput = str(len(subaswordlist)) + "\n" + subaswordlistOutput
     # Join the SRT sections back together into a single string
+    return subaswordlistOutput, ["SplitSRT.docx", "SplitSRT.txt", "SplitSRT.srt", "SplitPOSsrt.txt"], SplitSRT, SplitPOSsrt
 def find_string_positions(s, string):
     positions = []
     img.save(output_image_path, "PNG")
     return "output.png"
+def UnknownTrackTexttoApp(text): #Copy of def OptimisedTtAppForUNWFWO(text):
+      #Buttons and labels autocreation
+    #Change this to spacy version so that data is from one library
+    #Javascript videos on youtube - KodeBase - Change button color Onclick; bro code - button in 5 minutes
+    #GPT3 helped guide the highlighting if statements
+    FinalOutput = ""
+    #sentence = "One Piece chapter 1049 spoilers  Thanks to Etenboby from WG forums  Chapter 1049: **\"The world we should aspire to\"**  * In the cover, someone burned Niji and Yonji\u2019s book * Kaido flashback time. We see his childhood in Vodka Kingdom, and where a few years later he met Whitebeard who told him that Rocks wants to meet him * In the present, part of Raizo\u2019s water leaves the castle and flame clouds disappear. But Momo makes a new one. * Luffy says he will create a world where none of his friends would starve, then he hits Kaido and Kaido falls to the ground of the flower capital. * In another flashback, Kaido tells King that Joy Boy will be the man that can defeat him.  **Additional info**   *Flashback to Kaidou as a kid*  *- His country tries to sell him to the marines but he escapes*  *- He rampages in Hachinosu(i think it's blackbeard's island) and Rocks invites him to his crew*  *- Young WB appears*  *- Rocks flashback suddenly ends*  *- Higurashi invites Kaidou*  *- The flashback ends with Kaidou telling King he knows who Joy Boy is.*   *Back to the present*  \\- *Denjirou hugs Hiyori*  \\- *Luffy's punch hits Kaidou*  *Flashback continues*  \\- *King asks: Who is it then?*  \\- *Kaidou: The one who will defeat me*  \\- *King: Then he will not appear*  \\- *Onigashima falls near the capital*  \\- *Momo falls*  **BREAK NEXT WEEK**  https://www.reddit.com/r/OnePiece/comments/umu2h0/one_piece_chapter_1049_spoilers/" #@param {type: "string"}
+    HTMLMainbody = ""
+    doc = nlp(text)
+    iIDNumber = 0
+    iVerbCount = 0
+    iNounCount = 0
+    iWords = 0
+    allverbs = ""
+    allverbslist = ""
+    allverbids = ""
+    allverbidslist = ""
+    for token in doc:
+        if (token.pos_ == "VERB") or (token.pos_ == "AUX"):
+            HTMLMainbody = HTMLMainbody + "<button id='btn" + str(iVerbCount) +  "' onclick=HighlightWord('btn" + str(iVerbCount) + "')> " + token.text + "</button> "
+            allverbids = allverbids + str(iVerbCount) + " "
+            iVerbCount += 1
+            iWords += 1
+            allverbs = allverbs + token.text + " "
+        elif token.pos_ == "NOUN":
+            HTMLMainbody = HTMLMainbody +  "<label class='Nouns' id='lbl" + token.text + "'>" + token.text + " </label>"
+            iNounCount += 1
+            iWords += 1
+        elif token.pos_ == "PUNCT":
+            HTMLMainbody = HTMLMainbody + token.text
+        else:
+            HTMLMainbody = HTMLMainbody + token.text + " "
+            iWords += 1
+        iIDNumber += 1
+    allverbslist = allverbs.split()
+    allverbidslist = allverbids.split()
+    FinalHTML = ""
+    FinalCSS = ""
+    FinalJS = ""
+    FinalCSS = FinalCSS + ''' <style>
+    body {
+    background-color: darksalmon;
+    }
+    .Nouns {
+    color: red;
+    }
+    .clunknown{
+    background-color: gainsboro;
+    }
+    .clknownl1{
+    background-color: yellow;
+    }
+    .clknownl2{
+    background-color: gold;
+    }
+    .clknownl3{
+    background-color: orange;
+    }
+    .PD1 {
+    text-align: center;
+    font-size: larger;
+    font-family: cursive;
+    }
+    .PD2 {
+    font-family: monospace;
+    }
+    </style>
+    '''
+    #style='background-color:Gainsboro; There is no general style attribute for buttons but you can make a class and put the style conditions
+    iSents = 0
+    for sent in doc.sents:
+        iSents += 1
+    FinalHTML = FinalHTML + "\n<div id='PD1'>Picture on mouse hover = Visual<br> Speed = End Goal ==> App Timer Functions ||| \nSentences: " + str(iSents) + " | Words: " + str(iWords) + " | App elements: " + str(iNounCount + iVerbCount) + " | Verbs: " + str(iVerbCount) + "</div>"
+    FinalHTML = FinalHTML + "\n<div><hr><progress id='myVerbProgress' value='0' max='" + str(iVerbCount) + "'></progress></div>"
+    FinalJS = FinalJS + '''\n
+    <script>
+    function HighlightWord(Button){
+    if (document.getElementById(Button).style.backgroundColor === 'orange') {
+    document.getElementById(Button).style.backgroundColor=''
+    }
+    else if (document.getElementById(Button).style.backgroundColor === 'gold') {
+    document.getElementById(Button).style.backgroundColor='orange'
+    }
+    else if (document.getElementById(Button).style.backgroundColor === 'yellow') {
+    document.getElementById(Button).style.backgroundColor='gold'
+    }
+    else {document.getElementById(Button).style.backgroundColor='yellow'
+    }
+    OnlyUnknownVerbs()
+    }
+    '''
+    FinalHTML = FinalHTML + "\n<div><hr>\n" + HTMLMainbody + "\n"
+    #FinalHTML = FinalHTML + '''</div><hr>
+    #<button onclick=OnlyUnknownSentences() id="btnOnlyUnknownSentences">Only Unknown Sentences Put this function in a timer to keep up to date without input</button>
+    #'''
+    FinalJS = FinalJS + '''
+    function OnlyUnknownVerbs(){
+    AllButtons = ''' + str(allverbidslist) + '''
+    AllButtonsText = ''' + str(allverbslist) + '''
+    UnknownOutput = ""
+    iUnknownCount = 0
+    AllButtons.forEach(function(item){
+    if (document.getElementById('btn'+item).style.backgroundColor === ''){
+        UnknownOutput += AllButtonsText[item] + " "
+        iUnknownCount += 1
+        }
+        document.getElementById('myVerbProgress').value = ''' + str(iVerbCount) + ''' - iUnknownCount
+    })
+    document.getElementById('PD2').textContent = 'Only Unknwon words list: ' + UnknownOutput
+    }
+    </script>
+    '''
+    FinalHTML = FinalHTML + '''<br><hr><br>
+    <div id='PD2'> Only Unknown List</div>
+    \n
+    '''
+    FinalOutput = FinalHTML + FinalCSS + FinalJS
+    return FinalOutput, FinalOutput
+#Kathryn Lingel - Pyambic Pentameter Example - PyCon US
+#Basic Language Model Code
+def build_model(source_text):
+    list_of_words = source_text.split()
+    model = {} #initialise model to empty dictionary
+    for i, word in enumerate(list_of_words[:-1]): #every word except last word
+      if not word in model: #If word not already in dictionary as a key we add it and initialise to empty array
+        model[word] = []
+      next_word = list_of_words[i+1]
+      model[word].append(next_word) #model = dictionary per word containing previously seen next words from ANY given text ==> even lyrics
+    translatestring = str(model)
+    translatestring = translatestring.replace("'", "")
+    return model, translatestring
+def markov_generate(source_text, num_words = 20):
+    model = build_model(source_text)
+    seed = random.choice(list(model.keys())) #Randomly pick a word ==> Heading of the dictionary are keys aka the words
+    output = [seed] #output initialisation using random word
+    for i in range(num_words):
+      last_word = output[-1] #of the output list
+      next_word = random.choice(model[last_word]) # next word to the above word
+      output.append(next_word) #new last word in the output list
+      if next_word not in model:
+        break
+    return ' '.join(output) #New list into a string aka (hopefully) sentence
+# print(markov_generate("I am the egg man they are the egg men I am the wallrus goo goo g' joob"))
 # Define the Gradio interface inputs and outputs for video split
 spvvideo_file_input = gr.File(label='Video File')
 spvsubtitle_file_input = gr.File(label='Subtitle File')
 spvdownload_output = gr.File(label='Download Segmented Files')
+Markovlength = gr.Number(value=30, label='Length of generation')
+groupinput_text = gr.Textbox(lines=2, label="Enter a list of words")
+groupoutput_text = gr.Textbox(label="Grouped words")
 with gr.Blocks() as lliface:
   gr.HTML("<p> Target 1: Dual audio at word Level while using repitition to train random recall --> Word level Time <br> Target 2: Video --> Split by sentence --> each word repeated (60) + each phrase (10) + each sentence (10) --> TTS file for practice --> State Management/Known word Tracker <hr> The trick is minimum one minute of focus on a new word --> Listening is hard because there are new word within seconds and you need repeated focus on each to learn </p> <p>Audio = best long form attention mechanism AS it is ANTICIPATION (Awareness of something before it happens like knowing song Lyrics) FOCUSED - Attention (Focused Repitition) + Exposure (Random Repitition) </p>")
+  gr.HTML("""<hr> <a href="https://translate.google.com/?hl=en&tab=TT"> -- Google Translate -- </a> | <a href='https://huggingface.co/spaces/damo-vilab/modelscope-text-to-video-synthesis'> -- Modelscope Text to Video -- </a> | <a href='https://huggingface.co/spaces/stabilityai/stable-diffusion'> -- stable-diffusion 2 -- </a> | <a href='https://huggingface.co/spaces/stabilityai/stable-diffusion-1'> -- stable-diffusion 1 -- </a> | <a href='https://huggingface.co/spaces/kakaobrain/karlo'> -- karlo 1 -- </a>""")
   with gr.Tab("Welcome"):
     gr.HTML("""<p>Spaces Test - Still Undercontruction  | Knowledge is a Language but productive knowledge is find replace as well | LingQ is good option for per word state management</p> <p> Arrows app json creator for easy knowledge graphing and spacy POS graph? --> Questions? -->
     <p> ChatGPT Turns Learning into a read only what you dont know ask only what you dont know feedback loop --> All you have to do is keep track of what prompts you have asked in the past</p> """)
     gr.Interface(fn=group_words, inputs=groupinput_text, outputs=groupoutput_text, description="Word Grouping and Rotation - Group a list of words into sets of 10 and rotate them every 60 seconds.") #.queue()
     gr.HTML("""HTML Version <hr> <iframe height="1200" style="width: 100%;" scrolling="no" title="Memorisation Aid" src="https://codepen.io/kwabs22/embed/preview/GRXKQgj?default-tab=result&editable=true" frameborder="no" loading="lazy" allowtransparency="true" allowfullscreen="true">
                See the Pen <a href="https://codepen.io/kwabs22/pen/GRXKQgj"> Memorisation Aid</a> by kwabs22 (<a href="https://codepen.io/kwabs22">@kwabs22</a>) on <a href="https://codepen.io">CodePen</a>. </iframe>""")
+  with gr.Tab("Transition is the end goal"):
+      gr.HTML("Transition is the true nature of logic i.e. like some form of non-semantic embedding that is semantic?")
+      gr.Interface(fn=build_model, inputs="text", outputs=["text", "text"], description="Create Collocation Dictionary --> Google Kathryn Lingel - Pyambic Pentameter Example - PyCon US for more")
+      gr.Interface(fn=markov_generate, inputs=["text", Markovlength], outputs="text", description="Generate Text based on the collocations in the text")
   with gr.Tab("Unknown Tracker"):
     gr.HTML("Repitition of things you know is a waste of time when theres stuff you dont know <p> In Language the goal is bigger vocab --> Knowledge equivalent = question answer pairs but to get to those you need related information pairs</p> <p> Vocab = Glossary + all non text wall(lists, diagrams, etc.)</p>")
     gr.Textbox("Placeholder for a function that creates a set list and can takes a list for known words and auto find replaces the stuff you know out of the content")
+    gr.Interface(fn=UnknownTrackTexttoApp, inputs="text", outputs=["html", "text"], description="Use the text from here to create lists you use for the TTS section")
     with gr.Tab("Unique word ID - use in Infranodus"):
       gr.Interface(fn=unique_word_count, inputs="text", outputs="text", description="Wordcounter")
       gr.Interface(fn=SepHypandSynExpansion, inputs="text", outputs=["text", "text"], description="Word suggestions - Analyse the unique words in infranodus")
         gr.HTML("""<a href="https://huggingface.co/spaces/pharma/CLIP-Interrogator"> --Huggingface CLIP-Interrogator Space-- </a><br> """)
         gr.Interface(fn=removeTonalMarks, inputs="text", outputs="text", description="For text with characters use this function to remove any conflicting characters (if error below)")
         gr.Interface(fn=add_text_to_image , inputs=["image", "text"], outputs="image", description="Create Annotated images (Can create using stable diffusion and use the prompt)")
+        gr.HTML("Use Shift Enter To put text on new lines if the text doesnt fit <hr>")
   #with gr.Tab("Transcribe - RASMUS Whisper"):
     #gr.Interface.load("spaces/RASMUS/Whisper-youtube-crosslingual-subtitles", title="Subtitles")
   with gr.Tab("Advanced - LingQ Addon Ideas"):
         #gr.HTML("<p>If Space not loaded its because of offline devopment errors please message for edit</p> <hr>")
         with gr.Tab("Merged Subtitles"):
             gr.HTML("Step 1 - Word for Word Translation Creation in both Directions (Paste Google Translation here)")
+            gr.Interface(fn=split_srt_file, inputs=["text", SRTLangOptions] , outputs=["text", "file", "text", "text"], description="SRT Contents to W4W Split SRT for Google Translate")
             gr.HTML("Step 2 - Pronounciation (Roman) to Subtitle Format --> GTranslate returns unformatted string")
             gr.Interface(fn=splittext, inputs="text", outputs="text", description="Text for w4w creation in G Translate")
             gr.HTML("Step 3 - Merge into one file")