Spaces:

TDLI2024
/

Simita_Tariy

Sleeping

App Files Files Community

Alexis Palmer commited on Jan 12

Commit

ca2ea21

•

1 Parent(s): 39623cd

Scramble para Quechua, first version

Browse files

Files changed (3) hide show

app.py +88 -0
quechua.easy.filtered +87 -0
util.py +79 -0

app.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import gradio as gr
+import util
+import re
+import random
+### load and prepare corpus
+#corpus = util.load_raw_text(corpus_directory="map_avenue")
+corpus = util.load_single_raw_text_file("quechua.easy.filtered")
+#corpus = corpus.lower()
+#word_regex = r"[a-z]+"
+#def tokenize(text: str):
+#    return re.findall(word_regex, text)
+#words = tokenize(corpus)
+words = corpus.split()
+print(words)
+lexicon = set()
+for word in words:
+    lexicon.add(word)
+filtered_lexicon = set()
+for word in lexicon:
+    filtered_lexicon.add(word)
+    #    if 4 <= len(word) <= 6:
+#        filtered_lexicon.add(word)
+print(len(filtered_lexicon))
+def random_scramble(lexicon: set):
+    lexicon = list(lexicon)
+    word = random.choice(lexicon)
+    # Turn the word into a list of characters
+    word_chars = list(word)
+    # Shuffle those characters
+    random.shuffle(word_chars)
+    # Re-join the characters into a string
+    shuffled = ''.join(word_chars)
+    return {'shuffled': shuffled, 'original': word}
+def scrambler_game(current_word, guess: str):
+    """
+    If `guess` is the correct word, return 'Correct' and pick a new word. Otherwise, return 'Incorrect'
+    Returns (correct_label, scrambled_word, current_word)
+    """
+    if guess == current_word['original']:
+        current_word = random_scramble(filtered_lexicon)
+        return ('😀 ¡Correcto! 😀', current_word['shuffled'], current_word)
+    else:
+        return ('Incorrecto 😕', current_word['shuffled'], current_word)
+def new_word():
+    current_word = random_scramble(filtered_lexicon)
+    return ('', current_word['shuffled'], current_word)
+with gr.Blocks(theme=gr.themes.Soft(), title="Simita Tariy") as unscramble:
+    # Start with some initial word
+    current_word = gr.State(random_scramble(filtered_lexicon))
+    gr.Markdown("# Simita Tariy")
+    # Notice how we set the initial value based on the State
+    scrambled_textbox = gr.Textbox(label="Crucigrama", interactive=False, value=current_word.value['shuffled'])
+    guess_textbox = gr.Textbox(label="Adivinar - Adivina la palabra y luego aprieta en 'enviar'")
+    guess_button = gr.Button(value="Enviar")
+    new_word_button = gr.Button(value="Nueva Palabra")
+    output_textbox = gr.Textbox(label="Resultado", interactive=False)
+    guess_button.click(fn=scrambler_game, inputs=[current_word, guess_textbox], outputs=[output_textbox, scrambled_textbox, current_word])
+    new_word_button.click(fn=new_word, inputs=[], outputs=[output_textbox, scrambled_textbox, current_word])
+unscramble.launch(share=True)

quechua.easy.filtered ADDED Viewed

	@@ -0,0 +1,87 @@

+allqu
+chisha
+puma
+kimsa
+ñaña
+machu
+llumchuy
+ususi
+kundur
+waqta
+chukcha
+waaka
+Usha
+qasqu
+huq
+wiksa
+Pishqu
+wachwa
+mulla
+tayta
+qillayyashqa
+uma
+mama
+chanka
+chusaq
+maki
+churi
+Masha
+turi
+bukya
+yana
+qallu
+kiru
+qanchis
+iskay
+qusa
+mishi
+ucush
+uturunku
+qunqur
+kuchi
+sinqa
+chusku
+Chiwa
+yuraq
+Quechua
+kucha
+ñawi
+Kunka
+wanquyru
+uqi
+kukuchi
+chipi
+mamaku
+rinri
+willka
+kulli
+ipa
+challwa
+pillpis
+anqas
+chumpi
+killmu
+Amaru
+hatun-uywa
+yayawki
+wirpa
+qillu
+pichqa
+piwi
+warmi
+siwar
+rikra
+simi
+haway
+puka
+suqta
+Llumchuy
+qumir
+isqun
+chaki
+chunka
+qurichasqa
+Charapa
+pachka
+pusaq

util.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import os
+import re
+import unicodedata
+def strip_accents(text: str) -> str:
+    """Removes accents from text."""
+    return ''.join(c for c in unicodedata.normalize('NFD', text)
+                  if unicodedata.category(c) != 'Mn')
+def load_raw_text(corpus_directory: str, file_names=None) -> str:
+    """Loads all the text files in a directory into one large string"""
+    corpus = ""
+    for file_name in os.listdir(corpus_directory):
+        # Read the file as a string
+        file_path = os.path.join(corpus_directory, file_name)
+        if os.path.isdir(file_path):
+            continue
+        #  Make sure we only read text files
+        if ".txt" not in file_name:
+            continue
+        with open(file_path, 'r') as file:
+            file_contents = file.read()
+            corpus += (file_contents + "\n")
+    return corpus
+def load_single_raw_text_file(file_name):
+    """Loads a single text file into one large string"""
+    corpus = ""
+    with open(file_name, 'r') as file:
+        file_contents = file.read()
+        corpus += (file_contents + "\n")
+    return corpus
+word_regex = r"[\w|\']+"
+def tokenize(text):
+    return re.findall(word_regex, text)
+def preprocess(text):
+    """Tokenizes and processes text which is already separated by spaces into words. Designed for English punctuation."""
+    text = strip_accents(text)
+    text = text.lower()
+    tokens = text.split(" ")
+    tokens_filtered = []
+    for token in tokens:
+        # Skip any tokens with special characters
+        if re.match(r"[\w|\']+|[\.|\,|\?|\!]", token):
+            tokens_filtered.append(token)
+    return tokens_filtered
+def pad(text: list, num_padding: int):
+    """Pads the given text, as a list of strings, with <s> characters between sentences."""
+    padded_text = []
+    # Add initial padding to the first sentence
+    for _ in range(num_padding):
+        padded_text.append("<s>")
+    for word in text:
+        padded_text.append(word)
+        # Every time we see an end punctuation mark, add <s> tokens before it
+        # REPLACE IF YOUR LANGUAGE USES DIFFERENT END PUNCTUATION
+        if word in [".", "?", "!"]:
+            for _ in range(num_padding):
+                padded_text.append("<s>")
+    return padded_text