Spaces:
Sleeping
Sleeping
Alexis Palmer
commited on
Commit
•
29d61ac
1
Parent(s):
b965b6e
Scramble para Aymara, first version
Browse files- app.py +88 -0
- aymara.easy.filtered +71 -0
- util.py +79 -0
app.py
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import util
|
3 |
+
import re
|
4 |
+
import random
|
5 |
+
|
6 |
+
### load and prepare corpus
|
7 |
+
#corpus = util.load_raw_text(corpus_directory="map_avenue")
|
8 |
+
|
9 |
+
corpus = util.load_single_raw_text_file("aymara.easy.filtered")
|
10 |
+
|
11 |
+
#corpus = corpus.lower()
|
12 |
+
#word_regex = r"[a-z]+"
|
13 |
+
#def tokenize(text: str):
|
14 |
+
# return re.findall(word_regex, text)
|
15 |
+
|
16 |
+
#words = tokenize(corpus)
|
17 |
+
words = corpus.split()
|
18 |
+
print(words)
|
19 |
+
|
20 |
+
|
21 |
+
lexicon = set()
|
22 |
+
for word in words:
|
23 |
+
lexicon.add(word)
|
24 |
+
|
25 |
+
filtered_lexicon = set()
|
26 |
+
|
27 |
+
for word in lexicon:
|
28 |
+
filtered_lexicon.add(word)
|
29 |
+
# if 4 <= len(word) <= 6:
|
30 |
+
# filtered_lexicon.add(word)
|
31 |
+
|
32 |
+
print(len(filtered_lexicon))
|
33 |
+
|
34 |
+
def random_scramble(lexicon: set):
|
35 |
+
lexicon = list(lexicon)
|
36 |
+
|
37 |
+
word = random.choice(lexicon)
|
38 |
+
|
39 |
+
# Turn the word into a list of characters
|
40 |
+
word_chars = list(word)
|
41 |
+
|
42 |
+
# Shuffle those characters
|
43 |
+
random.shuffle(word_chars)
|
44 |
+
|
45 |
+
# Re-join the characters into a string
|
46 |
+
shuffled = ''.join(word_chars)
|
47 |
+
|
48 |
+
return {'shuffled': shuffled, 'original': word}
|
49 |
+
|
50 |
+
|
51 |
+
|
52 |
+
def scrambler_game(current_word, guess: str):
|
53 |
+
"""
|
54 |
+
If `guess` is the correct word, return 'Correct' and pick a new word. Otherwise, return 'Incorrect'
|
55 |
+
Returns (correct_label, scrambled_word, current_word)
|
56 |
+
"""
|
57 |
+
if guess == current_word['original']:
|
58 |
+
current_word = random_scramble(filtered_lexicon)
|
59 |
+
return ('😀 ¡Correcto! 😀', current_word['shuffled'], current_word)
|
60 |
+
else:
|
61 |
+
return ('Incorrecto 😕', current_word['shuffled'], current_word)
|
62 |
+
|
63 |
+
|
64 |
+
def new_word():
|
65 |
+
current_word = random_scramble(filtered_lexicon)
|
66 |
+
return ('', current_word['shuffled'], current_word)
|
67 |
+
|
68 |
+
|
69 |
+
with gr.Blocks(theme=gr.themes.Soft(), title="Aru Thaqana") as unscramble:
|
70 |
+
# Start with some initial word
|
71 |
+
current_word = gr.State(random_scramble(filtered_lexicon))
|
72 |
+
|
73 |
+
gr.Markdown("# Aru Thaqana")
|
74 |
+
|
75 |
+
# Notice how we set the initial value based on the State
|
76 |
+
scrambled_textbox = gr.Textbox(label="Crucigrama", interactive=False, value=current_word.value['shuffled'])
|
77 |
+
|
78 |
+
guess_textbox = gr.Textbox(label="Adivinar - Adivina la palabra y luego aprieta en 'enviar'")
|
79 |
+
guess_button = gr.Button(value="Enviar")
|
80 |
+
|
81 |
+
new_word_button = gr.Button(value="Nueva Palabra")
|
82 |
+
|
83 |
+
output_textbox = gr.Textbox(label="Resultado", interactive=False)
|
84 |
+
|
85 |
+
guess_button.click(fn=scrambler_game, inputs=[current_word, guess_textbox], outputs=[output_textbox, scrambled_textbox, current_word])
|
86 |
+
new_word_button.click(fn=new_word, inputs=[], outputs=[output_textbox, scrambled_textbox, current_word])
|
87 |
+
|
88 |
+
unscramble.launch(share=True)
|
aymara.easy.filtered
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
waka
|
3 |
+
kayunaka
|
4 |
+
paya
|
5 |
+
maya
|
6 |
+
qunqurinaka
|
7 |
+
ñik'uta
|
8 |
+
laka
|
9 |
+
kimsa
|
10 |
+
wank'u
|
11 |
+
kallachi
|
12 |
+
Uqi
|
13 |
+
tiyu
|
14 |
+
yuxch'a
|
15 |
+
anu
|
16 |
+
Aymara
|
17 |
+
papilanku
|
18 |
+
kusikusi
|
19 |
+
qaqilu
|
20 |
+
paqallqu
|
21 |
+
awki
|
22 |
+
ispillu
|
23 |
+
achaku
|
24 |
+
phisqa
|
25 |
+
puraka
|
26 |
+
laxra
|
27 |
+
chu'xña
|
28 |
+
kunka
|
29 |
+
chara
|
30 |
+
ampara
|
31 |
+
warmi
|
32 |
+
achila
|
33 |
+
tullqa
|
34 |
+
jamach'i
|
35 |
+
patu
|
36 |
+
puma
|
37 |
+
mallku
|
38 |
+
p'iqi
|
39 |
+
nayra
|
40 |
+
tunka
|
41 |
+
Jinchu
|
42 |
+
kimsaqallqu
|
43 |
+
Ch'umpi
|
44 |
+
larama
|
45 |
+
wallpa
|
46 |
+
tayka
|
47 |
+
awch'i
|
48 |
+
Ch'ara
|
49 |
+
Quri
|
50 |
+
pusi
|
51 |
+
phisi
|
52 |
+
khuchi
|
53 |
+
mujlli
|
54 |
+
asiru
|
55 |
+
kullaka
|
56 |
+
Janq'u
|
57 |
+
jilata
|
58 |
+
suxta
|
59 |
+
ch'usa
|
60 |
+
Ch'uri
|
61 |
+
jikhani
|
62 |
+
awicha
|
63 |
+
chacha
|
64 |
+
uwija
|
65 |
+
nasa
|
66 |
+
ñuñu
|
67 |
+
k'ulli
|
68 |
+
tiya
|
69 |
+
chawlla
|
70 |
+
wila
|
71 |
+
q'illu
|
util.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
import unicodedata
|
4 |
+
|
5 |
+
def strip_accents(text: str) -> str:
|
6 |
+
"""Removes accents from text."""
|
7 |
+
return ''.join(c for c in unicodedata.normalize('NFD', text)
|
8 |
+
if unicodedata.category(c) != 'Mn')
|
9 |
+
|
10 |
+
|
11 |
+
def load_raw_text(corpus_directory: str, file_names=None) -> str:
|
12 |
+
"""Loads all the text files in a directory into one large string"""
|
13 |
+
corpus = ""
|
14 |
+
|
15 |
+
for file_name in os.listdir(corpus_directory):
|
16 |
+
# Read the file as a string
|
17 |
+
file_path = os.path.join(corpus_directory, file_name)
|
18 |
+
if os.path.isdir(file_path):
|
19 |
+
continue
|
20 |
+
|
21 |
+
# Make sure we only read text files
|
22 |
+
if ".txt" not in file_name:
|
23 |
+
continue
|
24 |
+
|
25 |
+
with open(file_path, 'r') as file:
|
26 |
+
file_contents = file.read()
|
27 |
+
corpus += (file_contents + "\n")
|
28 |
+
return corpus
|
29 |
+
|
30 |
+
def load_single_raw_text_file(file_name):
|
31 |
+
"""Loads a single text file into one large string"""
|
32 |
+
|
33 |
+
corpus = ""
|
34 |
+
with open(file_name, 'r') as file:
|
35 |
+
file_contents = file.read()
|
36 |
+
corpus += (file_contents + "\n")
|
37 |
+
|
38 |
+
return corpus
|
39 |
+
|
40 |
+
|
41 |
+
word_regex = r"[\w|\']+"
|
42 |
+
def tokenize(text):
|
43 |
+
return re.findall(word_regex, text)
|
44 |
+
|
45 |
+
|
46 |
+
def preprocess(text):
|
47 |
+
"""Tokenizes and processes text which is already separated by spaces into words. Designed for English punctuation."""
|
48 |
+
text = strip_accents(text)
|
49 |
+
text = text.lower()
|
50 |
+
|
51 |
+
tokens = text.split(" ")
|
52 |
+
|
53 |
+
tokens_filtered = []
|
54 |
+
for token in tokens:
|
55 |
+
# Skip any tokens with special characters
|
56 |
+
if re.match(r"[\w|\']+|[\.|\,|\?|\!]", token):
|
57 |
+
tokens_filtered.append(token)
|
58 |
+
return tokens_filtered
|
59 |
+
|
60 |
+
|
61 |
+
def pad(text: list, num_padding: int):
|
62 |
+
"""Pads the given text, as a list of strings, with <s> characters between sentences."""
|
63 |
+
padded_text = []
|
64 |
+
|
65 |
+
# Add initial padding to the first sentence
|
66 |
+
for _ in range(num_padding):
|
67 |
+
padded_text.append("<s>")
|
68 |
+
|
69 |
+
for word in text:
|
70 |
+
padded_text.append(word)
|
71 |
+
|
72 |
+
# Every time we see an end punctuation mark, add <s> tokens before it
|
73 |
+
# REPLACE IF YOUR LANGUAGE USES DIFFERENT END PUNCTUATION
|
74 |
+
if word in [".", "?", "!"]:
|
75 |
+
for _ in range(num_padding):
|
76 |
+
padded_text.append("<s>")
|
77 |
+
|
78 |
+
|
79 |
+
return padded_text
|