Spaces:
Sleeping
Sleeping
Alexis Palmer
commited on
Commit
•
ca2ea21
1
Parent(s):
39623cd
Scramble para Quechua, first version
Browse files- app.py +88 -0
- quechua.easy.filtered +87 -0
- util.py +79 -0
app.py
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import util
|
3 |
+
import re
|
4 |
+
import random
|
5 |
+
|
6 |
+
### load and prepare corpus
|
7 |
+
#corpus = util.load_raw_text(corpus_directory="map_avenue")
|
8 |
+
|
9 |
+
corpus = util.load_single_raw_text_file("quechua.easy.filtered")
|
10 |
+
|
11 |
+
#corpus = corpus.lower()
|
12 |
+
#word_regex = r"[a-z]+"
|
13 |
+
#def tokenize(text: str):
|
14 |
+
# return re.findall(word_regex, text)
|
15 |
+
|
16 |
+
#words = tokenize(corpus)
|
17 |
+
words = corpus.split()
|
18 |
+
print(words)
|
19 |
+
|
20 |
+
|
21 |
+
lexicon = set()
|
22 |
+
for word in words:
|
23 |
+
lexicon.add(word)
|
24 |
+
|
25 |
+
filtered_lexicon = set()
|
26 |
+
|
27 |
+
for word in lexicon:
|
28 |
+
filtered_lexicon.add(word)
|
29 |
+
# if 4 <= len(word) <= 6:
|
30 |
+
# filtered_lexicon.add(word)
|
31 |
+
|
32 |
+
print(len(filtered_lexicon))
|
33 |
+
|
34 |
+
def random_scramble(lexicon: set):
|
35 |
+
lexicon = list(lexicon)
|
36 |
+
|
37 |
+
word = random.choice(lexicon)
|
38 |
+
|
39 |
+
# Turn the word into a list of characters
|
40 |
+
word_chars = list(word)
|
41 |
+
|
42 |
+
# Shuffle those characters
|
43 |
+
random.shuffle(word_chars)
|
44 |
+
|
45 |
+
# Re-join the characters into a string
|
46 |
+
shuffled = ''.join(word_chars)
|
47 |
+
|
48 |
+
return {'shuffled': shuffled, 'original': word}
|
49 |
+
|
50 |
+
|
51 |
+
|
52 |
+
def scrambler_game(current_word, guess: str):
|
53 |
+
"""
|
54 |
+
If `guess` is the correct word, return 'Correct' and pick a new word. Otherwise, return 'Incorrect'
|
55 |
+
Returns (correct_label, scrambled_word, current_word)
|
56 |
+
"""
|
57 |
+
if guess == current_word['original']:
|
58 |
+
current_word = random_scramble(filtered_lexicon)
|
59 |
+
return ('😀 ¡Correcto! 😀', current_word['shuffled'], current_word)
|
60 |
+
else:
|
61 |
+
return ('Incorrecto 😕', current_word['shuffled'], current_word)
|
62 |
+
|
63 |
+
|
64 |
+
def new_word():
|
65 |
+
current_word = random_scramble(filtered_lexicon)
|
66 |
+
return ('', current_word['shuffled'], current_word)
|
67 |
+
|
68 |
+
|
69 |
+
with gr.Blocks(theme=gr.themes.Soft(), title="Simita Tariy") as unscramble:
|
70 |
+
# Start with some initial word
|
71 |
+
current_word = gr.State(random_scramble(filtered_lexicon))
|
72 |
+
|
73 |
+
gr.Markdown("# Simita Tariy")
|
74 |
+
|
75 |
+
# Notice how we set the initial value based on the State
|
76 |
+
scrambled_textbox = gr.Textbox(label="Crucigrama", interactive=False, value=current_word.value['shuffled'])
|
77 |
+
|
78 |
+
guess_textbox = gr.Textbox(label="Adivinar - Adivina la palabra y luego aprieta en 'enviar'")
|
79 |
+
guess_button = gr.Button(value="Enviar")
|
80 |
+
|
81 |
+
new_word_button = gr.Button(value="Nueva Palabra")
|
82 |
+
|
83 |
+
output_textbox = gr.Textbox(label="Resultado", interactive=False)
|
84 |
+
|
85 |
+
guess_button.click(fn=scrambler_game, inputs=[current_word, guess_textbox], outputs=[output_textbox, scrambled_textbox, current_word])
|
86 |
+
new_word_button.click(fn=new_word, inputs=[], outputs=[output_textbox, scrambled_textbox, current_word])
|
87 |
+
|
88 |
+
unscramble.launch(share=True)
|
quechua.easy.filtered
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
allqu
|
3 |
+
chisha
|
4 |
+
puma
|
5 |
+
kimsa
|
6 |
+
ñaña
|
7 |
+
machu
|
8 |
+
llumchuy
|
9 |
+
ususi
|
10 |
+
kundur
|
11 |
+
waqta
|
12 |
+
chukcha
|
13 |
+
waaka
|
14 |
+
Usha
|
15 |
+
qasqu
|
16 |
+
huq
|
17 |
+
wiksa
|
18 |
+
Pishqu
|
19 |
+
wachwa
|
20 |
+
mulla
|
21 |
+
tayta
|
22 |
+
qillayyashqa
|
23 |
+
uma
|
24 |
+
mama
|
25 |
+
chanka
|
26 |
+
chusaq
|
27 |
+
maki
|
28 |
+
churi
|
29 |
+
Masha
|
30 |
+
turi
|
31 |
+
bukya
|
32 |
+
yana
|
33 |
+
qallu
|
34 |
+
kiru
|
35 |
+
qanchis
|
36 |
+
iskay
|
37 |
+
qusa
|
38 |
+
mishi
|
39 |
+
ucush
|
40 |
+
uturunku
|
41 |
+
qunqur
|
42 |
+
kuchi
|
43 |
+
sinqa
|
44 |
+
chusku
|
45 |
+
Chiwa
|
46 |
+
yuraq
|
47 |
+
Quechua
|
48 |
+
kucha
|
49 |
+
ñawi
|
50 |
+
Kunka
|
51 |
+
wanquyru
|
52 |
+
uqi
|
53 |
+
kukuchi
|
54 |
+
chipi
|
55 |
+
mamaku
|
56 |
+
rinri
|
57 |
+
willka
|
58 |
+
kulli
|
59 |
+
ipa
|
60 |
+
challwa
|
61 |
+
pillpis
|
62 |
+
anqas
|
63 |
+
chumpi
|
64 |
+
killmu
|
65 |
+
Amaru
|
66 |
+
hatun-uywa
|
67 |
+
yayawki
|
68 |
+
wirpa
|
69 |
+
qillu
|
70 |
+
pichqa
|
71 |
+
piwi
|
72 |
+
warmi
|
73 |
+
siwar
|
74 |
+
rikra
|
75 |
+
simi
|
76 |
+
haway
|
77 |
+
puka
|
78 |
+
suqta
|
79 |
+
Llumchuy
|
80 |
+
qumir
|
81 |
+
isqun
|
82 |
+
chaki
|
83 |
+
chunka
|
84 |
+
qurichasqa
|
85 |
+
Charapa
|
86 |
+
pachka
|
87 |
+
pusaq
|
util.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
import unicodedata
|
4 |
+
|
5 |
+
def strip_accents(text: str) -> str:
|
6 |
+
"""Removes accents from text."""
|
7 |
+
return ''.join(c for c in unicodedata.normalize('NFD', text)
|
8 |
+
if unicodedata.category(c) != 'Mn')
|
9 |
+
|
10 |
+
|
11 |
+
def load_raw_text(corpus_directory: str, file_names=None) -> str:
|
12 |
+
"""Loads all the text files in a directory into one large string"""
|
13 |
+
corpus = ""
|
14 |
+
|
15 |
+
for file_name in os.listdir(corpus_directory):
|
16 |
+
# Read the file as a string
|
17 |
+
file_path = os.path.join(corpus_directory, file_name)
|
18 |
+
if os.path.isdir(file_path):
|
19 |
+
continue
|
20 |
+
|
21 |
+
# Make sure we only read text files
|
22 |
+
if ".txt" not in file_name:
|
23 |
+
continue
|
24 |
+
|
25 |
+
with open(file_path, 'r') as file:
|
26 |
+
file_contents = file.read()
|
27 |
+
corpus += (file_contents + "\n")
|
28 |
+
return corpus
|
29 |
+
|
30 |
+
def load_single_raw_text_file(file_name):
|
31 |
+
"""Loads a single text file into one large string"""
|
32 |
+
|
33 |
+
corpus = ""
|
34 |
+
with open(file_name, 'r') as file:
|
35 |
+
file_contents = file.read()
|
36 |
+
corpus += (file_contents + "\n")
|
37 |
+
|
38 |
+
return corpus
|
39 |
+
|
40 |
+
|
41 |
+
word_regex = r"[\w|\']+"
|
42 |
+
def tokenize(text):
|
43 |
+
return re.findall(word_regex, text)
|
44 |
+
|
45 |
+
|
46 |
+
def preprocess(text):
|
47 |
+
"""Tokenizes and processes text which is already separated by spaces into words. Designed for English punctuation."""
|
48 |
+
text = strip_accents(text)
|
49 |
+
text = text.lower()
|
50 |
+
|
51 |
+
tokens = text.split(" ")
|
52 |
+
|
53 |
+
tokens_filtered = []
|
54 |
+
for token in tokens:
|
55 |
+
# Skip any tokens with special characters
|
56 |
+
if re.match(r"[\w|\']+|[\.|\,|\?|\!]", token):
|
57 |
+
tokens_filtered.append(token)
|
58 |
+
return tokens_filtered
|
59 |
+
|
60 |
+
|
61 |
+
def pad(text: list, num_padding: int):
|
62 |
+
"""Pads the given text, as a list of strings, with <s> characters between sentences."""
|
63 |
+
padded_text = []
|
64 |
+
|
65 |
+
# Add initial padding to the first sentence
|
66 |
+
for _ in range(num_padding):
|
67 |
+
padded_text.append("<s>")
|
68 |
+
|
69 |
+
for word in text:
|
70 |
+
padded_text.append(word)
|
71 |
+
|
72 |
+
# Every time we see an end punctuation mark, add <s> tokens before it
|
73 |
+
# REPLACE IF YOUR LANGUAGE USES DIFFERENT END PUNCTUATION
|
74 |
+
if word in [".", "?", "!"]:
|
75 |
+
for _ in range(num_padding):
|
76 |
+
padded_text.append("<s>")
|
77 |
+
|
78 |
+
|
79 |
+
return padded_text
|