Alexis Palmer commited on
Commit
ca2ea21
1 Parent(s): 39623cd

Scramble para Quechua, first version

Browse files
Files changed (3) hide show
  1. app.py +88 -0
  2. quechua.easy.filtered +87 -0
  3. util.py +79 -0
app.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import util
3
+ import re
4
+ import random
5
+
6
+ ### load and prepare corpus
7
+ #corpus = util.load_raw_text(corpus_directory="map_avenue")
8
+
9
+ corpus = util.load_single_raw_text_file("quechua.easy.filtered")
10
+
11
+ #corpus = corpus.lower()
12
+ #word_regex = r"[a-z]+"
13
+ #def tokenize(text: str):
14
+ # return re.findall(word_regex, text)
15
+
16
+ #words = tokenize(corpus)
17
+ words = corpus.split()
18
+ print(words)
19
+
20
+
21
+ lexicon = set()
22
+ for word in words:
23
+ lexicon.add(word)
24
+
25
+ filtered_lexicon = set()
26
+
27
+ for word in lexicon:
28
+ filtered_lexicon.add(word)
29
+ # if 4 <= len(word) <= 6:
30
+ # filtered_lexicon.add(word)
31
+
32
+ print(len(filtered_lexicon))
33
+
34
+ def random_scramble(lexicon: set):
35
+ lexicon = list(lexicon)
36
+
37
+ word = random.choice(lexicon)
38
+
39
+ # Turn the word into a list of characters
40
+ word_chars = list(word)
41
+
42
+ # Shuffle those characters
43
+ random.shuffle(word_chars)
44
+
45
+ # Re-join the characters into a string
46
+ shuffled = ''.join(word_chars)
47
+
48
+ return {'shuffled': shuffled, 'original': word}
49
+
50
+
51
+
52
+ def scrambler_game(current_word, guess: str):
53
+ """
54
+ If `guess` is the correct word, return 'Correct' and pick a new word. Otherwise, return 'Incorrect'
55
+ Returns (correct_label, scrambled_word, current_word)
56
+ """
57
+ if guess == current_word['original']:
58
+ current_word = random_scramble(filtered_lexicon)
59
+ return ('😀 ¡Correcto! 😀', current_word['shuffled'], current_word)
60
+ else:
61
+ return ('Incorrecto 😕', current_word['shuffled'], current_word)
62
+
63
+
64
+ def new_word():
65
+ current_word = random_scramble(filtered_lexicon)
66
+ return ('', current_word['shuffled'], current_word)
67
+
68
+
69
+ with gr.Blocks(theme=gr.themes.Soft(), title="Simita Tariy") as unscramble:
70
+ # Start with some initial word
71
+ current_word = gr.State(random_scramble(filtered_lexicon))
72
+
73
+ gr.Markdown("# Simita Tariy")
74
+
75
+ # Notice how we set the initial value based on the State
76
+ scrambled_textbox = gr.Textbox(label="Crucigrama", interactive=False, value=current_word.value['shuffled'])
77
+
78
+ guess_textbox = gr.Textbox(label="Adivinar - Adivina la palabra y luego aprieta en 'enviar'")
79
+ guess_button = gr.Button(value="Enviar")
80
+
81
+ new_word_button = gr.Button(value="Nueva Palabra")
82
+
83
+ output_textbox = gr.Textbox(label="Resultado", interactive=False)
84
+
85
+ guess_button.click(fn=scrambler_game, inputs=[current_word, guess_textbox], outputs=[output_textbox, scrambled_textbox, current_word])
86
+ new_word_button.click(fn=new_word, inputs=[], outputs=[output_textbox, scrambled_textbox, current_word])
87
+
88
+ unscramble.launch(share=True)
quechua.easy.filtered ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ allqu
3
+ chisha
4
+ puma
5
+ kimsa
6
+ ñaña
7
+ machu
8
+ llumchuy
9
+ ususi
10
+ kundur
11
+ waqta
12
+ chukcha
13
+ waaka
14
+ Usha
15
+ qasqu
16
+ huq
17
+ wiksa
18
+ Pishqu
19
+ wachwa
20
+ mulla
21
+ tayta
22
+ qillayyashqa
23
+ uma
24
+ mama
25
+ chanka
26
+ chusaq
27
+ maki
28
+ churi
29
+ Masha
30
+ turi
31
+ bukya
32
+ yana
33
+ qallu
34
+ kiru
35
+ qanchis
36
+ iskay
37
+ qusa
38
+ mishi
39
+ ucush
40
+ uturunku
41
+ qunqur
42
+ kuchi
43
+ sinqa
44
+ chusku
45
+ Chiwa
46
+ yuraq
47
+ Quechua
48
+ kucha
49
+ ñawi
50
+ Kunka
51
+ wanquyru
52
+ uqi
53
+ kukuchi
54
+ chipi
55
+ mamaku
56
+ rinri
57
+ willka
58
+ kulli
59
+ ipa
60
+ challwa
61
+ pillpis
62
+ anqas
63
+ chumpi
64
+ killmu
65
+ Amaru
66
+ hatun-uywa
67
+ yayawki
68
+ wirpa
69
+ qillu
70
+ pichqa
71
+ piwi
72
+ warmi
73
+ siwar
74
+ rikra
75
+ simi
76
+ haway
77
+ puka
78
+ suqta
79
+ Llumchuy
80
+ qumir
81
+ isqun
82
+ chaki
83
+ chunka
84
+ qurichasqa
85
+ Charapa
86
+ pachka
87
+ pusaq
util.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import unicodedata
4
+
5
+ def strip_accents(text: str) -> str:
6
+ """Removes accents from text."""
7
+ return ''.join(c for c in unicodedata.normalize('NFD', text)
8
+ if unicodedata.category(c) != 'Mn')
9
+
10
+
11
+ def load_raw_text(corpus_directory: str, file_names=None) -> str:
12
+ """Loads all the text files in a directory into one large string"""
13
+ corpus = ""
14
+
15
+ for file_name in os.listdir(corpus_directory):
16
+ # Read the file as a string
17
+ file_path = os.path.join(corpus_directory, file_name)
18
+ if os.path.isdir(file_path):
19
+ continue
20
+
21
+ # Make sure we only read text files
22
+ if ".txt" not in file_name:
23
+ continue
24
+
25
+ with open(file_path, 'r') as file:
26
+ file_contents = file.read()
27
+ corpus += (file_contents + "\n")
28
+ return corpus
29
+
30
+ def load_single_raw_text_file(file_name):
31
+ """Loads a single text file into one large string"""
32
+
33
+ corpus = ""
34
+ with open(file_name, 'r') as file:
35
+ file_contents = file.read()
36
+ corpus += (file_contents + "\n")
37
+
38
+ return corpus
39
+
40
+
41
+ word_regex = r"[\w|\']+"
42
+ def tokenize(text):
43
+ return re.findall(word_regex, text)
44
+
45
+
46
+ def preprocess(text):
47
+ """Tokenizes and processes text which is already separated by spaces into words. Designed for English punctuation."""
48
+ text = strip_accents(text)
49
+ text = text.lower()
50
+
51
+ tokens = text.split(" ")
52
+
53
+ tokens_filtered = []
54
+ for token in tokens:
55
+ # Skip any tokens with special characters
56
+ if re.match(r"[\w|\']+|[\.|\,|\?|\!]", token):
57
+ tokens_filtered.append(token)
58
+ return tokens_filtered
59
+
60
+
61
+ def pad(text: list, num_padding: int):
62
+ """Pads the given text, as a list of strings, with <s> characters between sentences."""
63
+ padded_text = []
64
+
65
+ # Add initial padding to the first sentence
66
+ for _ in range(num_padding):
67
+ padded_text.append("<s>")
68
+
69
+ for word in text:
70
+ padded_text.append(word)
71
+
72
+ # Every time we see an end punctuation mark, add <s> tokens before it
73
+ # REPLACE IF YOUR LANGUAGE USES DIFFERENT END PUNCTUATION
74
+ if word in [".", "?", "!"]:
75
+ for _ in range(num_padding):
76
+ padded_text.append("<s>")
77
+
78
+
79
+ return padded_text