HeshamHaroon commited on
Commit
61a3b2f
1 Parent(s): 7db01f9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -21
app.py CHANGED
@@ -1,30 +1,40 @@
1
- from gradio import inputs, outputs, Interface
2
  import aranizer
3
- from aranizer import aranizer_bpe32k, aranizer_bpe50k, aranizer_bpe64k, aranizer_bpe86k, aranizer_sp32k,aranizer_sp50k,aranizer_sp64k, aranizer_sp86k
4
- # Load all available tokenizers
5
- tokenizers = {
6
- "aranizer_bpe32k": aranizer.aranizer_bpe32k.get_tokenizer(),
7
- "aranizer_bpe50k": aranizer.aranizer_bpe50k.get_tokenizer(),
8
- "aranizer_bpe64k": aranizer.aranizer_bpe64k.get_tokenizer(),
9
- "aranizer_bpe86k": aranizer.aranizer_bpe86k.get_tokenizer(),
10
- "aranizer_sp32k": aranizer.aranizer_sp32k.get_tokenizer(),
11
- "aranizer_sp50k": aranizer.aranizer_sp50k.get_tokenizer(),
12
- "aranizer_sp64k": aranizer.aranizer_sp64k.get_tokenizer(),
13
- "aranizer_sp86k": aranizer.aranizer_sp86k.get_tokenizer(),
14
- }
 
 
 
 
 
 
 
15
 
16
  def compare_tokenizers(text):
17
  results = []
18
  for name, tokenizer in tokenizers.items():
19
- tokens = tokenizer.tokenize(text)
20
- encoded_output = tokenizer.encode(text, add_special_tokens=True)
21
- decoded_text = tokenizer.decode(encoded_output)
22
- results.append((name, tokens, encoded_output, decoded_text))
 
 
 
23
  return results
24
 
25
- inputs = inputs.Textbox(label="Enter Arabic text")
26
- outputs = outputs.Table(label="Results", columns=["Tokenizer", "Tokens", "Encoded Output", "Decoded Text"])
27
 
28
- iface = Interface(fn=compare_tokenizers, inputs=inputs, outputs=outputs)
29
 
30
- iface.launch()
 
1
+ import gradio as gr
2
  import aranizer
3
+
4
+ # Function to dynamically load tokenizers
5
+ def load_tokenizers():
6
+ tokenizer_methods = [
7
+ "aranizer_bpe32k", "aranizer_bpe50k", "aranizer_bpe64k", "aranizer_bpe86k",
8
+ "aranizer_sp32k", "aranizer_sp50k", "aranizer_sp64k", "aranizer_sp86k"
9
+ ]
10
+ tokenizers = {}
11
+ for method in tokenizer_methods:
12
+ module, function = method.rsplit("_", 1)
13
+ try:
14
+ tokenizer = getattr(getattr(aranizer, module), f'get_tokenizer_{function}')()
15
+ tokenizers[method] = tokenizer
16
+ except AttributeError as e:
17
+ print(f"Error loading tokenizer {method}: {e}")
18
+ return tokenizers
19
+
20
+ # Now, let's load all available tokenizers
21
+ tokenizers = load_tokenizers()
22
 
23
  def compare_tokenizers(text):
24
  results = []
25
  for name, tokenizer in tokenizers.items():
26
+ try:
27
+ tokens = tokenizer.tokenize(text)
28
+ encoded_output = tokenizer.encode(text, add_special_tokens=True)
29
+ decoded_text = tokenizer.decode(encoded_output)
30
+ results.append((name, tokens, encoded_output, decoded_text))
31
+ except Exception as e:
32
+ results.append((name, f"Error: {e}", "", ""))
33
  return results
34
 
35
+ inputs_component = gr.inputs.Textbox(lines=2, placeholder="Enter Arabic text here...", label="Input Text")
36
+ outputs_component = gr.outputs.Table(label="Results", headers=["Tokenizer", "Tokens", "Encoded Output", "Decoded Text"])
37
 
38
+ iface = gr.Interface(fn=compare_tokenizers, inputs=inputs_component, outputs=outputs_component, title="Tokenizer Comparison Tool")
39
 
40
+ iface.launch()