HeshamHaroon commited on
Commit
66b3df6
1 Parent(s): 61a3b2f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -28
app.py CHANGED
@@ -1,40 +1,33 @@
 
1
  import gradio as gr
2
  import aranizer
 
3
 
4
- # Function to dynamically load tokenizers
5
- def load_tokenizers():
6
- tokenizer_methods = [
7
- "aranizer_bpe32k", "aranizer_bpe50k", "aranizer_bpe64k", "aranizer_bpe86k",
8
- "aranizer_sp32k", "aranizer_sp50k", "aranizer_sp64k", "aranizer_sp86k"
9
- ]
10
- tokenizers = {}
11
- for method in tokenizer_methods:
12
- module, function = method.rsplit("_", 1)
13
- try:
14
- tokenizer = getattr(getattr(aranizer, module), f'get_tokenizer_{function}')()
15
- tokenizers[method] = tokenizer
16
- except AttributeError as e:
17
- print(f"Error loading tokenizer {method}: {e}")
18
- return tokenizers
19
-
20
- # Now, let's load all available tokenizers
21
- tokenizers = load_tokenizers()
22
 
23
  def compare_tokenizers(text):
24
  results = []
25
  for name, tokenizer in tokenizers.items():
26
- try:
27
- tokens = tokenizer.tokenize(text)
28
- encoded_output = tokenizer.encode(text, add_special_tokens=True)
29
- decoded_text = tokenizer.decode(encoded_output)
30
- results.append((name, tokens, encoded_output, decoded_text))
31
- except Exception as e:
32
- results.append((name, f"Error: {e}", "", ""))
33
  return results
34
 
35
- inputs_component = gr.inputs.Textbox(lines=2, placeholder="Enter Arabic text here...", label="Input Text")
36
- outputs_component = gr.outputs.Table(label="Results", headers=["Tokenizer", "Tokens", "Encoded Output", "Decoded Text"])
 
37
 
38
- iface = gr.Interface(fn=compare_tokenizers, inputs=inputs_component, outputs=outputs_component, title="Tokenizer Comparison Tool")
39
 
40
  iface.launch()
 
1
+ from gradio import Interface
2
  import gradio as gr
3
  import aranizer
4
+ from aranizer import aranizer_bpe32k, aranizer_bpe50k, aranizer_bpe64k, aranizer_bpe86k, aranizer_sp32k, aranizer_sp50k, aranizer_sp64k, aranizer_sp86k
5
 
6
+ # Load all available tokenizers
7
+ tokenizers = {
8
+ "aranizer_bpe32k": aranizer.aranizer_bpe32k.get_tokenizer(),
9
+ "aranizer_bpe50k": aranizer.aranizer_bpe50k.get_tokenizer(),
10
+ "aranizer_bpe64k": aranizer.aranizer_bpe64k.get_tokenizer(),
11
+ "aranizer_bpe86k": aranizer.aranizer_bpe86k.get_tokenizer(),
12
+ "aranizer_sp32k": aranizer.aranizer_sp32k.get_tokenizer(),
13
+ "aranizer_sp50k": aranizer.aranizer_sp50k.get_tokenizer(),
14
+ "aranizer_sp64k": aranizer.aranizer_sp64k.get_tokenizer(),
15
+ "aranizer_sp86k": aranizer.aranizer_sp86k.get_tokenizer(),
16
+ }
 
 
 
 
 
 
 
17
 
18
  def compare_tokenizers(text):
19
  results = []
20
  for name, tokenizer in tokenizers.items():
21
+ tokens = tokenizer.tokenize(text)
22
+ encoded_output = tokenizer.encode(text, add_special_tokens=True)
23
+ decoded_text = tokenizer.decode(encoded_output)
24
+ results.append((name, tokens, encoded_output, decoded_text))
 
 
 
25
  return results
26
 
27
+ # Define the Gradio interface components properly based on the Gradio API
28
+ inputs_component = gr.components.Textbox(lines=2, placeholder="Enter Arabic text here...", label="Input Text")
29
+ outputs_component = gr.components.Table(label="Results", headers=["Tokenizer", "Tokens", "Encoded Output", "Decoded Text"])
30
 
31
+ iface = Interface(fn=compare_tokenizers, inputs=inputs_component, outputs=outputs_component)
32
 
33
  iface.launch()