HeshamHaroon commited on
Commit
ece3f89
1 Parent(s): 5357bd8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -17
app.py CHANGED
@@ -2,11 +2,20 @@ from gradio import Interface
2
  import gradio as gr
3
  import aranizer
4
  from aranizer import aranizer_bpe50k, aranizer_bpe64k, aranizer_bpe86k, aranizer_sp32k, aranizer_sp50k, aranizer_sp64k, aranizer_sp86k
 
 
 
 
 
 
5
 
6
  # List of available tokenizers and a dictionary to load them
7
  tokenizer_options = [
8
  "aranizer_bpe50k", "aranizer_bpe64k", "aranizer_bpe86k",
9
- "aranizer_sp32k", "aranizer_sp50k", "aranizer_sp64k", "aranizer_sp86k"
 
 
 
10
  ]
11
 
12
  tokenizers = {
@@ -17,33 +26,49 @@ tokenizers = {
17
  "aranizer_sp50k": aranizer_sp50k.get_tokenizer,
18
  "aranizer_sp64k": aranizer_sp64k.get_tokenizer,
19
  "aranizer_sp86k": aranizer_sp86k.get_tokenizer,
 
 
 
20
  }
21
 
22
  def compare_tokenizers(tokenizer_name, text):
23
- # Load the selected tokenizer
24
- tokenizer = tokenizers[tokenizer_name]()
25
- tokens = tokenizer.tokenize(text)
26
- encoded_output = tokenizer.encode(text, add_special_tokens=True)
27
- decoded_text = tokenizer.decode(encoded_output)
 
 
 
 
 
 
 
28
 
29
  # Prepare the results to be displayed
30
  results = [(tokenizer_name, tokens, encoded_output, decoded_text)]
31
  return results
 
 
32
  inputs_component = [
33
  gr.Dropdown(choices=tokenizer_options, label="Select Tokenizer"),
34
- gr.Textbox(lines=2, placeholder="Enter Arabic text here...", label="Input Text")
35
  ]
36
 
37
- # Define the outputs component normally without the 'css' parameter
38
- outputs_component = gr.Dataframe(headers=["Tokenizer", "Tokens", "Encoded Output", "Decoded Text"], label="Results")
39
-
40
-
 
41
 
42
- # Setting up the interface with the custom theme
43
- iface = Interface(fn=compare_tokenizers,
44
- inputs=inputs_component,
45
- outputs=outputs_component,
46
- title="AraNizer Tokenizer Comparison")
 
 
 
47
 
48
  # Launching the Gradio app
49
- iface.launch()
 
2
  import gradio as gr
3
  import aranizer
4
  from aranizer import aranizer_bpe50k, aranizer_bpe64k, aranizer_bpe86k, aranizer_sp32k, aranizer_sp50k, aranizer_sp64k, aranizer_sp86k
5
+ from transformers import AutoTokenizer
6
+
7
+ # Load additional tokenizers from transformers
8
+ gpt_13b_tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/AceGPT-13B")
9
+ gpt_7b_tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/AceGPT-7B")
10
+ jais_13b_tokenizer = AutoTokenizer.from_pretrained("inception-mbzuai/jais-13b")
11
 
12
  # List of available tokenizers and a dictionary to load them
13
  tokenizer_options = [
14
  "aranizer_bpe50k", "aranizer_bpe64k", "aranizer_bpe86k",
15
+ "aranizer_sp32k", "aranizer_sp50k", "aranizer_sp64k", "aranizer_sp86k",
16
+ "FreedomIntelligence/AceGPT-13B", # Previously added GPT tokenizer
17
+ "FreedomIntelligence/AceGPT-7B", # Another previously added GPT tokenizer
18
+ "inception-mbzuai/jais-13b" # Adding the new tokenizer to the options
19
  ]
20
 
21
  tokenizers = {
 
26
  "aranizer_sp50k": aranizer_sp50k.get_tokenizer,
27
  "aranizer_sp64k": aranizer_sp64k.get_tokenizer,
28
  "aranizer_sp86k": aranizer_sp86k.get_tokenizer,
29
+ "FreedomIntelligence/AceGPT-13B": lambda: gpt_13b_tokenizer,
30
+ "FreedomIntelligence/AceGPT-7B": lambda: gpt_7b_tokenizer,
31
+ "inception-mbzuai/jais-13b": lambda: jais_13b_tokenizer # Adding the new Jais tokenizer
32
  }
33
 
34
  def compare_tokenizers(tokenizer_name, text):
35
+ # Handle the transformer tokenizers separately due to API differences
36
+ if tokenizer_name in ["FreedomIntelligence/AceGPT-13B", "FreedomIntelligence/AceGPT-7B", "inception-mbzuai/jais-13b"]:
37
+ tokenizer = tokenizers[tokenizer_name]()
38
+ tokens = tokenizer.tokenize(text)
39
+ encoded_output = tokenizer.encode(text, add_special_tokens=True, return_tensors="pt")
40
+ decoded_text = tokenizer.decode(encoded_output[0], skip_special_tokens=True)
41
+ else:
42
+ # AraNizer tokenizers
43
+ tokenizer = tokenizers[tokenizer_name]()
44
+ tokens = tokenizer.tokenize(text)
45
+ encoded_output = tokenizer.encode(text, add_special_tokens=True)
46
+ decoded_text = tokenizer.decode(encoded_output)
47
 
48
  # Prepare the results to be displayed
49
  results = [(tokenizer_name, tokens, encoded_output, decoded_text)]
50
  return results
51
+
52
+ # Define the Gradio interface components with a dropdown for model selection
53
  inputs_component = [
54
  gr.Dropdown(choices=tokenizer_options, label="Select Tokenizer"),
55
+ gr.Textbox(lines=2, placeholder="Enter text here...", label="Input Text")
56
  ]
57
 
58
+ outputs_component = gr.Dataframe(
59
+ headers=["Tokenizer", "Tokens", "Encoded Output", "Decoded Text"],
60
+ label="Results",
61
+ type="pandas"
62
+ )
63
 
64
+ # Setting up the interface
65
+ iface = Interface(
66
+ fn=compare_tokenizers,
67
+ inputs=inputs_component,
68
+ outputs=outputs_component,
69
+ title="Tokenizer Comparison",
70
+ live=True
71
+ )
72
 
73
  # Launching the Gradio app
74
+ iface.launch()