HeshamHaroon commited on
Commit
7760bbc
1 Parent(s): e47d405

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -30
app.py CHANGED
@@ -1,63 +1,74 @@
1
  from gradio import Interface
2
  import gradio as gr
3
  import aranizer
 
4
  from transformers import AutoTokenizer
5
- import codecs
6
 
7
- # Loading tokenizer instances from Transformers
8
  gpt_13b_tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/AceGPT-13B")
9
  gpt_7b_tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/AceGPT-7B")
10
  jais_13b_tokenizer = AutoTokenizer.from_pretrained("inception-mbzuai/jais-13b")
11
 
12
- # Assuming the existence of get_tokenizer() method for aranizer models in your setup
 
 
 
 
 
 
 
 
13
  tokenizers = {
14
- "aranizer_bpe50k": lambda: aranizer.aranizer_bpe50k.get_tokenizer(),
15
- "aranizer_bpe64k": lambda: aranizer.aranizer_bpe64k.get_tokenizer(),
16
- "aranizer_bpe86k": lambda: aranizer.aranizer_bpe86k.get_tokenizer(),
17
- "aranizer_sp32k": lambda: aranizer.aranizer_sp32k.get_tokenizer(),
18
- "aranizer_sp50k": lambda: aranizer.aranizer_sp50k.get_tokenizer(),
19
- "aranizer_sp64k": lambda: aranizer.aranizer_sp64k.get_tokenizer(),
20
- "aranizer_sp86k": lambda: aranizer.aranizer_sp86k.get_tokenizer(),
21
  "FreedomIntelligence/AceGPT-13B": lambda: gpt_13b_tokenizer,
22
  "FreedomIntelligence/AceGPT-7B": lambda: gpt_7b_tokenizer,
23
- "inception-mbzuai/jais-13b": lambda: jais_13b_tokenizer,
24
  }
25
 
26
- # Define tokenizer options for dropdown menu
27
- tokenizer_options = list(tokenizers.keys())
28
-
29
  def compare_tokenizers(tokenizer_name, text):
30
- # UTF-8 encoding assertion for the input text
31
- text = codecs.decode(text.encode('utf-8'), 'utf-8')
 
 
 
 
 
 
 
 
 
 
32
 
33
- tokenizer = tokenizers[tokenizer_name]()
34
- tokens = tokenizer.tokenize(text)
35
- encoded_output = tokenizer.encode(text, add_special_tokens=True, return_tensors="pt")
36
- decoded_text = tokenizer.decode(encoded_output[0], skip_special_tokens=True)
37
-
38
- # Ensuring the tokens are iterated and converted correctly
39
- tokens_utf8 = [codecs.decode(token.encode('utf-8'), 'utf-8', errors='ignore') for token in tokens]
40
-
41
- # Preparing and returning results in UTF-8
42
- results = [(tokenizer_name, tokens_utf8, encoded_output.tolist(), decoded_text)]
43
  return results
44
 
 
45
  inputs_component = [
46
- gr.Dropdown(choices=tokenizer_options, label="اختر Tokenizer"),
47
- gr.Textbox(lines=2, placeholder="اكتب النص الخاص بك هنا...", label="النص المدخل")
48
  ]
49
 
50
  outputs_component = gr.Dataframe(
51
  headers=["Tokenizer", "Tokens", "Encoded Output", "Decoded Text"],
52
- label="النتائج",
 
53
  )
54
 
 
55
  iface = Interface(
56
  fn=compare_tokenizers,
57
  inputs=inputs_component,
58
  outputs=outputs_component,
59
  title="Arabic Tokenizer Arena",
60
- live=True,
61
  )
62
 
 
63
  iface.launch()
 
1
  from gradio import Interface
2
  import gradio as gr
3
  import aranizer
4
+ from aranizer import aranizer_bpe50k, aranizer_bpe64k, aranizer_bpe86k, aranizer_sp32k, aranizer_sp50k, aranizer_sp64k, aranizer_sp86k
5
  from transformers import AutoTokenizer
 
6
 
7
+ # Load additional tokenizers from transformers
8
  gpt_13b_tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/AceGPT-13B")
9
  gpt_7b_tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/AceGPT-7B")
10
  jais_13b_tokenizer = AutoTokenizer.from_pretrained("inception-mbzuai/jais-13b")
11
 
12
+ # List of available tokenizers and a dictionary to load them
13
+ tokenizer_options = [
14
+ "aranizer_bpe50k", "aranizer_bpe64k", "aranizer_bpe86k",
15
+ "aranizer_sp32k", "aranizer_sp50k", "aranizer_sp64k", "aranizer_sp86k",
16
+ "FreedomIntelligence/AceGPT-13B",
17
+ "FreedomIntelligence/AceGPT-7B",
18
+ "inception-mbzuai/jais-13b"
19
+ ]
20
+
21
  tokenizers = {
22
+ "aranizer_bpe50k": aranizer_bpe50k.get_tokenizer,
23
+ "aranizer_bpe64k": aranizer_bpe64k.get_tokenizer,
24
+ "aranizer_bpe86k": aranizer_bpe86k.get_tokenizer,
25
+ "aranizer_sp32k": aranizer_sp32k.get_tokenizer,
26
+ "aranizer_sp50k": aranizer_sp50k.get_tokenizer,
27
+ "aranizer_sp64k": aranizer_sp64k.get_tokenizer,
28
+ "aranizer_sp86k": aranizer_sp86k.get_tokenizer,
29
  "FreedomIntelligence/AceGPT-13B": lambda: gpt_13b_tokenizer,
30
  "FreedomIntelligence/AceGPT-7B": lambda: gpt_7b_tokenizer,
31
+ "inception-mbzuai/jais-13b": lambda: jais_13b_tokenizer
32
  }
33
 
 
 
 
34
  def compare_tokenizers(tokenizer_name, text):
35
+ # Handle the transformer tokenizers separately due to API differences
36
+ if tokenizer_name in ["FreedomIntelligence/AceGPT-13B", "FreedomIntelligence/AceGPT-7B", "inception-mbzuai/jais-13b"]:
37
+ tokenizer = tokenizers[tokenizer_name]()
38
+ tokens = tokenizer.tokenize(text)
39
+ encoded_output = tokenizer.encode(text, add_special_tokens=True, return_tensors="pt")
40
+ decoded_text = tokenizer.decode(encoded_output[0], skip_special_tokens=True)
41
+ else:
42
+ # AraNizer tokenizers
43
+ tokenizer = tokenizers[tokenizer_name]()
44
+ tokens = tokenizer.tokenize(text)
45
+ encoded_output = tokenizer.encode(text, add_special_tokens=True)
46
+ decoded_text = tokenizer.decode(encoded_output)
47
 
48
+ # Prepare the results to be displayed
49
+ results = [(tokenizer_name, tokens, encoded_output, decoded_text)]
 
 
 
 
 
 
 
 
50
  return results
51
 
52
+ # Define the Gradio interface components with a dropdown for model selection
53
  inputs_component = [
54
+ gr.Dropdown(choices=tokenizer_options, label="Select Tokenizer"),
55
+ gr.Textbox(lines=2, placeholder="اكتب النص هنا...", label="Input Text")
56
  ]
57
 
58
  outputs_component = gr.Dataframe(
59
  headers=["Tokenizer", "Tokens", "Encoded Output", "Decoded Text"],
60
+ label="Results",
61
+ type="pandas"
62
  )
63
 
64
+ # Setting up the interface
65
  iface = Interface(
66
  fn=compare_tokenizers,
67
  inputs=inputs_component,
68
  outputs=outputs_component,
69
  title="Arabic Tokenizer Arena",
70
+ live=True
71
  )
72
 
73
+ # Launching the Gradio app
74
  iface.launch()