File size: 1,446 Bytes
9715926
 
 
71be925
9715926
71be925
 
 
 
 
e5aaf6d
9715926
 
 
 
 
 
b6678bf
 
 
 
 
 
 
9715926
 
 
 
 
 
 
 
 
71be925
44c45da
71be925
9715926
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import gradio as gr
from transformers import AutoTokenizer

# Define a function to tokenize text with a selected tokenizer
def tokenize_text(text, tokenizer_name):
  tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
  tokenized_text = tokenizer.tokenize(text)
  input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
  decoded_text = tokenizer.decode(input_ids)  # Decode the input IDs
  return f"Tokenized Text: {tokenized_text}\nInput IDs: {input_ids}\nDecoded Text: {decoded_text}"


# Define available tokenizers
tokenizer_names = [
  "riotu-lab/ArabianGPT-01B",
  "riotu-lab/ArabianGPT-03B",
  "riotu-lab/ArabianGPT-08B",
  "FreedomIntelligence/AceGPT-13B",  
  "FreedomIntelligence/AceGPT-7B",   
  "inception-mbzuai/jais-13b",
  "aubmindlab/aragpt2-base",
  "aubmindlab/aragpt2-medium",
  "aubmindlab/aragpt2-large",
  "aubmindlab/aragpt2-mega"
]

# Create the Gradio interface
iface = gr.Interface(
  fn=tokenize_text,
  inputs=[
    gr.Textbox(label="Enter Text"),
    gr.Dropdown(choices=tokenizer_names, label="Select Tokenizer"),
  ],
  outputs="text",
  title="Kalemat: Explore Arabic Tokenizers",
  description="This interactive tool allows you to experiment with different Arabic tokenizers and see how they break down text into individual units. Try out various tokenizers and observe the tokenized form, input IDs, and decoded text to gain insights into the tokenization process.",
)

# Launch the app
iface.launch()