Commit
76d1dbc
1 Parent(s): ca4f672

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -9
app.py CHANGED
@@ -5,16 +5,18 @@ chart_html = gr.HTML(label="Token Frequency Chart")
5
 
6
  # Define a function to tokenize text and create visualization
7
  def tokenize_text(text, tokenizer_name):
8
- tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
9
- tokenized_text = tokenizer.tokenize(text)
10
- input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
11
- decoded_text = tokenizer.decode(input_ids)
12
 
13
- # Create visualization HTML
14
- chart_html = create_token_frequency_chart(tokenized_text)
15
-
16
- return f"Tokenized Text: {tokenized_text}\nInput IDs: {input_ids}\nDecoded Text: {decoded_text}", chart_html
17
 
 
 
 
 
18
 
19
 
20
  # Define available tokenizers
@@ -38,7 +40,12 @@ iface = gr.Interface(
38
  gr.Textbox(label="Enter Text"),
39
  gr.Dropdown(choices=tokenizer_names, label="Select Tokenizer"),
40
  ],
41
- outputs="text",
 
 
 
 
 
42
  title="Kalemat: Explore Arabic Tokenizers",
43
  description="This interactive tool allows you to experiment with different Arabic tokenizers and see how they break down text into individual units. Try out various tokenizers and observe the tokenized form, input IDs, and decoded text to gain insights into the tokenization process",
44
  )
 
5
 
6
  # Define a function to tokenize text and create visualization
7
  def tokenize_text(text, tokenizer_name):
8
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
9
+ tokenized_text = tokenizer.tokenize(text)
10
+ input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
11
+ decoded_text = tokenizer.decode(input_ids)
12
 
13
+ # Create visualization HTML
14
+ chart_html = create_token_frequency_chart(tokenized_text)
 
 
15
 
16
+ return (
17
+ f"Tokenized Text: {tokenized_text}\nInput IDs: {input_ids}\nDecoded Text: {decoded_text}",
18
+ chart_html,
19
+ )
20
 
21
 
22
  # Define available tokenizers
 
40
  gr.Textbox(label="Enter Text"),
41
  gr.Dropdown(choices=tokenizer_names, label="Select Tokenizer"),
42
  ],
43
+ outputs=[
44
+ gr.Textbox(label="Tokenized Text"),
45
+ gr.Textbox(label="Input IDs"),
46
+ gr.Textbox(label="Decoded Text"),
47
+ gr.HTML(label="Token Frequency Chart"), # Include chart_html
48
+ ]
49
  title="Kalemat: Explore Arabic Tokenizers",
50
  description="This interactive tool allows you to experiment with different Arabic tokenizers and see how they break down text into individual units. Try out various tokenizers and observe the tokenized form, input IDs, and decoded text to gain insights into the tokenization process",
51
  )