lucyknada commited on
Commit
e2d65f6
·
verified ·
1 Parent(s): 441d7e2

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -0
app.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoTokenizer
3
+ import random
4
+ import colorsys
5
+ import html
6
+
7
+ def get_distinct_colors(n):
8
+ colors = []
9
+ for i in range(n):
10
+ h = i / n
11
+ s = 0.6
12
+ v = 0.7
13
+ r, g, b = colorsys.hsv_to_rgb(h, s, v)
14
+ color = "#{:02x}{:02x}{:02x}".format(int(r*255), int(g*255), int(b*255))
15
+ colors.append(color)
16
+ return colors
17
+
18
+ def tokenize_text(hf_model_id, text):
19
+ try:
20
+ tokenizer = AutoTokenizer.from_pretrained(hf_model_id)
21
+ tokens = tokenizer.tokenize(text)
22
+ token_count = len(tokens)
23
+ colors = get_distinct_colors(token_count)
24
+ colored_tokens = []
25
+ for i, token in enumerate(tokens):
26
+ display_token = token.replace('Ġ', '<space>')
27
+ display_token = html.escape(display_token)
28
+ colored_tokens.append(f'<span style="background-color: {colors[i]}; color: white; padding: 2px 4px; border-radius: 3px; margin: 2px; display: inline-block;">{display_token}</span>')
29
+ tokenized_text = "".join(colored_tokens)
30
+ return token_count, tokenized_text
31
+ except Exception as e:
32
+ return f"Error: {str(e)}", ""
33
+
34
+ demo = gr.Interface(
35
+ fn=tokenize_text,
36
+ inputs=[
37
+ gr.Textbox(label="Hugging Face Model ID", placeholder="e.g., gpt2, bert-base-uncased", value="unsloth/gemma-3-27b-it"),
38
+ gr.Textbox(label="Text to Tokenize", lines=5, placeholder="Enter your text here...")
39
+ ],
40
+ outputs=[
41
+ gr.Number(label="Token Count"),
42
+ gr.HTML(label="Tokenized Text", container=True, show_label=True)
43
+ ],
44
+ title="HuggingFace Tokenizer",
45
+ description="Enter a HuggingFace model ID and text to see how it gets tokenized.",
46
+ allow_flagging="never"
47
+ )
48
+
49
+ demo.launch()