Prgckwb commited on
Commit
e603ef9
1 Parent(s): 547d010

:tada: init

Browse files
Files changed (3) hide show
  1. README.md +5 -4
  2. app.py +123 -76
  3. requirements.txt +2 -1
README.md CHANGED
@@ -1,10 +1,11 @@
1
  ---
2
- title: Tokenvisor
3
- emoji: 👀
4
- colorFrom: green
 
5
  colorTo: green
6
  sdk: gradio
7
- sdk_version: 4.36.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
 
1
  ---
2
+ title: Tokenvisor SD
3
+ emoji: 🥽
4
+
5
+ colorFrom: red
6
  colorTo: green
7
  sdk: gradio
8
+ sdk_version: 4.44.0
9
  app_file: app.py
10
  pinned: false
11
  license: apache-2.0
app.py CHANGED
@@ -1,94 +1,141 @@
1
- import os
2
-
3
  import gradio as gr
4
- import pandas as pd
5
- from gradio.themes import colors
6
- from transformers import AutoTokenizer
 
 
 
 
 
7
 
8
- os.environ['TOKENIZERS_PARALLELISM'] = "false"
 
9
 
 
 
 
 
10
 
11
- # Function to map tokenized text to IDs
12
- def inference(
13
- text="",
14
- model_id="openai/clip-vit-large-patch14",
15
- progress=gr.Progress()
16
- ) -> (list[str, str], list[str, str], pd.DataFrame):
17
- if text == "":
18
- return [], [], pd.DataFrame()
19
 
20
- progress(0, desc='Loading tokenizer...')
21
- tokenizer = AutoTokenizer.from_pretrained(model_id)
22
 
23
- # Use tokenizer to tokenize the text
24
- progress(0.5, desc='Tokenizing text...')
25
- text_inputs = tokenizer(text, return_tensors='pt')
26
 
27
- input_ids = text_inputs['input_ids'].tolist()[0] # Convert tensor to list
 
 
28
 
29
- # Create pairs of tokens and IDs
30
- tokens = [tokenizer.decode([id_]) for id_ in input_ids]
31
- token_pairs = []
 
 
32
 
33
- for token, id_ in zip(tokens, input_ids):
34
- token_pairs.append((token, str(id_)))
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
- # Count the number of characters and tokens
37
- pos_count = pd.DataFrame({
38
- "Char Count": [len(text)],
39
- "Token Count": [len(token_pairs)]
40
- })
 
 
 
 
 
 
 
 
 
 
 
41
 
42
- # Create list of special tokens
43
- special_tokens = []
44
- for k, v in tokenizer.special_tokens_map.items():
45
- if k == 'additional_special_tokens':
46
- continue
47
- sp_token_map = [str(k), str(v)]
48
- special_tokens.append(sp_token_map)
49
 
50
- return token_pairs, special_tokens, pos_count
51
 
52
 
53
  if __name__ == '__main__':
54
- iface = gr.Interface(
55
- fn=inference,
56
- inputs=[
57
- gr.Textbox(label="Text"),
58
- gr.Dropdown(
59
- label="Model",
 
 
60
  choices=[
61
- "openai/clip-vit-large-patch14",
62
- "google/gemma-7b",
63
- "google-bert/bert-base-uncased",
64
- "google/flan-t5-base",
65
- "openai-community/gpt2",
66
- "rinna/japanese-gpt-1b",
67
- "cyberagent/open-calm-7b",
68
  ],
69
- value="openai/clip-vit-large-patch14"
70
- ),
71
- ],
72
- outputs=[
73
- gr.Highlightedtext(label="Highlighted Text"),
74
- gr.Highlightedtext(label="Special Tokens", combine_adjacent=True, adjacent_separator=' / '),
75
- gr.Dataframe(label="Position Count"),
76
- ],
77
- examples=[
78
- ["When I told my computer I needed a break, it froze.", "openai/clip-vit-large-patch14"],
79
- ["Yesterday, I thought my cat was studying for her degree in philosophy because she sat on my book, "
80
- "but turns out she was just trying to hatch a plot to steal my dinner.", "openai/clip-vit-large-patch14"],
81
- ["The square root of x is the cube root of y. What is y to the power of 2, if x = 4?",
82
- "google/flan-t5-base"],
83
- ["In my home country, it's a custom to say 'いただきマサチューセッツ' before we start eating a meal.",
84
- "google/gemma-7b"],
85
- ["日本で一番高い山は富士山ですが、二番目に高い山は何ですか?", "rinna/japanese-gpt-1b"],
86
- ],
87
- cache_examples=True,
88
- title="TokenVisor 👀",
89
- description="Visualize how the Tokenizer used in Hugging Face's Transformers library tokenizes text.",
90
- theme=gr.Theme(primary_hue=colors.green, secondary_hue=colors.yellow),
91
- allow_flagging="never",
92
 
93
- )
94
- iface.queue().launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import torch
3
+ from diffusers import DiffusionPipeline
4
+ from transformers import AutoTokenizer, CLIPTokenizerFast, T5TokenizerFast
5
+
6
+
7
+ def load_tokenizers(model_id: str) -> list[CLIPTokenizerFast | T5TokenizerFast | None]:
8
+ config = DiffusionPipeline.load_config(model_id)
9
+ num_tokenizers = sum('tokenizer' in key for key in config.keys())
10
 
11
+ if not 1 <= num_tokenizers <= 3:
12
+ raise gr.Error(f'Invalid number of tokenizers: {num_tokenizers}')
13
 
14
+ tokenizers = [
15
+ AutoTokenizer.from_pretrained(model_id, subfolder=f'tokenizer{"" if i == 0 else f"_{i + 1}"}')
16
+ for i in range(num_tokenizers)
17
+ ]
18
 
19
+ # Pad the list with None if there are fewer than 3 tokenizers
20
+ tokenizers.extend([None] * (3 - num_tokenizers))
 
 
 
 
 
 
21
 
22
+ return tokenizers
 
23
 
 
 
 
24
 
25
+ @torch.inference_mode()
26
+ def inference(model_id: str, input_text: str):
27
+ tokenizers = load_tokenizers(model_id)
28
 
29
+ text_pairs_components = []
30
+ special_tokens_components = []
31
+ for i, tokenizer in enumerate(tokenizers):
32
+ if tokenizer:
33
+ label_text = f'Tokenizer {i + 1}: {tokenizer.__class__.__name__}'
34
 
35
+ # テキストとトークンIDのペアを作成
36
+ input_ids = tokenizer(
37
+ text=input_text,
38
+ truncation=True,
39
+ return_length=False,
40
+ return_overflowing_tokens=False
41
+ ).input_ids
42
+ decoded_tokens = [tokenizer.decode(id_) for id_ in input_ids]
43
+ token_pairs = [(str(token), str(id_)) for token, id_ in zip(decoded_tokens, input_ids)]
44
+ output_text_pair_component = gr.HighlightedText(
45
+ label=label_text,
46
+ value=token_pairs,
47
+ visible=True,
48
+ show_legend=True,
49
+ )
50
 
51
+ # スペシャルトークンを追加
52
+ special_tokens = []
53
+ for k, v in tokenizer.special_tokens_map.items():
54
+ if k == 'additional_special_tokens':
55
+ continue
56
+ special_token_map = (str(k), str(v))
57
+ special_tokens.append(special_token_map)
58
+ output_special_tokens_component = gr.HighlightedText(
59
+ label=label_text,
60
+ value=special_tokens,
61
+ visible=True,
62
+ show_legend=True,
63
+ )
64
+ else:
65
+ output_text_pair_component = gr.HighlightedText(visible=False)
66
+ output_special_tokens_component = gr.HighlightedText(visible=False)
67
 
68
+ text_pairs_components.append(output_text_pair_component)
69
+ special_tokens_components.append(output_special_tokens_component)
 
 
 
 
 
70
 
71
+ return text_pairs_components + special_tokens_components
72
 
73
 
74
  if __name__ == '__main__':
75
+ theme = gr.themes.Soft(
76
+ primary_hue=gr.themes.colors.emerald,
77
+ secondary_hue=gr.themes.colors.emerald,
78
+ )
79
+ with gr.Blocks(theme=theme) as demo:
80
+ with gr.Column():
81
+ input_model_id = gr.Dropdown(
82
+ label='Model ID',
83
  choices=[
84
+ 'black-forest-labs/FLUX.1-dev',
85
+ 'black-forest-labs/FLUX.1-schnell',
86
+ 'stabilityai/stable-diffusion-3-medium-diffusers',
87
+ 'stabilityai/stable-diffusion-xl-base-1.0',
88
+ 'stable-diffusion-v1-5/stable-diffusion-v1-5',
89
+ 'stabilityai/japanese-stable-diffusion-xl',
90
+ 'rinna/japanese-stable-diffusion',
91
  ],
92
+ value='black-forest-labs/FLUX.1-dev',
93
+ )
94
+ input_text = gr.Textbox(
95
+ label='Input Text',
96
+ placeholder='Enter text here',
97
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
+ with gr.Tab(label='Tokenization Outputs'):
100
+ with gr.Column():
101
+ output_highlighted_text_1 = gr.HighlightedText()
102
+ output_highlighted_text_2 = gr.HighlightedText()
103
+ output_highlighted_text_3 = gr.HighlightedText()
104
+ with gr.Tab(label='Special Tokens'):
105
+ with gr.Column():
106
+ output_special_tokens_1 = gr.HighlightedText()
107
+ output_special_tokens_2 = gr.HighlightedText()
108
+ output_special_tokens_3 = gr.HighlightedText()
109
+
110
+ with gr.Row():
111
+ clear_button = gr.ClearButton(components=[input_text])
112
+ submit_button = gr.Button('Run', variant='primary')
113
+
114
+ all_inputs = [input_model_id, input_text]
115
+ all_output = [
116
+ output_highlighted_text_1,
117
+ output_highlighted_text_2,
118
+ output_highlighted_text_3,
119
+ output_special_tokens_1,
120
+ output_special_tokens_2,
121
+ output_special_tokens_3,
122
+ ]
123
+ submit_button.click(
124
+ fn=inference,
125
+ inputs=all_inputs,
126
+ outputs=all_output
127
+ )
128
+
129
+ examples = gr.Examples(
130
+ fn=inference,
131
+ inputs=all_inputs,
132
+ outputs=all_output,
133
+ examples=[
134
+ ['black-forest-labs/FLUX.1-dev', 'a photo of cat'],
135
+ ['stabilityai/stable-diffusion-3-medium-diffusers', 'cat holding sign saying "I am a cat"'],
136
+ ['rinna/japanese-stable-diffusion', '空を飛んでいるネコの写真 油絵']
137
+ ],
138
+ cache_examples=True
139
+ )
140
+
141
+ demo.queue().launch()
requirements.txt CHANGED
@@ -3,4 +3,5 @@ transformers
3
  safetensors
4
  accelerate
5
  diffusers
6
- sentencepiece
 
 
3
  safetensors
4
  accelerate
5
  diffusers
6
+ sentencepiece
7
+ protobuf