Prgckwb commited on
Commit
fc973c2
1 Parent(s): e5a75a4

Add gemma model

Browse files
Files changed (1) hide show
  1. app.py +23 -4
app.py CHANGED
@@ -1,8 +1,12 @@
 
 
1
  import gradio as gr
2
  import pandas as pd
3
  from gradio.themes import colors
4
  from transformers import AutoTokenizer
5
 
 
 
6
 
7
  # Function to map tokenized text to IDs
8
  def inference(
@@ -11,6 +15,7 @@ def inference(
11
  ) -> (list[str, str], pd.DataFrame):
12
  if text == "":
13
  return [], pd.DataFrame()
 
14
  tokenizer = AutoTokenizer.from_pretrained(model_id)
15
 
16
  # Use tokenizer to tokenize the text
@@ -30,7 +35,16 @@ def inference(
30
  "Char Count": [len(text)],
31
  "Token Count": [len(token_pairs)]
32
  })
33
- return token_pairs, pos_count
 
 
 
 
 
 
 
 
 
34
 
35
 
36
  if __name__ == '__main__':
@@ -42,16 +56,19 @@ if __name__ == '__main__':
42
  label="Model",
43
  choices=[
44
  "openai/clip-vit-large-patch14",
 
45
  "google-bert/bert-base-uncased",
46
  "google/flan-t5-base",
47
  "openai-community/gpt2",
48
- "rinna/japanese-gpt-1b"
 
49
  ],
50
  value="openai/clip-vit-large-patch14"
51
  ),
52
  ],
53
  outputs=[
54
  gr.Highlightedtext(label="Highlighted Text"),
 
55
  gr.Dataframe(label="Position Count"),
56
  ],
57
  examples=[
@@ -60,10 +77,12 @@ if __name__ == '__main__':
60
  "but turns out she was just trying to hatch a plot to steal my dinner.", "openai/clip-vit-large-patch14"],
61
  ["The square root of x is the cube root of y. What is y to the power of 2, if x = 4?",
62
  "google/flan-t5-base"],
63
- ["日本で一番高い山は富士山ですが、二番目に高い山は何ですか?", "rinna/japanese-gpt-1b"]
 
 
64
  ],
65
  cache_examples=True,
66
- title="TokenVisor",
67
  description="Visualize how the Tokenizer used in Hugging Face's Transformers library tokenizes text.",
68
  theme=gr.Theme(primary_hue=colors.green, secondary_hue=colors.yellow),
69
  allow_flagging="never",
 
1
+ import os
2
+
3
  import gradio as gr
4
  import pandas as pd
5
  from gradio.themes import colors
6
  from transformers import AutoTokenizer
7
 
8
+ os.environ['TOKENIZERS_PARALLELISM'] = "false"
9
+
10
 
11
  # Function to map tokenized text to IDs
12
  def inference(
 
15
  ) -> (list[str, str], pd.DataFrame):
16
  if text == "":
17
  return [], pd.DataFrame()
18
+
19
  tokenizer = AutoTokenizer.from_pretrained(model_id)
20
 
21
  # Use tokenizer to tokenize the text
 
35
  "Char Count": [len(text)],
36
  "Token Count": [len(token_pairs)]
37
  })
38
+
39
+ # Create list of special tokens
40
+ special_tokens = []
41
+ for k, v in tokenizer.special_tokens_map.items():
42
+ if k == 'additional_special_tokens':
43
+ continue
44
+ sp_token_map = [str(k), str(v)]
45
+ special_tokens.append(sp_token_map)
46
+
47
+ return token_pairs, special_tokens, pos_count
48
 
49
 
50
  if __name__ == '__main__':
 
56
  label="Model",
57
  choices=[
58
  "openai/clip-vit-large-patch14",
59
+ "google/gemma-7b",
60
  "google-bert/bert-base-uncased",
61
  "google/flan-t5-base",
62
  "openai-community/gpt2",
63
+ "rinna/japanese-gpt-1b",
64
+ "cyberagent/open-calm-7b",
65
  ],
66
  value="openai/clip-vit-large-patch14"
67
  ),
68
  ],
69
  outputs=[
70
  gr.Highlightedtext(label="Highlighted Text"),
71
+ gr.Highlightedtext(label="Special Tokens", combine_adjacent=True, adjacent_separator=' / '),
72
  gr.Dataframe(label="Position Count"),
73
  ],
74
  examples=[
 
77
  "but turns out she was just trying to hatch a plot to steal my dinner.", "openai/clip-vit-large-patch14"],
78
  ["The square root of x is the cube root of y. What is y to the power of 2, if x = 4?",
79
  "google/flan-t5-base"],
80
+ ["In my home country, it's a custom to say 'いただきマサチューセッツ' before we start eating a meal.",
81
+ "google/gemma-7b"],
82
+ ["日本で一番高い山は富士山ですが、二番目に高い山は何ですか?", "rinna/japanese-gpt-1b"],
83
  ],
84
  cache_examples=True,
85
+ title="TokenVisor 👀",
86
  description="Visualize how the Tokenizer used in Hugging Face's Transformers library tokenizes text.",
87
  theme=gr.Theme(primary_hue=colors.green, secondary_hue=colors.yellow),
88
  allow_flagging="never",