tokenvisor-sd

Sleeping

App Files Files Community

Prgckwb commited on Feb 22

Commit

fc973c2

•

1 Parent(s): e5a75a4

Add gemma model

Browse files

Files changed (1) hide show

app.py +23 -4

app.py CHANGED Viewed

@@ -1,8 +1,12 @@
 import gradio as gr
 import pandas as pd
 from gradio.themes import colors
 from transformers import AutoTokenizer
 # Function to map tokenized text to IDs
 def inference(
@@ -11,6 +15,7 @@ def inference(
 ) -> (list[str, str], pd.DataFrame):
     if text == "":
         return [], pd.DataFrame()
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     # Use tokenizer to tokenize the text
@@ -30,7 +35,16 @@ def inference(
         "Char Count": [len(text)],
         "Token Count": [len(token_pairs)]
     })
-    return token_pairs, pos_count
 if __name__ == '__main__':
@@ -42,16 +56,19 @@ if __name__ == '__main__':
                 label="Model",
                 choices=[
                     "openai/clip-vit-large-patch14",
                     "google-bert/bert-base-uncased",
                     "google/flan-t5-base",
                     "openai-community/gpt2",
-                    "rinna/japanese-gpt-1b"
                 ],
                 value="openai/clip-vit-large-patch14"
             ),
         ],
         outputs=[
             gr.Highlightedtext(label="Highlighted Text"),
             gr.Dataframe(label="Position Count"),
         ],
         examples=[
@@ -60,10 +77,12 @@ if __name__ == '__main__':
              "but turns out she was just trying to hatch a plot to steal my dinner.", "openai/clip-vit-large-patch14"],
             ["The square root of x is the cube root of y. What is y to the power of 2, if x = 4?",
              "google/flan-t5-base"],
-            ["日本で一番高い山は富士山ですが、二番目に高い山は何ですか？", "rinna/japanese-gpt-1b"]
         ],
         cache_examples=True,
-        title="TokenVisor",
         description="Visualize how the Tokenizer used in Hugging Face's Transformers library tokenizes text.",
         theme=gr.Theme(primary_hue=colors.green, secondary_hue=colors.yellow),
         allow_flagging="never",

+import os
 import gradio as gr
 import pandas as pd
 from gradio.themes import colors
 from transformers import AutoTokenizer
+os.environ['TOKENIZERS_PARALLELISM'] = "false"
 # Function to map tokenized text to IDs
 def inference(
 ) -> (list[str, str], pd.DataFrame):
     if text == "":
         return [], pd.DataFrame()
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     # Use tokenizer to tokenize the text
         "Char Count": [len(text)],
         "Token Count": [len(token_pairs)]
     })
+    # Create list of special tokens
+    special_tokens = []
+    for k, v in tokenizer.special_tokens_map.items():
+        if k == 'additional_special_tokens':
+            continue
+        sp_token_map = [str(k), str(v)]
+        special_tokens.append(sp_token_map)
+    return token_pairs, special_tokens, pos_count
 if __name__ == '__main__':
                 label="Model",
                 choices=[
                     "openai/clip-vit-large-patch14",
+                    "google/gemma-7b",
                     "google-bert/bert-base-uncased",
                     "google/flan-t5-base",
                     "openai-community/gpt2",
+                    "rinna/japanese-gpt-1b",
+                    "cyberagent/open-calm-7b",
                 ],
                 value="openai/clip-vit-large-patch14"
             ),
         ],
         outputs=[
             gr.Highlightedtext(label="Highlighted Text"),
+            gr.Highlightedtext(label="Special Tokens", combine_adjacent=True, adjacent_separator=' / '),
             gr.Dataframe(label="Position Count"),
         ],
         examples=[
              "but turns out she was just trying to hatch a plot to steal my dinner.", "openai/clip-vit-large-patch14"],
             ["The square root of x is the cube root of y. What is y to the power of 2, if x = 4?",
              "google/flan-t5-base"],
+            ["In my home country, it's a custom to say 'いただきマサチューセッツ' before we start eating a meal.",
+             "google/gemma-7b"],
+            ["日本で一番高い山は富士山ですが、二番目に高い山は何ですか？", "rinna/japanese-gpt-1b"],
         ],
         cache_examples=True,
+        title="TokenVisor 👀",
         description="Visualize how the Tokenizer used in Hugging Face's Transformers library tokenizes text.",
         theme=gr.Theme(primary_hue=colors.green, secondary_hue=colors.yellow),
         allow_flagging="never",