File size: 2,593 Bytes
93013c6
6c94b18
 
 
93013c6
e5a75a4
6c94b18
 
 
 
 
 
 
 
93013c6
6c94b18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e5a75a4
6c94b18
 
 
 
 
 
 
 
 
 
 
 
 
e5a75a4
 
6c94b18
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import gradio as gr
import pandas as pd
from gradio.themes import colors
from transformers import AutoTokenizer


# Function to map tokenized text to IDs
def inference(
        text="",
        model_id="openai/clip-vit-large-patch14",
) -> (list[str, str], pd.DataFrame):
    if text == "":
        return [], pd.DataFrame()
    tokenizer = AutoTokenizer.from_pretrained(model_id)

    # Use tokenizer to tokenize the text
    text_inputs = tokenizer(text, return_tensors='pt')

    input_ids = text_inputs['input_ids'].tolist()[0]  # Convert tensor to list

    # Create pairs of tokens and IDs
    tokens = [tokenizer.decode([id_]) for id_ in input_ids]
    token_pairs = []

    for token, id_ in zip(tokens, input_ids):
        token_pairs.append((token, str(id_)))

    # Count the number of characters and tokens
    pos_count = pd.DataFrame({
        "Char Count": [len(text)],
        "Token Count": [len(token_pairs)]
    })
    return token_pairs, pos_count


if __name__ == '__main__':
    iface = gr.Interface(
        fn=inference,
        inputs=[
            gr.Textbox(label="Text"),
            gr.Dropdown(
                label="Model",
                choices=[
                    "openai/clip-vit-large-patch14",
                    "google-bert/bert-base-uncased",
                    "google/flan-t5-base",
                    "openai-community/gpt2",
                    "rinna/japanese-gpt-1b"
                ],
                value="openai/clip-vit-large-patch14"
            ),
        ],
        outputs=[
            gr.Highlightedtext(label="Highlighted Text"),
            gr.Dataframe(label="Position Count"),
        ],
        examples=[
            ["When I told my computer I needed a break, it froze.", "openai/clip-vit-large-patch14"],
            ["Yesterday, I thought my cat was studying for her degree in philosophy because she sat on my book, "
             "but turns out she was just trying to hatch a plot to steal my dinner.", "openai/clip-vit-large-patch14"],
            ["The square root of x is the cube root of y. What is y to the power of 2, if x = 4?",
             "google/flan-t5-base"],
            ["日本で一番高い山は富士山ですが、二番目に高い山は何ですか?", "rinna/japanese-gpt-1b"]
        ],
        cache_examples=True,
        title="TokenVisor",
        description="Visualize how the Tokenizer used in Hugging Face's Transformers library tokenizes text.",
        theme=gr.Theme(primary_hue=colors.green, secondary_hue=colors.yellow),
        allow_flagging="never",

    )
    iface.launch()