File size: 3,823 Bytes
3a03bb1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import gradio as gr
from transformers import BertTokenizer, AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM
from tokenizers import ByteLevelBPETokenizer
from gensim.models import FastText
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
mbert_tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
bpe_tokenizer = ByteLevelBPETokenizer()
fasttext_model = FastText(vector_size=100, window=5, min_count=1)

polylm_tokenizer = AutoTokenizer.from_pretrained("DAMO-NLP-MT/polylm-1.7b")
polylm_model = AutoModelForCausalLM.from_pretrained("DAMO-NLP-MT/polylm-1.7b")

byt5_tokenizer = AutoTokenizer.from_pretrained("google/byt5-small")
byt5_model = AutoModelForSeq2SeqLM.from_pretrained("google/byt5-small")

def process_text(input_text, show_tokens, tokenizer_type, display_mode):
    tokens = []
    if tokenizer_type == "BERT":
        tokens = bert_tokenizer.tokenize(input_text)
    elif tokenizer_type == "Multilingual BERT":
        tokens = mbert_tokenizer.tokenize(input_text)
    elif tokenizer_type == "BPE":
        bpe_tokenizer.train_from_iterator([input_text], vocab_size=1000, min_frequency=1)
        tokens = bpe_tokenizer.encode(input_text).tokens
    elif tokenizer_type == "FastText":
        tokens = input_text.split()
    elif tokenizer_type == "PolyLM":
        tokens = polylm_tokenizer.tokenize(input_text)
    elif tokenizer_type == "ByT5":
        tokens = byt5_tokenizer.tokenize(input_text)

    token_count = len(tokens)

    if display_mode == "Tokens":
        if show_tokens:
            token_html = ""
            for idx, token in enumerate(tokens):
                color = f"hsl({(idx * 50) % 360}, 70%, 40%)"
                token_html += f'<span style="background-color:{color}; padding:2px; border-radius:5px; color: black;">{token}</span> '
            return token_html, token_count
        else:
            return " ".join(tokens), token_count
    elif display_mode == "Token Values":
        return str(tokens), token_count

with gr.Blocks() as demo:
    gr.Markdown("# Tokenizer Explorer")
    gr.Markdown("Choose a tokenizer and see how your text is tokenized. Toggle 'Show Tokens' to view highlighted tokens.")

    with gr.Row():
        input_text = gr.Textbox(label="Input Text", placeholder="Type your text here...", lines=5)
        output_display = gr.HTML(label="Output Display")

    with gr.Row():
        token_count_display = gr.Number(label="Number of Tokens", value=0, interactive=False)

    tokenizer_type = gr.Radio(
        ["BERT", "Multilingual BERT", "BPE", "FastText", "PolyLM", "ByT5"],
        label="Choose Tokenizer",
        value="BERT",
    )
    display_mode = gr.Radio(
        ["Tokens", "Token Values"],
        label="Display Mode",
        value="Tokens",
    )
    show_tokens = gr.Checkbox(label="Show Tokens", value=True)

    def update_output(input_text, show_tokens, tokenizer_type, display_mode):
        token_output, token_count = process_text(input_text, show_tokens, tokenizer_type, display_mode)
        return token_output, token_count
    input_text.change(
        fn=update_output,
        inputs=[input_text, show_tokens, tokenizer_type, display_mode],
        outputs=[output_display, token_count_display],
    )
    show_tokens.change(
        fn=update_output,
        inputs=[input_text, show_tokens, tokenizer_type, display_mode],
        outputs=[output_display, token_count_display],
    )
    tokenizer_type.change(
        fn=update_output,
        inputs=[input_text, show_tokens, tokenizer_type, display_mode],
        outputs=[output_display, token_count_display],
    )
    display_mode.change(
        fn=update_output,
        inputs=[input_text, show_tokens, tokenizer_type, display_mode],
        outputs=[output_display, token_count_display],
    )

demo.launch()