File size: 3,473 Bytes
d798650
 
 
 
1d1780a
 
 
 
 
 
 
 
 
 
 
 
 
 
d798650
 
a0e97a1
d798650
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a0e97a1
d798650
 
a0e97a1
d798650
1d1780a
d798650
deb8701
d798650
a0e97a1
d798650
 
 
a0e97a1
d798650
a0e97a1
 
d798650
2adc116
8a4996e
85a1a43
2adc116
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d798650
1d1780a
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import gradio as gr
import tiktoken
import random

# License Information
# This application uses the following open-source libraries:
#
# 1. Gradio:
#    - License: Apache License 2.0
#    - Copyright: 2020-2023, Gradio contributors
#    - Full License: http://www.apache.org/licenses/LICENSE-2.0
#
# 2. tiktoken:
#    - License: MIT License
#    - Copyright: 2022, OpenAI, Shantanu Jain
#    - Full License: https://opensource.org/licenses/MIT


# Load the tokenizers
enc_gpt4o = tiktoken.encoding_for_model("gpt-4o")
enc_gpt4 = tiktoken.encoding_for_model("gpt-4")

def get_color_mapping(tokens):
    unique_tokens = list(set(tokens))
    colors = ["#" + ''.join([random.choice('0123456789ABCDEF') for _ in range(6)]) for _ in unique_tokens]
    color_mapping = dict(zip(unique_tokens, colors))
    return color_mapping

def process_model(text, encoder, model_name):
    token_ids = encoder.encode(text)
    tokens = [encoder.decode([id]) for id in token_ids]
    num_tokens = len(tokens)
    
    color_mapping = get_color_mapping(tokens)

    modelname_html = f'<h2>{model_name}</h2>'
    
    tokens_colored = [f'<span style="color:{color_mapping[token]}; font-weight: bold;">{token}</span>' for token in tokens]
    token_ids_colored = [f'<span style="color:{color_mapping[token]}; font-weight: bold;">{token_id}</span>' for token, token_id in zip(tokens, token_ids)]
    
    tokens_html = f'<h3>{model_name} Tokens</h3>' + ' '.join(tokens_colored)
    num_tokens_html = f'<h3>Number of Tokens: <span style="font-size: 20px; font-weight: bold;">{num_tokens}</span></h3>'
    token_ids_html = f'<h3>{model_name} Token IDs</h3>' + ' '.join(map(str, token_ids_colored))
    
    return modelname_html + num_tokens_html + tokens_html + token_ids_html

def tokenize_input(text):
    gpt4o_result = process_model(text, enc_gpt4o, "GPT-4o")
    gpt4_result = process_model(text, enc_gpt4, "GPT-4")
    num_chars = len(text)
    num_chars_html = f'<h2>Number of Characters: <span style="font-size: 20px; font-weight: bold;">{num_chars}</span></h2>'
    return num_chars_html, gpt4o_result, gpt4_result


with gr.Blocks() as demo:
    gr.Markdown("## GPT4o vs GPT4 Token Comparison")
    with gr.Row():
        input_text = gr.Textbox(lines=2, placeholder="Enter text here...", label="Enter text to tokenize and compare results between GPT-4o and GPT-4 tokenizers.")
        num_chars_output = gr.HTML()
    with gr.Row():
        gpt4o_output = gr.HTML(label="GPT-4o")
        gpt4_output = gr.HTML(label="GPT-4")
    
    input_text.change(tokenize_input, inputs=[input_text], outputs=[num_chars_output, gpt4o_output, gpt4_output])
    input_text.submit(tokenize_input, inputs=[input_text], outputs=[num_chars_output, gpt4o_output, gpt4_output])

    gr.Markdown("""
        <hr>
        
        ### License Information
        This application uses the following open-source libraries:
        
        1. **Gradio**:
       - License: Apache License 2.0
       - Copyright: 2020-2023, Gradio contributors
       - Full License: [Apache License 2.0](http://www.apache.org/licenses/LICENSE-2.0)
       - Repository: [Gradio GitHub](https://github.com/gradio-app/gradio/)

        2. **tiktoken**:
       - License: MIT License
       - Copyright: 2022, OpenAI, Shantanu Jain
       - Full License: [MIT License](https://opensource.org/licenses/MIT)
       - Repository: [tiktoken GitHub](https://github.com/openai/tiktoken)
        """)


# Launch the app
demo.launch()