|
import gradio as gr |
|
import tiktoken |
|
import random |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
enc_gpt4o = tiktoken.encoding_for_model("gpt-4o") |
|
enc_gpt4 = tiktoken.encoding_for_model("gpt-4") |
|
|
|
def get_color_mapping(tokens): |
|
unique_tokens = list(set(tokens)) |
|
colors = ["#" + ''.join([random.choice('0123456789ABCDEF') for _ in range(6)]) for _ in unique_tokens] |
|
color_mapping = dict(zip(unique_tokens, colors)) |
|
return color_mapping |
|
|
|
def process_model(text, encoder, model_name): |
|
token_ids = encoder.encode(text) |
|
tokens = [encoder.decode([id]) for id in token_ids] |
|
num_tokens = len(tokens) |
|
|
|
color_mapping = get_color_mapping(tokens) |
|
|
|
modelname_html = f'<h2>{model_name}</h2>' |
|
|
|
tokens_colored = [f'<span style="color:{color_mapping[token]}; font-weight: bold;">{token}</span>' for token in tokens] |
|
token_ids_colored = [f'<span style="color:{color_mapping[token]}; font-weight: bold;">{token_id}</span>' for token, token_id in zip(tokens, token_ids)] |
|
|
|
tokens_html = f'<h3>{model_name} Tokens</h3>' + ' '.join(tokens_colored) |
|
num_tokens_html = f'<h3>Number of Tokens: <span style="font-size: 20px; font-weight: bold;">{num_tokens}</span></h3>' |
|
token_ids_html = f'<h3>{model_name} Token IDs</h3>' + ' '.join(map(str, token_ids_colored)) |
|
|
|
return modelname_html + num_tokens_html + tokens_html + token_ids_html |
|
|
|
def tokenize_input(text): |
|
gpt4o_result = process_model(text, enc_gpt4o, "GPT-4o") |
|
gpt4_result = process_model(text, enc_gpt4, "GPT-4") |
|
num_chars = len(text) |
|
num_chars_html = f'<h2>Number of Characters: <span style="font-size: 20px; font-weight: bold;">{num_chars}</span></h2>' |
|
return num_chars_html, gpt4o_result, gpt4_result |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("## GPT4o vs GPT4 Token Comparison") |
|
with gr.Row(): |
|
input_text = gr.Textbox(lines=2, placeholder="Enter text here...", label="Enter text to tokenize and compare results between GPT-4o and GPT-4 tokenizers.") |
|
num_chars_output = gr.HTML() |
|
with gr.Row(): |
|
gpt4o_output = gr.HTML(label="GPT-4o") |
|
gpt4_output = gr.HTML(label="GPT-4") |
|
|
|
input_text.change(tokenize_input, inputs=[input_text], outputs=[num_chars_output, gpt4o_output, gpt4_output]) |
|
input_text.submit(tokenize_input, inputs=[input_text], outputs=[num_chars_output, gpt4o_output, gpt4_output]) |
|
|
|
gr.Markdown(""" |
|
<hr> |
|
|
|
### License Information |
|
This application uses the following open-source libraries: |
|
|
|
1. **Gradio**: |
|
- License: Apache License 2.0 |
|
- Copyright: 2020-2023, Gradio contributors |
|
- Full License: [Apache License 2.0](http://www.apache.org/licenses/LICENSE-2.0) |
|
- Repository: [Gradio GitHub](https://github.com/gradio-app/gradio/) |
|
|
|
2. **tiktoken**: |
|
- License: MIT License |
|
- Copyright: 2022, OpenAI, Shantanu Jain |
|
- Full License: [MIT License](https://opensource.org/licenses/MIT) |
|
- Repository: [tiktoken GitHub](https://github.com/openai/tiktoken) |
|
""") |
|
|
|
|
|
|
|
demo.launch() |
|
|
|
|
|
|