Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import tiktoken | |
| import random | |
| # License Information | |
| # This application uses the following open-source libraries: | |
| # | |
| # 1. Gradio: | |
| # - License: Apache License 2.0 | |
| # - Copyright: 2020-2023, Gradio contributors | |
| # - Full License: http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # 2. tiktoken: | |
| # - License: MIT License | |
| # - Copyright: 2022, OpenAI, Shantanu Jain | |
| # - Full License: https://opensource.org/licenses/MIT | |
| # Load the tokenizers | |
| enc_gpt4o = tiktoken.encoding_for_model("gpt-4o") | |
| enc_gpt3_5turbo = tiktoken.encoding_for_model("gpt-3.5-turbo") | |
| def get_color_mapping(tokens): | |
| unique_tokens = list(set(tokens)) | |
| colors = ["#" + ''.join([random.choice('0123456789ABCDEF') for _ in range(6)]) for _ in unique_tokens] | |
| color_mapping = dict(zip(unique_tokens, colors)) | |
| return color_mapping | |
| def process_model(text, encoder, model_name): | |
| token_ids = encoder.encode(text) | |
| tokens = [encoder.decode([id]) for id in token_ids] | |
| num_tokens = len(tokens) | |
| color_mapping = get_color_mapping(tokens) | |
| modelname_html = f'<h2>{model_name}</h2>' | |
| tokens_colored = [f'<span style="color:{color_mapping[token]}; font-weight: bold;">{token}</span>' for token in tokens] | |
| token_ids_colored = [f'<span style="color:{color_mapping[token]}; font-weight: bold;">{token_id}</span>' for token, token_id in zip(tokens, token_ids)] | |
| tokens_html = f'<h3>{model_name} Tokens</h3>' + ' '.join(tokens_colored) | |
| num_tokens_html = f'<h3>Number of Tokens: <span style="font-size: 20px; font-weight: bold;">{num_tokens}</span></h3>' | |
| token_ids_html = f'<h3>{model_name} Token IDs</h3>' + ' '.join(map(str, token_ids_colored)) | |
| return modelname_html + num_tokens_html + tokens_html + token_ids_html | |
| def tokenize_input(text): | |
| gpt4o_result = process_model(text, enc_gpt4o, "GPT-4o") | |
| gpt35turbo_result = process_model(text, enc_gpt3_5turbo, "GPT-3.5-turbo") | |
| num_chars = len(text) | |
| num_chars_html = f'<h2>Number of Characters: <span style="font-size: 20px; font-weight: bold;">{num_chars}</span></h2>' | |
| return num_chars_html, gpt4o_result, gpt35turbo_result | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## GPT4o vsGPT3.5 Token Comparison") | |
| with gr.Row(): | |
| input_text = gr.Textbox(lines=2, placeholder="Enter text here...", label="Enter text to tokenize and compare results between GPT-4o and GPT-3.5-turbo tokenizers.") | |
| num_chars_output = gr.HTML() | |
| with gr.Row(): | |
| gpt4o_output = gr.HTML(label="GPT-4o") | |
| gpt35turbo_output = gr.HTML(label="GPT-3.5-turbo") | |
| input_text.change(tokenize_input, inputs=[input_text], outputs=[num_chars_output, gpt4o_output, gpt35turbo_output]) | |
| input_text.submit(tokenize_input, inputs=[input_text], outputs=[num_chars_output, gpt4o_output, gpt35turbo_output]) | |
| gr.Markdown(""" | |
| <hr> | |
| ### License Information | |
| This application uses the following open-source libraries: | |
| 1. **Gradio**: | |
| - License: Apache License 2.0 | |
| - Copyright: 2020-2023, Gradio contributors | |
| - Full License: [Apache License 2.0](http://www.apache.org/licenses/LICENSE-2.0) | |
| - Repository: [Gradio GitHub](https://github.com/gradio-app/gradio/) | |
| 2. **tiktoken**: | |
| - License: MIT License | |
| - Copyright: 2022, OpenAI, Shantanu Jain | |
| - Full License: [MIT License](https://opensource.org/licenses/MIT) | |
| - Repository: [tiktoken GitHub](https://github.com/openai/tiktoken) | |
| """) | |
| # Launch the app | |
| demo.launch() | |