Spaces:
Sleeping
Sleeping
import gradio as gr | |
import tiktoken | |
import json | |
import os | |
# Function to count tokens in the dataset based on the "messages" field | |
def count_tokens(json_file, encoding_name): | |
encoding = tiktoken.get_encoding(encoding_name) | |
# Load the JSON or JSONL data | |
with open(json_file.name, 'r') as f: | |
data = json.load(f) if json_file.name.endswith('.json') else [json.loads(line) for line in f.readlines()] | |
total_token_count = 0 | |
token_counts = [] | |
for entry in data: | |
conversation_token_count = 0 | |
conversation_texts = [] | |
if "messages" in entry: | |
for message in entry["messages"]: | |
content = message.get("content", "") | |
conversation_texts.append(content) | |
tokens = len(encoding.encode(content)) | |
conversation_token_count += tokens | |
# Add conversation token count to the total | |
total_token_count += conversation_token_count | |
token_counts.append({ | |
'conversation': ' '.join(conversation_texts), | |
'token_count': conversation_token_count | |
}) | |
return token_counts, total_token_count | |
# Gradio interface function | |
def token_counter(json_file, encoding_with_model): | |
# Split encoding name and model type from the dropdown input | |
encoding_name = encoding_with_model.split()[0] | |
# Get token counts | |
token_data, total_token_count = count_tokens(json_file, encoding_name) | |
return token_data, total_token_count | |
# Define the encoding choices with model information | |
encoding_options = [ | |
"o200k_base (gpt-4o, gpt-4o-mini)", | |
"cl100k_base (gpt-4-turbo, gpt-4, gpt-3.5-turbo, text-embedding-ada-002, text-embedding-3-small, text-embedding-3-large)", | |
"p50k_base (Codex models, text-davinci-002, text-davinci-003)", | |
"r50k_base (GPT-3 models like davinci)" | |
] | |
# Gradio UI setup | |
with gr.Blocks() as app: | |
gr.Markdown("# Token Counter for JSON/JSONL Datasets") | |
with gr.Row(): | |
json_input = gr.File(label="Upload JSON/JSONL File") | |
encoding_dropdown = gr.Dropdown(choices=encoding_options, label="Select Encoding", value="o200k_base (gpt-4o, gpt-4o-mini)") | |
# Example file (this will automatically upload when clicked) | |
example_file_path = "keivalyaMedQuad-MedicalQnADataset_valid.jsonl" | |
if os.path.exists(example_file_path): | |
example = gr.Examples( | |
examples=[example_file_path], | |
inputs=json_input, | |
label="Click here to load the example file" | |
) | |
# Display credits for the dataset author | |
gr.Markdown("### Dataset Credits") | |
gr.Markdown( | |
""" | |
This dataset is provided by the [MedQuad-MedicalQnADataset](https://huggingface.co/datasets/keivalya/MedQuad-MedicalQnADataset) on Hugging Face. | |
All credit goes to the original creator, [keivalya](https://huggingface.co/keivalya). | |
""" | |
) | |
# Output for individual conversation token counts | |
conversation_output = gr.JSON(label="Token Counts per Conversation") | |
# Output for total token count | |
total_output = gr.Number(label="Total Token Count", interactive=False) | |
# Add a submit button to trigger token counting | |
submit_button = gr.Button("Submit") | |
# Link the button click event to the token counting function | |
submit_button.click(token_counter, [json_input, encoding_dropdown], [conversation_output, total_output]) | |
# Launch the app | |
app.launch() | |