token_counter / app.py
luminoussg's picture
Update app.py
51a3da8 verified
import gradio as gr
import tiktoken
import json
import os
# Function to count tokens in the dataset based on the "messages" field
def count_tokens(json_file, encoding_name):
encoding = tiktoken.get_encoding(encoding_name)
# Load the JSON or JSONL data
with open(json_file.name, 'r') as f:
data = json.load(f) if json_file.name.endswith('.json') else [json.loads(line) for line in f.readlines()]
total_token_count = 0
token_counts = []
for entry in data:
conversation_token_count = 0
conversation_texts = []
if "messages" in entry:
for message in entry["messages"]:
content = message.get("content", "")
conversation_texts.append(content)
tokens = len(encoding.encode(content))
conversation_token_count += tokens
# Add conversation token count to the total
total_token_count += conversation_token_count
token_counts.append({
'conversation': ' '.join(conversation_texts),
'token_count': conversation_token_count
})
return token_counts, total_token_count
# Gradio interface function
def token_counter(json_file, encoding_with_model):
# Split encoding name and model type from the dropdown input
encoding_name = encoding_with_model.split()[0]
# Get token counts
token_data, total_token_count = count_tokens(json_file, encoding_name)
return token_data, total_token_count
# Define the encoding choices with model information
encoding_options = [
"o200k_base (gpt-4o, gpt-4o-mini)",
"cl100k_base (gpt-4-turbo, gpt-4, gpt-3.5-turbo, text-embedding-ada-002, text-embedding-3-small, text-embedding-3-large)",
"p50k_base (Codex models, text-davinci-002, text-davinci-003)",
"r50k_base (GPT-3 models like davinci)"
]
# Gradio UI setup
with gr.Blocks() as app:
gr.Markdown("# Token Counter for JSON/JSONL Datasets")
with gr.Row():
json_input = gr.File(label="Upload JSON/JSONL File")
encoding_dropdown = gr.Dropdown(choices=encoding_options, label="Select Encoding", value="o200k_base (gpt-4o, gpt-4o-mini)")
# Example file (this will automatically upload when clicked)
example_file_path = "keivalyaMedQuad-MedicalQnADataset_valid.jsonl"
if os.path.exists(example_file_path):
example = gr.Examples(
examples=[example_file_path],
inputs=json_input,
label="Click here to load the example file"
)
# Display credits for the dataset author
gr.Markdown("### Dataset Credits")
gr.Markdown(
"""
This dataset is provided by the [MedQuad-MedicalQnADataset](https://huggingface.co/datasets/keivalya/MedQuad-MedicalQnADataset) on Hugging Face.
All credit goes to the original creator, [keivalya](https://huggingface.co/keivalya).
"""
)
# Output for individual conversation token counts
conversation_output = gr.JSON(label="Token Counts per Conversation")
# Output for total token count
total_output = gr.Number(label="Total Token Count", interactive=False)
# Add a submit button to trigger token counting
submit_button = gr.Button("Submit")
# Link the button click event to the token counting function
submit_button.click(token_counter, [json_input, encoding_dropdown], [conversation_output, total_output])
# Launch the app
app.launch()