Spaces:

luminoussg
/

token_counter

Sleeping

App Files Files Community

token_counter / app.py

luminoussg

Update app.py

51a3da8 verified 9 months ago

raw

history blame contribute delete

3.48 kB

	import gradio as gr
	import tiktoken
	import json
	import os

	# Function to count tokens in the dataset based on the "messages" field
	def count_tokens(json_file, encoding_name):
	encoding = tiktoken.get_encoding(encoding_name)

	# Load the JSON or JSONL data
	with open(json_file.name, 'r') as f:
	data = json.load(f) if json_file.name.endswith('.json') else [json.loads(line) for line in f.readlines()]

	total_token_count = 0
	token_counts = []
	for entry in data:
	conversation_token_count = 0
	conversation_texts = []
	if "messages" in entry:
	for message in entry["messages"]:
	content = message.get("content", "")
	conversation_texts.append(content)
	tokens = len(encoding.encode(content))
	conversation_token_count += tokens

	# Add conversation token count to the total
	total_token_count += conversation_token_count

	token_counts.append({
	'conversation': ' '.join(conversation_texts),
	'token_count': conversation_token_count
	})

	return token_counts, total_token_count

	# Gradio interface function
	def token_counter(json_file, encoding_with_model):
	# Split encoding name and model type from the dropdown input
	encoding_name = encoding_with_model.split()[0]

	# Get token counts
	token_data, total_token_count = count_tokens(json_file, encoding_name)

	return token_data, total_token_count

	# Define the encoding choices with model information
	encoding_options = [
	"o200k_base (gpt-4o, gpt-4o-mini)",
	"cl100k_base (gpt-4-turbo, gpt-4, gpt-3.5-turbo, text-embedding-ada-002, text-embedding-3-small, text-embedding-3-large)",
	"p50k_base (Codex models, text-davinci-002, text-davinci-003)",
	"r50k_base (GPT-3 models like davinci)"
	]

	# Gradio UI setup
	with gr.Blocks() as app:
	gr.Markdown("# Token Counter for JSON/JSONL Datasets")

	with gr.Row():
	json_input = gr.File(label="Upload JSON/JSONL File")
	encoding_dropdown = gr.Dropdown(choices=encoding_options, label="Select Encoding", value="o200k_base (gpt-4o, gpt-4o-mini)")

	# Example file (this will automatically upload when clicked)
	example_file_path = "keivalyaMedQuad-MedicalQnADataset_valid.jsonl"
	if os.path.exists(example_file_path):
	example = gr.Examples(
	examples=[example_file_path],
	inputs=json_input,
	label="Click here to load the example file"
	)

	# Display credits for the dataset author
	gr.Markdown("### Dataset Credits")
	gr.Markdown(
	"""
	This dataset is provided by the [MedQuad-MedicalQnADataset](https://huggingface.co/datasets/keivalya/MedQuad-MedicalQnADataset) on Hugging Face.
	All credit goes to the original creator, [keivalya](https://huggingface.co/keivalya).
	"""
	)

	# Output for individual conversation token counts
	conversation_output = gr.JSON(label="Token Counts per Conversation")

	# Output for total token count
	total_output = gr.Number(label="Total Token Count", interactive=False)

	# Add a submit button to trigger token counting
	submit_button = gr.Button("Submit")

	# Link the button click event to the token counting function
	submit_button.click(token_counter, [json_input, encoding_dropdown], [conversation_output, total_output])

	# Launch the app
	app.launch()