import token import tokenize import gradio as gr from datasets import load_dataset from transformers import AutoTokenizer def ReturnTokens(dataset_name, tokenizer_name="openai-community/gpt2", split="train"): # Initialize tokenizer tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) # Load dataset dataset = load_dataset(dataset_name, split=split) # Function to count tokens in a single example def count_tokens_in_example(example): total_tokens = 0 tokenized = tokenizer.batch_encode_plus(example) for i in tokenized: total_tokens+=len(i) return total_tokens tokens_=0 for field in dataset[0].keys(): tokens_+=count_tokens_in_example(dataset[field]) return tokens_ with gr.Blocks(title="Dataset token counter") as app: gr.Markdown("# Token Counter") with gr.Row(): prompt = gr.Textbox(label="Dataset", elem_id="dataset", info="", placeholder="") tokenizer = gr.Textbox(label="Tokenizer", elem_id="tokenizer", info="", placeholder="openai-community/gpt2", value="openai-community/gpt2") split = gr.Textbox(label="Split (default: train)", elem_id="split", info="", placeholder="train", value="train") tokens = gr.Label(label="Tokens", elem_id="tokens") prompt.submit().success( ReturnTokens, inputs=[prompt,tokenizer,split], outputs=[tokens] ) gr.on( triggers=[ prompt.submit, tokenizer.submit, split.submit, ], fn=ReturnTokens, inputs=[ prompt, tokenizer, split ], outputs=[tokens], api_name="run", ) app.launch()