import token import tokenize import gradio as gr from datasets import load_dataset from transformers import AutoTokenizer def ReturnTokens(dataset_name, tokenizer_name="openai-community/gpt2", split="train", progress=gr.Progress()): progress(0, desc="Starting") # Initialize tokenizer tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) # Load dataset dataset = load_dataset(dataset_name, split=split) progress(0, desc="Loaded \"{}\"".format(dataset_name)) tokens_=0 for field in dataset[0].keys(): _all=dataset[field] for i in progress.tqdm(_all, desc=f"Tokenizing \"{field}\""): tokens_+=len(tokenizer.tokenize(i)) return tokens_ with gr.Blocks(title="Dataset token counter") as app: gr.Markdown("# Token Counter") with gr.Row(): prompt = gr.Textbox(label="Dataset", elem_id="dataset", info="", placeholder="") tokenizer = gr.Textbox(label="Tokenizer", elem_id="tokenizer", info="", placeholder="openai-community/gpt2", value="openai-community/gpt2") split = gr.Textbox(label="Split (default: train)", elem_id="split", info="", placeholder="train", value="train") tokens = gr.Label(label="Tokens", elem_id="tokens") gr.on( triggers=[ prompt.submit, tokenizer.submit, split.submit, ], fn=ReturnTokens, inputs=[ prompt, tokenizer, split ], outputs=[tokens], api_name="run", ) app.launch()