Dataset-Tokens / app.py
Boubou78000
.
7df3d0f
import token
import tokenize
import gradio as gr
from datasets import load_dataset
from transformers import AutoTokenizer
def ReturnTokens(dataset_name, tokenizer_name="openai-community/gpt2", split="train", progress=gr.Progress()):
progress(0, desc="Starting")
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
# Load dataset
dataset = load_dataset(dataset_name, split=split)
progress(0, desc="Loaded \"{}\"".format(dataset_name))
tokens_=0
for field in dataset[0].keys():
_all=dataset[field]
for i in progress.tqdm(_all, desc=f"Tokenizing \"{field}\""):
tokens_+=len(tokenizer.tokenize(i))
return tokens_
with gr.Blocks(title="Dataset token counter") as app:
gr.Markdown("# Token Counter")
with gr.Row():
prompt = gr.Textbox(label="Dataset", elem_id="dataset", info="", placeholder="")
tokenizer = gr.Textbox(label="Tokenizer", elem_id="tokenizer", info="", placeholder="openai-community/gpt2", value="openai-community/gpt2")
split = gr.Textbox(label="Split (default: train)", elem_id="split", info="", placeholder="train", value="train")
tokens = gr.Label(label="Tokens", elem_id="tokens")
gr.on(
triggers=[
prompt.submit,
tokenizer.submit,
split.submit,
],
fn=ReturnTokens,
inputs=[
prompt,
tokenizer,
split
],
outputs=[tokens],
api_name="run",
)
app.launch()