File size: 1,555 Bytes
3e8ed31
 
89002be
3e8ed31
42d7cb6
89002be
619fcb8
 
 
0a88a11
2c3f7c0
c221aa8
2c3f7c0
 
c221aa8
7df3d0f
 
2c3f7c0
 
 
c221aa8
619fcb8
 
 
 
 
2c3f7c0
89002be
3e8ed31
 
89002be
3e8ed31
 
 
 
dd8b6dd
3e8ed31
0a88a11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42d7cb6
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import token
import tokenize
import gradio as gr
from datasets import load_dataset
from transformers import AutoTokenizer

def ReturnTokens(dataset_name, tokenizer_name="openai-community/gpt2", split="train", progress=gr.Progress()):

    progress(0, desc="Starting")

    # Initialize tokenizer
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    
    # Load dataset
    dataset = load_dataset(dataset_name, split=split)

    progress(0, desc="Loaded \"{}\"".format(dataset_name))
    
    tokens_=0

    for field in dataset[0].keys():
        _all=dataset[field]

        for i in progress.tqdm(_all, desc=f"Tokenizing \"{field}\""):
            tokens_+=len(tokenizer.tokenize(i))

    return tokens_

with gr.Blocks(title="Dataset token counter") as app:
    gr.Markdown("# Token Counter")

    with gr.Row():
        prompt = gr.Textbox(label="Dataset", elem_id="dataset", info="", placeholder="")
        tokenizer = gr.Textbox(label="Tokenizer", elem_id="tokenizer", info="", placeholder="openai-community/gpt2", value="openai-community/gpt2")
        split = gr.Textbox(label="Split (default: train)", elem_id="split", info="", placeholder="train", value="train")
        tokens = gr.Label(label="Tokens", elem_id="tokens")

    gr.on(
        triggers=[
            prompt.submit,
            tokenizer.submit,
            split.submit,
        ],
        fn=ReturnTokens,
        inputs=[
            prompt,
            tokenizer,
            split
        ],
        outputs=[tokens],
        api_name="run",
    )

app.launch()