Boubou78000 commited on
Commit
c221aa8
1 Parent(s): be12dbf

Various bug fixes

Browse files
Files changed (1) hide show
  1. app.py +9 -13
app.py CHANGED
@@ -4,19 +4,15 @@ import gradio as gr
4
  from datasets import load_dataset
5
  from transformers import AutoTokenizer
6
 
7
- def ReturnTokens(dataset, tokenizer="openai-community/gpt2", split="train"):
8
- global tokens_
9
- tokenizer=AutoTokenizer.from_pretrained(tokenizer)
10
- dataset=load_dataset(dataset, split=split)
11
- tokens_=0
12
- def CountTokens(Example):
13
- global tokens_
14
- for k,i in enumerate(Example):
15
- tokens_+=len(tokenizer.tokenize(i))
16
- categories=[i for i in dataset[0].keys()]
17
- for cat in categories:
18
- CountTokens(dataset[cat])
19
- return tokens_
20
 
21
  with gr.Blocks(title="Dataset token counter") as app:
22
  gr.Markdown("# Token Counter")
 
4
  from datasets import load_dataset
5
  from transformers import AutoTokenizer
6
 
7
+ def ReturnTokens(dataset_name, tokenizer_name="openai-community/gpt2", split="train"):
8
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
9
+ dataset = load_dataset(dataset_name, split=split)
10
+ def count_tokens(examples):
11
+ return sum(len(tokenizer.tokenize(example)) for example in examples)
12
+ total_tokens = 0
13
+ for field in dataset[0].keys():
14
+ total_tokens += count_tokens(dataset[field])
15
+ return total_tokens
 
 
 
 
16
 
17
  with gr.Blocks(title="Dataset token counter") as app:
18
  gr.Markdown("# Token Counter")