Boubou78000 commited on
Commit
2c3f7c0
1 Parent(s): c221aa8

BUG FIXES + FASTER

Browse files
Files changed (1) hide show
  1. app.py +17 -5
app.py CHANGED
@@ -5,14 +5,26 @@ from datasets import load_dataset
5
  from transformers import AutoTokenizer
6
 
7
  def ReturnTokens(dataset_name, tokenizer_name="openai-community/gpt2", split="train"):
 
8
  tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
 
 
9
  dataset = load_dataset(dataset_name, split=split)
10
- def count_tokens(examples):
11
- return sum(len(tokenizer.tokenize(example)) for example in examples)
12
- total_tokens = 0
 
 
 
 
 
 
 
 
13
  for field in dataset[0].keys():
14
- total_tokens += count_tokens(dataset[field])
15
- return total_tokens
 
16
 
17
  with gr.Blocks(title="Dataset token counter") as app:
18
  gr.Markdown("# Token Counter")
 
5
  from transformers import AutoTokenizer
6
 
7
  def ReturnTokens(dataset_name, tokenizer_name="openai-community/gpt2", split="train"):
8
+ # Initialize tokenizer
9
  tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
10
+
11
+ # Load dataset
12
  dataset = load_dataset(dataset_name, split=split)
13
+
14
+ # Function to count tokens in a single example
15
+ def count_tokens_in_example(example):
16
+ total_tokens = 0
17
+ tokenized = tokenizer.batch_encode_plus(example)
18
+ for i in tokenized:
19
+ total_tokens+=len(i)
20
+ return total_tokens
21
+
22
+ tokens_=0
23
+
24
  for field in dataset[0].keys():
25
+ tokens_+=count_tokens_in_example(dataset[field])
26
+
27
+ return tokens_
28
 
29
  with gr.Blocks(title="Dataset token counter") as app:
30
  gr.Markdown("# Token Counter")