Boubou78000 commited on
Commit
619fcb8
β€’
1 Parent(s): 0a88a11

Slides 😁

Browse files
Files changed (1) hide show
  1. app.py +8 -11
app.py CHANGED
@@ -4,7 +4,9 @@ import gradio as gr
4
  from datasets import load_dataset
5
  from transformers import AutoTokenizer
6
 
7
- def ReturnTokens(dataset_name, tokenizer_name="openai-community/gpt2", split="train"):
 
 
8
 
9
  # Initialize tokenizer
10
  tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
@@ -12,19 +14,14 @@ def ReturnTokens(dataset_name, tokenizer_name="openai-community/gpt2", split="tr
12
  # Load dataset
13
  dataset = load_dataset(dataset_name, split=split)
14
 
15
- # Function to count tokens in a single example
16
- def count_tokens_in_example(example):
17
- total_tokens = 0
18
- tokenized = tokenizer.batch_encode_plus(example)
19
- for i in tokenized:
20
- total_tokens+=len(i)
21
- return total_tokens
22
-
23
  tokens_=0
24
 
25
  for field in dataset[0].keys():
26
- tokens_+=count_tokens_in_example(dataset[field])
27
-
 
 
 
28
  return tokens_
29
 
30
  with gr.Blocks(title="Dataset token counter") as app:
 
4
  from datasets import load_dataset
5
  from transformers import AutoTokenizer
6
 
7
+ def ReturnTokens(dataset_name, tokenizer_name="openai-community/gpt2", split="train", progress=gr.Progress()):
8
+
9
+ progress(0, desc="Starting")
10
 
11
  # Initialize tokenizer
12
  tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
 
14
  # Load dataset
15
  dataset = load_dataset(dataset_name, split=split)
16
 
 
 
 
 
 
 
 
 
17
  tokens_=0
18
 
19
  for field in dataset[0].keys():
20
+ _all=dataset[field]
21
+
22
+ for i in progress.tqdm(_all, desc=f"Tokenizing \"{field}\""):
23
+ tokens_+=len(tokenizer.tokenize(i))
24
+
25
  return tokens_
26
 
27
  with gr.Blocks(title="Dataset token counter") as app: