Boubou78000 commited on
Commit
42d7cb6
1 Parent(s): dd8b6dd

Created app

Browse files
Files changed (1) hide show
  1. app.py +26 -7
app.py CHANGED
@@ -2,18 +2,21 @@ import token
2
  import tokenize
3
  import gradio as gr
4
  from datasets import load_dataset
5
- from tokenizers import Tokenizer
6
 
7
  def ReturnTokens(dataset, tokenizer="openai-community/gpt2", split="train"):
8
  global tokens_
9
- tokenizer=Tokenizer.from_pretrained(tokenizer)
10
- dataset=load_dataset(dataset)
11
  tokens_=0
12
  def CountTokens(Example):
13
  global tokens_
14
- for i in Example.values():
15
- tokens_+=len(Tokenizer.encode(i))
16
- dataset.map(CountTokens)
 
 
 
17
  return tokens_
18
 
19
  with gr.Blocks(title="Dataset token counter") as app:
@@ -30,4 +33,20 @@ with gr.Blocks(title="Dataset token counter") as app:
30
  outputs=[tokens]
31
  )
32
 
33
- app.launch(share=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import tokenize
3
  import gradio as gr
4
  from datasets import load_dataset
5
+ from transformers import AutoTokenizer
6
 
7
  def ReturnTokens(dataset, tokenizer="openai-community/gpt2", split="train"):
8
  global tokens_
9
+ tokenizer=AutoTokenizer.from_pretrained(tokenizer)
10
+ dataset=load_dataset(dataset, split=split)
11
  tokens_=0
12
  def CountTokens(Example):
13
  global tokens_
14
+ print(Example)
15
+ for k,i in enumerate(Example):
16
+ tokens_+=len(tokenizer.tokenize(i))
17
+ categories=[i for i in dataset[0].keys()]
18
+ for cat in categories:
19
+ CountTokens(dataset[cat])
20
  return tokens_
21
 
22
  with gr.Blocks(title="Dataset token counter") as app:
 
33
  outputs=[tokens]
34
  )
35
 
36
+ gr.on(
37
+ triggers=[
38
+ prompt.submit,
39
+ tokenizer.submit,
40
+ split.submit,
41
+ ],
42
+ fn=ReturnTokens,
43
+ inputs=[
44
+ prompt,
45
+ tokenizer,
46
+ split
47
+ ],
48
+ outputs=[tokens],
49
+ api_name="run",
50
+ )
51
+
52
+ app.launch()