helenai commited on
Commit
57b690d
1 Parent(s): 75cf623

Initial commit

Browse files
Files changed (3) hide show
  1. README.md +1 -3
  2. app.py +96 -0
  3. requirements.txt +5 -0
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  title: Dataset Token Distribution
3
- emoji: 🏆
4
  colorFrom: red
5
  colorTo: yellow
6
  sdk: gradio
@@ -9,5 +9,3 @@ app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
  ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Dataset Token Distribution
3
+ emoji: 🏢
4
  colorFrom: red
5
  colorTo: yellow
6
  sdk: gradio
 
9
  pinned: false
10
  license: apache-2.0
11
  ---
 
 
app.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import json
3
+ import re
4
+
5
+ import gradio as gr
6
+ import matplotlib.pyplot as plt
7
+ import pandas as pd
8
+ from datasets import load_dataset
9
+ from PIL import Image
10
+ from transformers import AutoTokenizer
11
+
12
+ tokenizers = [
13
+ "google/gemma-7b",
14
+ "meta-llama/Llama-2-7b",
15
+ "mistralai/Mistral-7B-v0.1",
16
+ "facebook/opt-2.7b",
17
+ "microsoft/phi-2",
18
+ "THUDM/chatglm3-6b",
19
+ "Qwen/Qwen1.5-7B-Chat",
20
+ "bigscience/bloom-560m",
21
+ "ise-uiuc/Magicoder-S-DS-6.7B",
22
+ "google/flan-t5-base",
23
+ "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
24
+ ]
25
+
26
+
27
+ def plot_histogram(data):
28
+ plt.hist(data)
29
+ plt.title("Histogram of number of tokens per dataset item")
30
+ buf = io.BytesIO()
31
+ plt.savefig(buf, format="png")
32
+ buf.seek(0)
33
+ im = Image.open(buf)
34
+ return im
35
+
36
+
37
+ def count(model_id, dataset_id, config, split, column, add_special_tokens=True):
38
+ tokencounter = []
39
+ wordcounter = []
40
+ charcounter = []
41
+ tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
42
+ if config == "":
43
+ config is None
44
+ dataset = load_dataset(dataset_id, config, split=split, trust_remote_code=True)
45
+ pattern = r"[a-zA-Z]+"
46
+ for item in dataset:
47
+ tokens = tokenizer(item[column], add_special_tokens=add_special_tokens)["input_ids"]
48
+ tokencounter.append(len(tokens))
49
+ charcounter.append(len(item[column]))
50
+ # not 100% accurate but good enough
51
+ words = re.findall(pattern, item[column])
52
+ wordcounter.append(len(words))
53
+
54
+ df = pd.DataFrame(tokencounter).describe().T
55
+ df.insert(0, "type", "tokens")
56
+ dfc = pd.DataFrame(charcounter).describe().T
57
+ dfc.insert(0, "type", "chars")
58
+ dfw = pd.DataFrame(wordcounter).describe().T
59
+ dfw.insert(0, "type", "words")
60
+ df.loc[-1] = dfw.values[0]
61
+ df.index = df.index + 1 # shifting index
62
+ df.loc[-1] = dfc.values[0]
63
+ df = df.round(1)
64
+ df.drop("count", axis=1, inplace=True)
65
+
66
+ return plot_histogram(tokencounter), df
67
+
68
+
69
+ demo = gr.Interface(
70
+ fn=count,
71
+ title="Dataset token counts and distribution",
72
+ inputs=[
73
+ gr.Dropdown(label="Tokenizer", choices=tokenizers, allow_custom_value=True),
74
+ gr.Textbox(label="Dataset"),
75
+ gr.Textbox(label="Config"),
76
+ gr.Textbox(label="Split"),
77
+ gr.Textbox(label="Column"),
78
+ gr.Checkbox(label="Add special tokens", value=True),
79
+ ],
80
+ outputs=[
81
+ gr.Image(),
82
+ gr.Dataframe(label="Token, word and character counts per dataset item"),
83
+ ],
84
+ examples=[
85
+ ["mistralai/Mistral-7B-v0.1", "gsarti/flores_101", "eng", "dev", "sentence"],
86
+ ["mistralai/Mistral-7B-v0.1", "Muennighoff/flores200", "eng_Latn", "dev", "sentence"],
87
+ ["mistralai/Mistral-7B-v0.1", "wikitext", "wikitext-2-v1", "validation", "text"],
88
+ ["mistralai/Mistral-7B-v0.1", "hails/mmlu_no_train", "elementary_mathematics", "test", "question"],
89
+ ["mistralai/Mistral-7B-v0.1", "imdb", "", "test", "text"],
90
+ ["mistralai/Mistral-7B-v0.1", "gsm8k", "main", "test", "question"],
91
+ ["mistralai/Mistral-7B-v0.1", "locuslab/TOFU", "world_facts", "train", "question"],
92
+ ],
93
+ cache_examples=False
94
+ )
95
+
96
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ datasets
2
+ matplotlib
3
+ pandas
4
+ pillow
5
+ transformers