Spaces:

helenai
/

dataset-token-distribution

Sleeping

helenai commited on 20 days ago

Commit

5942a55

•

1 Parent(s): 45b4bd7

Add more percentiles and BAAI/bge preset tokenizer

Files changed (1) hide show

app.py CHANGED Viewed

@@ -12,6 +12,8 @@ from transformers import AutoTokenizer
 tokenizers = {
     "bert": "google-bert/bert-base-uncased",
     "blenderbot": "facebook/blenderbot-3B",
     "bloom": "bigscience/bloom-560m",
     "bloomz": "bigscience/bloomz-7b1",
@@ -65,12 +67,12 @@ def count(model_id, dataset_id, config, split, column, add_special_tokens=True):
         # not 100% accurate but good enough
         words = re.findall(pattern, item[column])
         wordcounter.append(len(words))
-    df = pd.DataFrame(tokencounter).describe().T
     df.insert(0, "type", "tokens")
-    dfc = pd.DataFrame(charcounter).describe().T
     dfc.insert(0, "type", "chars")
-    dfw = pd.DataFrame(wordcounter).describe().T
     dfw.insert(0, "type", "words")
     df.loc[-1] = dfw.values[0]
     df.index = df.index + 1  # shifting index
@@ -105,6 +107,7 @@ demo = gr.Interface(
         ["tiiuae/falcon-7b", "imdb", "", "test", "text"],
         ["tiiuae/falcon-7b", "wikitext", "wikitext-2-v1", "validation", "text"],
         ["tiiuae/falcon-7b", "zeroshot/twitter-financial-news-sentiment", "", "validation", "text"],
     ],
     cache_examples=True,
 )

 tokenizers = {
     "bert": "google-bert/bert-base-uncased",
+    "bge-en": "BAAI/bge-base-en-v1.5",
+    "bge-zh": "BAAI/bge-base-zh-v1.5",
     "blenderbot": "facebook/blenderbot-3B",
     "bloom": "bigscience/bloom-560m",
     "bloomz": "bigscience/bloomz-7b1",
         # not 100% accurate but good enough
         words = re.findall(pattern, item[column])
         wordcounter.append(len(words))
+    percentiles = [0.25, 0.5, 0.75, 0.9, 0.95, 0.99]
+    df = pd.DataFrame(tokencounter).describe(percentiles=percentiles).T
     df.insert(0, "type", "tokens")
+    dfc = pd.DataFrame(charcounter).describe(percentiles=percentiles).T
     dfc.insert(0, "type", "chars")
+    dfw = pd.DataFrame(wordcounter).describe(percentiles=percentiles).T
     dfw.insert(0, "type", "words")
     df.loc[-1] = dfw.values[0]
     df.index = df.index + 1  # shifting index
         ["tiiuae/falcon-7b", "imdb", "", "test", "text"],
         ["tiiuae/falcon-7b", "wikitext", "wikitext-2-v1", "validation", "text"],
         ["tiiuae/falcon-7b", "zeroshot/twitter-financial-news-sentiment", "", "validation", "text"],
+        ["BAAI/bge-base-en-v1.5", "PolyAI/banking77", "", "test", "text"],
     ],
     cache_examples=True,
 )