helenai commited on
Commit
5942a55
1 Parent(s): 45b4bd7

Add more percentiles and BAAI/bge preset tokenizer

Browse files
Files changed (1) hide show
  1. app.py +7 -4
app.py CHANGED
@@ -12,6 +12,8 @@ from transformers import AutoTokenizer
12
 
13
  tokenizers = {
14
  "bert": "google-bert/bert-base-uncased",
 
 
15
  "blenderbot": "facebook/blenderbot-3B",
16
  "bloom": "bigscience/bloom-560m",
17
  "bloomz": "bigscience/bloomz-7b1",
@@ -65,12 +67,12 @@ def count(model_id, dataset_id, config, split, column, add_special_tokens=True):
65
  # not 100% accurate but good enough
66
  words = re.findall(pattern, item[column])
67
  wordcounter.append(len(words))
68
-
69
- df = pd.DataFrame(tokencounter).describe().T
70
  df.insert(0, "type", "tokens")
71
- dfc = pd.DataFrame(charcounter).describe().T
72
  dfc.insert(0, "type", "chars")
73
- dfw = pd.DataFrame(wordcounter).describe().T
74
  dfw.insert(0, "type", "words")
75
  df.loc[-1] = dfw.values[0]
76
  df.index = df.index + 1 # shifting index
@@ -105,6 +107,7 @@ demo = gr.Interface(
105
  ["tiiuae/falcon-7b", "imdb", "", "test", "text"],
106
  ["tiiuae/falcon-7b", "wikitext", "wikitext-2-v1", "validation", "text"],
107
  ["tiiuae/falcon-7b", "zeroshot/twitter-financial-news-sentiment", "", "validation", "text"],
 
108
  ],
109
  cache_examples=True,
110
  )
 
12
 
13
  tokenizers = {
14
  "bert": "google-bert/bert-base-uncased",
15
+ "bge-en": "BAAI/bge-base-en-v1.5",
16
+ "bge-zh": "BAAI/bge-base-zh-v1.5",
17
  "blenderbot": "facebook/blenderbot-3B",
18
  "bloom": "bigscience/bloom-560m",
19
  "bloomz": "bigscience/bloomz-7b1",
 
67
  # not 100% accurate but good enough
68
  words = re.findall(pattern, item[column])
69
  wordcounter.append(len(words))
70
+ percentiles = [0.25, 0.5, 0.75, 0.9, 0.95, 0.99]
71
+ df = pd.DataFrame(tokencounter).describe(percentiles=percentiles).T
72
  df.insert(0, "type", "tokens")
73
+ dfc = pd.DataFrame(charcounter).describe(percentiles=percentiles).T
74
  dfc.insert(0, "type", "chars")
75
+ dfw = pd.DataFrame(wordcounter).describe(percentiles=percentiles).T
76
  dfw.insert(0, "type", "words")
77
  df.loc[-1] = dfw.values[0]
78
  df.index = df.index + 1 # shifting index
 
107
  ["tiiuae/falcon-7b", "imdb", "", "test", "text"],
108
  ["tiiuae/falcon-7b", "wikitext", "wikitext-2-v1", "validation", "text"],
109
  ["tiiuae/falcon-7b", "zeroshot/twitter-financial-news-sentiment", "", "validation", "text"],
110
+ ["BAAI/bge-base-en-v1.5", "PolyAI/banking77", "", "test", "text"],
111
  ],
112
  cache_examples=True,
113
  )