liujch1998 commited on
Commit
7f5f844
·
1 Parent(s): 9679701

sync: add olmo3 indexes; improve UI

Browse files
Files changed (3) hide show
  1. app.py +38 -21
  2. constants.py +15 -8
  3. requirements.txt +1 -1
app.py CHANGED
@@ -5,6 +5,19 @@ import random
5
  import requests
6
  from constants import *
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  def process(query_type, index_desc, **kwargs):
9
  timestamp = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
10
  index = INDEX_BY_DESC[index_desc]
@@ -19,7 +32,7 @@ def process(query_type, index_desc, **kwargs):
19
  if API_URL is None:
20
  raise ValueError(f'API_URL envvar is not set!')
21
  try:
22
- response = requests.post(API_URL, json=data, timeout=10)
23
  except requests.exceptions.Timeout:
24
  raise ValueError('Web request timed out. Please try again later.')
25
  except requests.exceptions.RequestException as e:
@@ -80,7 +93,7 @@ def prob(index_desc, query):
80
  if 'error' in result:
81
  prob = result['error']
82
  elif result['prompt_cnt'] == 0:
83
- prob = '(n-1)-gram is not found in the corpus'
84
  else:
85
  prob = f'{result["prob"]:.4f} ({result["cont_cnt"]:,} / {result["prompt_cnt"]:,})'
86
  return latency, tokenization_info, prob
@@ -97,7 +110,7 @@ def ntd(index_desc, query, max_support):
97
  for token_id, r in result_by_token_id.items():
98
  ntd[f'{r["token"]} ({r["cont_cnt"]} / {result["prompt_cnt"]})'] = r['prob']
99
  if ntd == {}:
100
- ntd = '(n-1)-gram is not found in the corpus'
101
  return latency, tokenization_info, ntd
102
 
103
  def infgram_prob(index_desc, query):
@@ -173,11 +186,11 @@ def search_docs_new(index_desc, query, max_disp_len, max_clause_freq, max_diff_t
173
  ptrs_by_shard = find_result['ptrs_by_shard']
174
  cnt_retrievable = sum([len(ptrs) for ptrs in ptrs_by_shard])
175
  if find_result["approx"]:
176
- message = f'Approximately {find_result["cnt"]} occurrences found, of which {cnt_retrievable} are retrievable'
177
  else:
178
- message = f'{find_result["cnt"]} occurrences found'
179
  else: # simple query
180
- message = f'{find_result["cnt"]} occurrences found'
181
  cnt_retrievable = find_result['cnt']
182
  if cnt_retrievable == 0:
183
  idx = gr.Number(minimum=0, maximum=0, step=1, value=0, interactive=False)
@@ -229,24 +242,28 @@ def get_another_doc(index_desc, idx, max_disp_len, state):
229
  with gr.Blocks() as demo:
230
  with gr.Column():
231
  gr.HTML(
232
- '''<h1 text-align="center">Infini-gram: An Efficient Search Engine over the Massive Pretraining Datasets of Language Models</h1>
233
 
234
- <p style='font-size: 16px;'>This engine does exact-match search over several open pretraining datasets of language models. Please first select the corpus and the type of query, then enter your query and submit.</p>
235
  <p style='font-size: 16px;'>The engine is developed by <a href="https://liujch1998.github.io">Jiacheng Liu</a> and documented in our paper: <a href="https://huggingface.co/papers/2401.17377">Infini-gram: Scaling Unbounded n-gram Language Models to a Trillion Tokens</a>. Feel free to check out our <a href="https://infini-gram.io">Project Homepage</a>.</p>
236
  <p style='font-size: 16px;'><b>API Endpoint:</b> If you'd like to issue batch queries to infini-gram, you may invoke our API endpoint. Please refer to the <a href="https://infini-gram.io/api_doc">API documentation</a>.</p>
237
- <p style='font-size: 16px;'><b>Note:</b> The query is <b>case-sensitive</b>. Your query will be tokenized with the Llama-2 tokenizer (unless otherwise specified).</p>
 
 
 
 
238
  '''
239
  )
240
  with gr.Row():
241
  with gr.Column(scale=1, min_width=240):
242
- index_desc = gr.Radio(choices=INDEX_DESCS, label='Corpus', value=INDEX_DESCS[0])
243
 
244
  with gr.Column(scale=7):
245
  with gr.Tab('1. Count an n-gram'):
246
  with gr.Column():
247
  gr.HTML('<h2>1. Count an n-gram</h2>')
248
  with gr.Accordion(label='Click to view instructions', open=False):
249
- gr.HTML(f'''<p style="font-size: 16px;">This counts the number of times an n-gram appears in the corpus. If you submit an empty input, it will return the total number of tokens in the corpus. You can also make more complex queries by connecting multiple n-gram terms with the AND/OR operators, in the <a href="https://en.wikipedia.org/wiki/Conjunctive_normal_form">CNF format</a>.</p>
250
  <br>
251
  <p style="font-size: 16px;">Example queries:</p>
252
  <ul style="font-size: 16px;">
@@ -291,7 +308,7 @@ with gr.Blocks() as demo:
291
  <br>
292
  <p style="font-size: 16px;">Notes:</p>
293
  <ul style="font-size: 16px;">
294
- <li>The (n-1)-gram needs to exist in the corpus. If the (n-1)-gram is not found in the corpus, an error message will appear.</li>
295
  </ul>
296
  ''')
297
  with gr.Row():
@@ -317,8 +334,8 @@ with gr.Blocks() as demo:
317
  <br>
318
  <p style="font-size: 16px;">Notes:</p>
319
  <ul style="font-size: 16px;">
320
- <li>The (n-1)-gram needs to exist in the corpus. If the (n-1)-gram is not found in the corpus, an error message will appear.</li>
321
- <li>If the (n-1)-gram appears more than {max_support} times in the corpus, the result will be approximate: we will estimate the distribution by examining a subset of {max_support} occurrences of the (n-1)-gram. This value can be adjusted within range [1, {MAX_SUPPORT}] in "Advanced options".</li>
322
  </ul>
323
  ''')
324
 
@@ -341,9 +358,9 @@ with gr.Blocks() as demo:
341
  with gr.Column():
342
  gr.HTML('<h2>4. Compute the ∞-gram probability of the last token</h2>')
343
  with gr.Accordion(label='Click to view instructions', open=False):
344
- gr.HTML(f'''<p style="font-size: 16px;">This computes the ∞-gram probability of the last token conditioned on the previous tokens. Compared to Query Type 2 (which uses your entire input for n-gram modeling), here we take the longest suffix that we can find in the corpus.</p>
345
  <br>
346
- <p style="font-size: 16px;">Example query: <b>I love natural language processing</b> (if "natural language" appears in the corpus but "love natural language" doesn't, the output is P(processing | natural language); in this case the effective n = 3)</p>
347
  <br>
348
  <p style="font-size: 16px;">Notes:</p>
349
  <ul style="font-size: 16px;">
@@ -370,7 +387,7 @@ with gr.Blocks() as demo:
370
  with gr.Accordion(label='Click to view instructions', open=False):
371
  gr.HTML(f'''<p style="font-size: 16px;">This is similar to Query Type 3, but with ∞-gram instead of n-gram.</p>
372
  <br>
373
- <p style="font-size: 16px;">Example query: <b>I love natural language</b> (if "natural language" appears in the corpus but "love natural language" doesn't, the output is P(* | natural language), for the top-10 tokens *)</p>
374
  ''')
375
  with gr.Row():
376
  with gr.Column(scale=1):
@@ -392,7 +409,7 @@ with gr.Blocks() as demo:
392
  with gr.Column():
393
  gr.HTML(f'''<h2>6. Search for documents containing n-gram(s)</h2>''')
394
  with gr.Accordion(label='Click to view instructions', open=False):
395
- gr.HTML(f'''<p style="font-size: 16px;">This displays a few random documents in the corpus that satisfies your query. You can simply enter an n-gram, in which case the document displayed would contain your n-gram. You can also connect multiple n-gram terms with the AND/OR operators, in the <a href="https://en.wikipedia.org/wiki/Conjunctive_normal_form">CNF format</a>, in which case the displayed document contains n-grams such that it satisfies this logical constraint.</p>
396
  <br>
397
  <p style="font-size: 16px;">Example queries:</p>
398
  <ul style="font-size: 16px;">
@@ -412,7 +429,7 @@ with gr.Blocks() as demo:
412
  <li>The above subsampling mechanism might cause a zero count on co-occurrences of some simple n-grams (e.g., <b>birds AND oil</b>).</li>
413
  </ul>
414
  <br>
415
- <p style="font-size: 16px;">❗️WARNING: Corpus may contain problematic contents such as PII, toxicity, hate speech, and NSFW text. This tool is merely presenting selected text from the corpus, without any post-hoc safety filtering. It is NOT creating new text. This is a research prototype through which we can expose and examine existing problems with massive text corpora. Please use with caution. Don't be evil :)</p>
416
  ''')
417
  with gr.Row():
418
  with gr.Column(scale=1):
@@ -443,7 +460,7 @@ with gr.Blocks() as demo:
443
  with gr.Column():
444
  gr.HTML(f'''<h2>6. Search for documents containing n-gram(s)</h2>''')
445
  with gr.Accordion(label='Click to view instructions', open=False):
446
- gr.HTML(f'''<p style="font-size: 16px;">This displays the documents in the corpus that satisfies your query. You can simply enter an n-gram, in which case the document displayed would contain your n-gram. You can also connect multiple n-gram terms with the AND/OR operators, in the <a href="https://en.wikipedia.org/wiki/Conjunctive_normal_form">CNF format</a>, in which case the displayed document contains n-grams such that it satisfies this logical constraint.</p>
447
  <br>
448
  <p style="font-size: 16px;">Example queries:</p>
449
  <ul style="font-size: 16px;">
@@ -461,7 +478,7 @@ with gr.Blocks() as demo:
461
  <li>The above subsampling mechanism might cause a zero count on co-occurrences of some simple n-grams (e.g., <b>birds AND oil</b>).</li>
462
  </ul>
463
  <br>
464
- <p style="font-size: 16px;">❗️WARNING: Corpus may contain problematic contents such as PII, toxicity, hate speech, and NSFW text. This tool is merely presenting selected text from the corpus, without any post-hoc safety filtering. It is NOT creating new text. This is a research prototype through which we can expose and examine existing problems with massive text corpora. Please use with caution. Don't be evil :)</p>
465
  ''')
466
  with gr.Row():
467
  with gr.Column(scale=1):
 
5
  import requests
6
  from constants import *
7
 
8
+ # def get_demo_indexes():
9
+ # try:
10
+ # response = requests.get(API_URL)
11
+ # print(response)
12
+ # return response.json()
13
+ # except:
14
+ # return []
15
+
16
+ # INDEXES = get_demo_indexes()
17
+ # print(INDEXES)
18
+ # INDEX_BY_DESC = {index['desc']: index['index'] for index in INDEXES}
19
+ # INDEX_DESCS = [index['desc'] for index in INDEXES]
20
+
21
  def process(query_type, index_desc, **kwargs):
22
  timestamp = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
23
  index = INDEX_BY_DESC[index_desc]
 
32
  if API_URL is None:
33
  raise ValueError(f'API_URL envvar is not set!')
34
  try:
35
+ response = requests.post(API_URL, json=data, timeout=30)
36
  except requests.exceptions.Timeout:
37
  raise ValueError('Web request timed out. Please try again later.')
38
  except requests.exceptions.RequestException as e:
 
93
  if 'error' in result:
94
  prob = result['error']
95
  elif result['prompt_cnt'] == 0:
96
+ prob = '(n-1)-gram is not found in the dataset'
97
  else:
98
  prob = f'{result["prob"]:.4f} ({result["cont_cnt"]:,} / {result["prompt_cnt"]:,})'
99
  return latency, tokenization_info, prob
 
110
  for token_id, r in result_by_token_id.items():
111
  ntd[f'{r["token"]} ({r["cont_cnt"]} / {result["prompt_cnt"]})'] = r['prob']
112
  if ntd == {}:
113
+ ntd = '(n-1)-gram is not found in the dataset'
114
  return latency, tokenization_info, ntd
115
 
116
  def infgram_prob(index_desc, query):
 
186
  ptrs_by_shard = find_result['ptrs_by_shard']
187
  cnt_retrievable = sum([len(ptrs) for ptrs in ptrs_by_shard])
188
  if find_result["approx"]:
189
+ message = f'Approximately {find_result["cnt"]:,} occurrences found, of which {cnt_retrievable:,} are retrievable'
190
  else:
191
+ message = f'{find_result["cnt"]:,} occurrences found'
192
  else: # simple query
193
+ message = f'{find_result["cnt"]:,} occurrences found'
194
  cnt_retrievable = find_result['cnt']
195
  if cnt_retrievable == 0:
196
  idx = gr.Number(minimum=0, maximum=0, step=1, value=0, interactive=False)
 
242
  with gr.Blocks() as demo:
243
  with gr.Column():
244
  gr.HTML(
245
+ '''<h1 text-align="center">Infini-gram: An Efficient Search Engine over the Massive Pretraining Datasets of LLMs</h1>
246
 
247
+ <p style='font-size: 16px;'>Infini-gram does exact-match search over several open pretraining datasets of language models. Please first select the dataset and the type of query, then enter your query and submit.</p>
248
  <p style='font-size: 16px;'>The engine is developed by <a href="https://liujch1998.github.io">Jiacheng Liu</a> and documented in our paper: <a href="https://huggingface.co/papers/2401.17377">Infini-gram: Scaling Unbounded n-gram Language Models to a Trillion Tokens</a>. Feel free to check out our <a href="https://infini-gram.io">Project Homepage</a>.</p>
249
  <p style='font-size: 16px;'><b>API Endpoint:</b> If you'd like to issue batch queries to infini-gram, you may invoke our API endpoint. Please refer to the <a href="https://infini-gram.io/api_doc">API documentation</a>.</p>
250
+ <p style='font-size: 16px;'><b>Notes:</b></p>
251
+ <ul style="font-size: 16px;">
252
+ <li>The query is <b>case-sensitive</b>. Your query will be tokenized with the Llama-2 tokenizer (unless otherwise specified). The total number of tokens in each dataset is shown in parenthesis in the dataset selection panel.</li>
253
+ <li>Dolma 3 and the Olmo 3 training datasets uses the Olmo 3 tokenizer. Also, these use a more cost-efficient technique to serve, meaning: (1) each query typically takes 12-15 seconds; (2) they only support n-gram counting and document search, and CNF queries are not supported.</li>
254
+ </ul>
255
  '''
256
  )
257
  with gr.Row():
258
  with gr.Column(scale=1, min_width=240):
259
+ index_desc = gr.Radio(choices=INDEX_DESCS, label='Dataset', value=INDEX_DESCS[0])
260
 
261
  with gr.Column(scale=7):
262
  with gr.Tab('1. Count an n-gram'):
263
  with gr.Column():
264
  gr.HTML('<h2>1. Count an n-gram</h2>')
265
  with gr.Accordion(label='Click to view instructions', open=False):
266
+ gr.HTML(f'''<p style="font-size: 16px;">This counts the number of times an n-gram appears in the dataset. If you submit an empty input, it will return the total number of tokens in the dataset. You can also make more complex queries by connecting multiple n-gram terms with the AND/OR operators, in the <a href="https://en.wikipedia.org/wiki/Conjunctive_normal_form">CNF format</a>.</p>
267
  <br>
268
  <p style="font-size: 16px;">Example queries:</p>
269
  <ul style="font-size: 16px;">
 
308
  <br>
309
  <p style="font-size: 16px;">Notes:</p>
310
  <ul style="font-size: 16px;">
311
+ <li>The (n-1)-gram needs to exist in the dataset. If the (n-1)-gram is not found in the dataset, an error message will appear.</li>
312
  </ul>
313
  ''')
314
  with gr.Row():
 
334
  <br>
335
  <p style="font-size: 16px;">Notes:</p>
336
  <ul style="font-size: 16px;">
337
+ <li>The (n-1)-gram needs to exist in the dataset. If the (n-1)-gram is not found in the dataset, an error message will appear.</li>
338
+ <li>If the (n-1)-gram appears more than {max_support} times in the dataset, the result will be approximate: we will estimate the distribution by examining a subset of {max_support} occurrences of the (n-1)-gram. This value can be adjusted within range [1, {MAX_SUPPORT}] in "Advanced options".</li>
339
  </ul>
340
  ''')
341
 
 
358
  with gr.Column():
359
  gr.HTML('<h2>4. Compute the ∞-gram probability of the last token</h2>')
360
  with gr.Accordion(label='Click to view instructions', open=False):
361
+ gr.HTML(f'''<p style="font-size: 16px;">This computes the ∞-gram probability of the last token conditioned on the previous tokens. Compared to Query Type 2 (which uses your entire input for n-gram modeling), here we take the longest suffix that we can find in the dataset.</p>
362
  <br>
363
+ <p style="font-size: 16px;">Example query: <b>I love natural language processing</b> (if "natural language" appears in the dataset but "love natural language" doesn't, the output is P(processing | natural language); in this case the effective n = 3)</p>
364
  <br>
365
  <p style="font-size: 16px;">Notes:</p>
366
  <ul style="font-size: 16px;">
 
387
  with gr.Accordion(label='Click to view instructions', open=False):
388
  gr.HTML(f'''<p style="font-size: 16px;">This is similar to Query Type 3, but with ∞-gram instead of n-gram.</p>
389
  <br>
390
+ <p style="font-size: 16px;">Example query: <b>I love natural language</b> (if "natural language" appears in the dataset but "love natural language" doesn't, the output is P(* | natural language), for the top-10 tokens *)</p>
391
  ''')
392
  with gr.Row():
393
  with gr.Column(scale=1):
 
409
  with gr.Column():
410
  gr.HTML(f'''<h2>6. Search for documents containing n-gram(s)</h2>''')
411
  with gr.Accordion(label='Click to view instructions', open=False):
412
+ gr.HTML(f'''<p style="font-size: 16px;">This displays a few random documents in the dataset that satisfies your query. You can simply enter an n-gram, in which case the document displayed would contain your n-gram. You can also connect multiple n-gram terms with the AND/OR operators, in the <a href="https://en.wikipedia.org/wiki/Conjunctive_normal_form">CNF format</a>, in which case the displayed document contains n-grams such that it satisfies this logical constraint.</p>
413
  <br>
414
  <p style="font-size: 16px;">Example queries:</p>
415
  <ul style="font-size: 16px;">
 
429
  <li>The above subsampling mechanism might cause a zero count on co-occurrences of some simple n-grams (e.g., <b>birds AND oil</b>).</li>
430
  </ul>
431
  <br>
432
+ <p style="font-size: 16px;">❗️WARNING: Dataset may contain problematic contents such as PII, toxicity, hate speech, and NSFW text. This tool is merely presenting selected text from the dataset, without any post-hoc safety filtering. It is NOT creating new text. This is a research prototype through which we can expose and examine existing problems with massive text datasets. Please use with caution. Don't be evil :)</p>
433
  ''')
434
  with gr.Row():
435
  with gr.Column(scale=1):
 
460
  with gr.Column():
461
  gr.HTML(f'''<h2>6. Search for documents containing n-gram(s)</h2>''')
462
  with gr.Accordion(label='Click to view instructions', open=False):
463
+ gr.HTML(f'''<p style="font-size: 16px;">This displays the documents in the dataset that satisfies your query. You can simply enter an n-gram, in which case the document displayed would contain your n-gram. You can also connect multiple n-gram terms with the AND/OR operators, in the <a href="https://en.wikipedia.org/wiki/Conjunctive_normal_form">CNF format</a>, in which case the displayed document contains n-grams such that it satisfies this logical constraint.</p>
464
  <br>
465
  <p style="font-size: 16px;">Example queries:</p>
466
  <ul style="font-size: 16px;">
 
478
  <li>The above subsampling mechanism might cause a zero count on co-occurrences of some simple n-grams (e.g., <b>birds AND oil</b>).</li>
479
  </ul>
480
  <br>
481
+ <p style="font-size: 16px;">❗️WARNING: Dataset may contain problematic contents such as PII, toxicity, hate speech, and NSFW text. This tool is merely presenting selected text from the dataset, without any post-hoc safety filtering. It is NOT creating new text. This is a research prototype through which we can expose and examine existing problems with massive text datasets. Please use with caution. Don't be evil :)</p>
482
  ''')
483
  with gr.Row():
484
  with gr.Column(scale=1):
constants.py CHANGED
@@ -2,14 +2,21 @@ import os
2
 
3
  # options
4
  INDEX_BY_DESC = {
5
- 'OLMo 2 32B Instruct (4.6T tokens)': 'v4_olmo-2-0325-32b-instruct_llama',
6
- 'OLMo 2 13B Instruct (4.6T tokens)': 'v4_olmo-2-1124-13b-instruct_llama',
7
- 'OLMoE 1B 7B Instruct (4.6T tokens)': 'v4_olmoe-0125-1b-7b-instruct_llama',
8
- 'Dolma-v1.7 (2.6T tokens)': 'v4_dolma-v1_7_llama',
9
- 'RedPajama (1.4T tokens)': 'v4_rpj_llama_s4',
10
- 'Pile-train (380B tokens)': 'v4_piletrain_llama',
11
- 'C4-train (200B tokens)': 'v4_c4train_llama',
12
- 'Pile-val (390M tokens)': 'v4_pileval_llama',
 
 
 
 
 
 
 
13
  # 'Pile-val (GPT-2 tokenizer), 380M tokens': 'v4_pileval_gpt2',
14
  # 'Dolma-v1.6-sample (OLMo tokenizer), 8.0B tokens': 'v4_dolmasample_olmo',
15
  # 'Dolma-v1.6-sample (9.2B tokens)': 'v4_dolma-v1_6-sample_llama',
 
2
 
3
  # options
4
  INDEX_BY_DESC = {
5
+ "Olmo 3 32B Think (6.1T)": "olmo-3-32b-think",
6
+ "OLMo 3 7B Think (6.1T)": "olmo-3-7b-think",
7
+ "OLMo 3 7B Instruct (6.0T)": "olmo-3-7b-instruct",
8
+ "Dolma 3 (5.9T)": "dolma3",
9
+ 'OLMo 2 32B Instruct (4.6T)': 'v4_olmo-2-0325-32b-instruct_llama',
10
+ 'OLMo 2 13B Instruct (4.6T)': 'v4_olmo-2-1124-13b-instruct_llama',
11
+ 'OLMoE 1B 7B Instruct (4.6T)': 'v4_olmoe-0125-1b-7b-instruct_llama',
12
+ 'dolmino-mix-1124-minus-olmo-mix-1124 (34B)': 'v4_dolmino-mix-1124-minus-olmo-mix-1124_llama',
13
+ 'olmo-mix-1124 (4.6T)': 'v4_olmo-mix-1124_llama',
14
+ 'DCLM-baseline (4.3T)': 'v4_dclm-baseline_llama',
15
+ 'Dolma-v1.7 (2.6T)': 'v4_dolma-v1_7_llama',
16
+ 'RedPajama (1.4T)': 'v4_rpj_llama_s4',
17
+ 'Pile-train (380B)': 'v4_piletrain_llama',
18
+ 'C4-train (200B)': 'v4_c4train_llama',
19
+ 'Pile-val (390M)': 'v4_pileval_llama',
20
  # 'Pile-val (GPT-2 tokenizer), 380M tokens': 'v4_pileval_gpt2',
21
  # 'Dolma-v1.6-sample (OLMo tokenizer), 8.0B tokens': 'v4_dolmasample_olmo',
22
  # 'Dolma-v1.6-sample (9.2B tokens)': 'v4_dolma-v1_6-sample_llama',
requirements.txt CHANGED
@@ -2,5 +2,5 @@ torch==1.13.1
2
  transformers==4.31.0
3
  tokenizers==0.13.3
4
  sentencepiece==0.1.96
5
- huggingface_hub==0.14.1
6
  requests
 
2
  transformers==4.31.0
3
  tokenizers==0.13.3
4
  sentencepiece==0.1.96
5
+ huggingface_hub==0.34.3
6
  requests