liujch1998 commited on
Commit
2e63f1e
1 Parent(s): c4bc2a0

Add C4 index

Browse files
Files changed (1) hide show
  1. constants.py +1 -0
constants.py CHANGED
@@ -4,6 +4,7 @@ import os
4
  CORPUS_BY_DESC = {
5
  'RedPajama (LLaMA tokenizer), 1.4T tokens': 'v4_rpj_llama_s4',
6
  'Pile-train (LLaMA tokenizer), 380B tokens': 'v4_piletrain_llama',
 
7
  'Pile-val (LLaMA tokenizer), 390M tokens': 'v4_pileval_llama',
8
  'Pile-val (GPT-2 tokenizer), 380M tokens': 'v4_pileval_gpt2',
9
  'Dolma-sample (OLMo tokenizer), 8.0B tokens': 'v4_dolmasample_olmo',
 
4
  CORPUS_BY_DESC = {
5
  'RedPajama (LLaMA tokenizer), 1.4T tokens': 'v4_rpj_llama_s4',
6
  'Pile-train (LLaMA tokenizer), 380B tokens': 'v4_piletrain_llama',
7
+ 'C4-train (LLaMA tokenizer), 200B tokens': 'v4_c4train_llama',
8
  'Pile-val (LLaMA tokenizer), 390M tokens': 'v4_pileval_llama',
9
  'Pile-val (GPT-2 tokenizer), 380M tokens': 'v4_pileval_gpt2',
10
  'Dolma-sample (OLMo tokenizer), 8.0B tokens': 'v4_dolmasample_olmo',