JonasGeiping commited on
Commit
53759c1
1 Parent(s): b2ff355

Upload data_budget_hours_24.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. data_budget_hours_24.json +34 -0
data_budget_hours_24.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sources": {
3
+ "c4": {
4
+ "provider": "huggingface",
5
+ "partition": "en",
6
+ "split": "train",
7
+ "streaming": true,
8
+ "remove_columns": null,
9
+ "concatenate_successive_entries": 0
10
+ }
11
+ },
12
+ "name": "c4-subset",
13
+ "normalizer": {
14
+ "force_lowercase": true,
15
+ "strip_accents": true,
16
+ "force_english_keyboard": true,
17
+ "whitespace_escape": false
18
+ },
19
+ "tokenizer": "WordPiece",
20
+ "vocab_size": 32768,
21
+ "seq_length": 128,
22
+ "include_cls_token_in_corpus": false,
23
+ "include_sep_token_in_corpus": true,
24
+ "use_type_ids": false,
25
+ "max_entries_in_raw_dataset": 25000000.0,
26
+ "max_seq_in_tokenized_dataset": 85000000.0,
27
+ "named_entity_simplification": false,
28
+ "remove_whitespaces": false,
29
+ "remove_trash": true,
30
+ "trash_cutoff": 0.25,
31
+ "deduplicate_entries": true,
32
+ "deduplication_threshold": 75,
33
+ "ordering": "sentence-length-curriculum"
34
+ }