itay-nakash commited on
Commit
472921f
1 Parent(s): 4360ad9

Upload data_budget_hours_24.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. data_budget_hours_24.json +34 -0
data_budget_hours_24.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sources": {
3
+ "ag_news": {
4
+ "provider": "huggingface",
5
+ "name": "default",
6
+ "split": "train",
7
+ "streaming": false,
8
+ "remove_columns": "label",
9
+ "concatenate_successive_entries": 0
10
+ }
11
+ },
12
+ "name": "sanity-check-2",
13
+ "normalizer": {
14
+ "force_lowercase": true,
15
+ "strip_accents": true,
16
+ "force_english_keyboard": true,
17
+ "whitespace_escape": false
18
+ },
19
+ "tokenizer": "BPE",
20
+ "vocab_size": 32768,
21
+ "seq_length": 128,
22
+ "include_cls_token_in_corpus": false,
23
+ "include_sep_token_in_corpus": false,
24
+ "use_type_ids": false,
25
+ "max_entries_in_raw_dataset": 10000000000.0,
26
+ "max_seq_in_tokenized_dataset": 10000000000.0,
27
+ "named_entity_simplification": false,
28
+ "remove_whitespaces": false,
29
+ "remove_trash": false,
30
+ "trash_cutoff": 0.3,
31
+ "deduplicate_entries": false,
32
+ "deduplication_threshold": 100,
33
+ "ordering": "randomized"
34
+ }