versae commited on
Commit
fe7ff35
1 Parent(s): a1f93c9

Adding correct models 10k steps

Browse files
Files changed (3) hide show
  1. flax_model.msgpack +2 -2
  2. pytorch_model.bin +3 -0
  3. tokens.py +2 -2
flax_model.msgpack CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:50c50c05859f43aa6a08aa3106a1ca62d225f1ac927d57e0e86e422cff5ee7a7
3
- size 711588089
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ff31ebb2460dbc41a160cc755d0555bb8c84672563808b968a2a121c1b2414a
3
+ size 711587941
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4265b625a915f8a622926c9be27d6b1f3f2bc44481f81ab5d53eace54a0bc06
3
+ size 1421780139
tokens.py CHANGED
@@ -3,11 +3,11 @@ from datasets import load_dataset
3
  from tokenizers import ByteLevelBPETokenizer
4
 
5
  # Load dataset
6
- dataset = load_dataset("oscar", "unshuffled_deduplicated_es")
7
 
8
  # Instantiate tokenizer
9
  tokenizer = ByteLevelBPETokenizer()
10
- def batch_iterator(batch_size=100_000_000):
11
  for i in range(0, len(dataset), batch_size):
12
  yield dataset["text"][i: i + batch_size]
13
 
3
  from tokenizers import ByteLevelBPETokenizer
4
 
5
  # Load dataset
6
+ dataset = load_dataset("oscar", "unshuffled_deduplicated_es", split="train")
7
 
8
  # Instantiate tokenizer
9
  tokenizer = ByteLevelBPETokenizer()
10
+ def batch_iterator(batch_size=1_000_000):
11
  for i in range(0, len(dataset), batch_size):
12
  yield dataset["text"][i: i + batch_size]
13