Pablo commited on
Commit
2a963f0
1 Parent(s): de633ab

:sparkles: Added test_script and a folder for scripts

Browse files
Files changed (2) hide show
  1. bertin/__init__.py +0 -0
  2. test_script.py +45 -0
bertin/__init__.py ADDED
File without changes
test_script.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """CONFIG"""
2
+ #!/usr/bin/env python3
3
+ from transformers import RobertaConfig
4
+ config = RobertaConfig.from_pretrained("roberta-large")
5
+ config.save_pretrained("./")
6
+
7
+ """TOKENIZER"""
8
+ #!/usr/bin/env python3
9
+ from datasets import load_dataset
10
+ from tokenizers import ByteLevelBPETokenizer
11
+ # load dataset
12
+ dataset = load_dataset("large_spanish_corpus")
13
+ # Instantiate tokenizer
14
+ tokenizer = ByteLevelBPETokenizer()
15
+ def batch_iterator(batch_size=1000):
16
+ for i in range(0, len(dataset), batch_size):
17
+ yield dataset[i: i + batch_size]["text"]
18
+ # Customized training
19
+ tokenizer.train_from_iterator(batch_iterator(), vocab_size=50265, min_frequency=2, special_tokens=[
20
+ "<s>",
21
+ "<pad>",
22
+ "</s>",
23
+ "<unk>",
24
+ "<mask>",
25
+ ])
26
+ # Save files to disk
27
+ tokenizer.save("./tokenizer.json")
28
+
29
+ """TOKENIZER"""
30
+ #!/usr/bin/env bash
31
+ ./run_mlm_flax.py \
32
+ --output_dir="./" \
33
+ --model_type="roberta" \
34
+ --config_name="./" \
35
+ --tokenizer_name="./" \
36
+ --dataset_name="large_spanish_corpus" \
37
+ --dataset_config_name \ # I think this would be empty
38
+ --max_seq_length="128" \
39
+ --per_device_train_batch_size="4" \
40
+ --per_device_eval_batch_size="4" \
41
+ --learning_rate="3e-4" \
42
+ --warmup_steps="1000" \
43
+ --overwrite_output_dir \
44
+ --num_train_epochs="8" \
45
+ --push_to_hub