"""CONFIG""" #!/usr/bin/env python3 from transformers import RobertaConfig config = RobertaConfig.from_pretrained("roberta-large") config.save_pretrained("./") """TOKENIZER""" #!/usr/bin/env python3 from datasets import load_dataset from tokenizers import ByteLevelBPETokenizer # load dataset dataset = load_dataset("large_spanish_corpus") # Instantiate tokenizer tokenizer = ByteLevelBPETokenizer() def batch_iterator(batch_size=1000): for i in range(0, len(dataset), batch_size): yield dataset[i: i + batch_size]["text"] # Customized training tokenizer.train_from_iterator(batch_iterator(), vocab_size=50265, min_frequency=2, special_tokens=[ "", "", "", "", "", ]) # Save files to disk tokenizer.save("./tokenizer.json") """TOKENIZER""" #!/usr/bin/env bash ./run_mlm_flax.py \ --output_dir="./" \ --model_type="roberta" \ --config_name="./" \ --tokenizer_name="./" \ --dataset_name="large_spanish_corpus" \ --dataset_config_name \ # I think this would be empty --max_seq_length="128" \ --per_device_train_batch_size="4" \ --per_device_eval_batch_size="4" \ --learning_rate="3e-4" \ --warmup_steps="1000" \ --overwrite_output_dir \ --num_train_epochs="8" \ --push_to_hub