Fixes and eval configs

Browse files

Files changed (6) hide show

evaluation/paws.yaml +0 -2
evaluation/token.yaml +53 -0
evaluation/xnli.yaml +0 -2
images/bertin.png +0 -0
run_mlm_flax_stream.py +7 -6
utils/download_mc4es_sampled.py +32 -0

evaluation/paws.yaml CHANGED Viewed

@@ -36,8 +36,6 @@ parameters:
     value: ./outputs
   overwrite_output_dir:
     value: true
-  resume_from_checkpoint:
-    value: false
   max_seq_length:
     value: 512
   pad_to_max_length:

     value: ./outputs
   overwrite_output_dir:
     value: true
   max_seq_length:
     value: 512
   pad_to_max_length:

evaluation/token.yaml ADDED Viewed

	@@ -0,0 +1,53 @@

+name: BERTIN NER and POS es
+project: bertin-eval
+enitity: versae
+program: run_ner.py
+command:
+  - ${env}
+  - ${interpreter}
+  - ${program}
+  - ${args}
+method: grid
+metric:
+  name: eval/accuracy
+  goal: maximize
+parameters:
+  model_name_or_path:
+    values:
+    - bertin-project/bertin-base-gaussian-exp-512seqlen
+    - bertin-project/bertin-base-random-exp-512seqlen
+    - bertin-project/bertin-base-gaussian
+    - bertin-project/bertin-base-stepwise
+    - bertin-project/bertin-base-random
+    - bertin-project/bertin-roberta-base-spanish
+    - flax-community/bertin-roberta-large-spanish
+    - BSC-TeMU/roberta-base-bne
+    - dccuchile/bert-base-spanish-wwm-cased
+    - bert-base-multilingual-cased
+  num_train_epochs:
+    values: [5]
+  task_name:
+    values:
+    - ner
+    - pos
+  dataset_name:
+    value: conll2002
+  dataset_config_name:
+    value: es
+  output_dir:
+    value: ./outputs
+  overwrite_output_dir:
+    value: true
+  pad_to_max_length:
+    value: true
+  per_device_train_batch_size:
+    value: 16
+  per_device_eval_batch_size:
+    value: 16
+  save_total_limit:
+    value: 1
+  do_train:
+    value: true
+  do_eval:
+    value: true

evaluation/xnli.yaml CHANGED Viewed

@@ -36,8 +36,6 @@ parameters:
     value: ./outputs
   overwrite_output_dir:
     value: true
-  resume_from_checkpoint:
-    value: false
   max_seq_length:
     value: 512
   pad_to_max_length:

     value: ./outputs
   overwrite_output_dir:
     value: true
   max_seq_length:
     value: 512
   pad_to_max_length:

images/bertin.png CHANGED Viewed

run_mlm_flax_stream.py CHANGED Viewed

@@ -384,8 +384,8 @@ def to_f32(t):
 def convert(output_dir, destination_dir="./"):
-    shutil.copyfile(Path(output_dir) / "flax_model.msgpack", destination_dir)
-    shutil.copyfile(Path(output_dir) / "config.json", destination_dir)
     # Saving extra files from config.json and tokenizer.json files
     tokenizer = AutoTokenizer.from_pretrained(destination_dir)
     tokenizer.save_pretrained(destination_dir)
@@ -611,8 +611,8 @@ if __name__ == "__main__":
     # Setup train state
     state = train_state.TrainState.create(apply_fn=model.__call__, params=model.params, tx=adamw)
-    saved_step = 0
-    if "checkpoint" in model_args.model_name_or_path:
         params, opt_state, saved_step, args, data_collator = restore_checkpoint(model_args.model_name_or_path, state)
         # Create learning rate schedule
         warmup_fn = optax.linear_schedule(
@@ -714,8 +714,9 @@ if __name__ == "__main__":
     max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
     eval_samples = advance_iter_and_group_samples(training_iter, data_args.num_eval_samples, max_seq_length)
     steps = tqdm(range(num_train_steps), desc="Training...", position=0)
-    for step in range(saved_step, num_train_steps):
         if step < saved_step:
             steps.update(1)
             continue
@@ -827,5 +828,5 @@ if __name__ == "__main__":
             training_args.output_dir,
             params=params,
             push_to_hub=training_args.push_to_hub,
-            commit_message=last_desc,
         )

 def convert(output_dir, destination_dir="./"):
+    shutil.copyfile(Path(output_dir) / "flax_model.msgpack", Path(destination_dir) / "flax_model.msgpack")
+    shutil.copyfile(Path(output_dir) / "config.json", Path(destination_dir) / "config.json")
     # Saving extra files from config.json and tokenizer.json files
     tokenizer = AutoTokenizer.from_pretrained(destination_dir)
     tokenizer.save_pretrained(destination_dir)
     # Setup train state
     state = train_state.TrainState.create(apply_fn=model.__call__, params=model.params, tx=adamw)
+    saved_step = -1
+    if model_args.model_name_or_path and "checkpoint" in model_args.model_name_or_path:
         params, opt_state, saved_step, args, data_collator = restore_checkpoint(model_args.model_name_or_path, state)
         # Create learning rate schedule
         warmup_fn = optax.linear_schedule(
     max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
     eval_samples = advance_iter_and_group_samples(training_iter, data_args.num_eval_samples, max_seq_length)
+    last_desc = ""
     steps = tqdm(range(num_train_steps), desc="Training...", position=0)
+    for step in range(num_train_steps):
         if step < saved_step:
             steps.update(1)
             continue
             training_args.output_dir,
             params=params,
             push_to_hub=training_args.push_to_hub,
+            commit_message=last_desc or "Saving model after training",
         )

utils/download_mc4es_sampled.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import io
+import gzip
+import json
+import sys
+import requests
+from tqdm import tqdm
+_DATA_URL_TRAIN = "https://huggingface.co/datasets/bertin-project/mc4-es-sampled/resolve/main/mc4-es-train-50M-{config}-shard-{index:04d}-of-{n_shards:04d}.json.gz"
+def main(config="stepwise"):
+    data_urls = [
+        _DATA_URL_TRAIN.format(
+            config=config,
+            index=index + 1,
+            n_shards=1024,
+        )
+        for index in range(1024)
+    ]
+    with open(f"mc4-es-train-50M-{config}.jsonl", "w") as f:
+        for dara_url in tqdm(data_urls):
+            response = requests.get(dara_url)
+            bio = io.BytesIO(response.content)
+            with gzip.open(bio, "rt", encoding="utf8") as g:
+                for line in g:
+                    json_line = json.loads(line.strip())
+                    f.write(json.dumps(json_line) + "\n")
+if __name__ == "__main__":
+    main(sys.argv[1])