flax-community
/

roberta-base-mr

@@ -0,0 +1,273 @@
  0%|          | 0/5 [00:00<?, ?ba/s]
 20%|██        | 1/5 [00:00<00:00,  8.80ba/s]
 40%|████      | 2/5 [00:00<00:00,  9.16ba/s]
 60%|██████    | 3/5 [00:00<00:00,  9.52ba/s]
 80%|████████  | 4/5 [00:00<00:00,  7.13ba/s]
  0%|          | 0/1 [00:00<?, ?ba/s]
  0%|          | 0/5 [00:00<?, ?ba/s]
 20%|██        | 1/5 [00:00<00:03,  1.32ba/s]
 40%|████      | 2/5 [00:01<00:02,  1.24ba/s]
 60%|██████    | 3/5 [00:02<00:01,  1.21ba/s]
 80%|████████  | 4/5 [00:03<00:00,  1.13ba/s]
  0%|          | 0/1 [00:00<?, ?ba/s]

+[16:04:24] - INFO - __main__ - Training/evaluation parameters TrainingArguments(
+_n_gpu=-1,
+adafactor=False,
+adam_beta1=0.9,
+adam_beta2=0.999,
+adam_epsilon=1e-08,
+dataloader_drop_last=False,
+dataloader_num_workers=0,
+dataloader_pin_memory=True,
+ddp_find_unused_parameters=None,
+debug=[],
+deepspeed=None,
+disable_tqdm=False,
+do_eval=False,
+do_predict=False,
+do_train=False,
+eval_accumulation_steps=None,
+eval_steps=500,
+evaluation_strategy=IntervalStrategy.NO,
+fp16=False,
+fp16_backend=auto,
+fp16_full_eval=False,
+fp16_opt_level=O1,
+gradient_accumulation_steps=1,
+greater_is_better=None,
+group_by_length=False,
+ignore_data_skip=False,
+label_names=None,
+label_smoothing_factor=0.0,
+learning_rate=0.0003,
+length_column_name=length,
+load_best_model_at_end=False,
+local_rank=-1,
+log_level=-1,
+log_level_replica=-1,
+log_on_each_node=True,
+logging_dir=./runs/Jul08_16-04-24_t1v-n-112df4a9-w-0,
+logging_first_step=False,
+logging_steps=500,
+logging_strategy=IntervalStrategy.STEPS,
+lr_scheduler_type=SchedulerType.LINEAR,
+max_grad_norm=1.0,
+max_steps=-1,
+metric_for_best_model=None,
+mp_parameters=,
+no_cuda=False,
+num_train_epochs=8.0,
+output_dir=./,
+overwrite_output_dir=True,
+past_index=-1,
+per_device_eval_batch_size=4,
+per_device_train_batch_size=4,
+prediction_loss_only=False,
+push_to_hub=True,
+push_to_hub_model_id=flax-community/roberta-base-mr,
+push_to_hub_organization=None,
+push_to_hub_token=vdIAyRvCACJNslYtyLHufmNDnUIyknPzUgVDMFiXqJoulvMqjoubonLJzXOJQJczWfRMJumVaMFjGSFVnQAMdswvZkzNIthKrxBeARBXfqnIwjABkKpCbjGEgnkjpjKi,
+remove_unused_columns=True,
+report_to=[],
+resume_from_checkpoint=None,
+run_name=./,
+save_on_each_node=False,
+save_steps=500,
+save_strategy=IntervalStrategy.STEPS,
+save_total_limit=None,
+seed=42,
+sharded_ddp=[],
+skip_memory_metrics=True,
+tpu_metrics_debug=False,
+tpu_num_cores=None,
+use_legacy_prediction_loop=False,
+warmup_ratio=0.0,
+warmup_steps=1000,
+weight_decay=0.0,
+)
+[16:04:24] - DEBUG - urllib3.connectionpool - Starting new HTTPS connection (1): s3.amazonaws.com:443
+[16:04:24] - DEBUG - urllib3.connectionpool - https://s3.amazonaws.com:443 "HEAD /datasets.huggingface.co/datasets/datasets/oscar/oscar.py HTTP/1.1" 404 0
+[16:04:24] - DEBUG - urllib3.connectionpool - Starting new HTTPS connection (1): raw.githubusercontent.com:443
+[16:04:24] - DEBUG - urllib3.connectionpool - https://raw.githubusercontent.com:443 "HEAD /huggingface/datasets/master/datasets/oscar/oscar.py HTTP/1.1" 200 0
+[16:04:24] - DEBUG - urllib3.connectionpool - Starting new HTTPS connection (1): raw.githubusercontent.com:443
+[16:04:24] - DEBUG - urllib3.connectionpool - https://raw.githubusercontent.com:443 "HEAD /huggingface/datasets/master/datasets/oscar/dataset_infos.json HTTP/1.1" 200 0
+[16:04:24] - WARNING - datasets.builder - Reusing dataset oscar (/home/nipunsadvilkar/.cache/huggingface/datasets/oscar/unshuffled_deduplicated_als/1.0.0/84838bd49d2295f62008383b05620571535451d84545037bb94d6f3501651df2)
+[16:04:24] - DEBUG - urllib3.connectionpool - Starting new HTTPS connection (1): s3.amazonaws.com:443
+[16:04:24] - DEBUG - urllib3.connectionpool - https://s3.amazonaws.com:443 "HEAD /datasets.huggingface.co/datasets/datasets/oscar/oscar.py HTTP/1.1" 404 0
+[16:04:24] - DEBUG - urllib3.connectionpool - Starting new HTTPS connection (1): raw.githubusercontent.com:443
+[16:04:24] - DEBUG - urllib3.connectionpool - https://raw.githubusercontent.com:443 "HEAD /huggingface/datasets/master/datasets/oscar/oscar.py HTTP/1.1" 200 0
+[16:04:24] - DEBUG - urllib3.connectionpool - Starting new HTTPS connection (1): raw.githubusercontent.com:443
+[16:04:24] - DEBUG - urllib3.connectionpool - https://raw.githubusercontent.com:443 "HEAD /huggingface/datasets/master/datasets/oscar/dataset_infos.json HTTP/1.1" 200 0
+[16:04:24] - WARNING - datasets.builder - Reusing dataset oscar (/home/nipunsadvilkar/.cache/huggingface/datasets/oscar/unshuffled_deduplicated_als/1.0.0/84838bd49d2295f62008383b05620571535451d84545037bb94d6f3501651df2)
+[16:04:24] - DEBUG - urllib3.connectionpool - Starting new HTTPS connection (1): s3.amazonaws.com:443
+[16:04:25] - DEBUG - urllib3.connectionpool - https://s3.amazonaws.com:443 "HEAD /datasets.huggingface.co/datasets/datasets/oscar/oscar.py HTTP/1.1" 404 0
+[16:04:25] - DEBUG - urllib3.connectionpool - Starting new HTTPS connection (1): raw.githubusercontent.com:443
+[16:04:25] - DEBUG - urllib3.connectionpool - https://raw.githubusercontent.com:443 "HEAD /huggingface/datasets/master/datasets/oscar/oscar.py HTTP/1.1" 200 0
+[16:04:25] - DEBUG - urllib3.connectionpool - Starting new HTTPS connection (1): raw.githubusercontent.com:443
+[16:04:25] - DEBUG - urllib3.connectionpool - https://raw.githubusercontent.com:443 "HEAD /huggingface/datasets/master/datasets/oscar/dataset_infos.json HTTP/1.1" 200 0
+[16:04:25] - WARNING - datasets.builder - Reusing dataset oscar (/home/nipunsadvilkar/.cache/huggingface/datasets/oscar/unshuffled_deduplicated_als/1.0.0/84838bd49d2295f62008383b05620571535451d84545037bb94d6f3501651df2)
  0%|          | 0/5 [00:00<?, ?ba/s]
 20%|██        | 1/5 [00:00<00:00,  8.80ba/s]
 40%|████      | 2/5 [00:00<00:00,  9.16ba/s]
 60%|██████    | 3/5 [00:00<00:00,  9.52ba/s]
 80%|████████  | 4/5 [00:00<00:00,  7.13ba/s]
  0%|          | 0/1 [00:00<?, ?ba/s]
  0%|          | 0/5 [00:00<?, ?ba/s]
 20%|██        | 1/5 [00:00<00:03,  1.32ba/s]
 40%|████      | 2/5 [00:01<00:02,  1.24ba/s]
 60%|██████    | 3/5 [00:02<00:01,  1.21ba/s]
 80%|████████  | 4/5 [00:03<00:00,  1.13ba/s]
  0%|          | 0/1 [00:00<?, ?ba/s]
+[16:04:29] - WARNING - __main__ - Unable to display metrics through TensorBoard because the package is not installed: Please run pip install tensorboard to enable.
+[16:04:29] - INFO - absl - Starting the local TPU driver.
+[16:04:29] - INFO - absl - Unable to initialize backend 'tpu_driver': Not found: Unable to find driver in registry given worker: local://
+[16:04:29] - INFO - absl - Unable to initialize backend 'gpu': Not found: Could not find registered platform with name: "cuda". Available platform names are: Interpreter TPU Host
+/home/nipunsadvilkar/roberta_mr_env/lib/python3.8/site-packages/jax/lib/xla_bridge.py:382: UserWarning: jax.host_count has been renamed to jax.process_count. This alias will eventually be removed; please update your code.
+  warnings.warn(
+/home/nipunsadvilkar/roberta_mr_env/lib/python3.8/site-packages/jax/lib/xla_bridge.py:369: UserWarning: jax.host_id has been renamed to jax.process_index. This alias will eventually be removed; please update your code.
+  warnings.warn(
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+[16:06:12] - INFO - huggingface_hub.repository - git version 2.25.1
+git-lfs/2.9.2 (GitHub; linux amd64; go 1.13.5)
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+[16:06:12] - DEBUG - huggingface_hub.repository - [Repository] is a valid git repo
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)

run.sh ADDED Viewed

	@@ -0,0 +1,18 @@

+HUB_TOKEN=`cat $HOME/.huggingface/token`
+python run_mlm_flax.py \
+    --output_dir="./" \
+    --model_type="roberta" \
+    --config_name="./" \
+    --tokenizer_name="./" \
+    --dataset_name="oscar" \
+    --dataset_config_name="unshuffled_deduplicated_als" \
+    --max_seq_length="128" \
+    --per_device_train_batch_size="4" \
+    --per_device_eval_batch_size="4" \
+    --learning_rate="3e-4" \
+    --warmup_steps="1000" \
+    --overwrite_output_dir \
+    --num_train_epochs="8" \
+    --push_to_hub_model_id="flax-community/roberta-base-mr" \
+    --push_to_hub_token="$HUB_TOKEN" \
+    --push_to_hub 2>&1 | tee run.log

tokens.py ADDED Viewed

	@@ -0,0 +1,20 @@

+#!/usr/bin/env python3
+from datasets import load_dataset
+from tokenizers import ByteLevelBPETokenizer
+# Load dataset
+dataset = load_dataset("oscar", "unshuffled_deduplicated_als", split="train")
+# Instantiate tokenizer
+tokenizer = ByteLevelBPETokenizer()
+def batch_iterator(batch_size=100_000):
+    for i in range(0, len(dataset), batch_size):
+        yield dataset["text"][i: i + batch_size]
+# Customized training
+tokenizer.train_from_iterator(batch_iterator(), vocab_size=50265, min_frequency=2, special_tokens=[
+    "<s>",
+    "<pad>",
+    "</s>",
+    "<unk>",
+    "<mask>",
+])
+# Save files to disk
+tokenizer.save("./tokenizer.json")