Abinaya Mahendiran commited on Jul 14, 2021

Commit

3d74ff6

1 Parent(s): 2cc2a38

Updated baseline

Files changed (42) hide show

.gitattributes +3 -2
gpt-2-tamil/config.json +36 -0
gpt-2-tamil/events.out.tfevents.1626064970.t1v-n-ebe36c53-w-0.400773.3.v2 +3 -0
gpt-2-tamil/events.out.tfevents.1626108088.t1v-n-ebe36c53-w-0.483452.3.v2 +3 -0
gpt-2-tamil/events.out.tfevents.1626108395.t1v-n-ebe36c53-w-0.486342.3.v2 +3 -0
gpt-2-tamil/flax_model.msgpack +3 -0
gpt-2-tamil/tokenizer.json +0 -0
scripts/run.log +0 -0
scripts/train_gpt2-oscar-tamil.sh +11 -3
scripts/wandb/debug-internal.log +1 -0
scripts/wandb/debug.log +1 -0
scripts/wandb/latest-run +1 -0
scripts/wandb/run-20210712_044248-12kjsz9i/files/config.yaml +301 -0
scripts/wandb/run-20210712_044248-12kjsz9i/files/events.out.tfevents.1626064970.t1v-n-ebe36c53-w-0.400773.3.v2 +1 -0
scripts/wandb/run-20210712_044248-12kjsz9i/files/output.log +3 -0
scripts/wandb/run-20210712_044248-12kjsz9i/files/requirements.txt +123 -0
scripts/wandb/run-20210712_044248-12kjsz9i/files/wandb-metadata.json +45 -0
scripts/wandb/run-20210712_044248-12kjsz9i/files/wandb-summary.json +1 -0
scripts/wandb/run-20210712_044248-12kjsz9i/logs/debug-internal.log +3 -0
scripts/wandb/run-20210712_044248-12kjsz9i/logs/debug.log +3 -0
scripts/wandb/run-20210712_044248-12kjsz9i/run-12kjsz9i.wandb +3 -0
scripts/wandb/run-20210712_164126-1cgtoi5r/files/config.yaml +305 -0
scripts/wandb/run-20210712_164126-1cgtoi5r/files/events.out.tfevents.1626108088.t1v-n-ebe36c53-w-0.483452.3.v2 +1 -0
scripts/wandb/run-20210712_164126-1cgtoi5r/files/output.log +3 -0
scripts/wandb/run-20210712_164126-1cgtoi5r/files/requirements.txt +123 -0
scripts/wandb/run-20210712_164126-1cgtoi5r/files/wandb-metadata.json +49 -0
scripts/wandb/run-20210712_164126-1cgtoi5r/files/wandb-summary.json +1 -0
scripts/wandb/run-20210712_164126-1cgtoi5r/logs/debug-internal.log +3 -0
scripts/wandb/run-20210712_164126-1cgtoi5r/logs/debug.log +3 -0
scripts/wandb/run-20210712_164126-1cgtoi5r/run-1cgtoi5r.wandb +3 -0
scripts/wandb/run-20210712_164633-1ddv4131/files/config.yaml +305 -0
scripts/wandb/run-20210712_164633-1ddv4131/files/events.out.tfevents.1626108395.t1v-n-ebe36c53-w-0.486342.3.v2 +1 -0
scripts/wandb/run-20210712_164633-1ddv4131/files/output.log +3 -0
scripts/wandb/run-20210712_164633-1ddv4131/files/requirements.txt +123 -0
scripts/wandb/run-20210712_164633-1ddv4131/files/wandb-metadata.json +49 -0
scripts/wandb/run-20210712_164633-1ddv4131/files/wandb-summary.json +1 -0
scripts/wandb/run-20210712_164633-1ddv4131/logs/debug-internal.log +3 -0
scripts/wandb/run-20210712_164633-1ddv4131/logs/debug.log +3 -0
scripts/wandb/run-20210712_164633-1ddv4131/run-1ddv4131.wandb +3 -0
src/create_config.py +1 -1
src/run_clm_flax.py +147 -232
src/train_tokenizer.py +1 -1

.gitattributes CHANGED Viewed

@@ -12,6 +12,7 @@
 *.model filter=lfs diff=lfs merge=lfs -text
 *.msgpack filter=lfs diff=lfs merge=lfs -text
 *.pb filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

 *.model filter=lfs diff=lfs merge=lfs -text
 *.msgpack filter=lfs diff=lfs merge=lfs -text
 *.pb filter=lfs diff=lfs merge=lfs -text
+*.log filter=lfs diff=lfs merge=lfs -text
+*.wandb filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

gpt-2-tamil/config.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.0,
+  "bos_token_id": 50256,
+  "embd_pdrop": 0.0,
+  "eos_token_id": 50256,
+  "gradient_checkpointing": false,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 12,
+  "n_positions": 1024,
+  "resid_pdrop": 0.0,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 50
+    }
+  },
+  "transformers_version": "4.9.0.dev0",
+  "use_cache": true,
+  "vocab_size": 50257
+}

gpt-2-tamil/events.out.tfevents.1626064970.t1v-n-ebe36c53-w-0.400773.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4cc79262fd103f58e2c2bb461dc3db699613de0d444116b20f5644759ebfbe6e
+size 40

gpt-2-tamil/events.out.tfevents.1626108088.t1v-n-ebe36c53-w-0.483452.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5d27b640fe5e66ecd1bf4a3667a35e4243bc4afc19dd7a2247a6e3d0a56211f6
+size 40

gpt-2-tamil/events.out.tfevents.1626108395.t1v-n-ebe36c53-w-0.486342.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f98c1e1d0d88519bc875d97549b8ceb6d03f7c5d0aca79c15a10749f91c28362
+size 19735799

gpt-2-tamil/flax_model.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f15aa88a1b0381444c39e9e70f17a82751f7c317d7be7e22cc9707527f9a8c27
+size 497764120

gpt-2-tamil/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

scripts/run.log ADDED Viewed

File without changes

scripts/train_gpt2-oscar-tamil.sh CHANGED Viewed

@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-./run_clm_flax.py \
     --output_dir="${MODEL_DIR}" \
     --model_type="gpt2" \
     --config_name="${MODEL_DIR}" \
@@ -10,8 +10,16 @@
     --block_size="512" \
     --per_device_train_batch_size="64" \
     --per_device_eval_batch_size="64" \
-    --learning_rate="5e-3" --warmup_steps="1000" \
     --adam_beta1="0.9" --adam_beta2="0.98" --weight_decay="0.01" \
     --overwrite_output_dir \
-    --num_train_epochs="20" \
     #--push_to_hub

 #!/usr/bin/env bash
+python ../src/run_clm_flax.py \
     --output_dir="${MODEL_DIR}" \
     --model_type="gpt2" \
     --config_name="${MODEL_DIR}" \
     --block_size="512" \
     --per_device_train_batch_size="64" \
     --per_device_eval_batch_size="64" \
+    --learning_rate="3e-5" \
+    --warmup_steps="1000" \
     --adam_beta1="0.9" --adam_beta2="0.98" --weight_decay="0.01" \
     --overwrite_output_dir \
+    --num_train_epochs="25" \
+    --report_to wandb \
+    --run_name trial \
+    --logging_steps="500" \
+    --save_steps="2500" \
+    --eval_steps="2500" \
+    --preprocessing_num_workers="90" \
     #--push_to_hub
+    2>&1 | tee run.log

scripts/wandb/debug-internal.log ADDED Viewed

	@@ -0,0 +1 @@


1	+ run-20210712_164633-1ddv4131/logs/debug-internal.log

scripts/wandb/debug.log ADDED Viewed

	@@ -0,0 +1 @@


1	+ run-20210712_164633-1ddv4131/logs/debug.log

scripts/wandb/latest-run ADDED Viewed

	@@ -0,0 +1 @@


1	+ run-20210712_164633-1ddv4131

scripts/wandb/run-20210712_044248-12kjsz9i/files/config.yaml ADDED Viewed

	@@ -0,0 +1,301 @@

+wandb_version: 1
+__cached__setup_devices:
+  desc: null
+  value: cpu
+_n_gpu:
+  desc: null
+  value: 0
+_wandb:
+  desc: null
+  value:
+    cli_version: 0.10.33
+    framework: huggingface
+    huggingface_version: 4.9.0.dev0
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    python_version: 3.8.10
+    t:
+      1:
+      - 1
+      - 3
+      - 11
+      4: 3.8.10
+      5: 0.10.33
+      6: 4.9.0.dev0
+      8:
+      - 5
+adafactor:
+  desc: null
+  value: false
+adam_beta1:
+  desc: null
+  value: 0.9
+adam_beta2:
+  desc: null
+  value: 0.98
+adam_epsilon:
+  desc: null
+  value: 1.0e-08
+block_size:
+  desc: null
+  value: 512
+cache_dir:
+  desc: null
+  value: null
+config_name:
+  desc: null
+  value: ../gpt-2-tamil/
+dataloader_drop_last:
+  desc: null
+  value: false
+dataloader_num_workers:
+  desc: null
+  value: 0
+dataloader_pin_memory:
+  desc: null
+  value: true
+dataset_config_name:
+  desc: null
+  value: unshuffled_deduplicated_ta
+dataset_name:
+  desc: null
+  value: oscar
+ddp_find_unused_parameters:
+  desc: null
+  value: null
+debug:
+  desc: null
+  value: []
+deepspeed:
+  desc: null
+  value: null
+disable_tqdm:
+  desc: null
+  value: false
+do_eval:
+  desc: null
+  value: true
+do_predict:
+  desc: null
+  value: false
+do_train:
+  desc: null
+  value: true
+dtype:
+  desc: null
+  value: float32
+eval_accumulation_steps:
+  desc: null
+  value: null
+eval_steps:
+  desc: null
+  value: 500
+evaluation_strategy:
+  desc: null
+  value: IntervalStrategy.NO
+fp16:
+  desc: null
+  value: false
+fp16_backend:
+  desc: null
+  value: auto
+fp16_full_eval:
+  desc: null
+  value: false
+fp16_opt_level:
+  desc: null
+  value: O1
+gradient_accumulation_steps:
+  desc: null
+  value: 1
+greater_is_better:
+  desc: null
+  value: null
+group_by_length:
+  desc: null
+  value: false
+ignore_data_skip:
+  desc: null
+  value: false
+label_names:
+  desc: null
+  value: null
+label_smoothing_factor:
+  desc: null
+  value: 0.0
+learning_rate:
+  desc: null
+  value: 3.0e-05
+length_column_name:
+  desc: null
+  value: length
+load_best_model_at_end:
+  desc: null
+  value: false
+local_rank:
+  desc: null
+  value: -1
+log_level:
+  desc: null
+  value: -1
+log_level_replica:
+  desc: null
+  value: -1
+log_on_each_node:
+  desc: null
+  value: true
+logging_dir:
+  desc: null
+  value: ../tmp/../gpt-2-tamil/runs/Jul11_17-18-14_t1v-n-ebe36c53-w-0
+logging_first_step:
+  desc: null
+  value: false
+logging_steps:
+  desc: null
+  value: 500
+logging_strategy:
+  desc: null
+  value: IntervalStrategy.STEPS
+lr_scheduler_type:
+  desc: null
+  value: SchedulerType.LINEAR
+max_eval_samples:
+  desc: null
+  value: null
+max_grad_norm:
+  desc: null
+  value: 1.0
+max_steps:
+  desc: null
+  value: -1
+max_train_samples:
+  desc: null
+  value: null
+metric_for_best_model:
+  desc: null
+  value: null
+model_name_or_path:
+  desc: null
+  value: null
+model_type:
+  desc: null
+  value: gpt2
+mp_parameters:
+  desc: null
+  value: ''
+no_cuda:
+  desc: null
+  value: false
+num_train_epochs:
+  desc: null
+  value: 1.0
+output_dir:
+  desc: null
+  value: ../tmp/../gpt-2-tamil/
+overwrite_cache:
+  desc: null
+  value: false
+overwrite_output_dir:
+  desc: null
+  value: true
+past_index:
+  desc: null
+  value: -1
+per_device_eval_batch_size:
+  desc: null
+  value: 64
+per_device_train_batch_size:
+  desc: null
+  value: 64
+per_gpu_eval_batch_size:
+  desc: null
+  value: null
+per_gpu_train_batch_size:
+  desc: null
+  value: null
+prediction_loss_only:
+  desc: null
+  value: false
+preprocessing_num_workers:
+  desc: null
+  value: null
+push_to_hub:
+  desc: null
+  value: false
+push_to_hub_model_id:
+  desc: null
+  value: gpt-2-tamil
+push_to_hub_organization:
+  desc: null
+  value: null
+push_to_hub_token:
+  desc: null
+  value: null
+remove_unused_columns:
+  desc: null
+  value: true
+report_to:
+  desc: null
+  value:
+  - wandb
+resume_from_checkpoint:
+  desc: null
+  value: null
+run_name:
+  desc: null
+  value: trial
+save_on_each_node:
+  desc: null
+  value: false
+save_steps:
+  desc: null
+  value: 500
+save_strategy:
+  desc: null
+  value: IntervalStrategy.STEPS
+save_total_limit:
+  desc: null
+  value: null
+seed:
+  desc: null
+  value: 42
+sharded_ddp:
+  desc: null
+  value: []
+skip_memory_metrics:
+  desc: null
+  value: true
+tokenizer_name:
+  desc: null
+  value: ../gpt-2-tamil/
+tpu_metrics_debug:
+  desc: null
+  value: false
+tpu_num_cores:
+  desc: null
+  value: null
+train_file:
+  desc: null
+  value: null
+use_fast_tokenizer:
+  desc: null
+  value: true
+use_legacy_prediction_loop:
+  desc: null
+  value: false
+validation_file:
+  desc: null
+  value: null
+validation_split_percentage:
+  desc: null
+  value: 5
+warmup_ratio:
+  desc: null
+  value: 0.0
+warmup_steps:
+  desc: null
+  value: 1000
+weight_decay:
+  desc: null
+  value: 0.01

scripts/wandb/run-20210712_044248-12kjsz9i/files/events.out.tfevents.1626064970.t1v-n-ebe36c53-w-0.400773.3.v2 ADDED Viewed

	@@ -0,0 +1 @@


1	+ /home/tweety_abi/GPT2-Tamil/gpt-2-tamil/events.out.tfevents.1626064970.t1v-n-ebe36c53-w-0.400773.3.v2

scripts/wandb/run-20210712_044248-12kjsz9i/files/output.log ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:83c77cb8fdf96d6479ff9c389029839beb48a924dec227d80f708b2d1f1dd66f
+size 107953

scripts/wandb/run-20210712_044248-12kjsz9i/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,123 @@

+absl-py==0.13.0
+aiohttp==3.7.4.post0
+appdirs==1.4.4
+astunparse==1.6.3
+async-timeout==3.0.1
+attrs==21.2.0
+backcall==0.2.0
+black==21.6b0
+cachetools==4.2.2
+certifi==2021.5.30
+cfgv==3.3.0
+chardet==4.0.0
+chex==0.0.7
+click==8.0.1
+configparser==5.0.2
+cycler==0.10.0
+datasets==1.8.1.dev0
+decorator==5.0.9
+dill==0.3.4
+distlib==0.3.2
+dm-tree==0.1.6
+docker-pycreds==0.4.0
+filelock==3.0.12
+flake8==3.9.2
+flatbuffers==1.12
+flax==0.3.4
+fsspec==2021.6.1
+gast==0.4.0
+gitdb==4.0.7
+gitpython==3.1.18
+google-auth-oauthlib==0.4.4
+google-auth==1.32.1
+google-pasta==0.2.0
+grpcio==1.34.1
+h5py==3.1.0
+huggingface-hub==0.0.12
+identify==2.2.10
+idna==2.10
+ipython-genutils==0.2.0
+ipython==7.25.0
+isort==5.9.1
+jax==0.2.16
+jaxlib==0.1.68
+jedi==0.18.0
+joblib==1.0.1
+keras-nightly==2.5.0.dev2021032900
+keras-preprocessing==1.1.2
+kiwisolver==1.3.1
+libtpu-nightly==0.1.dev20210615
+markdown==3.3.4
+matplotlib-inline==0.1.2
+matplotlib==3.4.2
+mccabe==0.6.1
+msgpack==1.0.2
+multidict==5.1.0
+multiprocess==0.70.12.2
+mypy-extensions==0.4.3
+nodeenv==1.6.0
+numpy==1.19.5
+oauthlib==3.1.1
+opt-einsum==3.3.0
+optax==0.0.8
+packaging==20.9
+pandas==1.2.5
+parso==0.8.2
+pathspec==0.8.1
+pathtools==0.1.2
+pexpect==4.8.0
+pickleshare==0.7.5
+pillow==8.3.0
+pip==20.0.2
+pkg-resources==0.0.0
+pre-commit==2.13.0
+promise==2.3
+prompt-toolkit==3.0.19
+protobuf==3.17.3
+psutil==5.8.0
+ptyprocess==0.7.0
+pyarrow==4.0.1
+pyasn1-modules==0.2.8
+pyasn1==0.4.8
+pycodestyle==2.7.0
+pyflakes==2.3.1
+pygments==2.9.0
+pyparsing==2.4.7
+python-dateutil==2.8.1
+pytz==2021.1
+pyyaml==5.4.1
+regex==2021.7.1
+requests-oauthlib==1.3.0
+requests==2.25.1
+rsa==4.7.2
+sacremoses==0.0.45
+scipy==1.7.0
+sentry-sdk==1.3.0
+setuptools==44.0.0
+shortuuid==1.0.1
+six==1.15.0
+smmap==4.0.0
+subprocess32==3.5.4
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.0
+tensorboard==2.5.0
+tensorflow-estimator==2.5.0
+tensorflow==2.5.0
+termcolor==1.1.0
+tokenizers==0.10.3
+toml==0.10.2
+toolz==0.11.1
+torch==1.9.0
+tqdm==4.61.1
+traitlets==5.0.5
+transformers==4.9.0.dev0
+typing-extensions==3.7.4.3
+urllib3==1.26.6
+virtualenv==20.4.7
+wandb==0.10.33
+wcwidth==0.2.5
+werkzeug==2.0.1
+wheel==0.36.2
+wrapt==1.12.1
+xxhash==2.0.2
+yarl==1.6.3

scripts/wandb/run-20210712_044248-12kjsz9i/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+    "os": "Linux-5.4.0-1043-gcp-x86_64-with-glibc2.29",
+    "python": "3.8.10",
+    "heartbeatAt": "2021-07-12T04:42:50.208592",
+    "startedAt": "2021-07-12T04:42:48.264668",
+    "docker": null,
+    "cpu_count": 96,
+    "cuda": null,
+    "args": [
+        "--output_dir=../tmp/../gpt-2-tamil/",
+        "--model_type=gpt2",
+        "--config_name=../gpt-2-tamil/",
+        "--tokenizer_name=../gpt-2-tamil/",
+        "--dataset_name=oscar",
+        "--dataset_config_name=unshuffled_deduplicated_ta",
+        "--do_train",
+        "--do_eval",
+        "--block_size=512",
+        "--per_device_train_batch_size=64",
+        "--per_device_eval_batch_size=64",
+        "--learning_rate=3e-5",
+        "--warmup_steps=1000",
+        "--adam_beta1=0.9",
+        "--adam_beta2=0.98",
+        "--weight_decay=0.01",
+        "--overwrite_output_dir",
+        "--num_train_epochs=1",
+        "--report_to",
+        "wandb",
+        "--run_name",
+        "trial"
+    ],
+    "state": "running",
+    "program": "../src/run_clm_flax.py",
+    "codePath": "src/run_clm_flax.py",
+    "git": {
+        "remote": "https://github.com/AbinayaM02/GPT2-Tamil.git",
+        "commit": "a828229d00c071e9ced919095290b80e4781210e"
+    },
+    "email": "abinaya.m02@mphasis.com",
+    "root": "/home/tweety_abi/GPT2-Tamil",
+    "host": "t1v-n-ebe36c53-w-0",
+    "username": "tweety_abi",
+    "executable": "/home/tweety_abi/gpt2_env/bin/python"
+}

scripts/wandb/run-20210712_044248-12kjsz9i/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {}

scripts/wandb/run-20210712_044248-12kjsz9i/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5fcfc08cf0afaa70518eae77306ccb85256b1c75804bb875c43e69ead82eecee
+size 351322

scripts/wandb/run-20210712_044248-12kjsz9i/logs/debug.log ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:629872cc9a122d0b701da9fb5b2d152a25a06d9c316ff22a307c2d5297a5f684
+size 5672

scripts/wandb/run-20210712_044248-12kjsz9i/run-12kjsz9i.wandb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:77335af984a7fc18e2e903ce57acfa4f5ee5a60613ffb55a60c3e1996007f5e9
+size 327917

scripts/wandb/run-20210712_164126-1cgtoi5r/files/config.yaml ADDED Viewed

	@@ -0,0 +1,305 @@

+wandb_version: 1
+__cached__setup_devices:
+  desc: null
+  value: cpu
+_n_gpu:
+  desc: null
+  value: 0
+_wandb:
+  desc: null
+  value:
+    cli_version: 0.10.33
+    framework: huggingface
+    huggingface_version: 4.9.0.dev0
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    python_version: 3.8.10
+    t:
+      1:
+      - 1
+      - 3
+      - 11
+      2:
+      - 1
+      - 3
+      - 11
+      4: 3.8.10
+      5: 0.10.33
+      6: 4.9.0.dev0
+      8:
+      - 5
+adafactor:
+  desc: null
+  value: false
+adam_beta1:
+  desc: null
+  value: 0.9
+adam_beta2:
+  desc: null
+  value: 0.98
+adam_epsilon:
+  desc: null
+  value: 1.0e-08
+block_size:
+  desc: null
+  value: 512
+cache_dir:
+  desc: null
+  value: null
+config_name:
+  desc: null
+  value: ../gpt-2-tamil/
+dataloader_drop_last:
+  desc: null
+  value: false
+dataloader_num_workers:
+  desc: null
+  value: 0
+dataloader_pin_memory:
+  desc: null
+  value: true
+dataset_config_name:
+  desc: null
+  value: unshuffled_deduplicated_ta
+dataset_name:
+  desc: null
+  value: oscar
+ddp_find_unused_parameters:
+  desc: null
+  value: null
+debug:
+  desc: null
+  value: []
+deepspeed:
+  desc: null
+  value: null
+disable_tqdm:
+  desc: null
+  value: false
+do_eval:
+  desc: null
+  value: true
+do_predict:
+  desc: null
+  value: false
+do_train:
+  desc: null
+  value: true
+dtype:
+  desc: null
+  value: float32
+eval_accumulation_steps:
+  desc: null
+  value: null
+eval_steps:
+  desc: null
+  value: 2500
+evaluation_strategy:
+  desc: null
+  value: IntervalStrategy.NO
+fp16:
+  desc: null
+  value: false
+fp16_backend:
+  desc: null
+  value: auto
+fp16_full_eval:
+  desc: null
+  value: false
+fp16_opt_level:
+  desc: null
+  value: O1
+gradient_accumulation_steps:
+  desc: null
+  value: 1
+greater_is_better:
+  desc: null
+  value: null
+group_by_length:
+  desc: null
+  value: false
+ignore_data_skip:
+  desc: null
+  value: false
+label_names:
+  desc: null
+  value: null
+label_smoothing_factor:
+  desc: null
+  value: 0.0
+learning_rate:
+  desc: null
+  value: 3.0e-05
+length_column_name:
+  desc: null
+  value: length
+load_best_model_at_end:
+  desc: null
+  value: false
+local_rank:
+  desc: null
+  value: -1
+log_level:
+  desc: null
+  value: -1
+log_level_replica:
+  desc: null
+  value: -1
+log_on_each_node:
+  desc: null
+  value: true
+logging_dir:
+  desc: null
+  value: ../gpt-2-tamil/runs/Jul12_16-26-59_t1v-n-ebe36c53-w-0
+logging_first_step:
+  desc: null
+  value: false
+logging_steps:
+  desc: null
+  value: 500
+logging_strategy:
+  desc: null
+  value: IntervalStrategy.STEPS
+lr_scheduler_type:
+  desc: null
+  value: SchedulerType.LINEAR
+max_eval_samples:
+  desc: null
+  value: null
+max_grad_norm:
+  desc: null
+  value: 1.0
+max_steps:
+  desc: null
+  value: -1
+max_train_samples:
+  desc: null
+  value: null
+metric_for_best_model:
+  desc: null
+  value: null
+model_name_or_path:
+  desc: null
+  value: null
+model_type:
+  desc: null
+  value: gpt2
+mp_parameters:
+  desc: null
+  value: ''
+no_cuda:
+  desc: null
+  value: false
+num_train_epochs:
+  desc: null
+  value: 1.0
+output_dir:
+  desc: null
+  value: ../gpt-2-tamil/
+overwrite_cache:
+  desc: null
+  value: false
+overwrite_output_dir:
+  desc: null
+  value: true
+past_index:
+  desc: null
+  value: -1
+per_device_eval_batch_size:
+  desc: null
+  value: 64
+per_device_train_batch_size:
+  desc: null
+  value: 64
+per_gpu_eval_batch_size:
+  desc: null
+  value: null
+per_gpu_train_batch_size:
+  desc: null
+  value: null
+prediction_loss_only:
+  desc: null
+  value: false
+preprocessing_num_workers:
+  desc: null
+  value: 90
+push_to_hub:
+  desc: null
+  value: false
+push_to_hub_model_id:
+  desc: null
+  value: gpt-2-tamil
+push_to_hub_organization:
+  desc: null
+  value: null
+push_to_hub_token:
+  desc: null
+  value: null
+remove_unused_columns:
+  desc: null
+  value: true
+report_to:
+  desc: null
+  value:
+  - wandb
+resume_from_checkpoint:
+  desc: null
+  value: null
+run_name:
+  desc: null
+  value: trial
+save_on_each_node:
+  desc: null
+  value: false
+save_steps:
+  desc: null
+  value: 2500
+save_strategy:
+  desc: null
+  value: IntervalStrategy.STEPS
+save_total_limit:
+  desc: null
+  value: null
+seed:
+  desc: null
+  value: 42
+sharded_ddp:
+  desc: null
+  value: []
+skip_memory_metrics:
+  desc: null
+  value: true
+tokenizer_name:
+  desc: null
+  value: ../gpt-2-tamil/
+tpu_metrics_debug:
+  desc: null
+  value: false
+tpu_num_cores:
+  desc: null
+  value: null
+train_file:
+  desc: null
+  value: null
+use_fast_tokenizer:
+  desc: null
+  value: true
+use_legacy_prediction_loop:
+  desc: null
+  value: false
+validation_file:
+  desc: null
+  value: null
+validation_split_percentage:
+  desc: null
+  value: 5
+warmup_ratio:
+  desc: null
+  value: 0.0
+warmup_steps:
+  desc: null
+  value: 1000
+weight_decay:
+  desc: null
+  value: 0.01

scripts/wandb/run-20210712_164126-1cgtoi5r/files/events.out.tfevents.1626108088.t1v-n-ebe36c53-w-0.483452.3.v2 ADDED Viewed

	@@ -0,0 +1 @@


1	+ /home/tweety_abi/GPT2-Tamil/gpt-2-tamil/events.out.tfevents.1626108088.t1v-n-ebe36c53-w-0.483452.3.v2

scripts/wandb/run-20210712_164126-1cgtoi5r/files/output.log ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e0919562e4d7c8bbdebb2074976c553f66727900bc770c5dd9d8041e3a008931
+size 2408

scripts/wandb/run-20210712_164126-1cgtoi5r/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,123 @@

+absl-py==0.13.0
+aiohttp==3.7.4.post0
+appdirs==1.4.4
+astunparse==1.6.3
+async-timeout==3.0.1
+attrs==21.2.0
+backcall==0.2.0
+black==21.6b0
+cachetools==4.2.2
+certifi==2021.5.30
+cfgv==3.3.0
+chardet==4.0.0
+chex==0.0.7
+click==8.0.1
+configparser==5.0.2
+cycler==0.10.0
+datasets==1.8.1.dev0
+decorator==5.0.9
+dill==0.3.4
+distlib==0.3.2
+dm-tree==0.1.6
+docker-pycreds==0.4.0
+filelock==3.0.12
+flake8==3.9.2
+flatbuffers==1.12
+flax==0.3.4
+fsspec==2021.6.1
+gast==0.4.0
+gitdb==4.0.7
+gitpython==3.1.18
+google-auth-oauthlib==0.4.4
+google-auth==1.32.1
+google-pasta==0.2.0
+grpcio==1.34.1
+h5py==3.1.0
+huggingface-hub==0.0.12
+identify==2.2.10
+idna==2.10
+ipython-genutils==0.2.0
+ipython==7.25.0
+isort==5.9.1
+jax==0.2.16
+jaxlib==0.1.68
+jedi==0.18.0
+joblib==1.0.1
+keras-nightly==2.5.0.dev2021032900
+keras-preprocessing==1.1.2
+kiwisolver==1.3.1
+libtpu-nightly==0.1.dev20210615
+markdown==3.3.4
+matplotlib-inline==0.1.2
+matplotlib==3.4.2
+mccabe==0.6.1
+msgpack==1.0.2
+multidict==5.1.0
+multiprocess==0.70.12.2
+mypy-extensions==0.4.3
+nodeenv==1.6.0
+numpy==1.19.5
+oauthlib==3.1.1
+opt-einsum==3.3.0
+optax==0.0.8
+packaging==20.9
+pandas==1.2.5
+parso==0.8.2
+pathspec==0.8.1
+pathtools==0.1.2
+pexpect==4.8.0
+pickleshare==0.7.5
+pillow==8.3.0
+pip==20.0.2
+pkg-resources==0.0.0
+pre-commit==2.13.0
+promise==2.3
+prompt-toolkit==3.0.19
+protobuf==3.17.3
+psutil==5.8.0
+ptyprocess==0.7.0
+pyarrow==4.0.1
+pyasn1-modules==0.2.8
+pyasn1==0.4.8
+pycodestyle==2.7.0
+pyflakes==2.3.1
+pygments==2.9.0
+pyparsing==2.4.7
+python-dateutil==2.8.1
+pytz==2021.1
+pyyaml==5.4.1
+regex==2021.7.1
+requests-oauthlib==1.3.0
+requests==2.25.1
+rsa==4.7.2
+sacremoses==0.0.45
+scipy==1.7.0
+sentry-sdk==1.3.0
+setuptools==44.0.0
+shortuuid==1.0.1
+six==1.15.0
+smmap==4.0.0
+subprocess32==3.5.4
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.0
+tensorboard==2.5.0
+tensorflow-estimator==2.5.0
+tensorflow==2.5.0
+termcolor==1.1.0
+tokenizers==0.10.3
+toml==0.10.2
+toolz==0.11.1
+torch==1.9.0
+tqdm==4.61.1
+traitlets==5.0.5
+transformers==4.9.0.dev0
+typing-extensions==3.7.4.3
+urllib3==1.26.6
+virtualenv==20.4.7
+wandb==0.10.33
+wcwidth==0.2.5
+werkzeug==2.0.1
+wheel==0.36.2
+wrapt==1.12.1
+xxhash==2.0.2
+yarl==1.6.3

scripts/wandb/run-20210712_164126-1cgtoi5r/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,49 @@

+{
+    "os": "Linux-5.4.0-1043-gcp-x86_64-with-glibc2.29",
+    "python": "3.8.10",
+    "heartbeatAt": "2021-07-12T16:41:28.249908",
+    "startedAt": "2021-07-12T16:41:26.246514",
+    "docker": null,
+    "cpu_count": 96,
+    "cuda": null,
+    "args": [
+        "--output_dir=../gpt-2-tamil/",
+        "--model_type=gpt2",
+        "--config_name=../gpt-2-tamil/",
+        "--tokenizer_name=../gpt-2-tamil/",
+        "--dataset_name=oscar",
+        "--dataset_config_name=unshuffled_deduplicated_ta",
+        "--do_train",
+        "--do_eval",
+        "--block_size=512",
+        "--per_device_train_batch_size=64",
+        "--per_device_eval_batch_size=64",
+        "--learning_rate=3e-5",
+        "--warmup_steps=1000",
+        "--adam_beta1=0.9",
+        "--adam_beta2=0.98",
+        "--weight_decay=0.01",
+        "--overwrite_output_dir",
+        "--num_train_epochs=1",
+        "--report_to",
+        "wandb",
+        "--run_name",
+        "trial",
+        "--logging_steps=500",
+        "--save_steps=2500",
+        "--eval_steps=2500",
+        "--preprocessing_num_workers=90"
+    ],
+    "state": "running",
+    "program": "../src/run_clm_flax.py",
+    "codePath": "src/run_clm_flax.py",
+    "git": {
+        "remote": "https://github.com/AbinayaM02/GPT2-Tamil.git",
+        "commit": "a828229d00c071e9ced919095290b80e4781210e"
+    },
+    "email": "abinaya.m02@mphasis.com",
+    "root": "/home/tweety_abi/GPT2-Tamil",
+    "host": "t1v-n-ebe36c53-w-0",
+    "username": "tweety_abi",
+    "executable": "/home/tweety_abi/gpt2_env/bin/python"
+}

scripts/wandb/run-20210712_164126-1cgtoi5r/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {}

scripts/wandb/run-20210712_164126-1cgtoi5r/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f5042f20446f607f9de1da5328a848218a444a160203dbbe620844d138a5f041
+size 28874

scripts/wandb/run-20210712_164126-1cgtoi5r/logs/debug.log ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b642e86c06ccde2fecec5b035f7154289468f3126e152b9170e2e8d35b655328
+size 7649

scripts/wandb/run-20210712_164126-1cgtoi5r/run-1cgtoi5r.wandb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1302a66f7718be000865df304f0b21d1471bf7bb1cc55e4aba7e50d6f051725d
+size 15809

scripts/wandb/run-20210712_164633-1ddv4131/files/config.yaml ADDED Viewed

	@@ -0,0 +1,305 @@

+wandb_version: 1
+__cached__setup_devices:
+  desc: null
+  value: cpu
+_n_gpu:
+  desc: null
+  value: 0
+_wandb:
+  desc: null
+  value:
+    cli_version: 0.10.33
+    framework: huggingface
+    huggingface_version: 4.9.0.dev0
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    python_version: 3.8.10
+    t:
+      1:
+      - 1
+      - 3
+      - 11
+      2:
+      - 1
+      - 3
+      - 11
+      4: 3.8.10
+      5: 0.10.33
+      6: 4.9.0.dev0
+      8:
+      - 5
+adafactor:
+  desc: null
+  value: false
+adam_beta1:
+  desc: null
+  value: 0.9
+adam_beta2:
+  desc: null
+  value: 0.98
+adam_epsilon:
+  desc: null
+  value: 1.0e-08
+block_size:
+  desc: null
+  value: 512
+cache_dir:
+  desc: null
+  value: null
+config_name:
+  desc: null
+  value: ../gpt-2-tamil/
+dataloader_drop_last:
+  desc: null
+  value: false
+dataloader_num_workers:
+  desc: null
+  value: 0
+dataloader_pin_memory:
+  desc: null
+  value: true
+dataset_config_name:
+  desc: null
+  value: unshuffled_deduplicated_ta
+dataset_name:
+  desc: null
+  value: oscar
+ddp_find_unused_parameters:
+  desc: null
+  value: null
+debug:
+  desc: null
+  value: []
+deepspeed:
+  desc: null
+  value: null
+disable_tqdm:
+  desc: null
+  value: false
+do_eval:
+  desc: null
+  value: true
+do_predict:
+  desc: null
+  value: false
+do_train:
+  desc: null
+  value: true
+dtype:
+  desc: null
+  value: float32
+eval_accumulation_steps:
+  desc: null
+  value: null
+eval_steps:
+  desc: null
+  value: 2500
+evaluation_strategy:
+  desc: null
+  value: IntervalStrategy.NO
+fp16:
+  desc: null
+  value: false
+fp16_backend:
+  desc: null
+  value: auto
+fp16_full_eval:
+  desc: null
+  value: false
+fp16_opt_level:
+  desc: null
+  value: O1
+gradient_accumulation_steps:
+  desc: null
+  value: 1
+greater_is_better:
+  desc: null
+  value: null
+group_by_length:
+  desc: null
+  value: false
+ignore_data_skip:
+  desc: null
+  value: false
+label_names:
+  desc: null
+  value: null
+label_smoothing_factor:
+  desc: null
+  value: 0.0
+learning_rate:
+  desc: null
+  value: 3.0e-05
+length_column_name:
+  desc: null
+  value: length
+load_best_model_at_end:
+  desc: null
+  value: false
+local_rank:
+  desc: null
+  value: -1
+log_level:
+  desc: null
+  value: -1
+log_level_replica:
+  desc: null
+  value: -1
+log_on_each_node:
+  desc: null
+  value: true
+logging_dir:
+  desc: null
+  value: ../gpt-2-tamil/runs/Jul12_16-45-48_t1v-n-ebe36c53-w-0
+logging_first_step:
+  desc: null
+  value: false
+logging_steps:
+  desc: null
+  value: 500
+logging_strategy:
+  desc: null
+  value: IntervalStrategy.STEPS
+lr_scheduler_type:
+  desc: null
+  value: SchedulerType.LINEAR
+max_eval_samples:
+  desc: null
+  value: null
+max_grad_norm:
+  desc: null
+  value: 1.0
+max_steps:
+  desc: null
+  value: -1
+max_train_samples:
+  desc: null
+  value: null
+metric_for_best_model:
+  desc: null
+  value: null
+model_name_or_path:
+  desc: null
+  value: null
+model_type:
+  desc: null
+  value: gpt2
+mp_parameters:
+  desc: null
+  value: ''
+no_cuda:
+  desc: null
+  value: false
+num_train_epochs:
+  desc: null
+  value: 25.0
+output_dir:
+  desc: null
+  value: ../gpt-2-tamil/
+overwrite_cache:
+  desc: null
+  value: false
+overwrite_output_dir:
+  desc: null
+  value: true
+past_index:
+  desc: null
+  value: -1
+per_device_eval_batch_size:
+  desc: null
+  value: 64
+per_device_train_batch_size:
+  desc: null
+  value: 64
+per_gpu_eval_batch_size:
+  desc: null
+  value: null
+per_gpu_train_batch_size:
+  desc: null
+  value: null
+prediction_loss_only:
+  desc: null
+  value: false
+preprocessing_num_workers:
+  desc: null
+  value: 90
+push_to_hub:
+  desc: null
+  value: false
+push_to_hub_model_id:
+  desc: null
+  value: gpt-2-tamil
+push_to_hub_organization:
+  desc: null
+  value: null
+push_to_hub_token:
+  desc: null
+  value: null
+remove_unused_columns:
+  desc: null
+  value: true
+report_to:
+  desc: null
+  value:
+  - wandb
+resume_from_checkpoint:
+  desc: null
+  value: null
+run_name:
+  desc: null
+  value: trial
+save_on_each_node:
+  desc: null
+  value: false
+save_steps:
+  desc: null
+  value: 2500
+save_strategy:
+  desc: null
+  value: IntervalStrategy.STEPS
+save_total_limit:
+  desc: null
+  value: null
+seed:
+  desc: null
+  value: 42
+sharded_ddp:
+  desc: null
+  value: []
+skip_memory_metrics:
+  desc: null
+  value: true
+tokenizer_name:
+  desc: null
+  value: ../gpt-2-tamil/
+tpu_metrics_debug:
+  desc: null
+  value: false
+tpu_num_cores:
+  desc: null
+  value: null
+train_file:
+  desc: null
+  value: null
+use_fast_tokenizer:
+  desc: null
+  value: true
+use_legacy_prediction_loop:
+  desc: null
+  value: false
+validation_file:
+  desc: null
+  value: null
+validation_split_percentage:
+  desc: null
+  value: 5
+warmup_ratio:
+  desc: null
+  value: 0.0
+warmup_steps:
+  desc: null
+  value: 1000
+weight_decay:
+  desc: null
+  value: 0.01

scripts/wandb/run-20210712_164633-1ddv4131/files/events.out.tfevents.1626108395.t1v-n-ebe36c53-w-0.486342.3.v2 ADDED Viewed

	@@ -0,0 +1 @@


1	+ /home/tweety_abi/GPT2-Tamil/gpt-2-tamil/events.out.tfevents.1626108395.t1v-n-ebe36c53-w-0.486342.3.v2

scripts/wandb/run-20210712_164633-1ddv4131/files/output.log ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7d9761e7442f5b6b99224ee68ee38f3b7e486ead79f4e390bf7e258dc16de973
+size 4407657

scripts/wandb/run-20210712_164633-1ddv4131/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,123 @@

+absl-py==0.13.0
+aiohttp==3.7.4.post0
+appdirs==1.4.4
+astunparse==1.6.3
+async-timeout==3.0.1
+attrs==21.2.0
+backcall==0.2.0
+black==21.6b0
+cachetools==4.2.2
+certifi==2021.5.30
+cfgv==3.3.0
+chardet==4.0.0
+chex==0.0.7
+click==8.0.1
+configparser==5.0.2
+cycler==0.10.0
+datasets==1.8.1.dev0
+decorator==5.0.9
+dill==0.3.4
+distlib==0.3.2
+dm-tree==0.1.6
+docker-pycreds==0.4.0
+filelock==3.0.12
+flake8==3.9.2
+flatbuffers==1.12
+flax==0.3.4
+fsspec==2021.6.1
+gast==0.4.0
+gitdb==4.0.7
+gitpython==3.1.18
+google-auth-oauthlib==0.4.4
+google-auth==1.32.1
+google-pasta==0.2.0
+grpcio==1.34.1
+h5py==3.1.0
+huggingface-hub==0.0.12
+identify==2.2.10
+idna==2.10
+ipython-genutils==0.2.0
+ipython==7.25.0
+isort==5.9.1
+jax==0.2.16
+jaxlib==0.1.68
+jedi==0.18.0
+joblib==1.0.1
+keras-nightly==2.5.0.dev2021032900
+keras-preprocessing==1.1.2
+kiwisolver==1.3.1
+libtpu-nightly==0.1.dev20210615
+markdown==3.3.4
+matplotlib-inline==0.1.2
+matplotlib==3.4.2
+mccabe==0.6.1
+msgpack==1.0.2
+multidict==5.1.0
+multiprocess==0.70.12.2
+mypy-extensions==0.4.3
+nodeenv==1.6.0
+numpy==1.19.5
+oauthlib==3.1.1
+opt-einsum==3.3.0
+optax==0.0.8
+packaging==20.9
+pandas==1.2.5
+parso==0.8.2
+pathspec==0.8.1
+pathtools==0.1.2
+pexpect==4.8.0
+pickleshare==0.7.5
+pillow==8.3.0
+pip==20.0.2
+pkg-resources==0.0.0
+pre-commit==2.13.0
+promise==2.3
+prompt-toolkit==3.0.19
+protobuf==3.17.3
+psutil==5.8.0
+ptyprocess==0.7.0
+pyarrow==4.0.1
+pyasn1-modules==0.2.8
+pyasn1==0.4.8
+pycodestyle==2.7.0
+pyflakes==2.3.1
+pygments==2.9.0
+pyparsing==2.4.7
+python-dateutil==2.8.1
+pytz==2021.1
+pyyaml==5.4.1
+regex==2021.7.1
+requests-oauthlib==1.3.0
+requests==2.25.1
+rsa==4.7.2
+sacremoses==0.0.45
+scipy==1.7.0
+sentry-sdk==1.3.0
+setuptools==44.0.0
+shortuuid==1.0.1
+six==1.15.0
+smmap==4.0.0
+subprocess32==3.5.4
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.0
+tensorboard==2.5.0
+tensorflow-estimator==2.5.0
+tensorflow==2.5.0
+termcolor==1.1.0
+tokenizers==0.10.3
+toml==0.10.2
+toolz==0.11.1
+torch==1.9.0
+tqdm==4.61.1
+traitlets==5.0.5
+transformers==4.9.0.dev0
+typing-extensions==3.7.4.3
+urllib3==1.26.6
+virtualenv==20.4.7
+wandb==0.10.33
+wcwidth==0.2.5
+werkzeug==2.0.1
+wheel==0.36.2
+wrapt==1.12.1
+xxhash==2.0.2
+yarl==1.6.3

scripts/wandb/run-20210712_164633-1ddv4131/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,49 @@

+{
+    "os": "Linux-5.4.0-1043-gcp-x86_64-with-glibc2.29",
+    "python": "3.8.10",
+    "heartbeatAt": "2021-07-12T16:46:35.350252",
+    "startedAt": "2021-07-12T16:46:33.416306",
+    "docker": null,
+    "cpu_count": 96,
+    "cuda": null,
+    "args": [
+        "--output_dir=../gpt-2-tamil/",
+        "--model_type=gpt2",
+        "--config_name=../gpt-2-tamil/",
+        "--tokenizer_name=../gpt-2-tamil/",
+        "--dataset_name=oscar",
+        "--dataset_config_name=unshuffled_deduplicated_ta",
+        "--do_train",
+        "--do_eval",
+        "--block_size=512",
+        "--per_device_train_batch_size=64",
+        "--per_device_eval_batch_size=64",
+        "--learning_rate=3e-5",
+        "--warmup_steps=1000",
+        "--adam_beta1=0.9",
+        "--adam_beta2=0.98",
+        "--weight_decay=0.01",
+        "--overwrite_output_dir",
+        "--num_train_epochs=25",
+        "--report_to",
+        "wandb",
+        "--run_name",
+        "trial",
+        "--logging_steps=500",
+        "--save_steps=2500",
+        "--eval_steps=2500",
+        "--preprocessing_num_workers=90"
+    ],
+    "state": "running",
+    "program": "../src/run_clm_flax.py",
+    "codePath": "src/run_clm_flax.py",
+    "git": {
+        "remote": "https://github.com/AbinayaM02/GPT2-Tamil.git",
+        "commit": "5d59c6a635e952a0f51ef33ed713960a04e9dcb6"
+    },
+    "email": "abinaya.m02@mphasis.com",
+    "root": "/home/tweety_abi/GPT2-Tamil",
+    "host": "t1v-n-ebe36c53-w-0",
+    "username": "tweety_abi",
+    "executable": "/home/tweety_abi/gpt2_env/bin/python"
+}

scripts/wandb/run-20210712_164633-1ddv4131/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"global_step": 132500, "_timestamp": 1626248099.379086, "train_time": 743654.875, "train_learning_rate": 1.1402963906448349e-08, "_step": 264206, "train_loss": 1.1299134492874146, "eval_loss": 1.1545542478561401, "eval_perplexity": 3.1726088523864746}

scripts/wandb/run-20210712_164633-1ddv4131/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:748fffc8fe7bbd39d404a1bae61d5711a3e098491142dc28b16d5d75e32dc937
+size 97283434

scripts/wandb/run-20210712_164633-1ddv4131/logs/debug.log ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:36e11410ff19af1db092231a1450397dffb80ef21248540b6d372dcf5606559c
+size 8797

scripts/wandb/run-20210712_164633-1ddv4131/run-1ddv4131.wandb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d8211487b4d0a0489ae4728120abad1be7ee4190520afc47fdae166087ae6068
+size 60817322

src/create_config.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from transformers import GPT2Config
-model_dir = "./gpt2-tamil"  # ${MODEL_DIR}
 config = GPT2Config.from_pretrained(
     "gpt2", resid_pdrop=0.0, embd_pdrop=0.0, attn_pdrop=0.0

 from transformers import GPT2Config
+model_dir = "../gpt-2-tamil"  # ${MODEL_DIR}
 config = GPT2Config.from_pretrained(
     "gpt2", resid_pdrop=0.0, embd_pdrop=0.0, attn_pdrop=0.0

src/run_clm_flax.py CHANGED Viewed

@@ -31,16 +31,18 @@ from pathlib import Path
 from typing import Callable, Optional
 import datasets
 import jax
 import jax.numpy as jnp
 import optax
 import transformers
-from datasets import Dataset, load_dataset
 from flax import jax_utils, traverse_util
 from flax.jax_utils import unreplicate
 from flax.training import train_state
 from flax.training.common_utils import get_metrics, onehot, shard, shard_prng_key
-from tqdm import tqdm
 from transformers import (
     CONFIG_MAPPING,
     FLAX_MODEL_FOR_CAUSAL_LM_MAPPING,
@@ -53,25 +55,8 @@ from transformers import (
 )
 from transformers.testing_utils import CaptureLogger
-logger = logging.getLogger(__name__)
-# Cache the result
-has_tensorboard = is_tensorboard_available()
-if has_tensorboard:
-    try:
-        from flax.metrics.tensorboard import SummaryWriter
-    except ImportError as ie:
-        has_tensorboard = False
-        print(
-            f"Unable to display metrics through TensorBoard because some package are not installed: {ie}"
-        )
-else:
-    print(
-        "Unable to display metrics through TensorBoard because the package is not installed: "
-        "Please run pip install tensorboard to enable."
-    )
 MODEL_CONFIG_CLASSES = list(FLAX_MODEL_FOR_CAUSAL_LM_MAPPING.keys())
 MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
@@ -92,34 +77,20 @@ class ModelArguments:
     )
     model_type: Optional[str] = field(
         default=None,
-        metadata={
-            "help": "If training from scratch, pass a model type from the list: "
-            + ", ".join(MODEL_TYPES)
-        },
     )
     config_name: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "Pretrained config name or path if not the same as model_name"
-        },
     )
     tokenizer_name: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "Pretrained tokenizer name or path if not the same as model_name"
-        },
     )
     cache_dir: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "Where do you want to store the pretrained models downloaded from s3"
-        },
     )
     use_fast_tokenizer: bool = field(
         default=True,
-        metadata={
-            "help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."
-        },
     )
     dtype: Optional[str] = field(
         default="float32",
@@ -136,26 +107,15 @@ class DataTrainingArguments:
     """
     dataset_name: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "The name of the dataset to use (via the datasets library)."
-        },
     )
     dataset_config_name: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "The configuration name of the dataset to use (via the datasets library)."
-        },
-    )
-    train_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "The input training data file (a text file)."},
     )
     validation_file: Optional[str] = field(
         default=None,
-        metadata={
-            "help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."
-        },
     )
     max_train_samples: Optional[int] = field(
         default=None,
@@ -172,8 +132,7 @@ class DataTrainingArguments:
         },
     )
     overwrite_cache: bool = field(
-        default=False,
-        metadata={"help": "Overwrite the cached training and evaluation sets"},
     )
     validation_split_percentage: Optional[int] = field(
         default=5,
@@ -190,8 +149,7 @@ class DataTrainingArguments:
         },
     )
     overwrite_cache: bool = field(
-        default=False,
-        metadata={"help": "Overwrite the cached training and evaluation sets"},
     )
     preprocessing_num_workers: Optional[int] = field(
         default=None,
@@ -199,43 +157,25 @@ class DataTrainingArguments:
     )
     def __post_init__(self):
-        if (
-            self.dataset_name is None
-            and self.train_file is None
-            and self.validation_file is None
-        ):
-            raise ValueError(
-                "Need either a dataset name or a training/validation file."
-            )
         else:
             if self.train_file is not None:
                 extension = self.train_file.split(".")[-1]
-                assert extension in [
-                    "csv",
-                    "json",
-                    "txt",
-                ], "`train_file` should be a csv, a json or a txt file."
             if self.validation_file is not None:
                 extension = self.validation_file.split(".")[-1]
-                assert extension in [
-                    "csv",
-                    "json",
-                    "txt",
-                ], "`validation_file` should be a csv, a json or a txt file."
 class TrainState(train_state.TrainState):
     dropout_rng: jnp.ndarray
     def replicate(self):
-        return jax_utils.replicate(self).replace(
-            dropout_rng=shard_prng_key(self.dropout_rng)
-        )
-def data_loader(
-    rng: jax.random.PRNGKey, dataset: Dataset, batch_size: int, shuffle: bool = False
-):
     """
     Returns batches of size `batch_size` from truncated `dataset`, sharded over all local devices.
     Shuffle batches if `shuffle` is `True`.
@@ -259,7 +199,7 @@ def data_loader(
         yield batch
-def write_metric(summary_writer, train_metrics, eval_metrics, train_time, step):
     summary_writer.scalar("train_time", train_time, step)
     train_metrics = get_metrics(train_metrics)
@@ -268,31 +208,23 @@ def write_metric(summary_writer, train_metrics, eval_metrics, train_time, step):
         for i, val in enumerate(vals):
             summary_writer.scalar(tag, val, step - len(vals) + i + 1)
     for metric_name, value in eval_metrics.items():
         summary_writer.scalar(f"eval_{metric_name}", value, step)
 def create_learning_rate_fn(
-    train_ds_size: int,
-    train_batch_size: int,
-    num_train_epochs: int,
-    num_warmup_steps: int,
-    learning_rate: float,
 ) -> Callable[[int], jnp.array]:
     """Returns a linear warmup, linear_decay learning rate function."""
     steps_per_epoch = train_ds_size // train_batch_size
     num_train_steps = steps_per_epoch * num_train_epochs
-    warmup_fn = optax.linear_schedule(
-        init_value=0.0, end_value=learning_rate, transition_steps=num_warmup_steps
-    )
     decay_fn = optax.linear_schedule(
-        init_value=learning_rate,
-        end_value=0,
-        transition_steps=num_train_steps - num_warmup_steps,
-    )
-    schedule_fn = optax.join_schedules(
-        schedules=[warmup_fn, decay_fn], boundaries=[num_warmup_steps]
     )
     return schedule_fn
@@ -301,15 +233,11 @@ def main():
     # or by passing the --help flag to this script.
     # We now keep distinct sets of args, for a cleaner separation of concerns.
-    parser = HfArgumentParser(
-        (ModelArguments, DataTrainingArguments, TrainingArguments)
-    )
     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
         # If we pass only one argument to the script and it's the path to a json file,
         # let's parse it to get our arguments.
-        model_args, data_args, training_args = parser.parse_json_file(
-            json_file=os.path.abspath(sys.argv[1])
-        )
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
@@ -351,14 +279,10 @@ def main():
     #
     # In distributed training, the load_dataset function guarantees that only one local process can concurrently
     # download the dataset.
-    logger.info("Loading dataset....")
     if data_args.dataset_name is not None:
         # Downloading and loading a dataset from the hub.
         dataset = load_dataset(
-            data_args.dataset_name,
-            data_args.dataset_config_name,
-            cache_dir=model_args.cache_dir,
-            keep_in_memory=False,
         )
         if "validation" not in dataset.keys():
@@ -383,10 +307,7 @@ def main():
         extension = data_args.train_file.split(".")[-1]
         if extension == "txt":
             extension = "text"
-        logger.info(f"Loading dataset....{data_args.train_file}")
-        dataset = load_dataset(
-            extension, data_files=data_files, cache_dir=model_args.cache_dir
-        )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -396,28 +317,20 @@ def main():
     # The .from_pretrained methods guarantee that only one local process can concurrently
     # download model & vocab.
     if model_args.config_name:
-        config = AutoConfig.from_pretrained(
-            model_args.config_name, cache_dir=model_args.cache_dir
-        )
     elif model_args.model_name_or_path:
-        config = AutoConfig.from_pretrained(
-            model_args.model_name_or_path, cache_dir=model_args.cache_dir
-        )
     else:
         config = CONFIG_MAPPING[model_args.model_type]()
         logger.warning("You are instantiating a new config instance from scratch.")
     if model_args.tokenizer_name:
         tokenizer = AutoTokenizer.from_pretrained(
-            model_args.tokenizer_name,
-            cache_dir=model_args.cache_dir,
-            use_fast=model_args.use_fast_tokenizer,
         )
     elif model_args.model_name_or_path:
         tokenizer = AutoTokenizer.from_pretrained(
-            model_args.model_name_or_path,
-            cache_dir=model_args.cache_dir,
-            use_fast=model_args.use_fast_tokenizer,
         )
     else:
         raise ValueError(
@@ -427,10 +340,7 @@ def main():
     if model_args.model_name_or_path:
         model = FlaxAutoModelForCausalLM.from_pretrained(
-            model_args.model_name_or_path,
-            config=config,
-            seed=training_args.seed,
-            dtype=getattr(jnp, model_args.dtype),
         )
     else:
         model = FlaxAutoModelForCausalLM.from_config(
@@ -446,9 +356,7 @@ def main():
     text_column_name = "text" if "text" in column_names else column_names[0]
     # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
-    tok_logger = transformers.utils.logging.get_logger(
-        "transformers.tokenization_utils_base"
-    )
     def tokenize_function(examples):
         with CaptureLogger(tok_logger) as cl:
@@ -491,7 +399,8 @@ def main():
         total_length = len(concatenated_examples[list(examples.keys())[0]])
         # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
         # customize this part to your needs.
-        total_length = (total_length // block_size) * block_size
         # Split by chunks of max_len.
         result = {
             k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
@@ -529,8 +438,32 @@ def main():
             eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
     # Enable tensorboard only on the master node
     if has_tensorboard and jax.process_index() == 0:
-        summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir))
     # Initialize our training
     rng = jax.random.PRNGKey(training_args.seed)
@@ -538,12 +471,8 @@ def main():
     # Store some constant
     num_epochs = int(training_args.num_train_epochs)
-    train_batch_size = (
-        int(training_args.per_device_train_batch_size) * jax.device_count()
-    )
-    eval_batch_size = (
-        int(training_args.per_device_eval_batch_size) * jax.device_count()
-    )
     steps_per_epoch = len(train_dataset) // train_batch_size
     total_train_steps = steps_per_epoch * num_epochs
@@ -566,39 +495,35 @@ def main():
     def decay_mask_fn(params):
         flat_params = traverse_util.flatten_dict(params)
         flat_mask = {
-            path: (
-                path[-1] != "bias"
-                and path[-2:]
-                not in [("ln_1", "scale"), ("ln_2", "scale"), ("ln_f", "scale")]
-            )
             for path in flat_params
         }
         return traverse_util.unflatten_dict(flat_mask)
     # create adam optimizer
-    adamw = optax.adamw(
-        learning_rate=linear_decay_lr_schedule_fn,
-        b1=training_args.adam_beta1,
-        b2=training_args.adam_beta2,
-        eps=training_args.adam_epsilon,
-        weight_decay=training_args.weight_decay,
-        mask=decay_mask_fn,
-    )
     # Setup train state
-    state = TrainState.create(
-        apply_fn=model.__call__,
-        params=model.params,
-        tx=adamw,
-        dropout_rng=dropout_rng,
-    )
     def loss_fn(logits, labels):
         shift_logits = logits[..., :-1, :]
         shift_labels = labels[..., 1:]
-        loss = optax.softmax_cross_entropy(
-            shift_logits, onehot(shift_labels, shift_logits.shape[-1])
-        )
         return loss.mean()
     # Define gradient update step fn
@@ -607,9 +532,7 @@ def main():
         def compute_loss(params):
             labels = batch.pop("labels")
-            logits = state.apply_fn(
-                **batch, params=params, dropout_rng=dropout_rng, train=True
-            )[0]
             loss = loss_fn(logits, labels)
             return loss
@@ -619,10 +542,7 @@ def main():
         new_state = state.apply_gradients(grads=grad, dropout_rng=new_dropout_rng)
-        metrics = {
-            "loss": loss,
-            "learning_rate": linear_decay_lr_schedule_fn(state.step),
-        }
         metrics = jax.lax.pmean(metrics, axis_name="batch")
         return new_state, metrics
@@ -648,15 +568,12 @@ def main():
     logger.info("***** Running training *****")
     logger.info(f"  Num examples = {len(train_dataset)}")
     logger.info(f"  Num Epochs = {num_epochs}")
-    logger.info(
-        f"  Instantaneous batch size per device = {training_args.per_device_train_batch_size}"
-    )
-    logger.info(
-        f"  Total train batch size (w. parallel & distributed) = {train_batch_size}"
-    )
     logger.info(f"  Total optimization steps = {total_train_steps}")
     train_time = 0
     epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
     for epoch in epochs:
         # ======================== Training ================================
@@ -664,72 +581,70 @@ def main():
         # Create sampling rng
         rng, input_rng = jax.random.split(rng)
-        train_metrics = []
         # Generate an epoch by shuffling sampling indices from the train dataset
-        train_loader = data_loader(
-            input_rng, train_dataset, train_batch_size, shuffle=True
-        )
         steps_per_epoch = len(train_dataset) // train_batch_size
         # train
-        for _ in tqdm(
-            range(steps_per_epoch), desc="Training...", position=1, leave=False
-        ):
             batch = next(train_loader)
             state, train_metric = p_train_step(state, batch)
             train_metrics.append(train_metric)
-        train_time += time.time() - train_start
-        train_metric = unreplicate(train_metric)
-        epochs.write(
-            f"Epoch... ({epoch + 1}/{num_epochs} | Loss: {train_metric['loss']}, Learning Rate: {train_metric['learning_rate']})"
-        )
-        # ======================== Evaluating ==============================
-        eval_metrics = []
-        eval_loader = data_loader(input_rng, eval_dataset, eval_batch_size)
-        eval_steps = len(eval_dataset) // eval_batch_size
-        for _ in tqdm(
-            range(eval_steps), desc="Evaluating...", position=2, leave=False
-        ):
-            # Model forward
-            batch = next(eval_loader)
-            metrics = p_eval_step(state.params, batch)
-            eval_metrics.append(metrics)
-        # normalize eval metrics
-        eval_metrics = get_metrics(eval_metrics)
-        eval_metrics = jax.tree_map(jnp.mean, eval_metrics)
-        try:
-            eval_metrics["perplexity"] = math.exp(eval_metrics["loss"])
-        except OverflowError:
-            eval_metrics["perplexity"] = float("inf")
-        # Print metrics and update progress bar
-        desc = f"Epoch... ({epoch + 1}/{num_epochs} | Eval Loss: {eval_metrics['loss']} | Eval Perplexity: {eval_metrics['perplexity']})"
-        epochs.write(desc)
-        epochs.desc = desc
-        # Save metrics
-        if has_tensorboard and jax.process_index() == 0:
-            cur_step = epoch * (len(train_dataset) // train_batch_size)
-            write_metric(
-                summary_writer, train_metrics, eval_metrics, train_time, cur_step
-            )
-        # save checkpoint after each epoch and push checkpoint to the hub
-        if jax.process_index() == 0:
-            params = jax.device_get(unreplicate(state.params))
-            model.save_pretrained(
-                training_args.output_dir,
-                params=params,
-                push_to_hub=training_args.push_to_hub,
-                commit_message=f"Saving weights and logs of epoch {epoch+1}",
-            )
 if __name__ == "__main__":

 from typing import Callable, Optional
 import datasets
+from datasets import Dataset, load_dataset
+from tqdm import tqdm
 import jax
 import jax.numpy as jnp
 import optax
 import transformers
+import wandb
 from flax import jax_utils, traverse_util
 from flax.jax_utils import unreplicate
 from flax.training import train_state
 from flax.training.common_utils import get_metrics, onehot, shard, shard_prng_key
 from transformers import (
     CONFIG_MAPPING,
     FLAX_MODEL_FOR_CAUSAL_LM_MAPPING,
 )
 from transformers.testing_utils import CaptureLogger
+logger = logging.getLogger(__name__)
 MODEL_CONFIG_CLASSES = list(FLAX_MODEL_FOR_CAUSAL_LM_MAPPING.keys())
 MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
     )
     model_type: Optional[str] = field(
         default=None,
+        metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
     )
     config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
     )
     tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
     )
     cache_dir: Optional[str] = field(
+        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
     )
     use_fast_tokenizer: bool = field(
         default=True,
+        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
     )
     dtype: Optional[str] = field(
         default="float32",
     """
     dataset_name: Optional[str] = field(
+        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
     )
     dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
     )
+    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
     validation_file: Optional[str] = field(
         default=None,
+        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
     )
     max_train_samples: Optional[int] = field(
         default=None,
         },
     )
     overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
     )
     validation_split_percentage: Optional[int] = field(
         default=5,
         },
     )
     overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
     )
     preprocessing_num_workers: Optional[int] = field(
         default=None,
     )
     def __post_init__(self):
+        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
+            raise ValueError("Need either a dataset name or a training/validation file.")
         else:
             if self.train_file is not None:
                 extension = self.train_file.split(".")[-1]
+                assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
             if self.validation_file is not None:
                 extension = self.validation_file.split(".")[-1]
+                assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
 class TrainState(train_state.TrainState):
     dropout_rng: jnp.ndarray
     def replicate(self):
+        return jax_utils.replicate(self).replace(dropout_rng=shard_prng_key(self.dropout_rng))
+def data_loader(rng: jax.random.PRNGKey, dataset: Dataset, batch_size: int, shuffle: bool = False):
     """
     Returns batches of size `batch_size` from truncated `dataset`, sharded over all local devices.
     Shuffle batches if `shuffle` is `True`.
         yield batch
+def write_train_metric(summary_writer, train_metrics, train_time, step):
     summary_writer.scalar("train_time", train_time, step)
     train_metrics = get_metrics(train_metrics)
         for i, val in enumerate(vals):
             summary_writer.scalar(tag, val, step - len(vals) + i + 1)
+def write_eval_metric(summary_writer, eval_metrics, step):
     for metric_name, value in eval_metrics.items():
         summary_writer.scalar(f"eval_{metric_name}", value, step)
 def create_learning_rate_fn(
+    train_ds_size: int, train_batch_size: int, num_train_epochs: int, num_warmup_steps: int, learning_rate: float
 ) -> Callable[[int], jnp.array]:
     """Returns a linear warmup, linear_decay learning rate function."""
     steps_per_epoch = train_ds_size // train_batch_size
     num_train_steps = steps_per_epoch * num_train_epochs
+    warmup_fn = optax.linear_schedule(init_value=0.0, end_value=learning_rate, transition_steps=num_warmup_steps)
     decay_fn = optax.linear_schedule(
+        init_value=learning_rate, end_value=0, transition_steps=num_train_steps - num_warmup_steps
     )
+    schedule_fn = optax.join_schedules(schedules=[warmup_fn, decay_fn], boundaries=[num_warmup_steps])
     return schedule_fn
     # or by passing the --help flag to this script.
     # We now keep distinct sets of args, for a cleaner separation of concerns.
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
         # If we pass only one argument to the script and it's the path to a json file,
         # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
     #
     # In distributed training, the load_dataset function guarantees that only one local process can concurrently
     # download the dataset.
     if data_args.dataset_name is not None:
         # Downloading and loading a dataset from the hub.
         dataset = load_dataset(
+            data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, keep_in_memory=False
         )
         if "validation" not in dataset.keys():
         extension = data_args.train_file.split(".")[-1]
         if extension == "txt":
             extension = "text"
+        dataset = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
     # The .from_pretrained methods guarantee that only one local process can concurrently
     # download model & vocab.
     if model_args.config_name:
+        config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir)
     elif model_args.model_name_or_path:
+        config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
     else:
         config = CONFIG_MAPPING[model_args.model_type]()
         logger.warning("You are instantiating a new config instance from scratch.")
     if model_args.tokenizer_name:
         tokenizer = AutoTokenizer.from_pretrained(
+            model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
         )
     elif model_args.model_name_or_path:
         tokenizer = AutoTokenizer.from_pretrained(
+            model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
         )
     else:
         raise ValueError(
     if model_args.model_name_or_path:
         model = FlaxAutoModelForCausalLM.from_pretrained(
+            model_args.model_name_or_path, config=config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
         )
     else:
         model = FlaxAutoModelForCausalLM.from_config(
     text_column_name = "text" if "text" in column_names else column_names[0]
     # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
+    tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
     def tokenize_function(examples):
         with CaptureLogger(tok_logger) as cl:
         total_length = len(concatenated_examples[list(examples.keys())[0]])
         # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
         # customize this part to your needs.
+        if total_length >= block_size:
+            total_length = (total_length // block_size) * block_size
         # Split by chunks of max_len.
         result = {
             k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
             eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
     # Enable tensorboard only on the master node
+    has_tensorboard = is_tensorboard_available()
     if has_tensorboard and jax.process_index() == 0:
+        wandb.init(
+            entity='abinayam',
+            project='hf-flax-gpt-2-tamil',
+            sync_tensorboard=True
+        )
+        wandb.config.update(training_args)  # optional, log your configs
+        wandb.config.update(model_args)  # optional, log your configs
+        wandb.config.update(data_args)   # optional, log your configs
+        try:
+            from flax.metrics.tensorboard import SummaryWriter
+            summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir))
+        except ImportError as ie:
+            has_tensorboard = False
+            logger.warning(
+                f"Unable to display metrics through TensorBoard because some package are not installed: {ie}"
+            )
+    else:
+        logger.warning(
+            "Unable to display metrics through TensorBoard because the package is not installed: "
+            "Please run pip install tensorboard to enable."
+        )
     # Initialize our training
     rng = jax.random.PRNGKey(training_args.seed)
     # Store some constant
     num_epochs = int(training_args.num_train_epochs)
+    train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count()
+    eval_batch_size = int(training_args.per_device_eval_batch_size) * jax.device_count()
     steps_per_epoch = len(train_dataset) // train_batch_size
     total_train_steps = steps_per_epoch * num_epochs
     def decay_mask_fn(params):
         flat_params = traverse_util.flatten_dict(params)
         flat_mask = {
+            path: (path[-1] != "bias" and path[-2:] not in [("ln_1", "scale"), ("ln_2", "scale"), ("ln_f", "scale")])
             for path in flat_params
         }
         return traverse_util.unflatten_dict(flat_mask)
     # create adam optimizer
+    if training_args.adafactor:
+        # We use the default parameters here to initialize adafactor,
+        # For more details about the parameters please check https://github.com/deepmind/optax/blob/ed02befef9bf81cbbf236be3d2b0e032e9ed4a40/optax/_src/alias.py#L74
+        optimizer = optax.adafactor(
+            learning_rate=linear_decay_lr_schedule_fn,
+        )
+    else:
+        optimizer = optax.adamw(
+            learning_rate=linear_decay_lr_schedule_fn,
+            b1=training_args.adam_beta1,
+            b2=training_args.adam_beta2,
+            eps=training_args.adam_epsilon,
+            weight_decay=training_args.weight_decay,
+            mask=decay_mask_fn,
+        )
     # Setup train state
+    state = TrainState.create(apply_fn=model.__call__, params=model.params, tx=optimizer, dropout_rng=dropout_rng)
     def loss_fn(logits, labels):
         shift_logits = logits[..., :-1, :]
         shift_labels = labels[..., 1:]
+        loss = optax.softmax_cross_entropy(shift_logits, onehot(shift_labels, shift_logits.shape[-1]))
         return loss.mean()
     # Define gradient update step fn
         def compute_loss(params):
             labels = batch.pop("labels")
+            logits = state.apply_fn(**batch, params=params, dropout_rng=dropout_rng, train=True)[0]
             loss = loss_fn(logits, labels)
             return loss
         new_state = state.apply_gradients(grads=grad, dropout_rng=new_dropout_rng)
+        metrics = {"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step)}
         metrics = jax.lax.pmean(metrics, axis_name="batch")
         return new_state, metrics
     logger.info("***** Running training *****")
     logger.info(f"  Num examples = {len(train_dataset)}")
     logger.info(f"  Num Epochs = {num_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {training_args.per_device_train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel & distributed) = {train_batch_size}")
     logger.info(f"  Total optimization steps = {total_train_steps}")
     train_time = 0
+    train_metrics = []
     epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
     for epoch in epochs:
         # ======================== Training ================================
         # Create sampling rng
         rng, input_rng = jax.random.split(rng)
         # Generate an epoch by shuffling sampling indices from the train dataset
+        train_loader = data_loader(input_rng, train_dataset, train_batch_size, shuffle=True)
         steps_per_epoch = len(train_dataset) // train_batch_size
         # train
+        for step in tqdm(range(steps_per_epoch), desc="Training...", position=1, leave=False):
             batch = next(train_loader)
             state, train_metric = p_train_step(state, batch)
             train_metrics.append(train_metric)
+            cur_step = epoch * (len(train_dataset) // train_batch_size) + step
+            if cur_step % training_args.logging_steps == 0 and cur_step > 0:
+                # Save metrics
+                train_metric = unreplicate(train_metric)
+                train_time += time.time() - train_start
+                if has_tensorboard and jax.process_index() == 0:
+                    write_train_metric(summary_writer, train_metrics, train_time, cur_step)
+                epochs.write(
+                    f"Step... ({cur_step} | Loss: {train_metric['loss'].mean()}, Learning Rate: {train_metric['learning_rate'].mean()})"
+                )
+                train_metrics = []
+            if cur_step % training_args.eval_steps == 0 and cur_step > 0:
+                # ======================== Evaluating ==============================
+                eval_metrics = []
+                eval_loader = data_loader(input_rng, eval_dataset, eval_batch_size)
+                eval_steps = len(eval_dataset) // eval_batch_size
+                for _ in tqdm(range(eval_steps), desc="Evaluating...", position=2, leave=False):
+                    # Model forward
+                    batch = next(eval_loader)
+                    metrics = p_eval_step(state.params, batch)
+                    eval_metrics.append(metrics)
+                # normalize eval metrics
+                eval_metrics = get_metrics(eval_metrics)
+                eval_metrics = jax.tree_map(jnp.mean, eval_metrics)
+                try:
+                    eval_metrics["perplexity"] = math.exp(eval_metrics["loss"])
+                except OverflowError:
+                    eval_metrics["perplexity"] = float("inf")
+                # Print metrics and update progress bar
+                desc = f"Step... ({cur_step} | Eval Loss: {eval_metrics['loss']} | Eval Perplexity: {eval_metrics['perplexity']})"
+                epochs.write(desc)
+                epochs.desc = desc
+                # Save metrics
+                if has_tensorboard and jax.process_index() == 0:
+                    write_eval_metric(summary_writer, eval_metrics, cur_step)
+            if cur_step % training_args.save_steps == 0 and cur_step > 0:
+                # save checkpoint after each epoch and push checkpoint to the hub
+                if jax.process_index() == 0:
+                    params = jax.device_get(unreplicate(state.params))
+                    model.save_pretrained(
+                        training_args.output_dir,
+                        params=params,
+                        push_to_hub=training_args.push_to_hub,
+                        commit_message=f"Saving weights and logs of step {cur_step}",
+                    )
 if __name__ == "__main__":

src/train_tokenizer.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from datasets import load_dataset
 from tokenizers import ByteLevelBPETokenizer  # Tokenizer, normalizers, trainers
-model_dir = "./gpt2-tamil"  # ${MODEL_DIR}
 # load dataset
 dataset = load_dataset("oscar", "unshuffled_deduplicated_ta", split="train")

 from datasets import load_dataset
 from tokenizers import ByteLevelBPETokenizer  # Tokenizer, normalizers, trainers
+model_dir = "../gpt-2-tamil"  # ${MODEL_DIR}
 # load dataset
 dataset = load_dataset("oscar", "unshuffled_deduplicated_ta", split="train")