Made change to cleaning code, modified number of warmpu step, getting eval samples from validation split

Browse files

Files changed (5) hide show

.gitignore +2 -0
.vscode/launch.json +3 -2
run_mlm_flax_stream.py +33 -11
run_stream.sh +2 -1
utils.py +30 -8

.gitignore CHANGED Viewed

	@@ -1 +1,3 @@
1	__pycache__

 __pycache__
+events.out.tfevents*
+*xplane.pb

.vscode/launch.json CHANGED Viewed

@@ -17,8 +17,8 @@
                 "--dataset_name","mc4",
                 "--dataset_config_name","hi",
                 "--max_seq_length","256",
-                "--per_device_train_batch_size","128",
-                "--per_device_eval_batch_size","128",
                 "--learning_rate","3e-4",
                 "--warmup_steps","1000",
                 "--overwrite_output_dir",
@@ -26,6 +26,7 @@
                 "--adam_beta2","0.98",
                 "--num_train_steps","10000",
                 "--num_eval_samples","5000",
                 "--logging_steps","250",
                 "--eval_steps","1000"
             ],

                 "--dataset_name","mc4",
                 "--dataset_config_name","hi",
                 "--max_seq_length","256",
+                "--per_device_train_batch_size","16",
+                "--per_device_eval_batch_size","16",
                 "--learning_rate","3e-4",
                 "--warmup_steps","1000",
                 "--overwrite_output_dir",
                 "--adam_beta2","0.98",
                 "--num_train_steps","10000",
                 "--num_eval_samples","5000",
+                "--preprocessing_num_workers", "90",
                 "--logging_steps","250",
                 "--eval_steps","1000"
             ],

run_mlm_flax_stream.py CHANGED Viewed

@@ -31,7 +31,7 @@ from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple
-from utils import keep_devnagri
 import datasets
 import numpy as np
@@ -60,6 +60,7 @@ from transformers import (
 )
 # if datasets.__version__ <= "1.8.0":
 #     raise ValueError("Make sure to upgrade `datasets` to a version >= 1.9.0 to use dataset streaming")
@@ -320,7 +321,6 @@ if __name__ == "__main__":
     # See all possible arguments in src/transformers/training_args.py
     # or by passing the --help flag to this script.
     # We now keep distinct sets of args, for a cleaner separation of concerns.
     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
         # If we pass only one argument to the script and it's the path to a json file,
@@ -375,6 +375,13 @@ if __name__ == "__main__":
             streaming=True,
             split="train",
         )
     if model_args.config_name:
         config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir)
@@ -404,17 +411,26 @@ if __name__ == "__main__":
     def tokenize_function(examples):
         return tokenizer(examples[data_args.text_column_name], return_special_tokens_mask=True)
-    cleaned_dataset = dataset.map(
-        keep_devnagri,
-        batched=False,
     )
     tokenized_datasets = cleaned_dataset.map(
         tokenize_function,
-        batched=True,
     )
-    shuffle_seed = training_args.seed
-    tokenized_datasets = tokenized_datasets.shuffle(buffer_size=data_args.shuffle_buffer_size, seed=shuffle_seed)
     has_tensorboard = is_tensorboard_available()
     if has_tensorboard and jax.process_index() == 0:
@@ -428,6 +444,10 @@ if __name__ == "__main__":
         summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir))
     # Data collator
     # This one will take care of randomly masking the tokens.
     data_collator = FlaxDataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=data_args.mlm_probability)
@@ -446,6 +466,7 @@ if __name__ == "__main__":
         )
     if jax.device_count() < 8:
         print('Number of device as per jax device count is {}. Press Enter to continue'.format(jax.device_count()))
     # Store some constant
     num_epochs = int(training_args.num_train_epochs)
@@ -556,9 +577,10 @@ if __name__ == "__main__":
     eval_metrics = []
     training_iter = iter(tokenized_datasets)
     max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
-    doc_count, eval_samples = advance_iter_and_group_samples(training_iter, data_args.num_eval_samples, max_seq_length)
     steps = tqdm(range(num_train_steps), desc="Training...", position=0)
     docs_progress_bar = tqdm(range(dataset_doc_count * num_epochs), desc="Docs Processed...", position=0)
@@ -575,7 +597,7 @@ if __name__ == "__main__":
             training_iter = iter(tokenized_datasets)
-            _, eval_dataset = advance_iter_and_group_samples(training_iter, data_args.num_eval_samples, max_seq_length)
             doc_count, samples = advance_iter_and_group_samples(training_iter, train_batch_size, max_seq_length)

 from pathlib import Path
 from typing import Dict, List, Optional, Tuple
+from utils import keep_devnagri_hf_doc
 import datasets
 import numpy as np
 )
 # if datasets.__version__ <= "1.8.0":
 #     raise ValueError("Make sure to upgrade `datasets` to a version >= 1.9.0 to use dataset streaming")
     # See all possible arguments in src/transformers/training_args.py
     # or by passing the --help flag to this script.
     # We now keep distinct sets of args, for a cleaner separation of concerns.
     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
         # If we pass only one argument to the script and it's the path to a json file,
             streaming=True,
             split="train",
         )
+        validation_dataset = load_dataset(
+            data_args.dataset_name,
+            data_args.dataset_config_name,
+            cache_dir=model_args.cache_dir,
+            streaming=True,
+            split="validation",
+        )
     if model_args.config_name:
         config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir)
     def tokenize_function(examples):
         return tokenizer(examples[data_args.text_column_name], return_special_tokens_mask=True)
+    shuffle_seed = training_args.seed
+    shuffled_dataset = dataset.shuffle(buffer_size=data_args.shuffle_buffer_size, seed=shuffle_seed)
+    cleaned_dataset = shuffled_dataset.map(
+        keep_devnagri_hf_doc,
+        batched=True
     )
     tokenized_datasets = cleaned_dataset.map(
         tokenize_function,
+        batched=True
+    )
+    cleaned_validation_dataset = dataset.map(
+        keep_devnagri_hf_doc,
+        batched=True
+    )
+    tokenized_validation_datasets = cleaned_validation_dataset.map(
+        tokenize_function,
+        batched=True
     )
     has_tensorboard = is_tensorboard_available()
     if has_tensorboard and jax.process_index() == 0:
         summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir))
+    # code for manual tpu profiling
+    import jax.profiler
+    server = jax.profiler.start_server(9999)
     # Data collator
     # This one will take care of randomly masking the tokens.
     data_collator = FlaxDataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=data_args.mlm_probability)
         )
     if jax.device_count() < 8:
         print('Number of device as per jax device count is {}. Press Enter to continue'.format(jax.device_count()))
+        input()
     # Store some constant
     num_epochs = int(training_args.num_train_epochs)
     eval_metrics = []
     training_iter = iter(tokenized_datasets)
+    validation_iter = iter(tokenized_validation_datasets)
     max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
+    _, eval_samples = advance_iter_and_group_samples(validation_iter, data_args.num_eval_samples, max_seq_length)
     steps = tqdm(range(num_train_steps), desc="Training...", position=0)
     docs_progress_bar = tqdm(range(dataset_doc_count * num_epochs), desc="Docs Processed...", position=0)
             training_iter = iter(tokenized_datasets)
+            _, eval_samples = advance_iter_and_group_samples(validation_iter, data_args.num_eval_samples, max_seq_length)
             doc_count, samples = advance_iter_and_group_samples(training_iter, train_batch_size, max_seq_length)

run_stream.sh CHANGED Viewed

@@ -10,11 +10,12 @@ python3 -c "import jax; print(jax.devices())"
     --per_device_train_batch_size="128" \
     --per_device_eval_batch_size="128" \
     --learning_rate="3e-4" \
-    --warmup_steps="1000" \
     --overwrite_output_dir \
     --adam_beta1="0.9" \
     --adam_beta2="0.98" \
     --num_train_steps="10000" \
     --num_eval_samples="5000" \
     --logging_steps="250" \
     --eval_steps="1000"

     --per_device_train_batch_size="128" \
     --per_device_eval_batch_size="128" \
     --learning_rate="3e-4" \
+    --warmup_steps="10000" \
     --overwrite_output_dir \
     --adam_beta1="0.9" \
     --adam_beta2="0.98" \
     --num_train_steps="10000" \
     --num_eval_samples="5000" \
+    --preprocessing_num_workers="90" \
     --logging_steps="250" \
     --eval_steps="1000"

utils.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import regex as re
 import string
-def keep_devnagri(document:str):
     """
     Remove all non Devnagri characters from the text.
     Code adapted from https://huggingface.co/flax-community/roberta-base-mr/blob/64d2c745f264f09c3d5b678a718746b2613887db/mr_clean_text.py
@@ -9,7 +9,6 @@ def keep_devnagri(document:str):
     @param text: str Text to be cleaned
     @return: Union[str, bool]
     """
-    text = document['text']
     pattern = r'[\p{Devanagari}0-9।\s\.\!]+'
     # regex pattern for all puntuation symbols
@@ -24,11 +23,34 @@ def keep_devnagri(document:str):
     # identify if the clean text only consists of punctuation
     is_just_punctuation = len(re.sub(punctuation_regex, "", cleaned)) == 0
-    # to handle the tokenizer as empty string may cause issues
-    # also this only happens for 5 out of 10000 docs, should not
-    # affect the results
-    if is_just_punctuation:
-        document['text'] = " "
     else:
-        document['text'] = cleaned
     return document

 import regex as re
 import string
+def keep_devnagri(text:str):
     """
     Remove all non Devnagri characters from the text.
     Code adapted from https://huggingface.co/flax-community/roberta-base-mr/blob/64d2c745f264f09c3d5b678a718746b2613887db/mr_clean_text.py
     @param text: str Text to be cleaned
     @return: Union[str, bool]
     """
     pattern = r'[\p{Devanagari}0-9।\s\.\!]+'
     # regex pattern for all puntuation symbols
     # identify if the clean text only consists of punctuation
     is_just_punctuation = len(re.sub(punctuation_regex, "", cleaned)) == 0
+    return cleaned, is_just_punctuation
+def keep_devnagri_hf_doc(document):
+    if isinstance(document['text'], str):
+        batched = False
+    elif isinstance(document['text'], list):
+        batched = True
     else:
+        raise TypeError("Document must be a dictionary or list.")
+    def get_clean_text(text):
+        cleaned_text, is_just_punctuation = keep_devnagri(text)
+        # to handle the tokenizer as empty string may cause issues
+        # also this only happens for 5 out of 10000 docs, should not
+        # affect the results
+        cleaned_text = cleaned_text if not is_just_punctuation else " "
+        return cleaned_text
+    if batched:
+        text_ls = document['text']
+        cleaned_text_ls = []
+        for text in text_ls:
+            cleaned_text = get_clean_text(text)
+            cleaned_text_ls.append(cleaned_text)
+        document['text'] = cleaned_text_ls
+    else:
+        text = document['text']
+        cleaned_text = get_clean_text(text)
+        document['text'] = cleaned_text
     return document