Add dataset creation script

Browse files

Files changed (18) hide show

src/__pycache__/data_utils.cpython-38.pyc +0 -0
src/__pycache__/dictionary.cpython-38.pyc +0 -0
src/__pycache__/normalizer.cpython-38.pyc +0 -0
src/create_dataset.py +136 -0
src/data_utils.py +5 -8
src/regexes/__pycache__/__init__.cpython-38.pyc +0 -0
src/regexes/__pycache__/currency.cpython-38.pyc +0 -0
src/regexes/__pycache__/email.cpython-38.pyc +0 -0
src/regexes/__pycache__/latin.cpython-38.pyc +0 -0
src/regexes/__pycache__/number.cpython-38.pyc +0 -0
src/regexes/__pycache__/persian.cpython-38.pyc +0 -0
src/regexes/__pycache__/phone.cpython-38.pyc +0 -0
src/regexes/__pycache__/punk.cpython-38.pyc +0 -0
src/regexes/__pycache__/quote.cpython-38.pyc +0 -0
src/regexes/__pycache__/url.cpython-38.pyc +0 -0
src/run.sh +37 -18
src/run_clm_flax.py +9 -8
src/run_dataset.sh +13 -0

src/__pycache__/data_utils.cpython-38.pyc CHANGED Viewed

Binary files a/src/__pycache__/data_utils.cpython-38.pyc and b/src/__pycache__/data_utils.cpython-38.pyc differ

src/__pycache__/dictionary.cpython-38.pyc CHANGED Viewed

Binary files a/src/__pycache__/dictionary.cpython-38.pyc and b/src/__pycache__/dictionary.cpython-38.pyc differ

src/__pycache__/normalizer.cpython-38.pyc CHANGED Viewed

Binary files a/src/__pycache__/normalizer.cpython-38.pyc and b/src/__pycache__/normalizer.cpython-38.pyc differ

src/create_dataset.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import ast
+import logging
+import os
+import sys
+from dataclasses import dataclass, field
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+from typing import Dict, List, Optional, Tuple
+from datasets import load_dataset
+from transformers import (
+    HfArgumentParser,
+)
+from data_utils import (
+    filter_by_lang_regex,
+    filter_by_num_tokens,
+    filter_by_num_sents,
+    filter_by_adv,
+    normalizer
+)
+logger = logging.getLogger(__name__)
+@dataclass
+class DataArguments:
+    """
+    Arguments to which dataset we are going to set up.
+    """
+    output_dir: str = field(
+        default=".",
+        metadata={"help": "The output directory where the config will be written."},
+    )
+    dataset_name: str = field(
+        default=None,
+        metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
+    )
+def main():
+    parser = HfArgumentParser([DataArguments])
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        data_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))[0]
+    else:
+        data_args = parser.parse_args_into_dataclasses()[0]
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    logger.setLevel(logging.INFO)
+    logger.info(f"Preparing the dataset")
+    if data_args.dataset_name is not None:
+        dataset = load_dataset(
+            data_args.dataset_name,
+            data_args.dataset_config_name,
+            cache_dir=data_args.cache_dir,
+            split="train"
+        )
+    else:
+        data_files = {"train": data_args.train_file}
+        extension = data_args.train_file.split(".")[-1]
+        if extension == "txt":
+            extension = "text"
+        dataset = load_dataset(
+            extension,
+            data_files=data_files,
+            delimiter="\t",
+            cache_dir=data_args.cache_dir,
+        )
+    logger.info(f"dataset: {dataset}")
+    def data_preparation(item_dict):
+        if "text" not in item_dict:
+            return None
+        text = item_dict["text"]
+        status = filter_by_lang_regex(text, ratio=0.75)
+        if not status:
+            return None
+        status = filter_by_num_tokens(text, gt=64)
+        if not status:
+            return None
+        status = filter_by_num_sents(text, gt=2)
+        if not status:
+            return None
+        status = filter_by_adv(text, ratio=50)
+        if not status:
+            return None
+        text = normalizer(text)
+        return {"text": text}
+    data_dict = []
+    for item in tqdm(dataset, position=0, total=len(dataset)):
+        item = data_preparation(item)
+        if item:
+            data_dict.append(item)
+    data_df = pd.DataFrame(data_dict)
+    logger.info(f"Preparation - [before] consists of {len(dataset)} records!")
+    logger.info(f"Preparation - [after]  consists of {len(data_df)} records!")
+    train, test = train_test_split(data_df, test_size=0.01, random_state=101)
+    train = train.reset_index(drop=True)
+    test = test.reset_index(drop=True)
+    logger.info(f"Preparation of [train] set consists of {len(train)} records!")
+    logger.info(f"Preparation of [test] set consists of {len(test)} records!")
+    os.makedirs(data_args.output_dir, exist_ok=True)
+    train.to_csv(os.path.join(data_args.output_dir, "train.csv"), sep="\t", encoding="utf-8", index=False)
+    test.to_csv(os.path.join(data_args.output_dir, "test.csv"), sep="\t", encoding="utf-8", index=False)
+    logger.info(f"Data saved here {data_args.output_dir}")
+if __name__ == '__main__':
+    main()

src/data_utils.py CHANGED Viewed

@@ -32,14 +32,11 @@ def filter_by_adv(text, ratio=50):
     return length_add < ratio
-# def normalizer(text, do_lowercase=False):
-#     text = normalize(text)
-#     if do_lowercase:
-#         text = text.lower()
-#     return text
-def normalizer(example):
-    example["text"] = normalize(example["text"])
-    return example

     return length_add < ratio
+def normalizer(text, do_lowercase=False):
+    text = normalize(text)
+    if do_lowercase:
+        text = text.lower()
+    return text

src/regexes/__pycache__/__init__.cpython-38.pyc CHANGED Viewed

Binary files a/src/regexes/__pycache__/__init__.cpython-38.pyc and b/src/regexes/__pycache__/__init__.cpython-38.pyc differ

src/regexes/__pycache__/currency.cpython-38.pyc CHANGED Viewed

Binary files a/src/regexes/__pycache__/currency.cpython-38.pyc and b/src/regexes/__pycache__/currency.cpython-38.pyc differ

src/regexes/__pycache__/email.cpython-38.pyc CHANGED Viewed

Binary files a/src/regexes/__pycache__/email.cpython-38.pyc and b/src/regexes/__pycache__/email.cpython-38.pyc differ

src/regexes/__pycache__/latin.cpython-38.pyc CHANGED Viewed

Binary files a/src/regexes/__pycache__/latin.cpython-38.pyc and b/src/regexes/__pycache__/latin.cpython-38.pyc differ

src/regexes/__pycache__/number.cpython-38.pyc CHANGED Viewed

Binary files a/src/regexes/__pycache__/number.cpython-38.pyc and b/src/regexes/__pycache__/number.cpython-38.pyc differ

src/regexes/__pycache__/persian.cpython-38.pyc CHANGED Viewed

Binary files a/src/regexes/__pycache__/persian.cpython-38.pyc and b/src/regexes/__pycache__/persian.cpython-38.pyc differ

src/regexes/__pycache__/phone.cpython-38.pyc CHANGED Viewed

Binary files a/src/regexes/__pycache__/phone.cpython-38.pyc and b/src/regexes/__pycache__/phone.cpython-38.pyc differ

src/regexes/__pycache__/punk.cpython-38.pyc CHANGED Viewed

Binary files a/src/regexes/__pycache__/punk.cpython-38.pyc and b/src/regexes/__pycache__/punk.cpython-38.pyc differ

src/regexes/__pycache__/quote.cpython-38.pyc CHANGED Viewed

Binary files a/src/regexes/__pycache__/quote.cpython-38.pyc and b/src/regexes/__pycache__/quote.cpython-38.pyc differ

src/regexes/__pycache__/url.cpython-38.pyc CHANGED Viewed

Binary files a/src/regexes/__pycache__/url.cpython-38.pyc and b/src/regexes/__pycache__/url.cpython-38.pyc differ

src/run.sh CHANGED Viewed

@@ -3,17 +3,17 @@
 export LC_ALL=C.UTF-8
 export LANG=C.UTF-8
-#export MODEL_NAME_OR_PATH=t5-base
-export OUTPUT_DIR=/home/saied/code/gpt2-medium-persian
-export MODEL_TYPE=gpt2
-export CONFIG_NAME=/home/saied/code/gpt2-medium-persian
-export TOKENIZER_NAME=/home/saied/code/gpt2-medium-persian
-#export TRAIN_FILE=/home/saied/code/data/...csv
-#export VALIDATION_FILE=/home/saied/code/data/...csv
-#export TEST_FILE=/home/saied/code/data/...csv
-export DATASET_NAME=oscar
-export DATASET_CONFIG_NAME=unshuffled_deduplicated_fa
 export MAX_SEQUENCE_LENGTH=512
 #export MAX_TRAIN_SAMPLE=5000
@@ -21,8 +21,8 @@ export MAX_SEQUENCE_LENGTH=512
 export PER_DEVICE_TRAIN_BATCH_SIZE=16
 export PER_DEVICE_EVAL_BATCH_SIZE=16
-export NUM_TRAIN_EPOCHS=10.0
-export LEARNING_RATE=1e-3
 export WARMUP_STEPS=5000
 export LOGGING_STEPS=500
 export EVAL_STEPS=2500
@@ -30,11 +30,9 @@ export SAVE_STEPS=2500
 python src/run_clm_flax.py \
     --output_dir="$OUTPUT_DIR"  \
-    --model_type="$MODEL_TYPE" \
-    --config_name="$CONFIG_NAME" \
-    --tokenizer_name="$TOKENIZER_NAME" \
-    --dataset_name="$DATASET_NAME" \
-    --dataset_config_name="$DATASET_CONFIG_NAME" \
     --block_size=$MAX_SEQUENCE_LENGTH \
     --per_device_train_batch_size=$PER_DEVICE_TRAIN_BATCH_SIZE \
     --per_device_eval_batch_size=$PER_DEVICE_EVAL_BATCH_SIZE \
@@ -47,4 +45,25 @@ python src/run_clm_flax.py \
     --do_train \
     --do_eval \
     --overwrite_output_dir \
-    --push_to_hub

 export LC_ALL=C.UTF-8
 export LANG=C.UTF-8
+export MODEL_NAME_OR_PATH=/home/m3hrdadfi/code/gpt2-medium-persian
+export OUTPUT_DIR=/home/m3hrdadfi/code/gpt2-medium-persian
+# export MODEL_TYPE=gpt2
+# export CONFIG_NAME=/home/m3hrdadfi/code/gpt2-medium-persian
+# export TOKENIZER_NAME=/home/m3hrdadfi/code/gpt2-medium-persian
+export TRAIN_FILE=/home/m3hrdadfi/data/train.csv
+export VALIDATION_FILE=/home/m3hrdadfi/data/test.csv
+#export TEST_FILE=/home/m3hrdadfi/code/data/...csv
+# export DATASET_NAME=oscar
+# export DATASET_CONFIG_NAME=unshuffled_deduplicated_fa
 export MAX_SEQUENCE_LENGTH=512
 #export MAX_TRAIN_SAMPLE=5000
 export PER_DEVICE_TRAIN_BATCH_SIZE=16
 export PER_DEVICE_EVAL_BATCH_SIZE=16
+export NUM_TRAIN_EPOCHS=9.0
+export LEARNING_RATE=8e-4
 export WARMUP_STEPS=5000
 export LOGGING_STEPS=500
 export EVAL_STEPS=2500
 python src/run_clm_flax.py \
     --output_dir="$OUTPUT_DIR"  \
+    --model_name_or_path="$MODEL_NAME_OR_PATH" \
+    --train_file="$TRAIN_FILE" \
+    --validation_file="$VALIDATION_FILE" \
     --block_size=$MAX_SEQUENCE_LENGTH \
     --per_device_train_batch_size=$PER_DEVICE_TRAIN_BATCH_SIZE \
     --per_device_eval_batch_size=$PER_DEVICE_EVAL_BATCH_SIZE \
     --do_train \
     --do_eval \
     --overwrite_output_dir \
+    --push_to_hub
+# python src/run_clm_flax.py \
+#     --output_dir="$OUTPUT_DIR"  \
+#     --model_type="$MODEL_TYPE" \
+#     --config_name="$CONFIG_NAME" \
+#     --tokenizer_name="$TOKENIZER_NAME" \
+#     --dataset_name="$DATASET_NAME" \
+#     --dataset_config_name="$DATASET_CONFIG_NAME" \
+#     --block_size=$MAX_SEQUENCE_LENGTH \
+#     --per_device_train_batch_size=$PER_DEVICE_TRAIN_BATCH_SIZE \
+#     --per_device_eval_batch_size=$PER_DEVICE_EVAL_BATCH_SIZE \
+#     --num_train_epochs=$NUM_TRAIN_EPOCHS \
+#     --learning_rate=$LEARNING_RATE \
+#     --warmup_steps=$WARMUP_STEPS \
+#     --logging_step=$LOGGING_STEPS \
+#     --eval_steps=$EVAL_STEPS \
+#     --save_steps=$SAVE_STEPS \
+#     --do_train \
+#     --do_eval \
+#     --overwrite_output_dir \
+#     --push_to_hub

src/run_clm_flax.py CHANGED Viewed

@@ -358,14 +358,15 @@ def main():
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
-    logger.info("Preprocessing the dataset")
-    dataset = raw_dataset.filter(lambda example: filter_by_lang_regex(example["text"], ratio=0.75))
-    dataset = dataset.filter(lambda example: filter_by_num_tokens(example["text"], gt=64))
-    dataset = dataset.filter(lambda example: filter_by_num_sents(example["text"], gt=2))
-    dataset = dataset.filter(lambda example: filter_by_adv(example["text"], ratio=50))
-    dataset = dataset.map(normalizer)
-    logger.info(f"Preprocessed dataset kept {len(dataset)} out of {len(raw_dataset)}")
     # Load pretrained model and tokenizer
     # Distributed training:

     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # logger.info("Preprocessing the dataset")
+    # dataset = raw_dataset.filter(lambda example: filter_by_lang_regex(example["text"], ratio=0.75))
+    # dataset = dataset.filter(lambda example: filter_by_num_tokens(example["text"], gt=64))
+    # dataset = dataset.filter(lambda example: filter_by_num_sents(example["text"], gt=2))
+    # dataset = dataset.filter(lambda example: filter_by_adv(example["text"], ratio=50))
+    # dataset = dataset.map(normalizer)
+    # logger.info(f"Preprocessed dataset kept {len(dataset)} out of {len(raw_dataset)}")
+    dataset = raw_dataset
     # Load pretrained model and tokenizer
     # Distributed training:

src/run_dataset.sh ADDED Viewed

	@@ -0,0 +1,13 @@

+#!/bin/bash
+export LC_ALL=C.UTF-8
+export LANG=C.UTF-8
+export OUTPUT_DIR=/home/m3hrdadfi/data/
+export DATASET_NAME=oscar
+export DATASET_CONFIG_NAME=unshuffled_deduplicated_fa
+python src/create_dataset.py \
+    --output_dir="$OUTPUT_DIR"  \
+    --dataset_name="$DATASET_NAME" \
+    --dataset_config_name="$DATASET_CONFIG_NAME"