m3hrdadfi commited on
Commit
c92ce97
1 Parent(s): ca87603

Add dataset creation script

Browse files
src/__pycache__/data_utils.cpython-38.pyc CHANGED
Binary files a/src/__pycache__/data_utils.cpython-38.pyc and b/src/__pycache__/data_utils.cpython-38.pyc differ
 
src/__pycache__/dictionary.cpython-38.pyc CHANGED
Binary files a/src/__pycache__/dictionary.cpython-38.pyc and b/src/__pycache__/dictionary.cpython-38.pyc differ
 
src/__pycache__/normalizer.cpython-38.pyc CHANGED
Binary files a/src/__pycache__/normalizer.cpython-38.pyc and b/src/__pycache__/normalizer.cpython-38.pyc differ
 
src/create_dataset.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast
2
+ import logging
3
+ import os
4
+ import sys
5
+ from dataclasses import dataclass, field
6
+ import pandas as pd
7
+ from sklearn.model_selection import train_test_split
8
+ from tqdm import tqdm
9
+ from typing import Dict, List, Optional, Tuple
10
+ from datasets import load_dataset
11
+ from transformers import (
12
+ HfArgumentParser,
13
+ )
14
+ from data_utils import (
15
+ filter_by_lang_regex,
16
+ filter_by_num_tokens,
17
+ filter_by_num_sents,
18
+ filter_by_adv,
19
+ normalizer
20
+ )
21
+
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ @dataclass
27
+ class DataArguments:
28
+ """
29
+ Arguments to which dataset we are going to set up.
30
+ """
31
+ output_dir: str = field(
32
+ default=".",
33
+ metadata={"help": "The output directory where the config will be written."},
34
+ )
35
+ dataset_name: str = field(
36
+ default=None,
37
+ metadata={"help": "The name of the dataset to use (via the datasets library)."}
38
+ )
39
+ dataset_config_name: Optional[str] = field(
40
+ default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
41
+ )
42
+ train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
43
+ cache_dir: Optional[str] = field(
44
+ default=None,
45
+ metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
46
+ )
47
+ def main():
48
+ parser = HfArgumentParser([DataArguments])
49
+ if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
50
+ # If we pass only one argument to the script and it's the path to a json file,
51
+ # let's parse it to get our arguments.
52
+ data_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))[0]
53
+ else:
54
+ data_args = parser.parse_args_into_dataclasses()[0]
55
+ # Setup logging
56
+ logging.basicConfig(
57
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
58
+ datefmt="%m/%d/%Y %H:%M:%S",
59
+ handlers=[logging.StreamHandler(sys.stdout)],
60
+ )
61
+ logger.setLevel(logging.INFO)
62
+ logger.info(f"Preparing the dataset")
63
+ if data_args.dataset_name is not None:
64
+ dataset = load_dataset(
65
+ data_args.dataset_name,
66
+ data_args.dataset_config_name,
67
+ cache_dir=data_args.cache_dir,
68
+ split="train"
69
+ )
70
+ else:
71
+ data_files = {"train": data_args.train_file}
72
+ extension = data_args.train_file.split(".")[-1]
73
+ if extension == "txt":
74
+ extension = "text"
75
+
76
+ dataset = load_dataset(
77
+ extension,
78
+ data_files=data_files,
79
+ delimiter="\t",
80
+ cache_dir=data_args.cache_dir,
81
+ )
82
+
83
+ logger.info(f"dataset: {dataset}")
84
+
85
+ def data_preparation(item_dict):
86
+ if "text" not in item_dict:
87
+ return None
88
+
89
+ text = item_dict["text"]
90
+
91
+ status = filter_by_lang_regex(text, ratio=0.75)
92
+ if not status:
93
+ return None
94
+
95
+ status = filter_by_num_tokens(text, gt=64)
96
+ if not status:
97
+ return None
98
+
99
+ status = filter_by_num_sents(text, gt=2)
100
+ if not status:
101
+ return None
102
+
103
+ status = filter_by_adv(text, ratio=50)
104
+ if not status:
105
+ return None
106
+
107
+ text = normalizer(text)
108
+ return {"text": text}
109
+
110
+ data_dict = []
111
+ for item in tqdm(dataset, position=0, total=len(dataset)):
112
+ item = data_preparation(item)
113
+
114
+ if item:
115
+ data_dict.append(item)
116
+
117
+ data_df = pd.DataFrame(data_dict)
118
+
119
+ logger.info(f"Preparation - [before] consists of {len(dataset)} records!")
120
+ logger.info(f"Preparation - [after] consists of {len(data_df)} records!")
121
+
122
+ train, test = train_test_split(data_df, test_size=0.01, random_state=101)
123
+
124
+ train = train.reset_index(drop=True)
125
+ test = test.reset_index(drop=True)
126
+
127
+ logger.info(f"Preparation of [train] set consists of {len(train)} records!")
128
+ logger.info(f"Preparation of [test] set consists of {len(test)} records!")
129
+
130
+ os.makedirs(data_args.output_dir, exist_ok=True)
131
+ train.to_csv(os.path.join(data_args.output_dir, "train.csv"), sep="\t", encoding="utf-8", index=False)
132
+ test.to_csv(os.path.join(data_args.output_dir, "test.csv"), sep="\t", encoding="utf-8", index=False)
133
+ logger.info(f"Data saved here {data_args.output_dir}")
134
+
135
+ if __name__ == '__main__':
136
+ main()
src/data_utils.py CHANGED
@@ -32,14 +32,11 @@ def filter_by_adv(text, ratio=50):
32
  return length_add < ratio
33
 
34
 
35
- # def normalizer(text, do_lowercase=False):
36
- # text = normalize(text)
37
 
38
- # if do_lowercase:
39
- # text = text.lower()
40
 
41
- # return text
42
- def normalizer(example):
43
- example["text"] = normalize(example["text"])
44
- return example
45
 
 
32
  return length_add < ratio
33
 
34
 
35
+ def normalizer(text, do_lowercase=False):
36
+ text = normalize(text)
37
 
38
+ if do_lowercase:
39
+ text = text.lower()
40
 
41
+ return text
 
 
 
42
 
src/regexes/__pycache__/__init__.cpython-38.pyc CHANGED
Binary files a/src/regexes/__pycache__/__init__.cpython-38.pyc and b/src/regexes/__pycache__/__init__.cpython-38.pyc differ
 
src/regexes/__pycache__/currency.cpython-38.pyc CHANGED
Binary files a/src/regexes/__pycache__/currency.cpython-38.pyc and b/src/regexes/__pycache__/currency.cpython-38.pyc differ
 
src/regexes/__pycache__/email.cpython-38.pyc CHANGED
Binary files a/src/regexes/__pycache__/email.cpython-38.pyc and b/src/regexes/__pycache__/email.cpython-38.pyc differ
 
src/regexes/__pycache__/latin.cpython-38.pyc CHANGED
Binary files a/src/regexes/__pycache__/latin.cpython-38.pyc and b/src/regexes/__pycache__/latin.cpython-38.pyc differ
 
src/regexes/__pycache__/number.cpython-38.pyc CHANGED
Binary files a/src/regexes/__pycache__/number.cpython-38.pyc and b/src/regexes/__pycache__/number.cpython-38.pyc differ
 
src/regexes/__pycache__/persian.cpython-38.pyc CHANGED
Binary files a/src/regexes/__pycache__/persian.cpython-38.pyc and b/src/regexes/__pycache__/persian.cpython-38.pyc differ
 
src/regexes/__pycache__/phone.cpython-38.pyc CHANGED
Binary files a/src/regexes/__pycache__/phone.cpython-38.pyc and b/src/regexes/__pycache__/phone.cpython-38.pyc differ
 
src/regexes/__pycache__/punk.cpython-38.pyc CHANGED
Binary files a/src/regexes/__pycache__/punk.cpython-38.pyc and b/src/regexes/__pycache__/punk.cpython-38.pyc differ
 
src/regexes/__pycache__/quote.cpython-38.pyc CHANGED
Binary files a/src/regexes/__pycache__/quote.cpython-38.pyc and b/src/regexes/__pycache__/quote.cpython-38.pyc differ
 
src/regexes/__pycache__/url.cpython-38.pyc CHANGED
Binary files a/src/regexes/__pycache__/url.cpython-38.pyc and b/src/regexes/__pycache__/url.cpython-38.pyc differ
 
src/run.sh CHANGED
@@ -3,17 +3,17 @@
3
  export LC_ALL=C.UTF-8
4
  export LANG=C.UTF-8
5
 
6
- #export MODEL_NAME_OR_PATH=t5-base
7
- export OUTPUT_DIR=/home/saied/code/gpt2-medium-persian
8
- export MODEL_TYPE=gpt2
9
- export CONFIG_NAME=/home/saied/code/gpt2-medium-persian
10
- export TOKENIZER_NAME=/home/saied/code/gpt2-medium-persian
11
 
12
- #export TRAIN_FILE=/home/saied/code/data/...csv
13
- #export VALIDATION_FILE=/home/saied/code/data/...csv
14
- #export TEST_FILE=/home/saied/code/data/...csv
15
- export DATASET_NAME=oscar
16
- export DATASET_CONFIG_NAME=unshuffled_deduplicated_fa
17
  export MAX_SEQUENCE_LENGTH=512
18
 
19
  #export MAX_TRAIN_SAMPLE=5000
@@ -21,8 +21,8 @@ export MAX_SEQUENCE_LENGTH=512
21
 
22
  export PER_DEVICE_TRAIN_BATCH_SIZE=16
23
  export PER_DEVICE_EVAL_BATCH_SIZE=16
24
- export NUM_TRAIN_EPOCHS=10.0
25
- export LEARNING_RATE=1e-3
26
  export WARMUP_STEPS=5000
27
  export LOGGING_STEPS=500
28
  export EVAL_STEPS=2500
@@ -30,11 +30,9 @@ export SAVE_STEPS=2500
30
 
31
  python src/run_clm_flax.py \
32
  --output_dir="$OUTPUT_DIR" \
33
- --model_type="$MODEL_TYPE" \
34
- --config_name="$CONFIG_NAME" \
35
- --tokenizer_name="$TOKENIZER_NAME" \
36
- --dataset_name="$DATASET_NAME" \
37
- --dataset_config_name="$DATASET_CONFIG_NAME" \
38
  --block_size=$MAX_SEQUENCE_LENGTH \
39
  --per_device_train_batch_size=$PER_DEVICE_TRAIN_BATCH_SIZE \
40
  --per_device_eval_batch_size=$PER_DEVICE_EVAL_BATCH_SIZE \
@@ -47,4 +45,25 @@ python src/run_clm_flax.py \
47
  --do_train \
48
  --do_eval \
49
  --overwrite_output_dir \
50
- --push_to_hub
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  export LC_ALL=C.UTF-8
4
  export LANG=C.UTF-8
5
 
6
+ export MODEL_NAME_OR_PATH=/home/m3hrdadfi/code/gpt2-medium-persian
7
+ export OUTPUT_DIR=/home/m3hrdadfi/code/gpt2-medium-persian
8
+ # export MODEL_TYPE=gpt2
9
+ # export CONFIG_NAME=/home/m3hrdadfi/code/gpt2-medium-persian
10
+ # export TOKENIZER_NAME=/home/m3hrdadfi/code/gpt2-medium-persian
11
 
12
+ export TRAIN_FILE=/home/m3hrdadfi/data/train.csv
13
+ export VALIDATION_FILE=/home/m3hrdadfi/data/test.csv
14
+ #export TEST_FILE=/home/m3hrdadfi/code/data/...csv
15
+ # export DATASET_NAME=oscar
16
+ # export DATASET_CONFIG_NAME=unshuffled_deduplicated_fa
17
  export MAX_SEQUENCE_LENGTH=512
18
 
19
  #export MAX_TRAIN_SAMPLE=5000
 
21
 
22
  export PER_DEVICE_TRAIN_BATCH_SIZE=16
23
  export PER_DEVICE_EVAL_BATCH_SIZE=16
24
+ export NUM_TRAIN_EPOCHS=9.0
25
+ export LEARNING_RATE=8e-4
26
  export WARMUP_STEPS=5000
27
  export LOGGING_STEPS=500
28
  export EVAL_STEPS=2500
 
30
 
31
  python src/run_clm_flax.py \
32
  --output_dir="$OUTPUT_DIR" \
33
+ --model_name_or_path="$MODEL_NAME_OR_PATH" \
34
+ --train_file="$TRAIN_FILE" \
35
+ --validation_file="$VALIDATION_FILE" \
 
 
36
  --block_size=$MAX_SEQUENCE_LENGTH \
37
  --per_device_train_batch_size=$PER_DEVICE_TRAIN_BATCH_SIZE \
38
  --per_device_eval_batch_size=$PER_DEVICE_EVAL_BATCH_SIZE \
 
45
  --do_train \
46
  --do_eval \
47
  --overwrite_output_dir \
48
+ --push_to_hub
49
+
50
+ # python src/run_clm_flax.py \
51
+ # --output_dir="$OUTPUT_DIR" \
52
+ # --model_type="$MODEL_TYPE" \
53
+ # --config_name="$CONFIG_NAME" \
54
+ # --tokenizer_name="$TOKENIZER_NAME" \
55
+ # --dataset_name="$DATASET_NAME" \
56
+ # --dataset_config_name="$DATASET_CONFIG_NAME" \
57
+ # --block_size=$MAX_SEQUENCE_LENGTH \
58
+ # --per_device_train_batch_size=$PER_DEVICE_TRAIN_BATCH_SIZE \
59
+ # --per_device_eval_batch_size=$PER_DEVICE_EVAL_BATCH_SIZE \
60
+ # --num_train_epochs=$NUM_TRAIN_EPOCHS \
61
+ # --learning_rate=$LEARNING_RATE \
62
+ # --warmup_steps=$WARMUP_STEPS \
63
+ # --logging_step=$LOGGING_STEPS \
64
+ # --eval_steps=$EVAL_STEPS \
65
+ # --save_steps=$SAVE_STEPS \
66
+ # --do_train \
67
+ # --do_eval \
68
+ # --overwrite_output_dir \
69
+ # --push_to_hub
src/run_clm_flax.py CHANGED
@@ -358,14 +358,15 @@ def main():
358
 
359
  # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
360
  # https://huggingface.co/docs/datasets/loading_datasets.html.
361
- logger.info("Preprocessing the dataset")
362
- dataset = raw_dataset.filter(lambda example: filter_by_lang_regex(example["text"], ratio=0.75))
363
- dataset = dataset.filter(lambda example: filter_by_num_tokens(example["text"], gt=64))
364
- dataset = dataset.filter(lambda example: filter_by_num_sents(example["text"], gt=2))
365
- dataset = dataset.filter(lambda example: filter_by_adv(example["text"], ratio=50))
366
- dataset = dataset.map(normalizer)
367
- logger.info(f"Preprocessed dataset kept {len(dataset)} out of {len(raw_dataset)}")
368
-
 
369
  # Load pretrained model and tokenizer
370
 
371
  # Distributed training:
 
358
 
359
  # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
360
  # https://huggingface.co/docs/datasets/loading_datasets.html.
361
+ # logger.info("Preprocessing the dataset")
362
+ # dataset = raw_dataset.filter(lambda example: filter_by_lang_regex(example["text"], ratio=0.75))
363
+ # dataset = dataset.filter(lambda example: filter_by_num_tokens(example["text"], gt=64))
364
+ # dataset = dataset.filter(lambda example: filter_by_num_sents(example["text"], gt=2))
365
+ # dataset = dataset.filter(lambda example: filter_by_adv(example["text"], ratio=50))
366
+ # dataset = dataset.map(normalizer)
367
+ # logger.info(f"Preprocessed dataset kept {len(dataset)} out of {len(raw_dataset)}")
368
+ dataset = raw_dataset
369
+
370
  # Load pretrained model and tokenizer
371
 
372
  # Distributed training:
src/run_dataset.sh ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ export LC_ALL=C.UTF-8
4
+ export LANG=C.UTF-8
5
+
6
+ export OUTPUT_DIR=/home/m3hrdadfi/data/
7
+ export DATASET_NAME=oscar
8
+ export DATASET_CONFIG_NAME=unshuffled_deduplicated_fa
9
+
10
+ python src/create_dataset.py \
11
+ --output_dir="$OUTPUT_DIR" \
12
+ --dataset_name="$DATASET_NAME" \
13
+ --dataset_config_name="$DATASET_CONFIG_NAME"