saied commited on
Commit
c36ebf7
1 Parent(s): 31bf2aa

pushing tokenizer

Browse files
config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 5,
8
+ "embd_pdrop": 0.1,
9
+ "eos_token_id": 5,
10
+ "gradient_checkpointing": false,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_ctx": 1024,
15
+ "n_embd": 1024,
16
+ "n_head": 16,
17
+ "n_inner": null,
18
+ "n_layer": 24,
19
+ "n_positions": 1024,
20
+ "n_special": 0,
21
+ "predict_special_tokens": true,
22
+ "resid_pdrop": 0.1,
23
+ "scale_attn_weights": true,
24
+ "summary_activation": null,
25
+ "summary_first_dropout": 0.1,
26
+ "summary_proj_to_labels": true,
27
+ "summary_type": "cls_index",
28
+ "summary_use_proj": true,
29
+ "task_specific_params": {
30
+ "text-generation": {
31
+ "do_sample": true,
32
+ "max_length": 50
33
+ }
34
+ },
35
+ "transformers_version": "4.9.0.dev0",
36
+ "use_cache": true,
37
+ "vocab_size": 50000
38
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
src/__pycache__/data_utils.cpython-38.pyc ADDED
Binary file (1.42 kB). View file
 
src/__pycache__/dictionary.cpython-38.pyc ADDED
Binary file (2.06 kB). View file
 
src/__pycache__/normalizer.cpython-38.pyc ADDED
Binary file (3.46 kB). View file
 
src/data_utils.py CHANGED
@@ -32,10 +32,14 @@ def filter_by_adv(text, ratio=50):
32
  return length_add < ratio
33
 
34
 
35
- def normalizer(text, do_lowercase=False):
36
- text = normalize(text)
37
 
38
- if do_lowercase:
39
- text = text.lower()
 
 
 
 
 
40
 
41
- return text
 
32
  return length_add < ratio
33
 
34
 
35
+ # def normalizer(text, do_lowercase=False):
36
+ # text = normalize(text)
37
 
38
+ # if do_lowercase:
39
+ # text = text.lower()
40
+
41
+ # return text
42
+ def normalizer(example):
43
+ example["text"] = normalize(example["text"])
44
+ return example
45
 
 
src/normalizer.py CHANGED
@@ -127,6 +127,7 @@ def normalize(text, zwnj="\u200c", tokenized=False):
127
  return " ".join(tokens)
128
 
129
 
 
130
  if __name__ == '__main__':
131
  import textwrap
132
 
 
127
  return " ".join(tokens)
128
 
129
 
130
+
131
  if __name__ == '__main__':
132
  import textwrap
133
 
src/regexes/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (151 Bytes). View file
 
src/regexes/__pycache__/currency.cpython-38.pyc ADDED
Binary file (674 Bytes). View file
 
src/regexes/__pycache__/email.cpython-38.pyc ADDED
Binary file (465 Bytes). View file
 
src/regexes/__pycache__/latin.cpython-38.pyc ADDED
Binary file (365 Bytes). View file
 
src/regexes/__pycache__/number.cpython-38.pyc ADDED
Binary file (331 Bytes). View file
 
src/regexes/__pycache__/persian.cpython-38.pyc ADDED
Binary file (532 Bytes). View file
 
src/regexes/__pycache__/phone.cpython-38.pyc ADDED
Binary file (361 Bytes). View file
 
src/regexes/__pycache__/punk.cpython-38.pyc ADDED
Binary file (292 Bytes). View file
 
src/regexes/__pycache__/quote.cpython-38.pyc ADDED
Binary file (572 Bytes). View file
 
src/regexes/__pycache__/url.cpython-38.pyc ADDED
Binary file (760 Bytes). View file
 
src/run.sh CHANGED
@@ -4,23 +4,23 @@ export LC_ALL=C.UTF-8
4
  export LANG=C.UTF-8
5
 
6
  #export MODEL_NAME_OR_PATH=t5-base
7
- export OUTPUT_DIR=/home/username/code/gpt2-medium-persian
8
  export MODEL_TYPE=gpt2
9
- export CONFIG_NAME=/home/username/code/gpt2-medium-persian
10
- export TOKENIZER_NAME=/home/username/code/gpt2-medium-persian
11
 
12
- #export TRAIN_FILE=/home/username/code/data/...csv
13
- #export VALIDATION_FILE=/home/username/code/data/...csv
14
- #export TEST_FILE=/home/username/code/data/...csv
15
  export DATASET_NAME=oscar
16
  export DATASET_CONFIG_NAME=unshuffled_deduplicated_fa
17
- export MAX_SEQUENCE_LENGTH=1024
18
 
19
  #export MAX_TRAIN_SAMPLE=5000
20
  #export MAX_EVAL_SAMPLES=5000
21
 
22
- export PER_DEVICE_TRAIN_BATCH_SIZE=8
23
- export PER_DEVICE_EVAL_BATCH_SIZE=8
24
  export NUM_TRAIN_EPOCHS=10.0
25
  export LEARNING_RATE=1e-3
26
  export WARMUP_STEPS=5000
 
4
  export LANG=C.UTF-8
5
 
6
  #export MODEL_NAME_OR_PATH=t5-base
7
+ export OUTPUT_DIR=/home/saied/code/gpt2-medium-persian
8
  export MODEL_TYPE=gpt2
9
+ export CONFIG_NAME=/home/saied/code/gpt2-medium-persian
10
+ export TOKENIZER_NAME=/home/saied/code/gpt2-medium-persian
11
 
12
+ #export TRAIN_FILE=/home/saied/code/data/...csv
13
+ #export VALIDATION_FILE=/home/saied/code/data/...csv
14
+ #export TEST_FILE=/home/saied/code/data/...csv
15
  export DATASET_NAME=oscar
16
  export DATASET_CONFIG_NAME=unshuffled_deduplicated_fa
17
+ export MAX_SEQUENCE_LENGTH=512
18
 
19
  #export MAX_TRAIN_SAMPLE=5000
20
  #export MAX_EVAL_SAMPLES=5000
21
 
22
+ export PER_DEVICE_TRAIN_BATCH_SIZE=16
23
+ export PER_DEVICE_EVAL_BATCH_SIZE=16
24
  export NUM_TRAIN_EPOCHS=10.0
25
  export LEARNING_RATE=1e-3
26
  export WARMUP_STEPS=5000
src/run_clm_flax.py CHANGED
@@ -158,7 +158,7 @@ class DataTrainingArguments:
158
  default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
159
  )
160
  validation_split_percentage: Optional[int] = field(
161
- default=5,
162
  metadata={
163
  "help": "The percentage of the train set used as validation set in case there's no validation split"
164
  },
 
158
  default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
159
  )
160
  validation_split_percentage: Optional[int] = field(
161
+ default=1,
162
  metadata={
163
  "help": "The percentage of the train set used as validation set in case there's no validation split"
164
  },
src/run_config.sh CHANGED
@@ -3,11 +3,11 @@
3
  export LC_ALL=C.UTF-8
4
  export LANG=C.UTF-8
5
 
6
- export OUTPUT_DIR=./
7
- #export OUTPUT_DIR=/home/username/code/gpt2-medium-persian
8
  export NAME_OR_PATH=gpt2-medium
9
 
10
  python src/create_config.py \
11
  --output_dir="$OUTPUT_DIR" \
12
  --name_or_path="$NAME_OR_PATH" \
13
- --params='{"vocab_size": 50000}'
 
3
  export LC_ALL=C.UTF-8
4
  export LANG=C.UTF-8
5
 
6
+ # export OUTPUT_DIR=./
7
+ export OUTPUT_DIR=/home/saied/code/gpt2-medium-persian
8
  export NAME_OR_PATH=gpt2-medium
9
 
10
  python src/create_config.py \
11
  --output_dir="$OUTPUT_DIR" \
12
  --name_or_path="$NAME_OR_PATH" \
13
+ --params='{"vocab_size": 50000,"bos_token_id": 5,"eos_token_id": 5}'
src/run_tokenizer.sh CHANGED
@@ -3,7 +3,7 @@
3
  export LC_ALL=C.UTF-8
4
  export LANG=C.UTF-8
5
 
6
- export OUTPUT_DIR=/home/username/code/gpt2-medium-persian
7
  export DATASET_NAME=oscar
8
  export DATASET_CONFIG_NAME=unshuffled_deduplicated_fa
9
  export VOCAB_SIZE=50000
 
3
  export LC_ALL=C.UTF-8
4
  export LANG=C.UTF-8
5
 
6
+ export OUTPUT_DIR=/home/saied/code/gpt2-medium-persian
7
  export DATASET_NAME=oscar
8
  export DATASET_CONFIG_NAME=unshuffled_deduplicated_fa
9
  export VOCAB_SIZE=50000
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
vocab.json ADDED
The diff for this file is too large to render. See raw diff