mrm8488 commited on
Commit
b20453f
1 Parent(s): 1c2e798

First commit

Browse files
.gitattributes CHANGED
@@ -14,3 +14,4 @@
14
  *.pb filter=lfs diff=lfs merge=lfs -text
15
  *.pt filter=lfs diff=lfs merge=lfs -text
16
  *.pth filter=lfs diff=lfs merge=lfs -text
 
 
14
  *.pb filter=lfs diff=lfs merge=lfs -text
15
  *.pt filter=lfs diff=lfs merge=lfs -text
16
  *.pth filter=lfs diff=lfs merge=lfs -text
17
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.run.sh.swp ADDED
File without changes
config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.0,
7
+ "bos_token_id": 50256,
8
+ "embd_pdrop": 0.0,
9
+ "eos_token_id": 50256,
10
+ "gradient_checkpointing": false,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_ctx": 1024,
15
+ "n_embd": 768,
16
+ "n_head": 12,
17
+ "n_inner": null,
18
+ "n_layer": 12,
19
+ "n_positions": 1024,
20
+ "resid_pdrop": 0.0,
21
+ "scale_attn_weights": true,
22
+ "summary_activation": null,
23
+ "summary_first_dropout": 0.1,
24
+ "summary_proj_to_labels": true,
25
+ "summary_type": "cls_index",
26
+ "summary_use_proj": true,
27
+ "task_specific_params": {
28
+ "text-generation": {
29
+ "do_sample": true,
30
+ "max_length": 50
31
+ }
32
+ },
33
+ "transformers_version": "4.9.0.dev0",
34
+ "use_cache": true,
35
+ "vocab_size": 50257
36
+ }
events.out.tfevents.1626191863.t1v-n-cf89aecf-w-0.861739.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f5a6cef1983c8dcbf34fc6d98bc192078bb1949efeea81672ee0edcf125fbd7
3
+ size 367772
events.out.tfevents.1626201019.t1v-n-cf89aecf-w-0.880320.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0c3cf7a5e63d4a77d3e7d14b72a3f38a9e604efcb0e78faa3ece56e96707932
3
+ size 735617
events.out.tfevents.1626210976.t1v-n-cf89aecf-w-0.890224.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:116109fbd23b28c6a4dc9ea828729181901502ced8d7c20ce220b9438ddc4341
3
+ size 73496
events.out.tfevents.1626212077.t1v-n-cf89aecf-w-0.892752.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8d395a87948d65cb9d66207f2321ec9b3cfa06afe2d800d2b246fb3aa6385eb
3
+ size 11003489
events.out.tfevents.1626343565.t1v-n-cf89aecf-w-0.1005830.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a13db211fb1e8fcab57645f6987bab32b307ade0e149d5954194e5e433200c12
3
+ size 1397738
flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a17e423adc844f2aaf44323c6738776964849df6d2ae7d0c0165bdd2267fbae1
3
+ size 497764120
run.sh ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ./run_clm_flax.py \
2
+ --output_dir="./" \
3
+ --model_type="gpt2" \
4
+ --model_name_or_path="./" \
5
+ --config_name="./" \
6
+ --tokenizer_name="./" \
7
+ --dataset_name="oscar" \
8
+ --dataset_config_name="unshuffled_deduplicated_es" \
9
+ --do_train \
10
+ --do_eval \
11
+ --block_size="512" \
12
+ --per_device_train_batch_size="64" \
13
+ --per_device_eval_batch_size="64" \
14
+ --learning_rate="5e-3" --warmup_steps="1000" \
15
+ --adam_beta1="0.9" --adam_beta2="0.98" --weight_decay="0.01" \
16
+ --overwrite_output_dir \
17
+ --num_train_epochs="20" \
18
+ --logging_steps="500" \
19
+ --save_steps="2500" \
20
+ --eval_steps="1000000" \
21
+ --preprocessing_num_workers="64"
22
+
run_clm_flax.py ADDED
@@ -0,0 +1 @@
 
 
1
+ ../../examples/flax/language-modeling/run_clm_flax.py
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
train_tokenizer.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from datasets import load_dataset
3
+ from tokenizers import ByteLevelBPETokenizer
4
+ # load dataset
5
+ dataset = load_dataset("oscar", "unshuffled_deduplicated_es", split="train")
6
+ # Instantiate tokenizer
7
+ tokenizer = ByteLevelBPETokenizer()
8
+ def batch_iterator(batch_size=10000):
9
+ for i in range(0, len(dataset), batch_size):
10
+ yield dataset[i: i + batch_size]["text"]
11
+ # Customized training
12
+ tokenizer.train_from_iterator(batch_iterator(), vocab_size=50265, min_frequency=2, special_tokens=[
13
+ "<s>",
14
+ "<pad>",
15
+ "</s>",
16
+ "<unk>",
17
+ "<mask>",
18
+ ])
19
+ # Save files to disk
20
+ tokenizer.save("./tokenizer.json")