vince62s commited on
Commit
57f6742
1 Parent(s): 6c3f7b6

Upload 4 files

Browse files
Files changed (4) hide show
  1. config.json +157 -0
  2. model.00.safetensors +3 -0
  3. sentencepiece.bpe.model +3 -0
  4. vocab.json +0 -0
config.json ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "src_vocab_size": 250880,
3
+ "report_every": 50,
4
+ "save_data": "/media/vincent/Crucial X6/NMT_work/en-de/runs/48-0-32-4096-16384/",
5
+ "skip_empty_level": "silent",
6
+ "decoder_start_token": "<s>",
7
+ "seed": 1234,
8
+ "log_file": "/media/vincent/Crucial X6/NMT_work/en-de/runs/48-0-32-4096-16384-with-estim/48-0-32-4096-16384-with-estim.log",
9
+ "n_sample": 0,
10
+ "tgt_vocab_size": 250880,
11
+ "default_specials": [
12
+ "<unk>",
13
+ "<blank>",
14
+ "<s>",
15
+ "</s>"
16
+ ],
17
+ "model": {
18
+ "rotary_theta": 10000,
19
+ "hidden_size": 4096,
20
+ "transformer_ff": 16384,
21
+ "layers": 48,
22
+ "parallel_residual": false,
23
+ "mlp_activation_fn": "gelu",
24
+ "add_ffnbias": true,
25
+ "add_qkvbias": true,
26
+ "norm_eps": 1e-05,
27
+ "heads": 32,
28
+ "embeddings": {
29
+ "n_positions": 514,
30
+ "word_vec_size": 4096,
31
+ "src_word_vec_size": 4096,
32
+ "position_shift": 2,
33
+ "freeze_word_vecs_enc": true,
34
+ "position_encoding_type": "Learned",
35
+ "tgt_word_vec_size": 4096,
36
+ "position_encoding": true
37
+ },
38
+ "shared_layer_norm": false,
39
+ "num_experts_per_tok": 0,
40
+ "max_relative_positions": 0,
41
+ "heads_kv": 32,
42
+ "num_experts": 0,
43
+ "architecture": "transformer_encoder",
44
+ "sliding_window": 0,
45
+ "share_decoder_embeddings": true,
46
+ "left_pad": false,
47
+ "add_estimator": true,
48
+ "encoder": {
49
+ "encoder_type": "transformer",
50
+ "src_word_vec_size": 4096
51
+ },
52
+ "layer_norm": "standard",
53
+ "rotary_interleave": false,
54
+ "rotary_dim": 0
55
+ },
56
+ "src_vocab": "/mnt/InternalCrucial4/LLM_work/xlm-roberta-xl-eole/dict2.txt",
57
+ "vocab_size_multiple": 1,
58
+ "share_vocab": true,
59
+ "tgt_vocab": null,
60
+ "transforms": [
61
+ "sentencepiece"
62
+ ],
63
+ "transforms_configs": {
64
+ "onmt_tokenize": {},
65
+ "tokendrop": {},
66
+ "bpe": {},
67
+ "filtertoolong": {
68
+ "src_seq_length": 94,
69
+ "tgt_seq_length": 94
70
+ },
71
+ "inlinetags": {},
72
+ "clean": {},
73
+ "suffix": {},
74
+ "docify": {},
75
+ "switchout": {},
76
+ "uppercase": {},
77
+ "terminology": {},
78
+ "sentencepiece": {
79
+ "tgt_subword_model": "/mnt/InternalCrucial4/LLM_work/xlm-roberta-xxl-eole/sentencepiece.bpe.model",
80
+ "src_subword_model": "/mnt/InternalCrucial4/LLM_work/xlm-roberta-xxl-eole/sentencepiece.bpe.model"
81
+ },
82
+ "normalize": {},
83
+ "bart": {},
84
+ "insert_mask_before_placeholder": {},
85
+ "prefix": {},
86
+ "tokenmask": {}
87
+ },
88
+ "training": {
89
+ "world_size": 1,
90
+ "w_bit": 0,
91
+ "group_size": 0,
92
+ "batch_type": "sents",
93
+ "param_init_glorot": true,
94
+ "prefetch_factor": 400,
95
+ "learning_rate_decay": 1.0,
96
+ "decay_steps": 100000,
97
+ "param_init": 0.0,
98
+ "save_checkpoint_steps": 4000,
99
+ "accum_count": [
100
+ 8
101
+ ],
102
+ "num_workers": 2,
103
+ "model_dtype": "fp16",
104
+ "start_decay_steps": 1000000,
105
+ "label_smoothing": 0.1,
106
+ "keep_checkpoint": 50,
107
+ "train_from": "/mnt/InternalCrucial4/LLM_work/xlm-roberta-xxl-eole/",
108
+ "valid_batch_size": 1,
109
+ "estim_loss_lambda_steps": [
110
+ 0
111
+ ],
112
+ "quant_type": "bnb_NF4",
113
+ "batch_size_multiple": 1,
114
+ "attention_dropout": [
115
+ 0.0
116
+ ],
117
+ "learning_rate": 1.5e-05,
118
+ "model_path": "/media/vincent/Crucial X6/NMT_work/en-de/runs/48-0-32-4096-16384-with-estim",
119
+ "batch_size": 8,
120
+ "dropout_steps": [
121
+ 0
122
+ ],
123
+ "dropout": [
124
+ 0.1
125
+ ],
126
+ "score_threshold": 0.0,
127
+ "gpu_ranks": [
128
+ 0
129
+ ],
130
+ "optim": "fusedadam",
131
+ "normalization": "tokens",
132
+ "valid_steps": 1000,
133
+ "train_steps": 4000,
134
+ "adam_beta2": 0.998,
135
+ "decay_method": "none",
136
+ "estim_loss_lambda": [
137
+ 1.0
138
+ ],
139
+ "average_decay": 0.0,
140
+ "accum_steps": [
141
+ 0
142
+ ],
143
+ "quant_layers": [
144
+ "linear_values",
145
+ "linear_query",
146
+ "linear_keys",
147
+ "final_linear",
148
+ "gate_up_proj",
149
+ "down_proj"
150
+ ],
151
+ "max_grad_norm": 1.0,
152
+ "self_attn_backend": "pytorch",
153
+ "freeze_encoder": true,
154
+ "bucket_size": 262144
155
+ },
156
+ "data": {}
157
+ }
model.00.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:570c25be3f39defca5b96f768d5d1bfe5a3587460d1880873722d99d7e0064ef
3
+ size 21423469914
sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
3
+ size 5069051
vocab.json ADDED
The diff for this file is too large to render. See raw diff