ecker commited on
Commit
e5e7575
1 Parent(s): 796c86d

Upload 2 files

Browse files

Very, very early NAR-len model (that works enough but not good enough so far)

.gitattributes CHANGED
@@ -18,3 +18,4 @@ loras/ckpt/lora-cyberpunk-takemura-r128-a128/lora.sft filter=lfs diff=lfs merge=
18
  loras/ckpt/lora-portal-glados-r128-a128/lora.sft filter=lfs diff=lfs merge=lfs -text
19
  loras/ckpt/lora-samandmax-sam-r128-a128/lora.sft filter=lfs diff=lfs merge=lfs -text
20
  models/ckpt/ar+nar-layerskip-llama-8/fp32.sft filter=lfs diff=lfs merge=lfs -text
 
 
18
  loras/ckpt/lora-portal-glados-r128-a128/lora.sft filter=lfs diff=lfs merge=lfs -text
19
  loras/ckpt/lora-samandmax-sam-r128-a128/lora.sft filter=lfs diff=lfs merge=lfs -text
20
  models/ckpt/ar+nar-layerskip-llama-8/fp32.sft filter=lfs diff=lfs merge=lfs -text
21
+ models/ckpt/nar-len-llama-8/fp32.sft filter=lfs diff=lfs merge=lfs -text
models/ckpt/nar-len-llama-8/fp32.sft ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02bd16380e556e4e4a41b3d45ee3a64bf439e6b5bd24f39a9e0c684155fd4aaf
3
+ size 475711612
models/config.llama[nar-len].yaml ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ sample_rate: 24_000
2
+ audio_backend: "vocos"
3
+ weights_format: sft
4
+ experimental: True
5
+
6
+ models:
7
+ - name: "nar-len"
8
+ size: "full"
9
+ resp_levels: 8
10
+ tasks: 9
11
+ langs: 4
12
+ tones: 1
13
+ arch_type: llama
14
+ training: True
15
+ version: 5
16
+ attention: sdpa
17
+ dropout: 0.1
18
+ #loss_factors:
19
+ # text: 0.01
20
+ # prom: 0.5
21
+ # resp: 1.0
22
+ capabilities: ["ar", "nar", "len"]
23
+ experimental:
24
+ audio_embedding_sums: True
25
+ split_classifiers: True
26
+ unified_position_ids: False
27
+ rvq_levels_p: [
28
+ 0, 0, 0, 0, 0, 0, 0,
29
+ 0, 0, 0, 0, 0, 0, 0,
30
+ 1, 2, 3, 4, 5, 6, 7
31
+ ]
32
+
33
+ masking_train_p: 1.0
34
+ masking_ratio_fixed: True
35
+ ignore_inputs_for_loss: True
36
+
37
+ cfg_cond_dropout_p: 0.1
38
+ cfg_prom_dropout_p: 0.05
39
+
40
+ #token_dropout_error: 0.001
41
+ #token_dropout_rate: 0.001
42
+ #layerskip: True
43
+ #layerskip_r: 2
44
+ #layerskip_e_scale: 0.1
45
+
46
+ #loras:
47
+ #- name : "lora-shodan"
48
+ # rank: 128
49
+ # alpha: 128
50
+ # training: True
51
+ # rvq_levels: []
52
+
53
+ hyperparameters:
54
+ batch_size: 32
55
+ gradient_accumulation_steps: 4 # 8
56
+ gradient_clipping: 1.0
57
+ warmup_steps: 10
58
+
59
+ optimizer: Prodigy
60
+ learning_rate: 1.0
61
+ torch_optimizer: True
62
+
63
+ scheduler: "" # ScheduleFree
64
+ torch_scheduler: True
65
+
66
+ evaluation:
67
+ batch_size: 8
68
+ frequency: 500
69
+ size: 8
70
+
71
+ kwargs:
72
+ max_duration: 500
73
+ max_steps: 25
74
+ ar_temperature: 1.0
75
+ repetition_penalty: 1.0
76
+ cfg_strength: 1.0
77
+ nar_temperature: 0.0
78
+
79
+ trainer:
80
+ iterations: 1_000_000
81
+ save_frequency: 250
82
+ keep_last_checkpoints: 4
83
+
84
+ resize_modules: True
85
+
86
+ check_for_oom: False
87
+ gradient_checkpointing: True
88
+
89
+ weight_dtype: float16
90
+ amp: True
91
+
92
+ backend: deepspeed
93
+ deepspeed:
94
+ inferencing: False
95
+ amp: False
96
+ loss_scale_window: 250
97
+ min_loss_scale: 32768
98
+
99
+ load_webui: False
100
+
101
+ inference:
102
+ backend: local
103
+ normalize: False
104
+
105
+ weight_dtype: float16
106
+ amp: True
107
+
108
+ optimizations:
109
+ injects: False
110
+ replace: True
111
+
112
+ linear: False
113
+ embedding: False
114
+ optimizers: True
115
+
116
+ bitsandbytes: False
117
+ dadaptation: False
118
+ bitnet: False
119
+ fp8: False
120
+
121
+ dataset:
122
+ speaker_name_getter: "lambda p: f'{p.parts[-3]}_{p.parts[-2]}'"
123
+ speaker_group_getter: "lambda p: f'{p.parts[-3]}'"
124
+ speaker_languages:
125
+ ja: [
126
+ "housamo",
127
+ "JA-"
128
+ ]
129
+ de: [
130
+ "DE-"
131
+ ]
132
+ fr: [
133
+ "FR-"
134
+ ]
135
+
136
+ use_hdf5: True
137
+ hdf5_flag: r
138
+
139
+ use_metadata: True
140
+ validate: True
141
+
142
+ workers: 2
143
+ cache: True
144
+
145
+ duration_range: [1.0, 16.0]
146
+
147
+ prompt_max_samples: 1
148
+ prompt_duration_range: [1.0, 6.0]
149
+ prompt_similar_p: 0.825
150
+ prompt_similar_top_k: 6
151
+
152
+ resps_max_samples: 1
153
+ resps_append_p: 0.0
154
+
155
+ sample_type: path # path # speaker
156
+ sample_order: duration
157
+ sample_max_duration_batch: 120
158
+ sample_shuffle: True
159
+ retokenize_text: True
160
+
161
+ tasks_list: [
162
+ "tts", "tts", "tts", "tts", "tts", "tts", "tts",
163
+ "tts", "tts", "tts", "tts", "tts", "tts", "tts",
164
+ "len",
165
+ ] #, "stt", "tts-c", "ns", "sr" ]
166
+
167
+ training: []
168
+ validation: []
169
+ noise: []