File size: 2,185 Bytes
3fe76d3 db6b323 3fe76d3 db6b323 3fe76d3 db6b323 3fe76d3 db6b323 3fe76d3 db6b323 3fe76d3 db6b323 3fe76d3 db6b323 3fe76d3 db6b323 3fe76d3 db6b323 3fe76d3 db6b323 3fe76d3 db6b323 3fe76d3 db6b323 3fe76d3 db6b323 3fe76d3 db6b323 3fe76d3 db6b323 3fe76d3 db6b323 3fe76d3 db6b323 3fe76d3 db6b323 3fe76d3 db6b323 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
sample_rate: 44_000
audio_backend: "dac"
models:
- name: "nar-len"
size:
audio_tokens: 1024
text_tokens: 256
dim: 1024
heads: 16
layers: 16
resp_levels: 9
prom_levels: 9
tasks: 8
langs: 2
tones: 1
arch_type: llama
training: True
version: 5
attention: flash_attention_2
dropout: 0.1
#loss_factors:
# text: 0.01
# prom: 0.5
# resp: 1.0
# len: 1.0
capabilities: ["nar", "len"]
experimental:
audio_embedding_sums: False
interleave: False
unified_position_ids: True
rvq_level_range: []
split_classifiers: True
tie_classifier_to_embedding: False
#loras:
#- name : "lora-test"
# rank: 128
# alpha: 128
# training: True
# rvq_levels: []
hyperparameters:
batch_size: 16
gradient_accumulation_steps: 4
gradient_clipping: 1.0
warmup_steps: 10
optimizer: Prodigy
learning_rate: 1.0
torch_optimizer: True
scheduler: "" # ScheduleFree
torch_scheduler: True
evaluation:
batch_size: 4
frequency: 250
size: 4
steps: 500
ar_temperature: 1.0
nar_temperature: 0.0
trainer:
iterations: 1_000_000
save_frequency: 250
keep_last_checkpoints: 4
check_for_oom: False
gradient_checkpointing: False
weight_dtype: bfloat16
amp: False
backend: deepspeed
deepspeed:
inferencing: False
amp: False
load_webui: False
inference:
backend: local
normalize: False
weight_dtype: bfloat16
amp: False
optimizations:
injects: False
replace: True
linear: False
embedding: False
optimizers: True
bitsandbytes: False
dadaptation: False
bitnet: False
fp8: False
dataset:
speaker_name_getter: "lambda p: f'{p.parts[-3]}_{p.parts[-2]}'"
speaker_group_getter: "lambda p: f'{p.parts[-3]}'"
use_hdf5: True
hdf5_flag: r
use_metadata: True
validate: True
workers: 1
cache: False
duration_range: [3.0, 24.0]
random_utterance: 1.0
max_prompts: 1
prompt_duration_range: [3.0, 3.0]
max_resps: 1
p_resp_append: 0.25
sample_type: path # path # speaker
sample_order: duration
sample_max_duration_batch: 100
tasks_list: [ "tts" ] #, "tts-c", "ns", "sr" ]
training: []
validation: []
noise: []
|