File size: 2,165 Bytes
67ce2fc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
sample_rate: 24_000
audio_backend: "vocos"
models:
- name: "ar+nar-tts+stt"
size: "full"
resp_levels: 8
prom_levels: 8
tasks: 9
langs: 2
tones: 1
arch_type: llama
training: False
version: 5
attention: auto
dropout: 0.1
#loss_factors:
# text: 0.01
# prom: 0.5
# resp: 1.0
capabilities: ["ar", "nar"]
experimental:
p_rvq_levels: "auto"
audio_embedding_sums: True
unified_position_ids: False
split_classifiers: True
#
causal_size: 1
interleave: False
rvq_level_range: []
tie_classifier_to_embedding: False
loras:
- name : "lora-max"
rank: 128
alpha: 128
training: True
rvq_levels: []
hyperparameters:
batch_size: 32
gradient_accumulation_steps: 8
gradient_clipping: 1.0
warmup_steps: 10
optimizer: Prodigy
learning_rate: 1.0
torch_optimizer: True
scheduler: "" # ScheduleFree
torch_scheduler: True
evaluation:
batch_size: 4
frequency: 250
size: 4
steps: 500
ar_temperature: 1.0
nar_temperature: 0.0
trainer:
iterations: 1_000_000
save_frequency: 250
keep_last_checkpoints: 4
resize_modules: True
check_for_oom: False
gradient_checkpointing: True
weight_dtype: bfloat16
amp: True
backend: deepspeed
deepspeed:
inferencing: False
amp: False
load_webui: False
inference:
backend: local
normalize: False
weight_dtype: bfloat16
amp: True
optimizations:
injects: False
replace: True
linear: False
embedding: False
optimizers: True
bitsandbytes: False
dadaptation: False
bitnet: False
fp8: False
dataset:
use_hdf5: True
hdf5_flag: r
use_metadata: True
validate: True
workers: 1
cache: True
duration_range: [3.0, 12.0]
random_utterance: 1.0
max_prompts: 1
prompt_duration_range: [3.0, 3.0]
max_resps: 1
p_resp_append: 0.25
sample_type: path # path # speaker
sample_order: duration
sample_max_duration_batch: 300
sample_shuffle: False
tasks_list: [ "tts", "stt" ]
training: []
validation: []
noise: [] |