ecker
/

vall-e

Model card Files Files and versions Community

ecker commited on Oct 26, 2024

Commit

6542f44

verified ·

1 Parent(s): 494a301

Update models/config.llama.yaml

Browse files

Reduced a lot of unneeded settings in the config YAML

Files changed (1) hide show

models/config.llama.yaml +5 -106

models/config.llama.yaml CHANGED Viewed

@@ -1,117 +1,16 @@
-sample_rate: 24_000
-audio_backend: "vocos"
 models:
-- name: "ar+nar-tts+stt"
   size: "full"
   resp_levels: 8
-  prom_levels: 8
   tasks: 9
   langs: 4
   tones: 1
   arch_type: llama
-  training: False
-  version: 5
   attention: auto
-  dropout: 0.1
-  #loss_factors:
-  #  text: 0.01
-  #  prom: 0.5
-  #  resp: 1.0
   capabilities: ["ar", "nar"]
   experimental:
-    # modifies model arch
-    audio_embedding_sums: True
-    unified_position_ids: False
     split_classifiers: True
-#loras:
-#- name : "lora"
-#  rank: 128
-#  alpha: 128
-#  training: True
-#  rvq_levels: []
-hyperparameters:
-  batch_size: 32
-  gradient_accumulation_steps: 8
-  gradient_clipping: 1.0
-  warmup_steps: 10
-  optimizer: Prodigy
-  learning_rate: 1.0
-  torch_optimizer: True
-  scheduler: "" # ScheduleFree
-  torch_scheduler: True
-evaluation:
-  batch_size: 4
-  frequency: 250
-  size: 4
-  steps: 500
-  ar_temperature: 1.0
-  nar_temperature: 0.0
-trainer:
-  iterations: 1_000_000
-  save_frequency: 250
-  keep_last_checkpoints: 4
-  resize_modules: True
-  gradient_checkpointing: True
-  weight_dtype: bfloat16
-  amp: True
-  backend: deepspeed
-  deepspeed:
-    inferencing: False
-    amp: False
-inference:
-  backend: local
-  weight_dtype: bfloat16
-  amp: True
-optimizations:
-  injects: False
-  replace: True
-  linear: False
-  embedding: False
-  optimizers: True
-  bitsandbytes: False
-  dadaptation: False
-  bitnet: False
-  fp8: False
-dataset:
-  use_hdf5: True
-  hdf5_flag: r
-  use_metadata: True
-  validate: True
-  workers: 1
-  cache: True
-  duration_range: [3.0, 12.0]
-  prompt_max_samples: 1
-  prompt_duration_range: [3.0, 3.0]
-  resps_max_samples: 1
-  sample_type: path # path # speaker
-  sample_order: duration
-  sample_max_duration_batch: 300
-  sample_shuffle: False
-  tasks_list: [ "tts", "stt" ]
-  training: []
-  validation: []
-  noise: []

 models:
+- name: "ar+nar"
   size: "full"
   resp_levels: 8
   tasks: 9
   langs: 4
   tones: 1
   arch_type: llama
   attention: auto
+  version: 5
   capabilities: ["ar", "nar"]
   experimental:
     split_classifiers: True
+    audio_embedding_sums: True
+    unified_position_ids: False