MusicGen / config /conditioner /clapemb2music.yaml
reach-vb's picture
reach-vb HF staff
Stereo demo update (#60)
5325fcc
# @package __global__
classifier_free_guidance:
training_dropout: 0.3
inference_coef: 3.0
attribute_dropout:
text: {}
wav: {}
fuser:
cross_attention_pos_emb: false
cross_attention_pos_emb_scale: 1
sum: []
prepend: []
cross: [description]
input_interpolate: []
conditioners:
description:
model: clap
clap:
checkpoint: //reference/clap/music_audioset_epoch_15_esc_90.14.pt
model_arch: 'HTSAT-base'
enable_fusion: false
sample_rate: 48000
max_audio_length: 10
audio_stride: 1
dim: 512
attribute: description
normalize: true
quantize: true # use RVQ quantization
n_q: 12
bins: 1024
kmeans_iters: 50
text_p: 0. # probability of using text embed at train time
cache_path: null
dataset:
joint_embed_attributes: [description]
train:
merge_text_p: 0.25
drop_desc_p: 0.5
drop_other_p: 0.5