File size: 2,737 Bytes
5325fcc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# @package __global__

defaults:
  - /solver/default
  - /conditioner: none
  - _self_
  - /solver/musicgen/evaluation: none
  - override /dset: audio/default

autocast: true
autocast_dtype: float16

solver: musicgen
sample_rate: ???
channels: ???
compression_model_checkpoint: ???
# The following will set the num codebooks on the underlying
# model, this might be different from the actual value for n_q
# given to the transformer, when the model output is postprocessed, for instance
# for stereo channels. If not provided, default value for the compression model
# will be used.
compression_model_n_q: null

tokens:
  padding_with_special_token: false

interleave_stereo_codebooks:
  use: false
  per_timestep: false

cache:
  path:
  write: false
  write_shard: 0
  write_num_shards: 1


dataset:
  batch_size: 128
  num_workers: 10
  segment_duration: 30
  min_segment_ratio: 0.8  # lower values such as 0.5 result in generations with a lot of silence.
  return_info: true
  train:
    num_samples: 1000000 # need a randomly large number here for AudioDataset
  valid:
    num_samples: 10000
  generate:
    num_samples: 50

metrics:
  fad:
    use_gt: false
    model: tf
    tf:
      bin: null  # path to local frechet_audio_distance code
      model_path: //reference/fad/vggish_model.ckpt
  kld:
    use_gt: false
    model: passt
    passt:
      pretrained_length: 20
  text_consistency:
    use_gt: false
    model: clap
    clap:
      model_path: //reference/clap/music_audioset_epoch_15_esc_90.14.pt
      model_arch: 'HTSAT-base'
      enable_fusion: false
  chroma_cosine:
    use_gt: false
    model: chroma_base
    chroma_base:
      sample_rate: ${sample_rate}
      n_chroma: 12
      radix2_exp: 14
      argmax: true

generate:
  every: 25
  num_workers: 5
  path: samples
  audio:
    format: wav
    strategy: loudness
    sample_rate: ${sample_rate}
    loudness_headroom_db: 14
  lm:
    prompted_samples: true
    unprompted_samples: true
    gen_gt_samples: false
    prompt_duration: null   # if not set, will use dataset.generate.segment_duration / 4
    gen_duration: null      # if not set, will use dataset.generate.segment_duration
    remove_prompts: false
    # generation params
    use_sampling: false
    temp: 1.0
    top_k: 0
    top_p: 0.0
evaluate:
  every: 25
  num_workers: 5
  metrics:
    base: false
    fad: false
    kld: false
    text_consistency: false
    chroma_cosine: false

checkpoint:
  save_last: true
  save_every: 50
  keep_last: 10
  keep_every_states: null

optim:
  epochs: 200
  updates_per_epoch: 2000
  lr: 1e-4
  optimizer: adamw
  max_norm: 1.0
  eager_sync: true
  adam:
    betas: [0.9, 0.95]
    weight_decay: 0.1
    eps: 1e-8

schedule:
  lr_scheduler: null