File size: 1,677 Bytes
5325fcc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# @package __global__

defaults:
  - /solver/default
  - /model: score/basic
  - override /dset: audio/default
  - _self_

solver: diffusion

sample_rate: 16000
channels: 1
compression_model_checkpoint: //sig/5091833e
n_q: 2   # number of codebooks to keep

dataset:
  batch_size: 8
  num_workers: 10
  segment_duration: 1
  train:
    num_samples: 100
  valid:
    num_samples: 100
  evaluate:
    batch_size: 8
    num_samples: 10
  generate:
    batch_size: 8
    num_samples: 10
    segment_duration: 10

loss:
  kind: mse
  norm_power: 0.

valid:
  every: 1

evaluate:
  every: 5
  num_workers: 5
  metrics:
    visqol: false
    sisnr: false
    rvm: true

generate:
  every: 5
  num_workers: 5
  audio:
    sample_rate: ${sample_rate}

checkpoint:
  save_last: true
  save_every: 25
  keep_last: 10
  keep_every_states: null


optim:
  epochs: 50
  updates_per_epoch: 2000
  lr: 2e-4
  max_norm: 0
  optimizer: adam
  adam:
    betas: [0.9, 0.999]
    weight_decay: 0.
  ema:
    use: true         # whether to use EMA or not
    updates: 1        # update at every step
    device: ${device} # device for EMA, can be put on GPU if more frequent updates
    decay: 0.99       # EMA decay value, if null, no EMA is used

processor:
  name: multi_band_processor
  use: false
  n_bands: 8
  num_samples: 10_000
  power_std: 1.

resampling:
  use: false
  target_sr: 16000

filter:
  use: false
  n_bands: 4
  idx_band: 0
  cutoffs: null

schedule:
  repartition: "power"
  variable_step_batch: true
  beta_t0: 1.0e-5
  beta_t1: 2.9e-2
  beta_exp: 7.5
  num_steps: 1000
  variance: 'beta'
  clip: 5.
  rescale: 1.
  n_bands: null
  noise_scale: 1.0

metrics:
  num_stage: 4