ESPnet
102 languages
audio
self-supervised-learning
speech-recognition
William Chen
init
921b02e
config: conf/tuning/train_ssl_torchaudiohubert_large_960h_pretrain_it2_wavlm.yaml
print_config: false
log_level: INFO
dry_run: false
iterator_type: sequence
output_dir: exp_li/hubert_iter2_train_ssl_torchaudiohubert_large_960h_pretrain_it2_wavlm_raw_layer_9
ngpu: 1
seed: 0
num_workers: 4
num_att_plot: 0
dist_backend: nccl
dist_init_method: file:///scratch/bbjs/chen26/espnet_01_23/egs2/librispeech/ssl1/exp_li/hubert_iter2_train_ssl_torchaudiohubert_large_960h_pretrain_it2_wavlm_raw_layer_9/.dist_init_a1240dd2-c062-43da-80c3-13d77add8604
dist_world_size: 32
dist_rank: 0
local_rank: 0
dist_master_addr: null
dist_master_port: null
dist_launcher: slurm
multiprocessing_distributed: true
unused_parameters: true
sharded_ddp: false
cudnn_enabled: true
cudnn_benchmark: false
cudnn_deterministic: true
collect_stats: false
write_collected_feats: false
max_epoch: 22
patience: null
val_scheduler_criterion:
- valid
- loss
early_stopping_criterion:
- valid
- loss
- min
best_model_criterion:
- - valid
- acc_m
- max
keep_nbest_models: 10
nbest_averaging_interval: 0
grad_clip: 5.0
grad_clip_type: 2.0
grad_noise: false
accum_grad: 1
no_forward_run: false
resume: true
train_dtype: float32
use_amp: true
drop_last: true
debug_grad: false
log_interval: null
use_matplotlib: true
use_tensorboard: true
create_graph_in_tensorboard: false
use_wandb: false
wandb_project: null
wandb_id: null
wandb_entity: null
wandb_name: null
wandb_model_log_interval: -1
detect_anomaly: false
pretrain_path: null
init_param: []
ignore_init_mismatch: false
freeze_param: []
num_iters_per_epoch: 2000
batch_size: 20
valid_batch_size: null
batch_bins: 35000000
valid_batch_bins: null
train_shape_file:
- exp_li/hubert_iter2_stats_raw/splits16/speech_shape
- exp_li/hubert_iter2_stats_raw/splits16/text_shape.word
valid_shape_file:
- exp_li/hubert_iter2_stats_raw/valid/speech_shape
- exp_li/hubert_iter2_stats_raw/valid/text_shape.word
batch_type: numel
valid_batch_type: null
fold_length:
- 80000
- 400
sort_in_batch: descending
sort_batch: descending
multiple_iterator: true
chunk_length: 500
chunk_shift_ratio: 0.5
num_cache_chunks: 1024
train_data_path_and_name_and_type:
- - exp_li/hubert_iter2_stats_raw/splits16/wav.scp
- speech
- kaldi_ark
- - exp_li/hubert_iter2_stats_raw/splits16/text.km.kmeans_iter2_hubert_train_li110_lid_portion0.1
- text
- text
valid_data_path_and_name_and_type:
- - dump/raw/dev_all_li/wav.scp
- speech
- kaldi_ark
- - dump/raw/dev_all_li/text.km.kmeans_iter2_hubert_train_li110_lid_portion0.1
- text
- text
allow_variable_data_keys: false
max_cache_size: 0.0
max_cache_fd: 32
valid_max_cache_size: null
optim: adam
optim_conf:
lr: 0.0005
scheduler: warmuplr
scheduler_conf:
warmup_steps: 32000
token_list:
- '55'
- '477'
- '43'
- '405'
- '468'
- '486'
- '41'
- '34'
- '0'
- '415'
- '409'
- '367'
- '224'
- '436'
- '17'
- '462'
- '32'
- '481'
- '332'
- '291'
- '395'
- '336'
- '147'
- '364'
- '430'
- '454'
- '44'
- '196'
- '269'
- '230'
- '189'
- '274'
- '305'
- '419'
- '444'
- '352'
- '72'
- '158'
- '349'
- '212'
- '148'
- '13'
- '140'
- '164'
- '329'
- '288'
- '259'
- '9'
- '249'
- '30'
- '198'
- '299'
- '427'
- '301'
- '107'
- '344'
- '213'
- '36'
- '203'
- '120'
- '208'
- '285'
- '69'
- '181'
- '166'
- '130'
- '402'
- '194'
- '37'
- '106'
- '330'
- '407'
- '242'
- '412'
- '167'
- '133'
- '40'
- '211'
- '57'
- '389'
- '325'
- '197'
- '170'
- '190'
- '260'
- '286'
- '257'
- '374'
- '116'
- '38'
- '221'
- '81'
- '87'
- '4'
- '173'
- '94'
- '83'
- '200'
- '331'
- '143'
- '248'
- '22'
- '26'
- '388'
- '174'
- '488'
- '320'
- '397'
- '188'
- '401'
- '126'
- '28'
- '11'
- '304'
- '135'
- '33'
- '109'
- '202'
- '267'
- '86'
- '487'
- '490'
- '482'
- '426'
- '103'
- '151'
- '324'
- '492'
- '238'
- '483'
- '467'
- '1'
- '20'
- '268'
- '21'
- '47'
- '377'
- '351'
- '297'
- '398'
- '348'
- '157'
- '303'
- '100'
- '68'
- '254'
- '216'
- '177'
- '491'
- '171'
- '361'
- '24'
- '338'
- '129'
- '154'
- '192'
- '222'
- '8'
- '156'
- '7'
- '78'
- '64'
- '29'
- '146'
- '90'
- '263'
- '393'
- '95'
- '102'
- '433'
- '480'
- '225'
- '59'
- '66'
- '82'
- '85'
- '54'
- '310'
- '429'
- '176'
- '366'
- '42'
- '298'
- '144'
- '215'
- '318'
- '136'
- '122'
- '459'
- '205'
- '498'
- '112'
- '52'
- '396'
- '282'
- '428'
- '335'
- '339'
- '386'
- '289'
- '187'
- '333'
- '449'
- '458'
- '233'
- '35'
- '400'
- '223'
- '375'
- '70'
- '134'
- '127'
- '410'
- '71'
- '312'
- '73'
- '341'
- '326'
- '273'
- '472'
- '23'
- '113'
- '117'
- '387'
- '207'
- '342'
- '12'
- '49'
- '281'
- '65'
- '356'
- '99'
- '423'
- '141'
- '493'
- '61'
- '494'
- '277'
- '453'
- '362'
- '185'
- '460'
- '256'
- '159'
- '302'
- '88'
- '53'
- '76'
- '243'
- '235'
- '306'
- '278'
- '15'
- '56'
- '25'
- '115'
- '48'
- '264'
- '363'
- '110'
- '204'
- '414'
- '287'
- '184'
- '172'
- '383'
- '316'
- '424'
- '169'
- '358'
- '14'
- '206'
- '91'
- '245'
- '447'
- '60'
- '125'
- '283'
- '246'
- '255'
- '313'
- '97'
- '89'
- '321'
- '214'
- '314'
- '464'
- '27'
- '294'
- '497'
- '128'
- '451'
- '365'
- '478'
- '337'
- '226'
- '422'
- '471'
- '381'
- '63'
- '452'
- '290'
- '118'
- '51'
- '261'
- '432'
- '376'
- '31'
- '80'
- '142'
- '295'
- '275'
- '272'
- '123'
- '270'
- '236'
- '195'
- '469'
- '50'
- '218'
- '435'
- '479'
- '315'
- '182'
- '372'
- '446'
- '132'
- '327'
- '229'
- '217'
- '373'
- '340'
- '153'
- '2'
- '163'
- '199'
- '378'
- '101'
- '79'
- '96'
- '434'
- '489'
- '247'
- '440'
- '448'
- '139'
- '466'
- '150'
- '465'
- '62'
- '421'
- '252'
- '104'
- '180'
- '232'
- '108'
- '307'
- '219'
- '228'
- '322'
- '455'
- '370'
- '39'
- '280'
- '114'
- '240'
- '137'
- '179'
- '162'
- '406'
- '168'
- '368'
- '473'
- '75'
- '441'
- '266'
- '442'
- '119'
- '347'
- '92'
- '209'
- '470'
- '296'
- '476'
- '93'
- '191'
- '437'
- '293'
- '186'
- '111'
- '265'
- '183'
- '145'
- '394'
- '155'
- '420'
- '438'
- '5'
- '463'
- '431'
- '334'
- '138'
- '3'
- '369'
- '403'
- '84'
- '152'
- '392'
- '18'
- '231'
- '417'
- '160'
- '357'
- '323'
- '475'
- '131'
- '485'
- '350'
- '450'
- '439'
- '353'
- '443'
- '384'
- '16'
- '201'
- '346'
- '253'
- '404'
- '445'
- '250'
- '165'
- '98'
- '193'
- '300'
- '328'
- '234'
- '496'
- '67'
- '359'
- '46'
- '345'
- '317'
- '354'
- '385'
- '276'
- '309'
- '425'
- '311'
- '456'
- '220'
- '178'
- '124'
- '244'
- '416'
- '399'
- '161'
- '413'
- '308'
- '371'
- '258'
- '45'
- '360'
- '149'
- '284'
- '241'
- '319'
- '411'
- '461'
- '237'
- '408'
- '390'
- '227'
- '382'
- '10'
- '292'
- '355'
- '262'
- '418'
- '379'
- '6'
- '271'
- '380'
- '105'
- '251'
- '175'
- '239'
- '210'
- '74'
- '495'
- '279'
- '457'
- '343'
- '77'
- '19'
- '391'
- '121'
- '499'
- '474'
- '484'
- '58'
- <unk>
- <sos/eos>
init: null
collate_fn_conf:
label_downsampling: 1
pad: false
rand_crop: true
mix_speech: true
noise_apply_prob: 0.2
input_size: 1
num_classes: 500
use_preprocessor: true
use_mixing: true
cs_aug: false
mixing_splits: 16
token_type: word
bpemodel: null
non_linguistic_symbols: null
cleaner: null
g2p: null
speech_volume_normalize: null
rir_scp: null
rir_apply_prob: 1.0
noise_scp: data/noise/wav.scp
noise_apply_prob: 0.2
noise_db_range: '13_15'
pred_masked_weight: 1.0
pred_nomask_weight: 0.0
loss_weights: 0.0
frontend: null
frontend_conf: {}
specaug: null
specaug_conf: {}
normalize: null
normalize_conf: {}
preencoder: null
preencoder_conf: {}
encoder: torchaudio_hubert
encoder_conf:
encoder_projection_dropout: 0.0
encoder_attention_dropout: 0.0
encoder_ff_interm_dropout: 0.0
encoder_dropout: 0.0
encoder_layer_drop: 0.0
extractor_mode: layer_norm
encoder_embed_dim: 1024
encoder_num_layers: 24
encoder_num_heads: 16
encoder_ff_interm_features: 4096
encoder_layer_norm_first: true
final_dim: 768
feature_grad_mult: null
model: torchaudio
model_conf: {}
required:
- output_dir
- token_list
version: '202211'
distributed: true