|
--- |
|
tags: |
|
- espnet |
|
- audio |
|
- automatic-speech-recognition |
|
language: en |
|
datasets: |
|
- slurp_entity |
|
license: cc-by-4.0 |
|
--- |
|
|
|
## ESPnet2 ASR model |
|
|
|
### `pyf98/slurp_entity_e_branchformer` |
|
|
|
This model was trained by Yifan Peng using slurp_entity recipe in [espnet](https://github.com/espnet/espnet/). |
|
|
|
References: |
|
- [E-Branchformer: Branchformer with Enhanced merging for speech recognition (SLT 2022)](https://arxiv.org/abs/2210.00077) |
|
- [Branchformer: Parallel MLP-Attention Architectures to Capture Local and Global Context for Speech Recognition and Understanding (ICML 2022)](https://proceedings.mlr.press/v162/peng22a.html) |
|
|
|
### Demo: How to use in ESPnet2 |
|
|
|
Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html) |
|
if you haven't done that already. |
|
|
|
```bash |
|
cd espnet |
|
git checkout 4bbd29a40cc7e2259996d30c0c76d3d789c1153d |
|
pip install -e . |
|
cd egs2/slurp_entity/asr1 |
|
./run.sh --skip_data_prep false --skip_train true --download_model pyf98/slurp_entity_e_branchformer |
|
``` |
|
|
|
<!-- Generated by scripts/utils/show_asr_result.sh --> |
|
# RESULTS |
|
## Environments |
|
- date: `Mon Feb 27 19:14:30 CST 2023` |
|
- python version: `3.9.15 (main, Nov 24 2022, 14:31:59) [GCC 11.2.0]` |
|
- espnet version: `espnet 202301` |
|
- pytorch version: `pytorch 1.13.1` |
|
- Git hash: `4bbd29a40cc7e2259996d30c0c76d3d789c1153d` |
|
- Commit date: `Sat Feb 25 21:54:03 2023 -0600` |
|
|
|
## exp/asr_train_asr_e_branchformer_e12_mlp3072_linear1024_layerdrop_raw_en_word |
|
### WER |
|
|
|
|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| |
|
|---|---|---|---|---|---|---|---|---| |
|
|decode_asr_asr_model_valid.acc.ave_10best/devel|8690|178058|84.6|7.6|7.8|3.2|18.6|51.2| |
|
|decode_asr_asr_model_valid.acc.ave_10best/test|13078|262176|83.7|7.7|8.6|3.0|19.3|49.7| |
|
|
|
### CER |
|
|
|
|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| |
|
|---|---|---|---|---|---|---|---|---| |
|
|decode_asr_asr_model_valid.acc.ave_10best/devel|8690|847400|90.8|3.0|6.2|3.5|12.7|51.2| |
|
|decode_asr_asr_model_valid.acc.ave_10best/test|13078|1245475|89.7|3.1|7.2|3.4|13.6|49.7| |
|
|
|
|
|
### Intent Classification |
|
|
|
- Valid Intent Classification Result: |
|
0.8781357882623706 |
|
- Test Intent Classification Result: |
|
0.8743691695977979 |
|
|
|
### Entity |
|
|
|
|Slu f1|Precision|Recall|F-Measure| |
|
|:---:|:---:|:---:|:---:| |
|
| test | 0.7940 | 0.7582 | 0.7757 | |
|
|
|
|
|
|
|
## ASR config |
|
|
|
<details><summary>expand</summary> |
|
|
|
``` |
|
config: conf/tuning/train_asr_e_branchformer_e12_mlp3072_linear1024_layerdrop.yaml |
|
print_config: false |
|
log_level: INFO |
|
dry_run: false |
|
iterator_type: sequence |
|
output_dir: exp/asr_train_asr_e_branchformer_e12_mlp3072_linear1024_layerdrop_raw_en_word |
|
ngpu: 1 |
|
seed: 0 |
|
num_workers: 1 |
|
num_att_plot: 3 |
|
dist_backend: nccl |
|
dist_init_method: env:// |
|
dist_world_size: null |
|
dist_rank: null |
|
local_rank: 0 |
|
dist_master_addr: null |
|
dist_master_port: null |
|
dist_launcher: null |
|
multiprocessing_distributed: false |
|
unused_parameters: false |
|
sharded_ddp: false |
|
cudnn_enabled: true |
|
cudnn_benchmark: false |
|
cudnn_deterministic: true |
|
collect_stats: false |
|
write_collected_feats: false |
|
max_epoch: 60 |
|
patience: null |
|
val_scheduler_criterion: |
|
- valid |
|
- loss |
|
early_stopping_criterion: |
|
- valid |
|
- loss |
|
- min |
|
best_model_criterion: |
|
- - valid |
|
- acc |
|
- max |
|
keep_nbest_models: 10 |
|
nbest_averaging_interval: 0 |
|
grad_clip: 5.0 |
|
grad_clip_type: 2.0 |
|
grad_noise: false |
|
accum_grad: 1 |
|
no_forward_run: false |
|
resume: true |
|
train_dtype: float32 |
|
use_amp: false |
|
log_interval: null |
|
use_matplotlib: true |
|
use_tensorboard: true |
|
create_graph_in_tensorboard: false |
|
use_wandb: false |
|
wandb_project: null |
|
wandb_id: null |
|
wandb_entity: null |
|
wandb_name: null |
|
wandb_model_log_interval: -1 |
|
detect_anomaly: false |
|
pretrain_path: null |
|
init_param: [] |
|
ignore_init_mismatch: false |
|
freeze_param: [] |
|
num_iters_per_epoch: null |
|
batch_size: 64 |
|
valid_batch_size: null |
|
batch_bins: 1000000 |
|
valid_batch_bins: null |
|
train_shape_file: |
|
- exp/asr_stats_raw_en_word/train/speech_shape |
|
- exp/asr_stats_raw_en_word/train/text_shape.word |
|
valid_shape_file: |
|
- exp/asr_stats_raw_en_word/valid/speech_shape |
|
- exp/asr_stats_raw_en_word/valid/text_shape.word |
|
batch_type: folded |
|
valid_batch_type: null |
|
fold_length: |
|
- 80000 |
|
- 150 |
|
sort_in_batch: descending |
|
sort_batch: descending |
|
multiple_iterator: false |
|
chunk_length: 500 |
|
chunk_shift_ratio: 0.5 |
|
num_cache_chunks: 1024 |
|
train_data_path_and_name_and_type: |
|
- - dump/raw/train/wav.scp |
|
- speech |
|
- kaldi_ark |
|
- - dump/raw/train/text |
|
- text |
|
- text |
|
valid_data_path_and_name_and_type: |
|
- - dump/raw/devel/wav.scp |
|
- speech |
|
- kaldi_ark |
|
- - dump/raw/devel/text |
|
- text |
|
- text |
|
allow_variable_data_keys: false |
|
max_cache_size: 0.0 |
|
max_cache_fd: 32 |
|
valid_max_cache_size: null |
|
exclude_weight_decay: false |
|
exclude_weight_decay_conf: {} |
|
optim: adam |
|
optim_conf: |
|
lr: 0.001 |
|
weight_decay: 1.0e-06 |
|
scheduler: warmuplr |
|
scheduler_conf: |
|
warmup_steps: 35000 |
|
token_list: |
|
- <blank> |
|
- <unk> |
|
- ▁SEP |
|
- ▁FILL |
|
- s |
|
- ▁the |
|
- a |
|
- ▁to |
|
- ▁i |
|
- ▁me |
|
- e |
|
- ▁s |
|
- ▁a |
|
- i |
|
- ▁you |
|
- ▁what |
|
- er |
|
- ing |
|
- u |
|
- ▁is |
|
- '''' |
|
- o |
|
- p |
|
- ▁in |
|
- ▁p |
|
- y |
|
- ▁my |
|
- ▁please |
|
- d |
|
- c |
|
- m |
|
- ▁b |
|
- l |
|
- ▁m |
|
- ▁c |
|
- st |
|
- date |
|
- n |
|
- ▁d |
|
- le |
|
- b |
|
- ▁for |
|
- re |
|
- t |
|
- ▁on |
|
- en |
|
- h |
|
- 'on' |
|
- ar |
|
- person |
|
- ▁re |
|
- ▁f |
|
- ▁g |
|
- ▁of |
|
- an |
|
- ▁ |
|
- g |
|
- ▁today |
|
- ▁t |
|
- or |
|
- ▁it |
|
- ▁this |
|
- ▁h |
|
- r |
|
- f |
|
- at |
|
- ch |
|
- ce |
|
- place_name |
|
- ▁email |
|
- ▁do |
|
- es |
|
- ri |
|
- ▁e |
|
- ▁w |
|
- ic |
|
- in |
|
- ▁that |
|
- event_name |
|
- ▁play |
|
- ▁and |
|
- al |
|
- ▁n |
|
- ▁can |
|
- email_query |
|
- ve |
|
- ▁new |
|
- day |
|
- it |
|
- ate |
|
- ▁from |
|
- ▁have |
|
- k |
|
- time |
|
- ▁am |
|
- media_type |
|
- email_sendemail |
|
- ent |
|
- ▁olly |
|
- qa_factoid |
|
- se |
|
- v |
|
- et |
|
- ck |
|
- ▁any |
|
- calendar_set |
|
- ly |
|
- th |
|
- ▁how |
|
- ▁meeting |
|
- ed |
|
- ▁tell |
|
- ▁st |
|
- x |
|
- ur |
|
- ro |
|
- ▁at |
|
- nd |
|
- ▁list |
|
- w |
|
- ▁u |
|
- ou |
|
- ▁not |
|
- ▁about |
|
- ▁an |
|
- ▁o |
|
- general_negate |
|
- ut |
|
- ▁time |
|
- ▁be |
|
- ▁ch |
|
- ▁are |
|
- social_post |
|
- business_name |
|
- la |
|
- ty |
|
- play_music |
|
- ot |
|
- general_quirky |
|
- ▁l |
|
- ▁sh |
|
- ▁tweet |
|
- om |
|
- ▁week |
|
- um |
|
- ▁one |
|
- ter |
|
- ▁he |
|
- ▁up |
|
- ▁com |
|
- general_praise |
|
- weather_query |
|
- ▁next |
|
- ▁th |
|
- ▁check |
|
- calendar_query |
|
- ▁last |
|
- ▁ro |
|
- ad |
|
- is |
|
- ▁with |
|
- ay |
|
- ▁send |
|
- pe |
|
- ▁pm |
|
- ▁tomorrow |
|
- ▁j |
|
- un |
|
- ▁train |
|
- general_explain |
|
- ▁v |
|
- one |
|
- ▁r |
|
- ra |
|
- news_query |
|
- ation |
|
- ▁emails |
|
- us |
|
- if |
|
- ct |
|
- ▁co |
|
- ▁add |
|
- ▁will |
|
- ▁se |
|
- nt |
|
- ▁was |
|
- ine |
|
- ▁de |
|
- ▁set |
|
- ▁ex |
|
- ▁would |
|
- ir |
|
- ow |
|
- ber |
|
- general_repeat |
|
- ight |
|
- ook |
|
- ▁again |
|
- ▁song |
|
- currency_name |
|
- ll |
|
- ▁ha |
|
- ▁go |
|
- relation |
|
- te |
|
- ion |
|
- and |
|
- ▁y |
|
- ▁ye |
|
- general_affirm |
|
- general_confirm |
|
- ery |
|
- ▁po |
|
- ff |
|
- ▁we |
|
- ▁turn |
|
- ▁did |
|
- ▁mar |
|
- ▁alarm |
|
- ▁like |
|
- datetime_query |
|
- ers |
|
- ▁all |
|
- ▁remind |
|
- ▁so |
|
- qa_definition |
|
- ▁calendar |
|
- end |
|
- ▁said |
|
- ci |
|
- ▁off |
|
- ▁john |
|
- ▁day |
|
- ss |
|
- pla |
|
- ume |
|
- ▁get |
|
- ail |
|
- pp |
|
- z |
|
- ry |
|
- am |
|
- ▁need |
|
- as |
|
- ▁thank |
|
- ▁wh |
|
- ▁want |
|
- ▁right |
|
- ▁jo |
|
- ▁facebook |
|
- ▁k |
|
- ge |
|
- ld |
|
- ▁fri |
|
- ▁two |
|
- general_dontcare |
|
- ▁news |
|
- ol |
|
- oo |
|
- ant |
|
- ▁five |
|
- ▁event |
|
- ake |
|
- definition_word |
|
- transport_type |
|
- ▁your |
|
- vi |
|
- orn |
|
- op |
|
- ▁weather |
|
- ome |
|
- ▁app |
|
- ▁lo |
|
- de |
|
- ▁music |
|
- weather_descriptor |
|
- ak |
|
- ke |
|
- ▁there |
|
- ▁si |
|
- ▁lights |
|
- ▁now |
|
- ▁mo |
|
- calendar_remove |
|
- our |
|
- ▁dollar |
|
- food_type |
|
- me |
|
- ▁more |
|
- ▁no |
|
- ▁birthday |
|
- orrect |
|
- ▁rep |
|
- ▁show |
|
- play_radio |
|
- ▁mon |
|
- ▁does |
|
- ood |
|
- ag |
|
- li |
|
- ▁sto |
|
- ▁contact |
|
- cket |
|
- email_querycontact |
|
- ▁ev |
|
- ▁could |
|
- ange |
|
- ▁just |
|
- out |
|
- ame |
|
- . |
|
- ▁ja |
|
- ▁confirm |
|
- qa_currency |
|
- ▁man |
|
- ▁late |
|
- ▁think |
|
- ▁some |
|
- timeofday |
|
- ▁bo |
|
- qa_stock |
|
- ong |
|
- ▁start |
|
- ▁work |
|
- ▁ten |
|
- int |
|
- ▁command |
|
- all |
|
- ▁make |
|
- ▁la |
|
- j |
|
- ▁answ |
|
- ▁hour |
|
- ▁cle |
|
- ah |
|
- ▁find |
|
- ▁service |
|
- ▁fa |
|
- qu |
|
- general_commandstop |
|
- ai |
|
- ▁when |
|
- ▁te |
|
- ▁by |
|
- social_query |
|
- ard |
|
- ▁tw |
|
- ul |
|
- id |
|
- ▁seven |
|
- ▁where |
|
- ▁much |
|
- art |
|
- ▁appointment |
|
- ver |
|
- artist_name |
|
- el |
|
- device_type |
|
- ▁know |
|
- ▁three |
|
- ▁events |
|
- ▁tr |
|
- ▁li |
|
- ork |
|
- red |
|
- ect |
|
- ▁let |
|
- ▁respon |
|
- ▁par |
|
- zz |
|
- ▁give |
|
- ▁twenty |
|
- ▁ti |
|
- ▁curre |
|
- play_podcasts |
|
- ▁radio |
|
- cooking_recipe |
|
- transport_query |
|
- ▁con |
|
- gh |
|
- ▁le |
|
- lists_query |
|
- ▁rem |
|
- recommendation_events |
|
- house_place |
|
- alarm_set |
|
- play_audiobook |
|
- ist |
|
- ase |
|
- music_genre |
|
- ive |
|
- ast |
|
- player_setting |
|
- ort |
|
- lly |
|
- news_topic |
|
- list_name |
|
- ▁playlist |
|
- ▁ne |
|
- business_type |
|
- personal_info |
|
- ind |
|
- ust |
|
- di |
|
- ress |
|
- recommendation_locations |
|
- lists_createoradd |
|
- iot_hue_lightoff |
|
- lists_remove |
|
- ord |
|
- ▁light |
|
- ere |
|
- alarm_query |
|
- audio_volume_mute |
|
- music_query |
|
- ▁audio |
|
- rain |
|
- ▁date |
|
- ▁order |
|
- audio_volume_up |
|
- ▁ar |
|
- ▁podcast |
|
- transport_ticket |
|
- mail |
|
- iot_hue_lightchange |
|
- iot_coffee |
|
- radio_name |
|
- ill |
|
- ▁ri |
|
- '@' |
|
- takeaway_query |
|
- song_name |
|
- takeaway_order |
|
- ▁ra |
|
- email_addcontact |
|
- play_game |
|
- book |
|
- transport_traffic |
|
- ▁house |
|
- music_likeness |
|
- her |
|
- transport_taxi |
|
- iot_hue_lightdim |
|
- ment |
|
- ght |
|
- fo |
|
- order_type |
|
- color_type |
|
- '1' |
|
- ven |
|
- ould |
|
- general_joke |
|
- ess |
|
- ain |
|
- qa_maths |
|
- ▁place |
|
- ▁twe |
|
- cast |
|
- iot_cleaning |
|
- ▁che |
|
- ▁cont |
|
- ith |
|
- audiobook_name |
|
- email_address |
|
- game_name |
|
- ▁cal |
|
- general_frequency |
|
- ▁tom |
|
- ▁food |
|
- act |
|
- iot_hue_lightup |
|
- '2' |
|
- alarm_remove |
|
- podcast_descriptor |
|
- ▁definition |
|
- audio_volume_down |
|
- ▁media |
|
- email_folder |
|
- dia |
|
- meal_type |
|
- ▁mus |
|
- recommendation_movies |
|
- ▁ad |
|
- ree |
|
- pt |
|
- now |
|
- playlist_name |
|
- ▁person |
|
- change_amount |
|
- ▁pla |
|
- escri |
|
- datetime_convert |
|
- podcast_name |
|
- ▁ab |
|
- time_zone |
|
- ▁def |
|
- ting |
|
- iot_wemo_on |
|
- music_settings |
|
- iot_wemo_off |
|
- orre |
|
- cy |
|
- ank |
|
- music_descriptor |
|
- lar |
|
- app_name |
|
- row |
|
- joke_type |
|
- xt |
|
- of |
|
- ition |
|
- ▁meet |
|
- ink |
|
- ▁confir |
|
- transport_agency |
|
- general_greet |
|
- ▁business |
|
- ▁art |
|
- ▁ag |
|
- urn |
|
- escript |
|
- rom |
|
- ▁rel |
|
- ▁au |
|
- ▁currency |
|
- audio_volume_other |
|
- iot_hue_lighton |
|
- ▁artist |
|
- '?' |
|
- ▁bus |
|
- cooking_type |
|
- movie_name |
|
- coffee_type |
|
- ingredient |
|
- ather |
|
- music_dislikeness |
|
- sp |
|
- q |
|
- ▁ser |
|
- esc |
|
- ▁bir |
|
- ▁cur |
|
- name |
|
- ▁tran |
|
- ▁hou |
|
- ek |
|
- uch |
|
- ▁conf |
|
- ▁face |
|
- '9' |
|
- ▁birth |
|
- I |
|
- sw |
|
- transport_descriptor |
|
- ▁comm |
|
- lease |
|
- transport_name |
|
- aid |
|
- movie_type |
|
- ▁device |
|
- alarm_type |
|
- audiobook_author |
|
- '5' |
|
- drink_type |
|
- ▁joh |
|
- ▁defin |
|
- word |
|
- ▁curren |
|
- order |
|
- iness |
|
- W |
|
- cooking_query |
|
- sport_type |
|
- ▁relation |
|
- oint |
|
- H |
|
- '8' |
|
- A |
|
- '0' |
|
- ▁dol |
|
- vice |
|
- ▁pers |
|
- '&' |
|
- T |
|
- ▁appoint |
|
- _ |
|
- '7' |
|
- '3' |
|
- '-' |
|
- game_type |
|
- ▁pod |
|
- N |
|
- M |
|
- E |
|
- list |
|
- music_album |
|
- dio |
|
- ▁transport |
|
- qa_query |
|
- C |
|
- O |
|
- U |
|
- query_detail |
|
- ']' |
|
- '[' |
|
- descriptor |
|
- ':' |
|
- spon |
|
- <sos/eos> |
|
init: null |
|
input_size: null |
|
ctc_conf: |
|
dropout_rate: 0.0 |
|
ctc_type: builtin |
|
reduce: true |
|
ignore_nan_grad: null |
|
zero_infinity: true |
|
joint_net_conf: null |
|
use_preprocessor: true |
|
token_type: word |
|
bpemodel: null |
|
non_linguistic_symbols: null |
|
cleaner: null |
|
g2p: null |
|
speech_volume_normalize: null |
|
rir_scp: null |
|
rir_apply_prob: 1.0 |
|
noise_scp: null |
|
noise_apply_prob: 1.0 |
|
noise_db_range: '13_15' |
|
short_noise_thres: 0.5 |
|
aux_ctc_tasks: [] |
|
frontend: default |
|
frontend_conf: |
|
fs: 16k |
|
specaug: specaug |
|
specaug_conf: |
|
apply_time_warp: true |
|
time_warp_window: 5 |
|
time_warp_mode: bicubic |
|
apply_freq_mask: true |
|
freq_mask_width_range: |
|
- 0 |
|
- 30 |
|
num_freq_mask: 2 |
|
apply_time_mask: true |
|
time_mask_width_range: |
|
- 0 |
|
- 40 |
|
num_time_mask: 2 |
|
normalize: utterance_mvn |
|
normalize_conf: {} |
|
model: espnet |
|
model_conf: |
|
ctc_weight: 0.3 |
|
lsm_weight: 0.1 |
|
length_normalized_loss: false |
|
extract_feats_in_collect_stats: false |
|
preencoder: null |
|
preencoder_conf: {} |
|
encoder: e_branchformer |
|
encoder_conf: |
|
output_size: 512 |
|
attention_heads: 8 |
|
attention_layer_type: rel_selfattn |
|
pos_enc_layer_type: rel_pos |
|
rel_pos_type: latest |
|
cgmlp_linear_units: 3072 |
|
cgmlp_conv_kernel: 31 |
|
use_linear_after_conv: false |
|
gate_activation: identity |
|
num_blocks: 12 |
|
dropout_rate: 0.1 |
|
positional_dropout_rate: 0.1 |
|
attention_dropout_rate: 0.1 |
|
input_layer: conv2d |
|
layer_drop_rate: 0.1 |
|
linear_units: 1024 |
|
positionwise_layer_type: linear |
|
macaron_ffn: true |
|
use_ffn: true |
|
merge_conv_kernel: 31 |
|
postencoder: null |
|
postencoder_conf: {} |
|
decoder: transformer |
|
decoder_conf: |
|
attention_heads: 8 |
|
linear_units: 2048 |
|
num_blocks: 6 |
|
dropout_rate: 0.1 |
|
positional_dropout_rate: 0.1 |
|
self_attention_dropout_rate: 0.1 |
|
src_attention_dropout_rate: 0.1 |
|
layer_drop_rate: 0.2 |
|
preprocessor: default |
|
preprocessor_conf: {} |
|
required: |
|
- output_dir |
|
- token_list |
|
version: '202301' |
|
distributed: false |
|
``` |
|
|
|
</details> |
|
|
|
|
|
|
|
### Citing ESPnet |
|
|
|
```BibTex |
|
@inproceedings{watanabe2018espnet, |
|
author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai}, |
|
title={{ESPnet}: End-to-End Speech Processing Toolkit}, |
|
year={2018}, |
|
booktitle={Proceedings of Interspeech}, |
|
pages={2207--2211}, |
|
doi={10.21437/Interspeech.2018-1456}, |
|
url={http://dx.doi.org/10.21437/Interspeech.2018-1456} |
|
} |
|
|
|
|
|
|
|
|
|
``` |
|
|
|
or arXiv: |
|
|
|
```bibtex |
|
@misc{watanabe2018espnet, |
|
title={ESPnet: End-to-End Speech Processing Toolkit}, |
|
author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai}, |
|
year={2018}, |
|
eprint={1804.00015}, |
|
archivePrefix={arXiv}, |
|
primaryClass={cs.CL} |
|
} |
|
``` |
|
|