ms180 commited on
Commit
9caf213
1 Parent(s): a6647d4

Initial commit

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. 404.html +37 -0
  2. assets/404.html-DN7291h8.js +1 -0
  3. assets/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html-CJ8-yKXK.js +87 -0
  4. assets/NpmBadge-rh9tvaXX.js +1 -0
  5. assets/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html-BDY4p1H1.js +315 -0
  6. assets/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html-DDjiuGQB.js +115 -0
  7. assets/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html-DnIftUJK.js +220 -0
  8. assets/app-DTS6SjJz.js +0 -0
  9. assets/asr_cli.html-BA-xBrC-.js +113 -0
  10. assets/asr_library.html-rEQwKTMV.js +165 -0
  11. assets/espnet2_2pass_slu_demo.html-BmvJ92Ni.js +61 -0
  12. assets/espnet2_asr_realtime_demo.html-BnK1Wovv.js +158 -0
  13. assets/espnet2_asr_transfer_learning_demo.html-DJeSoTyY.js +64 -0
  14. assets/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html-D9GsoT-_.js +112 -0
  15. assets/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html-DobzmqH0.js +0 -0
  16. assets/espnet2_streaming_asr_demo.html-Yx-OX8AZ.js +99 -0
  17. assets/espnet2_tts_realtime_demo.html-BdxLBr1c.js +141 -0
  18. assets/espnet2_tutorial_2021_CMU_11751_18781.html-BY6Z52B4.js +0 -0
  19. assets/espnet_se_demonstration_for_waspaa_2021.html--JdDNbEo.js +355 -0
  20. assets/finetune_owsm.html-ICOQYZj2.js +183 -0
  21. assets/finetune_with_lora.html-3NfoQDOl.js +47 -0
  22. assets/index.html-DGcx4T0I.js +7 -0
  23. assets/onnx_conversion_demo.html-D56NEMop.js +87 -0
  24. assets/pretrained.html-JpE__EKJ.js +40 -0
  25. assets/se_demo.html-DY-mv2y8.js +311 -0
  26. assets/st_demo.html-WLzB4ZGO.js +54 -0
  27. assets/style-SNWc1iKP.css +1 -0
  28. assets/tacotron2.html-Ds1AKES7.js +69 -0
  29. assets/train.html-BQ-t2Cs4.js +105 -0
  30. assets/tts_cli.html-BfB21gs4.js +113 -0
  31. assets/tts_realtime_demo.html-BKOGq7as.js +351 -0
  32. browserconfig.xml +9 -0
  33. espnet2/asr/asr_cli.html +149 -0
  34. espnet2/asr/asr_library.html +201 -0
  35. espnet2/asr/espnet2_asr_realtime_demo.html +194 -0
  36. espnet2/asr/espnet2_asr_transfer_learning_demo.html +100 -0
  37. espnet2/asr/espnet2_streaming_asr_demo.html +135 -0
  38. espnet2/others/onnx_conversion_demo.html +123 -0
  39. espnet2/others/pretrained.html +76 -0
  40. espnet2/se/espnet_se_demonstration_for_waspaa_2021.html +0 -0
  41. espnet2/se/se_demo.html +347 -0
  42. espnet2/slu/espnet2_2pass_slu_demo.html +97 -0
  43. espnet2/st/st_demo.html +90 -0
  44. espnet2/tts/espnet2_tts_realtime_demo.html +177 -0
  45. espnet2/tts/tts_cli.html +149 -0
  46. espnet2/tts/tts_realtime_demo.html +387 -0
  47. espnetez/asr/finetune_owsm.html +219 -0
  48. espnetez/asr/finetune_with_lora.html +83 -0
  49. espnetez/asr/train.html +141 -0
  50. espnetez/tts/tacotron2.html +105 -0
404.html ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!doctype html>
2
+ <html lang="en-US">
3
+ <head>
4
+ <meta charset="utf-8" />
5
+ <meta name="viewport" content="width=device-width,initial-scale=1" />
6
+ <meta name="generator" content="VuePress 2.0.0-rc.9" />
7
+ <style>
8
+ :root {
9
+ --c-bg: #fff;
10
+ }
11
+ html.dark {
12
+ --c-bg: #22272e;
13
+ }
14
+ html,
15
+ body {
16
+ background-color: var(--c-bg);
17
+ }
18
+ </style>
19
+ <script>
20
+ const userMode = localStorage.getItem('vuepress-color-scheme')
21
+ const systemDarkMode =
22
+ window.matchMedia &&
23
+ window.matchMedia('(prefers-color-scheme: dark)').matches
24
+ if (userMode === 'dark' || (userMode !== 'light' && systemDarkMode)) {
25
+ document.documentElement.classList.toggle('dark', true)
26
+ }
27
+ </script>
28
+ <link rel="manifest" href="/manifest.webmanifest"><meta name="application-name" content="Example"><meta name="apple-mobile-web-app-title" content="Example"><meta name="apple-mobile-web-app-status-bar-style" content="black"><meta name="msapplication-TileColor" content="#3eaf7c"><meta name="theme-color" content="#3eaf7c"><title> </title><meta name="description" content=" ">
29
+ <link rel="preload" href="/assets/style-SNWc1iKP.css" as="style"><link rel="stylesheet" href="/assets/style-SNWc1iKP.css">
30
+ <link rel="modulepreload" href="/assets/app-DTS6SjJz.js"><link rel="modulepreload" href="/assets/404.html-DN7291h8.js">
31
+ <link rel="prefetch" href="/assets/index.html-DGcx4T0I.js" as="script"><link rel="prefetch" href="/assets/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html-CJ8-yKXK.js" as="script"><link rel="prefetch" href="/assets/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html-BDY4p1H1.js" as="script"><link rel="prefetch" href="/assets/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html-DDjiuGQB.js" as="script"><link rel="prefetch" href="/assets/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html-DnIftUJK.js" as="script"><link rel="prefetch" href="/assets/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html-D9GsoT-_.js" as="script"><link rel="prefetch" href="/assets/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html-DobzmqH0.js" as="script"><link rel="prefetch" href="/assets/espnet2_tutorial_2021_CMU_11751_18781.html-BY6Z52B4.js" as="script"><link rel="prefetch" href="/assets/asr_cli.html-BA-xBrC-.js" as="script"><link rel="prefetch" href="/assets/asr_library.html-rEQwKTMV.js" as="script"><link rel="prefetch" href="/assets/espnet2_asr_realtime_demo.html-BnK1Wovv.js" as="script"><link rel="prefetch" href="/assets/espnet2_asr_transfer_learning_demo.html-DJeSoTyY.js" as="script"><link rel="prefetch" href="/assets/espnet2_streaming_asr_demo.html-Yx-OX8AZ.js" as="script"><link rel="prefetch" href="/assets/espnet_se_demonstration_for_waspaa_2021.html--JdDNbEo.js" as="script"><link rel="prefetch" href="/assets/se_demo.html-DY-mv2y8.js" as="script"><link rel="prefetch" href="/assets/onnx_conversion_demo.html-D56NEMop.js" as="script"><link rel="prefetch" href="/assets/pretrained.html-JpE__EKJ.js" as="script"><link rel="prefetch" href="/assets/espnet2_2pass_slu_demo.html-BmvJ92Ni.js" as="script"><link rel="prefetch" href="/assets/st_demo.html-WLzB4ZGO.js" as="script"><link rel="prefetch" href="/assets/espnet2_tts_realtime_demo.html-BdxLBr1c.js" as="script"><link rel="prefetch" href="/assets/tts_cli.html-BfB21gs4.js" as="script"><link rel="prefetch" href="/assets/tts_realtime_demo.html-BKOGq7as.js" as="script"><link rel="prefetch" href="/assets/finetune_owsm.html-ICOQYZj2.js" as="script"><link rel="prefetch" href="/assets/finetune_with_lora.html-3NfoQDOl.js" as="script"><link rel="prefetch" href="/assets/train.html-BQ-t2Cs4.js" as="script"><link rel="prefetch" href="/assets/tacotron2.html-Ds1AKES7.js" as="script"><link rel="prefetch" href="/assets/NpmBadge-rh9tvaXX.js" as="script">
32
+ </head>
33
+ <body>
34
+ <div id="app"><!--[--><div class="theme-container"><main class="page"><div class="theme-default-content"><h1>404</h1><blockquote>There&#39;s nothing here.</blockquote><a class="route-link" href="/">Take me home</a></div></main></div><!--[--><!----><!--]--><!--]--></div>
35
+ <script type="module" src="/assets/app-DTS6SjJz.js" defer></script>
36
+ </body>
37
+ </html>
assets/404.html-DN7291h8.js ADDED
@@ -0,0 +1 @@
 
 
1
+ import{_ as t,o as e,c as o,a}from"./app-DTS6SjJz.js";const n={},c=a("p",null,"404 Not Found",-1),l=[c];function s(_,r){return e(),o("div",null,l)}const d=t(n,[["render",s],["__file","404.html.vue"]]),h=JSON.parse('{"path":"/404.html","title":"","lang":"en-US","frontmatter":{"layout":"NotFound"},"headers":[],"git":{},"filePathRelative":null}');export{d as comp,h as data};
assets/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html-CJ8-yKXK.js ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import{_ as l,r as o,o as t,c as i,a as s,d as a,b as n,e as p}from"./app-DTS6SjJz.js";const r={},c=s("h1",{id:"cmu-11492-11692-spring-2023-data-preparation",tabindex:"-1"},[s("a",{class:"header-anchor",href:"#cmu-11492-11692-spring-2023-data-preparation"},[s("span",null,"CMU 11492/11692 Spring 2023: Data preparation")])],-1),d=s("p",null,"In this demonstration, we will show you the procedure to prepare the data for speech processing (ASR as an example).",-1),u=s("p",null,"Main references:",-1),h={href:"https://github.com/espnet/espnet",target:"_blank",rel:"noopener noreferrer"},D={href:"https://espnet.github.io/espnet/",target:"_blank",rel:"noopener noreferrer"},v={href:"https://colab.research.google.com/github/espnet/notebook/blob/master/espnet2_tutorial_2021_CMU_11751_18781.ipynb",target:"_blank",rel:"noopener noreferrer"},m={href:"https://colab.research.google.com/drive/1tY6PxF_M5Nx5n488x0DrpujJOyqW-ATi?usp=sharing",target:"_blank",rel:"noopener noreferrer"},y={href:"https://colab.research.google.com/github/espnet/notebook/blob/master/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.ipynb",target:"_blank",rel:"noopener noreferrer"},b=p(`<p>Author:</p><ul><li>Jiatong Shi (jiatongs@andrew.cmu.edu)</li></ul><h2 id="objectives" tabindex="-1"><a class="header-anchor" href="#objectives"><span>Objectives</span></a></h2><p>After this demonstration, you are expected to know:</p><ul><li>Understand the Kaldi(ESPnet) data format</li></ul><h2 id="useful-links" tabindex="-1"><a class="header-anchor" href="#useful-links"><span>Useful links</span></a></h2><ul><li>Installation https://espnet.github.io/espnet/installation.html</li><li>Kaldi Data format https://kaldi-asr.org/doc/data_prep.html</li><li>ESPnet data format https://github.com/espnet/espnet/tree/master/egs2/TEMPLATE#about-kaldi-style-data-directory</li></ul><h2 id="download-espnet" tabindex="-1"><a class="header-anchor" href="#download-espnet"><span>Download ESPnet</span></a></h2><p>We use <code>git clone</code> to download the source code of ESPnet and then go to a specific commit.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># It takes a few seconds</span></span>
2
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">git clone </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">depth </span><span style="color:#B5CEA8;">5</span><span style="color:#D4D4D4;"> https://github.com/espnet/espnet</span></span>
3
+ <span class="line"></span>
4
+ <span class="line"><span style="color:#6A9955;"># We use a specific commit just for reproducibility.</span></span>
5
+ <span class="line"><span style="color:#D4D4D4;">%cd /content/espnet</span></span>
6
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">git checkout </span><span style="color:#F44747;">3970558fbbe38d7b7e9922b08a9aa249390d4fb7</span></span>
7
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="setup-python-environment-based-on-anaconda" tabindex="-1"><a class="header-anchor" href="#setup-python-environment-based-on-anaconda"><span>Setup Python environment based on anaconda</span></a></h2><p>There are several other installation methods, but <strong>we highly recommend the anaconda-based one</strong>. In this demonstration, we will only need to have the python environment (no need to install the full espnet). But installation of ESPnet main codebase will be necessary for for training/inference/scoring.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># It takes 30 seconds</span></span>
8
+ <span class="line"><span style="color:#D4D4D4;">%cd /content/espnet/tools</span></span>
9
+ <span class="line"><span style="color:#D4D4D4;">!./setup_anaconda.sh anaconda espnet </span><span style="color:#B5CEA8;">3.9</span></span>
10
+ <span class="line"></span>
11
+ <span class="line"><span style="color:#D4D4D4;">!./installers/install_sph2pipe.sh</span></span>
12
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>We will also install some essential python libraries (these will be auto-matically downloaded during espnet installation. However, today, we won&#39;t go through that part, so we need to mannually install the packages.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">pip install kaldiio soundfile tqdm librosa matplotlib IPython</span></span>
13
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>We will also need Kaldi for some essential scripts.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">git clone https://github.com/kaldi-asr/kaldi.git</span></span>
14
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h1 id="data-preparation-in-espnet" tabindex="-1"><a class="header-anchor" href="#data-preparation-in-espnet"><span>Data preparation in ESPnet</span></a></h1><p>ESPnet has a number of recipes (146 recipes on Jan. 23, 2023). One of the most important steps for those recipes is the preparation of the data. Constructing in different scenarios, spoken corpora need to be converted into a unified format. In ESPnet, we follow and adapt the Kaldi data format for various tasks.</p><p>In this demonstration, we will focus on a specific recipe <code>an4</code> as an example.</p><p>Other materials:</p><ul><li>Kaldi format documentation can be found in https://kaldi-asr.org/doc/data_prep.html</li><li>ESPnet data format is in https://github.com/espnet/espnet/tree/master/egs2/TEMPLATE#about-kaldi-style-data-directory</li><li>Please refer to https://github.com/espnet/espnet/blob/master/egs2/README.md for a complete list of recipes.</li><li>Please also check the general usage of the recipe in https://espnet.github.io/espnet/espnet2_tutorial.html#recipes-using-espnet2</li></ul><h2 id="data-preparation-for-an4" tabindex="-1"><a class="header-anchor" href="#data-preparation-for-an4"><span>Data preparation for AN4</span></a></h2><p>All the data preparation in ESPnet2 happens in <code>egs2/recipe_name/task/local/data.sh</code> where the task can be either <code>asr1</code>, <code>enh1</code>, <code>tts1</code>, etc.</p><p><strong>CMU AN4 recipe</strong></p><p>In this demonstration, we will use the CMU <code>an4</code> recipe. This is a small-scale speech recognition task mainly used for testing.</p><p>First, let&#39;s go to the recipe directory.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">%cd /content/espnet/egs2/an4/asr1</span></span>
15
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">ls</span></span>
16
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>egs2/an4/asr1/</span></span>
17
+ <span class="line"><span> - conf/ # Configuration files for training, inference, etc.</span></span>
18
+ <span class="line"><span> - scripts/ # Bash utilities of espnet2</span></span>
19
+ <span class="line"><span> - pyscripts/ # Python utilities of espnet2</span></span>
20
+ <span class="line"><span> - steps/ # From Kaldi utilities</span></span>
21
+ <span class="line"><span> - utils/ # From Kaldi utilities</span></span>
22
+ <span class="line"><span> - local/ # Some local scripts for specific recipes (Data Preparation usually in \`local/data.sh\`)</span></span>
23
+ <span class="line"><span> - db.sh # The directory path of each corpora</span></span>
24
+ <span class="line"><span> - path.sh # Setup script for environment variables</span></span>
25
+ <span class="line"><span> - cmd.sh # Configuration for your backend of job scheduler</span></span>
26
+ <span class="line"><span> - run.sh # Entry point</span></span>
27
+ <span class="line"><span> - asr.sh # Invoked by run.sh</span></span>
28
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># a few seconds</span></span>
29
+ <span class="line"><span style="color:#D4D4D4;">!./local/data.sh</span></span>
30
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><p>The orginal data usually in various format. AN4 has a quite straightforward format. You may dig into the folder <code>an4</code> to see the raw format. After this preparation is finished, all the information will be in the <code>data</code> directory:</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">ls data</span></span>
31
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>In this recipe, we use <code>train_nodev</code> as a training set, <code>train_dev</code> as a validation set (monitor the training progress by checking the validation score). We also use <code>test</code> and <code>train_dev</code> sets for the final speech recognition evaluation.</p><p>Let&#39;s check one of the training data directories:</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">ls -</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;"> data/train_nodev/</span></span>
32
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>In short, the four files are:</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>spk2utt # Speaker information</span></span>
33
+ <span class="line"><span>text # Transcription file</span></span>
34
+ <span class="line"><span>utt2spk # Speaker information</span></span>
35
+ <span class="line"><span>wav.scp # Audio file</span></span>
36
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>The <code>wav.scp</code> is the most important file that holds the speech data. For each line of the <code>wav.scp</code>, there are generally two components <code>WAV_ID</code> and <code>SPEECH_AUDIO</code> for each line of the file. The <code>WAV_ID</code> is an identifier for the utterance, while the <code>SPEECH_AUDIO</code> holds the speech audio data.</p><p>The audio data can be in various audio formats, such as <code>wav</code>, <code>flac</code>, <code>sph</code>, etc. We can also use pipe to normalize audio files with (e.g., <code>sox</code>, <code>ffmpeg</code>, <code>sph2pipe</code>). The following from an4 is an example using <code>sph2pipe</code>.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">head -n </span><span style="color:#B5CEA8;">10</span><span style="color:#D4D4D4;"> data/train_nodev/wav.scp</span></span>
37
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>The <code>text</code> is to hold the transription of the speech. Similar to <code>wav.scp</code>, for each line of <code>text</code>, there are <code>UTT_ID</code> and <code>TRANSCRIPTION</code>. Note that the <code>UTT_ID</code> in <code>text</code> and <code>WAV_ID</code> in <code>wav.scp</code> are not necessary the same. But for the simple case (e.g., the <code>AN4</code>), we regard them as the same. The example in <code>AN4</code> is as:</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">head -n </span><span style="color:#B5CEA8;">10</span><span style="color:#D4D4D4;"> data/train_nodev/text</span></span>
38
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>The <code>spk2utt</code> and <code>utt2spk</code> are mapping between utterances and speakers. The information is widely used in conventional hidden Markov model (HMM)-based ASR systems, but not that popular in end-to-end ASR systems nowadays. However, they are still very important for tasks such as speaker diarization and multi-speaker text-to-speech. The examples of AN4 is as follows:</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">head -n </span><span style="color:#B5CEA8;">10</span><span style="color:#D4D4D4;"> data/train_nodev/spk2utt</span></span>
39
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">echo </span><span style="color:#CE9178;">&quot;--------------------------&quot;</span></span>
40
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">head -n </span><span style="color:#B5CEA8;">10</span><span style="color:#D4D4D4;"> data/train_nodev/utt2spk</span></span>
41
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="how-to-read-file-in-pipe" tabindex="-1"><a class="header-anchor" href="#how-to-read-file-in-pipe"><span>How to read file in pipe</span></a></h2><p>We can use <code>kaldiio</code> package to read audio files from <code>wav.scp</code>. The example is as follows:</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> soundfile</span></span>
42
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> kaldiio</span></span>
43
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> matplotlib.pyplot </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> plt</span></span>
44
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> io </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> BytesIO</span></span>
45
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> tqdm </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> tqdm</span></span>
46
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> librosa.display</span></span>
47
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> numpy </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> np</span></span>
48
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> IPython.display </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> ipd</span></span>
49
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> os</span></span>
50
+ <span class="line"></span>
51
+ <span class="line"><span style="color:#D4D4D4;">os.environ[</span><span style="color:#CE9178;">&#39;PATH&#39;</span><span style="color:#D4D4D4;">] = os.environ[</span><span style="color:#CE9178;">&#39;PATH&#39;</span><span style="color:#D4D4D4;">] + </span><span style="color:#CE9178;">&quot;:/content/espnet/tools/sph2pipe&quot;</span></span>
52
+ <span class="line"></span>
53
+ <span class="line"><span style="color:#D4D4D4;">wavscp = </span><span style="color:#DCDCAA;">open</span><span style="color:#D4D4D4;">(</span><span style="color:#CE9178;">&quot;data/test/wav.scp&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;r&quot;</span><span style="color:#D4D4D4;">)</span></span>
54
+ <span class="line"></span>
55
+ <span class="line"><span style="color:#D4D4D4;">num_wav = </span><span style="color:#B5CEA8;">5</span></span>
56
+ <span class="line"><span style="color:#D4D4D4;">count = </span><span style="color:#B5CEA8;">1</span></span>
57
+ <span class="line"><span style="color:#C586C0;">for</span><span style="color:#D4D4D4;"> line </span><span style="color:#C586C0;">in</span><span style="color:#D4D4D4;"> tqdm(wavscp):</span></span>
58
+ <span class="line"><span style="color:#D4D4D4;"> utt_id, wavpath = line.strip().split(</span><span style="color:#569CD6;">None</span><span style="color:#D4D4D4;">, </span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">)</span></span>
59
+ <span class="line"><span style="color:#C586C0;"> with</span><span style="color:#D4D4D4;"> kaldiio.open_like_kaldi(wavpath, </span><span style="color:#CE9178;">&quot;rb&quot;</span><span style="color:#D4D4D4;">) </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> f:</span></span>
60
+ <span class="line"><span style="color:#C586C0;"> with</span><span style="color:#D4D4D4;"> BytesIO(f.read()) </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> g:</span></span>
61
+ <span class="line"><span style="color:#D4D4D4;"> wave, rate = soundfile.read(g, </span><span style="color:#9CDCFE;">dtype</span><span style="color:#D4D4D4;">=np.float32)</span></span>
62
+ <span class="line"><span style="color:#DCDCAA;"> print</span><span style="color:#D4D4D4;">(</span><span style="color:#CE9178;">&quot;audio: </span><span style="color:#569CD6;">{}</span><span style="color:#CE9178;">&quot;</span><span style="color:#D4D4D4;">.format(utt_id))</span></span>
63
+ <span class="line"><span style="color:#D4D4D4;"> librosa.display.waveshow(wave, rate)</span></span>
64
+ <span class="line"><span style="color:#D4D4D4;"> plt.show()</span></span>
65
+ <span class="line"></span>
66
+ <span class="line"><span style="color:#D4D4D4;"> ipd.display(ipd.Audio(wave, </span><span style="color:#9CDCFE;">rate</span><span style="color:#D4D4D4;">=rate)) </span><span style="color:#6A9955;"># load a NumPy array</span></span>
67
+ <span class="line"><span style="color:#C586C0;"> if</span><span style="color:#D4D4D4;"> count == num_wav:</span></span>
68
+ <span class="line"><span style="color:#C586C0;"> break</span></span>
69
+ <span class="line"><span style="color:#D4D4D4;"> count += </span><span style="color:#B5CEA8;">1</span></span>
70
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="data-preparation-for-totonac" tabindex="-1"><a class="header-anchor" href="#data-preparation-for-totonac"><span>Data preparation for TOTONAC</span></a></h2><p><strong>CMU TOTONAC recipe</strong></p><p>In the second part of the demonstration, we will use the CMU <code>totonac</code> recipe. This is a small-scale ASR recipe, which is an endangered language in central Mexico. We will follow mostly the similar procedure as the showcase of AN4. For the start, the recipe directory of <code>totonac</code> is almost the same as <code>an4</code>.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">%cd /content/espnet/egs2/totonac/asr1</span></span>
71
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">ls</span></span>
72
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><p>Then we execute <code>./local/data.sh</code> for the data preparation, which is the same as <code>an4</code>. The downloading takes a longer time (around 2-3 mins) for <code>totonac</code> as the speech is in higher-sampling rate and recorded in a conversational manner which include longer session rather than single utterances.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">!. ../../../tools/activate_python.sh </span><span style="color:#F44747;">&amp;&amp;</span><span style="color:#D4D4D4;"> pip install soundfile </span><span style="color:#6A9955;"># we need soundfile for necessary processing</span></span>
73
+ <span class="line"></span>
74
+ <span class="line"><span style="color:#D4D4D4;">!./local/data.sh</span></span>
75
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>Let&#39;s first check the original data format of the <code>totonac</code>. To facilate the linguists working on the language, we use the ELAN format, which is special XML format. For preparation, we need to parse the format into the same Kaldi format as mentioned ahead. For more details, please check https://github.com/espnet/espnet/blob/master/egs2/totonac/asr1/local/data_prep.py</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">ls -l downloads/Conversaciones/Botany/Transcripciones/ELAN-para-traducir | head -n </span><span style="color:#B5CEA8;">5</span></span>
76
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">echo </span><span style="color:#CE9178;">&quot;-----------------------------------------------&quot;</span></span>
77
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">cat downloads/Conversaciones/Botany/Transcripciones/ELAN-para-traducir/Zongo_Botan_ESP400-SLC388_Convolvulaceae-Cuscuta-sp_2019-</span><span style="color:#B5CEA8;">0</span><span style="color:#F44747;">9</span><span style="color:#D4D4D4;">-</span><span style="color:#B5CEA8;">25</span><span style="color:#D4D4D4;">-c_ed-</span><span style="color:#B5CEA8;">2020</span><span style="color:#D4D4D4;">-</span><span style="color:#B5CEA8;">12</span><span style="color:#D4D4D4;">-</span><span style="color:#B5CEA8;">30.</span><span style="color:#D4D4D4;">eaf</span></span>
78
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>Similar to <code>AN4</code>, we will have three sets for the experiments for <code>totonac</code>, including train, test and dev. However, within the set, we also have a <code>segments</code> file apart from the files mentioned above.</p><p>For each line of <code>segments</code>, we will have four fields for each line, including <code>UTT_ID</code>, <code>WAV_ID</code>, &quot;start time&quot; and &quot;end time&quot;. Note that when <code>segments</code> files are presented, the <code>WAV_ID</code> in <code>wav.scp</code> and <code>UTT_ID</code> in <code>text</code>, <code>utt2spk</code> and <code>spk2utt</code> are not the same anymore. And the <code>segments</code> is the file that keeps the relationship between <code>WAV_ID</code> and <code>UTT_ID</code>.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">ls -l data</span></span>
79
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">echo </span><span style="color:#CE9178;">&quot;--------------------------&quot;</span></span>
80
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">ls -l data/train</span></span>
81
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">echo </span><span style="color:#CE9178;">&quot;------------- wav.scp file -------------&quot;</span></span>
82
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">head -n </span><span style="color:#B5CEA8;">10</span><span style="color:#D4D4D4;"> data/train/wav.scp</span></span>
83
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">echo </span><span style="color:#CE9178;">&quot;------------- Segment file -------------&quot;</span></span>
84
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">head -n </span><span style="color:#B5CEA8;">10</span><span style="color:#D4D4D4;"> data/train/segments</span></span>
85
+ <span class="line"></span>
86
+ <span class="line"></span>
87
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>#Questions:</p><p><strong>Q1: The format itself is very general. But it cannot fit to all the tasks in speech processing. Could you list three tasks where the current format cannot be sufficient?</strong></p><p><em>Your Answers here</em></p><p><strong>Q2: For the three tasks you listed above, can you think of some modification or addition to the format to make it also working for the tasks?</strong></p><p><em>Your Answers here</em></p><p><strong>Q3: Briefly discuss the difference within the <code>wav.scp</code> between <code>an4</code> and <code>totonac</code></strong></p><p><em>Your Answers here</em></p><p>(Note that for this assignment, you do not need to submit anything.)</p>`,66);function f(g,k){const e=o("ExternalLinkIcon");return t(),i("div",null,[c,d,u,s("ul",null,[s("li",null,[s("a",h,[a("ESPnet repository"),n(e)])]),s("li",null,[s("a",D,[a("ESPnet documentation"),n(e)])]),s("li",null,[s("a",v,[a("ESPnet tutorial in Speech Recognition and Understanding (Fall 2021)"),n(e)])]),s("li",null,[s("a",m,[a("Recitation in Multilingual NLP (Spring 2022)"),n(e)])]),s("li",null,[s("a",y,[a("ESPnet tutorail in Speech Recognition and Understanding (Fall 2022)"),n(e)])])]),b])}const w=l(r,[["render",f],["__file","DataPreparation_CMU_11492_692_Spring2023(Assignment0).html.vue"]]),C=JSON.parse('{"path":"/tutorials/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html","title":"CMU 11492/11692 Spring 2023: Data preparation","lang":"en-US","frontmatter":{},"headers":[{"level":2,"title":"Objectives","slug":"objectives","link":"#objectives","children":[]},{"level":2,"title":"Useful links","slug":"useful-links","link":"#useful-links","children":[]},{"level":2,"title":"Download ESPnet","slug":"download-espnet","link":"#download-espnet","children":[]},{"level":2,"title":"Setup Python environment based on anaconda","slug":"setup-python-environment-based-on-anaconda","link":"#setup-python-environment-based-on-anaconda","children":[]},{"level":2,"title":"Data preparation for AN4","slug":"data-preparation-for-an4","link":"#data-preparation-for-an4","children":[]},{"level":2,"title":"How to read file in pipe","slug":"how-to-read-file-in-pipe","link":"#how-to-read-file-in-pipe","children":[]},{"level":2,"title":"Data preparation for TOTONAC","slug":"data-preparation-for-totonac","link":"#data-preparation-for-totonac","children":[]}],"git":{},"filePathRelative":"tutorials/DataPreparation_CMU_11492_692_Spring2023(Assignment0).md"}');export{w as comp,C as data};
assets/NpmBadge-rh9tvaXX.js ADDED
@@ -0,0 +1 @@
 
 
1
+ import{f as r,g as t,o,c as p,a as g,_ as d}from"./app-DTS6SjJz.js";const l=["href","title"],i=["src","alt"],m=r({__name:"NpmBadge",props:{package:{type:String,required:!0},distTag:{type:String,required:!1,default:"next"}},setup(a){const e=a,n=t(()=>`https://www.npmjs.com/package/${e.package}`),c=t(()=>e.distTag?`${e.package}@${e.distTag}`:e.package),s=t(()=>`https://badgen.net/npm/v/${e.package}/${e.distTag}?label=${encodeURIComponent(c.value)}`);return(u,_)=>(o(),p("a",{class:"npm-badge",href:n.value,title:a.package,target:"_blank",rel:"noopener noreferrer"},[g("img",{src:s.value,alt:a.package},null,8,i)],8,l))}}),k=d(m,[["__scopeId","data-v-c758b2a0"],["__file","NpmBadge.vue"]]);export{k as default};
assets/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html-BDY4p1H1.js ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import{_ as l,r as p,o as t,c,a as n,d as s,b as e,e as i}from"./app-DTS6SjJz.js";const d={},r=n("h1",{id:"cmu-11492-11692-spring-2023-speech-enhancement",tabindex:"-1"},[n("a",{class:"header-anchor",href:"#cmu-11492-11692-spring-2023-speech-enhancement"},[n("span",null,"CMU 11492/11692 Spring 2023: Speech Enhancement")])],-1),o=n("p",null,"In this demonstration, we will show you some demonstrations of speech enhancement systems in ESPnet.",-1),u=n("p",null,"Main references:",-1),v={href:"https://github.com/espnet/espnet",target:"_blank",rel:"noopener noreferrer"},m={href:"https://espnet.github.io/espnet/",target:"_blank",rel:"noopener noreferrer"},h={href:"https://github.com/espnet/espnet/tree/master/egs2/TEMPLATE/enh1",target:"_blank",rel:"noopener noreferrer"},b=n("p",null,"Author:",-1),_=n("ul",null,[n("li",null,"Siddhant Arora (siddhana@andrew.cmu.edu)")],-1),f={href:"https://colab.research.google.com/drive/1faFfqWNFe1QW3Q1PMwRXlNDwaBms__Ho?usp=sharing",target:"_blank",rel:"noopener noreferrer"},x=i(`<h2 id="❗important-notes❗" tabindex="-1"><a class="header-anchor" href="#❗important-notes❗"><span>❗Important Notes❗</span></a></h2><ul><li>We are using Colab to show the demo. However, Colab has some constraints on the total GPU runtime. If you use too much GPU time, you may not be able to use GPU for some time.</li><li>There are multiple in-class checkpoints ✅ throughout this tutorial. <strong>Your participation points are based on these tasks.</strong> Please try your best to follow all the steps! If you encounter issues, please notify the TAs as soon as possible so that we can make an adjustment for you.</li><li>Please submit PDF files of your completed notebooks to Gradescope. You can print the notebook using <code>File -&gt; Print</code> in the menu bar.You also need to submit the spectrogram and waveform of noisy and enhanced audio files to Gradescope.</li></ul><h1 id="contents" tabindex="-1"><a class="header-anchor" href="#contents"><span>Contents</span></a></h1><p>Tutorials on the Basic Usage</p><ol><li><p>Install</p></li><li><p>Speech Enhancement with Pretrained Models</p></li></ol><blockquote><p>We support various interfaces, e.g. Python API, HuggingFace API, portable speech enhancement scripts for other tasks, etc.</p></blockquote><p>2.1 Single-channel Enhancement (CHiME-4)</p><p>2.2 Enhance Your Own Recordings</p><p>2.3 Multi-channel Enhancement (CHiME-4)</p><ol start="3"><li>Speech Separation with Pretrained Models</li></ol><p>3.1 Model Selection</p><p>3.2 Separate Speech Mixture</p><ol start="4"><li>Evaluate Separated Speech with the Pretrained ASR Model</li></ol><p>Tutorials on the Basic Usage</p><h2 id="install" tabindex="-1"><a class="header-anchor" href="#install"><span>Install</span></a></h2><p>Different from previous assignment where we install the full version of ESPnet, we use a lightweight ESPnet package, which mainly designed for inference purpose. The installation with the light version can be much faster than a full installation.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>import locale</span></span>
2
+ <span class="line"><span>locale.getpreferredencoding = lambda: &quot;UTF-8&quot;</span></span>
3
+ <span class="line"><span>%pip uninstall torch</span></span>
4
+ <span class="line"><span>%pip install torch==1.13.0+cu117 torchvision==0.14.0+cu117 torchaudio==0.13.0 --extra-index-url https://download.pytorch.org/whl/cu117</span></span>
5
+ <span class="line"><span>%pip install -q git+https://github.com/espnet/espnet</span></span>
6
+ <span class="line"><span>%pip install -q espnet_model_zoo</span></span>
7
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="speech-enhancement-with-pretrained-models" tabindex="-1"><a class="header-anchor" href="#speech-enhancement-with-pretrained-models"><span>Speech Enhancement with Pretrained Models</span></a></h2><h3 id="single-channel-enhancement-the-chime-example" tabindex="-1"><a class="header-anchor" href="#single-channel-enhancement-the-chime-example"><span>Single-Channel Enhancement, the CHiME example</span></a></h3><h3 id="task1-✅-checkpoint-1-1-point" tabindex="-1"><a class="header-anchor" href="#task1-✅-checkpoint-1-1-point"><span>Task1 (✅ Checkpoint 1 (1 point))</span></a></h3><p>Run inference of pretrained single-channel enhancement model.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># Download one utterance from real noisy speech of CHiME4</span></span>
8
+ <span class="line"><span>!gdown --id 1SmrN5NFSg6JuQSs2sfy3ehD8OIcqK6wS -O /content/M05_440C0213_PED_REAL.wav</span></span>
9
+ <span class="line"><span>import os</span></span>
10
+ <span class="line"><span></span></span>
11
+ <span class="line"><span>import soundfile</span></span>
12
+ <span class="line"><span>from IPython.display import display, Audio</span></span>
13
+ <span class="line"><span>mixwav_mc, sr = soundfile.read(&quot;/content/M05_440C0213_PED_REAL.wav&quot;)</span></span>
14
+ <span class="line"><span># mixwav.shape: num_samples, num_channels</span></span>
15
+ <span class="line"><span>mixwav_sc = mixwav_mc[:,4]</span></span>
16
+ <span class="line"><span>display(Audio(mixwav_mc.T, rate=sr))</span></span>
17
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h4 id="download-and-load-the-pretrained-conv-tasnet" tabindex="-1"><a class="header-anchor" href="#download-and-load-the-pretrained-conv-tasnet"><span>Download and load the pretrained Conv-Tasnet</span></a></h4><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!gdown --id 17DMWdw84wF3fz3t7ia1zssdzhkpVQGZm -O /content/chime_tasnet_singlechannel.zip</span></span>
18
+ <span class="line"><span>!unzip /content/chime_tasnet_singlechannel.zip -d /content/enh_model_sc</span></span>
19
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># Load the model</span></span>
20
+ <span class="line"><span># If you encounter error &quot;No module named &#39;espnet2&#39;&quot;, please re-run the 1st Cell. This might be a colab bug.</span></span>
21
+ <span class="line"><span>import sys</span></span>
22
+ <span class="line"><span>import soundfile</span></span>
23
+ <span class="line"><span>from espnet2.bin.enh_inference import SeparateSpeech</span></span>
24
+ <span class="line"><span></span></span>
25
+ <span class="line"><span></span></span>
26
+ <span class="line"><span>separate_speech = {}</span></span>
27
+ <span class="line"><span># For models downloaded from GoogleDrive, you can use the following script:</span></span>
28
+ <span class="line"><span>enh_model_sc = SeparateSpeech(</span></span>
29
+ <span class="line"><span> train_config=&quot;/content/enh_model_sc/exp/enh_train_enh_conv_tasnet_raw/config.yaml&quot;,</span></span>
30
+ <span class="line"><span> model_file=&quot;/content/enh_model_sc/exp/enh_train_enh_conv_tasnet_raw/5epoch.pth&quot;,</span></span>
31
+ <span class="line"><span> # for segment-wise process on long speech</span></span>
32
+ <span class="line"><span> normalize_segment_scale=False,</span></span>
33
+ <span class="line"><span> show_progressbar=True,</span></span>
34
+ <span class="line"><span> ref_channel=4,</span></span>
35
+ <span class="line"><span> normalize_output_wav=True,</span></span>
36
+ <span class="line"><span> device=&quot;cuda:0&quot;,</span></span>
37
+ <span class="line"><span>)</span></span>
38
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h4 id="enhance-the-single-channel-real-noisy-speech-in-chime4" tabindex="-1"><a class="header-anchor" href="#enhance-the-single-channel-real-noisy-speech-in-chime4"><span>Enhance the single-channel real noisy speech in CHiME4</span></a></h4><p>Please submit the screenshot of output of current block and the spectogram and waveform of noisy and enhanced speech file to Gradescope for Task 1.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># play the enhanced single-channel speech</span></span>
39
+ <span class="line"><span>wave = enh_model_sc(mixwav_sc[None, ...], sr)</span></span>
40
+ <span class="line"><span></span></span>
41
+ <span class="line"><span>print(&quot;Input real noisy speech&quot;, flush=True)</span></span>
42
+ <span class="line"><span>display(Audio(mixwav_sc, rate=sr))</span></span>
43
+ <span class="line"><span>print(&quot;Enhanced speech&quot;, flush=True)</span></span>
44
+ <span class="line"><span>display(Audio(wave[0].squeeze(), rate=sr))</span></span>
45
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="multi-channel-enhancement" tabindex="-1"><a class="header-anchor" href="#multi-channel-enhancement"><span>Multi-Channel Enhancement</span></a></h3><h4 id="download-and-load-the-pretrained-mvdr-neural-beamformer" tabindex="-1"><a class="header-anchor" href="#download-and-load-the-pretrained-mvdr-neural-beamformer"><span>Download and load the pretrained mvdr neural beamformer.</span></a></h4><h3 id="task2-✅-checkpoint-2-1-point" tabindex="-1"><a class="header-anchor" href="#task2-✅-checkpoint-2-1-point"><span>Task2 (✅ Checkpoint 2 (1 point))</span></a></h3><p>Run inference of pretrained multi-channel enhancement model.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># Download the pretained enhancement model</span></span>
46
+ <span class="line"><span></span></span>
47
+ <span class="line"><span>!gdown --id 1FohDfBlOa7ipc9v2luY-QIFQ_GJ1iW_i -O /content/mvdr_beamformer_16k_se_raw_valid.zip</span></span>
48
+ <span class="line"><span>!unzip /content/mvdr_beamformer_16k_se_raw_valid.zip -d /content/enh_model_mc </span></span>
49
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># Load the model</span></span>
50
+ <span class="line"><span># If you encounter error &quot;No module named &#39;espnet2&#39;&quot;, please re-run the 1st Cell. This might be a colab bug.</span></span>
51
+ <span class="line"><span>import sys</span></span>
52
+ <span class="line"><span>import soundfile</span></span>
53
+ <span class="line"><span>from espnet2.bin.enh_inference import SeparateSpeech</span></span>
54
+ <span class="line"><span></span></span>
55
+ <span class="line"><span></span></span>
56
+ <span class="line"><span>separate_speech = {}</span></span>
57
+ <span class="line"><span># For models downloaded from GoogleDrive, you can use the following script:</span></span>
58
+ <span class="line"><span>enh_model_mc = SeparateSpeech(</span></span>
59
+ <span class="line"><span> train_config=&quot;/content/enh_model_mc/exp/enh_train_enh_beamformer_mvdr_raw/config.yaml&quot;,</span></span>
60
+ <span class="line"><span> model_file=&quot;/content/enh_model_mc/exp/enh_train_enh_beamformer_mvdr_raw/11epoch.pth&quot;,</span></span>
61
+ <span class="line"><span> # for segment-wise process on long speech</span></span>
62
+ <span class="line"><span> normalize_segment_scale=False,</span></span>
63
+ <span class="line"><span> show_progressbar=True,</span></span>
64
+ <span class="line"><span> ref_channel=4,</span></span>
65
+ <span class="line"><span> normalize_output_wav=True,</span></span>
66
+ <span class="line"><span> device=&quot;cuda:0&quot;,</span></span>
67
+ <span class="line"><span>)</span></span>
68
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h4 id="enhance-the-multi-channel-real-noisy-speech-in-chime4" tabindex="-1"><a class="header-anchor" href="#enhance-the-multi-channel-real-noisy-speech-in-chime4"><span>Enhance the multi-channel real noisy speech in CHiME4</span></a></h4><p>Please submit the screenshot of output of current block and the spectrogram and waveform of noisy and enhanced speech file to Gradescope for Task 2.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>wave = enh_model_mc(mixwav_mc[None, ...], sr)</span></span>
69
+ <span class="line"><span>print(&quot;Input real noisy speech&quot;, flush=True)</span></span>
70
+ <span class="line"><span>display(Audio(mixwav_mc.T, rate=sr))</span></span>
71
+ <span class="line"><span>print(&quot;Enhanced speech&quot;, flush=True)</span></span>
72
+ <span class="line"><span>display(Audio(wave[0].squeeze(), rate=sr))</span></span>
73
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h4 id="portable-speech-enhancement-scripts-for-other-tasks" tabindex="-1"><a class="header-anchor" href="#portable-speech-enhancement-scripts-for-other-tasks"><span>Portable speech enhancement scripts for other tasks</span></a></h4><p>For an ESPNet ASR or TTS dataset like below:</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>data</span></span>
74
+ <span class="line"><span>\`-- et05_real_isolated_6ch_track</span></span>
75
+ <span class="line"><span> |-- spk2utt</span></span>
76
+ <span class="line"><span> |-- text</span></span>
77
+ <span class="line"><span> |-- utt2spk</span></span>
78
+ <span class="line"><span> |-- utt2uniq</span></span>
79
+ <span class="line"><span> \`-- wav.scp</span></span>
80
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>Run the following scripts to create an enhanced dataset:</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>scripts/utils/enhance_dataset.sh \\</span></span>
81
+ <span class="line"><span> --spk_num 1 \\</span></span>
82
+ <span class="line"><span> --gpu_inference true \\</span></span>
83
+ <span class="line"><span> --inference_nj 4 \\</span></span>
84
+ <span class="line"><span> --fs 16k \\</span></span>
85
+ <span class="line"><span> --id_prefix &quot;&quot; \\</span></span>
86
+ <span class="line"><span> dump/raw/et05_real_isolated_6ch_track \\</span></span>
87
+ <span class="line"><span> data/et05_real_isolated_6ch_track_enh \\</span></span>
88
+ <span class="line"><span> exp/enh_train_enh_beamformer_mvdr_raw/valid.loss.best.pth</span></span>
89
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>The above script will generate a new directory data/et05_real_isolated_6ch_track_enh:</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>data</span></span>
90
+ <span class="line"><span>\`-- et05_real_isolated_6ch_track_enh</span></span>
91
+ <span class="line"><span> |-- spk2utt</span></span>
92
+ <span class="line"><span> |-- text</span></span>
93
+ <span class="line"><span> |-- utt2spk</span></span>
94
+ <span class="line"><span> |-- utt2uniq</span></span>
95
+ <span class="line"><span> |-- wav.scp</span></span>
96
+ <span class="line"><span> \`-- wavs/</span></span>
97
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>where wav.scp contains paths to the enhanced audios (stored in wavs/).</p><h2 id="speech-separation" tabindex="-1"><a class="header-anchor" href="#speech-separation"><span>Speech Separation</span></a></h2><h3 id="model-selection" tabindex="-1"><a class="header-anchor" href="#model-selection"><span>Model Selection</span></a></h3><p>In this demonstration, we will show different speech separation models on wsj0_2mix.</p>`,48),g={href:"https://zenodo.org/",target:"_blank",rel:"noopener noreferrer"},k={href:"https://huggingface.co/",target:"_blank",rel:"noopener noreferrer"},q=i(`<div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!gdown --id 1TasZxZSnbSPsk_Wf7ZDhBAigS6zN8G9G -O enh_train_enh_tfgridnet_tf_lr-patience3_patience5_raw_valid.loss.ave.zip</span></span>
98
+ <span class="line"><span>!unzip enh_train_enh_tfgridnet_tf_lr-patience3_patience5_raw_valid.loss.ave.zip -d /content/enh_model_ss</span></span>
99
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>import sys</span></span>
100
+ <span class="line"><span>import soundfile</span></span>
101
+ <span class="line"><span>from espnet2.bin.enh_inference import SeparateSpeech</span></span>
102
+ <span class="line"><span></span></span>
103
+ <span class="line"><span># For models downloaded from GoogleDrive, you can use the following script:</span></span>
104
+ <span class="line"><span>separate_speech = SeparateSpeech(</span></span>
105
+ <span class="line"><span> train_config=&quot;/content/enh_model_ss/exp/enh_train_enh_tfgridnet_tf_lr-patience3_patience5_raw/config.yaml&quot;,</span></span>
106
+ <span class="line"><span> model_file=&quot;/content/enh_model_ss/exp/enh_train_enh_tfgridnet_tf_lr-patience3_patience5_raw/98epoch.pth&quot;,</span></span>
107
+ <span class="line"><span> # for segment-wise process on long speech</span></span>
108
+ <span class="line"><span> segment_size=2.4,</span></span>
109
+ <span class="line"><span> hop_size=0.8,</span></span>
110
+ <span class="line"><span> normalize_segment_scale=False,</span></span>
111
+ <span class="line"><span> show_progressbar=True,</span></span>
112
+ <span class="line"><span> ref_channel=None,</span></span>
113
+ <span class="line"><span> normalize_output_wav=True,</span></span>
114
+ <span class="line"><span> device=&quot;cuda:0&quot;,</span></span>
115
+ <span class="line"><span>)</span></span>
116
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="separate-speech-mixture" tabindex="-1"><a class="header-anchor" href="#separate-speech-mixture"><span>Separate Speech Mixture</span></a></h3><h4 id="separate-the-example-in-wsj0-2mix-testing-set" tabindex="-1"><a class="header-anchor" href="#separate-the-example-in-wsj0-2mix-testing-set"><span>Separate the example in wsj0_2mix testing set</span></a></h4><h3 id="task3-✅-checkpoint-3-1-point" tabindex="-1"><a class="header-anchor" href="#task3-✅-checkpoint-3-1-point"><span>Task3 (✅ Checkpoint 3 (1 point))</span></a></h3><p>Run inference of pretrained speech seperation model based on TF-GRIDNET.</p><p>Please submit the screenshot of output of current block and the spectrogram and waveform of mixed and seperated speech files to Gradescope for Task 3.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!gdown --id 1ZCUkd_Lb7pO2rpPr4FqYdtJBZ7JMiInx -O /content/447c020t_1.2106_422a0112_-1.2106.wav</span></span>
117
+ <span class="line"><span></span></span>
118
+ <span class="line"><span>import os</span></span>
119
+ <span class="line"><span>import soundfile</span></span>
120
+ <span class="line"><span>from IPython.display import display, Audio</span></span>
121
+ <span class="line"><span></span></span>
122
+ <span class="line"><span>mixwav, sr = soundfile.read(&quot;447c020t_1.2106_422a0112_-1.2106.wav&quot;)</span></span>
123
+ <span class="line"><span>waves_wsj = separate_speech(mixwav[None, ...], fs=sr)</span></span>
124
+ <span class="line"><span></span></span>
125
+ <span class="line"><span>print(&quot;Input mixture&quot;, flush=True)</span></span>
126
+ <span class="line"><span>display(Audio(mixwav, rate=sr))</span></span>
127
+ <span class="line"><span>print(f&quot;========= Separated speech with model =========&quot;, flush=True)</span></span>
128
+ <span class="line"><span>print(&quot;Separated spk1&quot;, flush=True)</span></span>
129
+ <span class="line"><span>display(Audio(waves_wsj[0].squeeze(), rate=sr))</span></span>
130
+ <span class="line"><span>print(&quot;Separated spk2&quot;, flush=True)</span></span>
131
+ <span class="line"><span>display(Audio(waves_wsj[1].squeeze(), rate=sr))</span></span>
132
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h4 id="show-spectrums-of-separated-speech" tabindex="-1"><a class="header-anchor" href="#show-spectrums-of-separated-speech"><span>Show spectrums of separated speech</span></a></h4><p>Show wavform and spectrogram of mixed and seperated speech.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>import matplotlib.pyplot as plt</span></span>
133
+ <span class="line"><span>import torch</span></span>
134
+ <span class="line"><span>from torch_complex.tensor import ComplexTensor</span></span>
135
+ <span class="line"><span></span></span>
136
+ <span class="line"><span>from espnet.asr.asr_utils import plot_spectrogram</span></span>
137
+ <span class="line"><span>from espnet2.layers.stft import Stft</span></span>
138
+ <span class="line"><span></span></span>
139
+ <span class="line"><span></span></span>
140
+ <span class="line"><span>stft = Stft(</span></span>
141
+ <span class="line"><span> n_fft=512,</span></span>
142
+ <span class="line"><span> win_length=None,</span></span>
143
+ <span class="line"><span> hop_length=128,</span></span>
144
+ <span class="line"><span> window=&quot;hann&quot;,</span></span>
145
+ <span class="line"><span>)</span></span>
146
+ <span class="line"><span>ilens = torch.LongTensor([len(mixwav)])</span></span>
147
+ <span class="line"><span># specs: (T, F)</span></span>
148
+ <span class="line"><span>spec_mix = ComplexTensor(</span></span>
149
+ <span class="line"><span> *torch.unbind(</span></span>
150
+ <span class="line"><span> stft(torch.as_tensor(mixwav).unsqueeze(0), ilens)[0].squeeze(),</span></span>
151
+ <span class="line"><span> dim=-1</span></span>
152
+ <span class="line"><span> )</span></span>
153
+ <span class="line"><span>)</span></span>
154
+ <span class="line"><span>spec_sep1 = ComplexTensor(</span></span>
155
+ <span class="line"><span> *torch.unbind(</span></span>
156
+ <span class="line"><span> stft(torch.as_tensor(waves_wsj[0]), ilens)[0].squeeze(),</span></span>
157
+ <span class="line"><span> dim=-1</span></span>
158
+ <span class="line"><span> )</span></span>
159
+ <span class="line"><span>)</span></span>
160
+ <span class="line"><span>spec_sep2 = ComplexTensor(</span></span>
161
+ <span class="line"><span> *torch.unbind(</span></span>
162
+ <span class="line"><span> stft(torch.as_tensor(waves_wsj[1]), ilens)[0].squeeze(),</span></span>
163
+ <span class="line"><span> dim=-1</span></span>
164
+ <span class="line"><span> )</span></span>
165
+ <span class="line"><span>)</span></span>
166
+ <span class="line"><span></span></span>
167
+ <span class="line"><span>samples = torch.linspace(0, len(mixwav) / sr, len(mixwav))</span></span>
168
+ <span class="line"><span>plt.figure(figsize=(24, 12))</span></span>
169
+ <span class="line"><span>plt.subplot(3, 2, 1)</span></span>
170
+ <span class="line"><span>plt.title(&#39;Mixture Spectrogram&#39;)</span></span>
171
+ <span class="line"><span>plot_spectrogram(</span></span>
172
+ <span class="line"><span> plt, abs(spec_mix).transpose(-1, -2).numpy(), fs=sr,</span></span>
173
+ <span class="line"><span> mode=&#39;db&#39;, frame_shift=None,</span></span>
174
+ <span class="line"><span> bottom=False, labelbottom=False</span></span>
175
+ <span class="line"><span>)</span></span>
176
+ <span class="line"><span>plt.subplot(3, 2, 2)</span></span>
177
+ <span class="line"><span>plt.title(&#39;Mixture Wavform&#39;)</span></span>
178
+ <span class="line"><span>plt.plot(samples, mixwav)</span></span>
179
+ <span class="line"><span>plt.xlim(0, len(mixwav) / sr)</span></span>
180
+ <span class="line"><span></span></span>
181
+ <span class="line"><span>plt.subplot(3, 2, 3)</span></span>
182
+ <span class="line"><span>plt.title(&#39;Separated Spectrogram (spk1)&#39;)</span></span>
183
+ <span class="line"><span>plot_spectrogram(</span></span>
184
+ <span class="line"><span> plt, abs(spec_sep1).transpose(-1, -2).numpy(), fs=sr,</span></span>
185
+ <span class="line"><span> mode=&#39;db&#39;, frame_shift=None,</span></span>
186
+ <span class="line"><span> bottom=False, labelbottom=False</span></span>
187
+ <span class="line"><span>)</span></span>
188
+ <span class="line"><span>plt.subplot(3, 2, 4)</span></span>
189
+ <span class="line"><span>plt.title(&#39;Separated Wavform (spk1)&#39;)</span></span>
190
+ <span class="line"><span>plt.plot(samples, waves_wsj[0].squeeze())</span></span>
191
+ <span class="line"><span>plt.xlim(0, len(mixwav) / sr)</span></span>
192
+ <span class="line"><span></span></span>
193
+ <span class="line"><span>plt.subplot(3, 2, 5)</span></span>
194
+ <span class="line"><span>plt.title(&#39;Separated Spectrogram (spk2)&#39;)</span></span>
195
+ <span class="line"><span>plot_spectrogram(</span></span>
196
+ <span class="line"><span> plt, abs(spec_sep2).transpose(-1, -2).numpy(), fs=sr,</span></span>
197
+ <span class="line"><span> mode=&#39;db&#39;, frame_shift=None,</span></span>
198
+ <span class="line"><span> bottom=False, labelbottom=False</span></span>
199
+ <span class="line"><span>)</span></span>
200
+ <span class="line"><span>plt.subplot(3, 2, 6)</span></span>
201
+ <span class="line"><span>plt.title(&#39;Separated Wavform (spk2)&#39;)</span></span>
202
+ <span class="line"><span>plt.plot(samples, waves_wsj[1].squeeze())</span></span>
203
+ <span class="line"><span>plt.xlim(0, len(mixwav) / sr)</span></span>
204
+ <span class="line"><span>plt.xlabel(&quot;Time (s)&quot;)</span></span>
205
+ <span class="line"><span>plt.show()</span></span>
206
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="evaluate-separated-speech-with-pretrained-asr-model" tabindex="-1"><a class="header-anchor" href="#evaluate-separated-speech-with-pretrained-asr-model"><span>Evaluate separated speech with pretrained ASR model</span></a></h2><p>The ground truths are:</p><p><code>text_1: SOME CRITICS INCLUDING HIGH REAGAN ADMINISTRATION OFFICIALS ARE RAISING THE ALARM THAT THE FED&#39;S POLICY IS TOO TIGHT AND COULD CAUSE A RECESSION NEXT YEAR</code></p><p><code>text_2: THE UNITED STATES UNDERTOOK TO DEFEND WESTERN EUROPE AGAINST SOVIET ATTACK</code></p><p>(This may take a while for the speech recognition.)</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>%pip install -q https://github.com/kpu/kenlm/archive/master.zip # ASR needs kenlm</span></span>
207
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h3 id="task4-✅-checkpoint-4-1-point" tabindex="-1"><a class="header-anchor" href="#task4-✅-checkpoint-4-1-point"><span>Task4 (✅ Checkpoint 4 (1 point))</span></a></h3><p>Show inference of pre-trained ASR model on mixed and seperated speech.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!gdown --id 1H7--jXTTwmwxzfO8LT5kjZyBjng-HxED -O asr_train_asr_transformer_raw_char_1gpu_valid.acc.ave.zip</span></span>
208
+ <span class="line"><span>!unzip asr_train_asr_transformer_raw_char_1gpu_valid.acc.ave.zip -d /content/asr_model</span></span>
209
+ <span class="line"><span>!ln -sf /content/asr_model/exp .</span></span>
210
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>Please submit the screenshot of ASR inference on Mix Speech and Separated Speech 1 and Separated Speech 2 files to Gradescope for Task 4.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>import espnet_model_zoo</span></span>
211
+ <span class="line"><span>from espnet2.bin.asr_inference import Speech2Text</span></span>
212
+ <span class="line"><span></span></span>
213
+ <span class="line"><span></span></span>
214
+ <span class="line"><span># For models downloaded from GoogleDrive, you can use the following script:</span></span>
215
+ <span class="line"><span>speech2text = Speech2Text(</span></span>
216
+ <span class="line"><span> asr_train_config=&quot;/content/asr_model/exp/asr_train_asr_transformer_raw_char_1gpu/config.yaml&quot;,</span></span>
217
+ <span class="line"><span> asr_model_file=&quot;/content/asr_model/exp/asr_train_asr_transformer_raw_char_1gpu/valid.acc.ave_10best.pth&quot;,</span></span>
218
+ <span class="line"><span> device=&quot;cuda:0&quot;</span></span>
219
+ <span class="line"><span>)</span></span>
220
+ <span class="line"><span></span></span>
221
+ <span class="line"><span>text_est = [None, None]</span></span>
222
+ <span class="line"><span>text_est[0], *_ = speech2text(waves_wsj[0].squeeze())[0]</span></span>
223
+ <span class="line"><span>text_est[1], *_ = speech2text(waves_wsj[1].squeeze())[0]</span></span>
224
+ <span class="line"><span>text_m, *_ = speech2text(mixwav)[0]</span></span>
225
+ <span class="line"><span>print(&quot;Mix Speech to Text: &quot;, text_m)</span></span>
226
+ <span class="line"><span>print(&quot;Separated Speech 1 to Text: &quot;, text_est[0])</span></span>
227
+ <span class="line"><span>print(&quot;Separated Speech 2 to Text: &quot;, text_est[1])</span></span>
228
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>import difflib</span></span>
229
+ <span class="line"><span>from itertools import permutations</span></span>
230
+ <span class="line"><span></span></span>
231
+ <span class="line"><span>import editdistance</span></span>
232
+ <span class="line"><span>import numpy as np</span></span>
233
+ <span class="line"><span></span></span>
234
+ <span class="line"><span>colors = dict(</span></span>
235
+ <span class="line"><span> red=lambda text: f&quot;\\033[38;2;255;0;0m{text}\\033[0m&quot; if text else &quot;&quot;,</span></span>
236
+ <span class="line"><span> green=lambda text: f&quot;\\033[38;2;0;255;0m{text}\\033[0m&quot; if text else &quot;&quot;,</span></span>
237
+ <span class="line"><span> yellow=lambda text: f&quot;\\033[38;2;225;225;0m{text}\\033[0m&quot; if text else &quot;&quot;,</span></span>
238
+ <span class="line"><span> white=lambda text: f&quot;\\033[38;2;255;255;255m{text}\\033[0m&quot; if text else &quot;&quot;,</span></span>
239
+ <span class="line"><span> black=lambda text: f&quot;\\033[38;2;0;0;0m{text}\\033[0m&quot; if text else &quot;&quot;,</span></span>
240
+ <span class="line"><span>)</span></span>
241
+ <span class="line"><span></span></span>
242
+ <span class="line"><span>def diff_strings(ref, est):</span></span>
243
+ <span class="line"><span> &quot;&quot;&quot;Reference: https://stackoverflow.com/a/64404008/7384873&quot;&quot;&quot;</span></span>
244
+ <span class="line"><span> ref_str, est_str, err_str = [], [], []</span></span>
245
+ <span class="line"><span> matcher = difflib.SequenceMatcher(None, ref, est)</span></span>
246
+ <span class="line"><span> for opcode, a0, a1, b0, b1 in matcher.get_opcodes():</span></span>
247
+ <span class="line"><span> if opcode == &quot;equal&quot;:</span></span>
248
+ <span class="line"><span> txt = ref[a0:a1]</span></span>
249
+ <span class="line"><span> ref_str.append(txt)</span></span>
250
+ <span class="line"><span> est_str.append(txt)</span></span>
251
+ <span class="line"><span> err_str.append(&quot; &quot; * (a1 - a0))</span></span>
252
+ <span class="line"><span> elif opcode == &quot;insert&quot;:</span></span>
253
+ <span class="line"><span> ref_str.append(&quot;*&quot; * (b1 - b0))</span></span>
254
+ <span class="line"><span> est_str.append(colors[&quot;green&quot;](est[b0:b1]))</span></span>
255
+ <span class="line"><span> err_str.append(colors[&quot;black&quot;](&quot;I&quot; * (b1 - b0)))</span></span>
256
+ <span class="line"><span> elif opcode == &quot;delete&quot;:</span></span>
257
+ <span class="line"><span> ref_str.append(ref[a0:a1])</span></span>
258
+ <span class="line"><span> est_str.append(colors[&quot;red&quot;](&quot;*&quot; * (a1 - a0)))</span></span>
259
+ <span class="line"><span> err_str.append(colors[&quot;black&quot;](&quot;D&quot; * (a1 - a0)))</span></span>
260
+ <span class="line"><span> elif opcode == &quot;replace&quot;:</span></span>
261
+ <span class="line"><span> diff = a1 - a0 - b1 + b0</span></span>
262
+ <span class="line"><span> if diff &gt;= 0:</span></span>
263
+ <span class="line"><span> txt_ref = ref[a0:a1]</span></span>
264
+ <span class="line"><span> txt_est = colors[&quot;yellow&quot;](est[b0:b1]) + colors[&quot;red&quot;](&quot;*&quot; * diff)</span></span>
265
+ <span class="line"><span> txt_err = &quot;S&quot; * (b1 - b0) + &quot;D&quot; * diff</span></span>
266
+ <span class="line"><span> elif diff &lt; 0:</span></span>
267
+ <span class="line"><span> txt_ref = ref[a0:a1] + &quot;*&quot; * -diff</span></span>
268
+ <span class="line"><span> txt_est = colors[&quot;yellow&quot;](est[b0:b1]) + colors[&quot;green&quot;](&quot;*&quot; * -diff)</span></span>
269
+ <span class="line"><span> txt_err = &quot;S&quot; * (b1 - b0) + &quot;I&quot; * -diff</span></span>
270
+ <span class="line"><span></span></span>
271
+ <span class="line"><span> ref_str.append(txt_ref)</span></span>
272
+ <span class="line"><span> est_str.append(txt_est)</span></span>
273
+ <span class="line"><span> err_str.append(colors[&quot;black&quot;](txt_err))</span></span>
274
+ <span class="line"><span> return &quot;&quot;.join(ref_str), &quot;&quot;.join(est_str), &quot;&quot;.join(err_str)</span></span>
275
+ <span class="line"><span></span></span>
276
+ <span class="line"><span></span></span>
277
+ <span class="line"><span>text_ref = [</span></span>
278
+ <span class="line"><span> &quot;SOME CRITICS INCLUDING HIGH REAGAN ADMINISTRATION OFFICIALS ARE RAISING THE ALARM THAT THE FED&#39;S POLICY IS TOO TIGHT AND COULD CAUSE A RECESSION NEXT YEAR&quot;,</span></span>
279
+ <span class="line"><span> &quot;THE UNITED STATES UNDERTOOK TO DEFEND WESTERN EUROPE AGAINST SOVIET ATTACK&quot;,</span></span>
280
+ <span class="line"><span>]</span></span>
281
+ <span class="line"><span></span></span>
282
+ <span class="line"><span>print(&quot;=====================&quot; , flush=True)</span></span>
283
+ <span class="line"><span>perms = list(permutations(range(2)))</span></span>
284
+ <span class="line"><span>string_edit = [</span></span>
285
+ <span class="line"><span> [</span></span>
286
+ <span class="line"><span> editdistance.eval(text_ref[m], text_est[n])</span></span>
287
+ <span class="line"><span> for m, n in enumerate(p)</span></span>
288
+ <span class="line"><span> ]</span></span>
289
+ <span class="line"><span> for p in perms</span></span>
290
+ <span class="line"><span>]</span></span>
291
+ <span class="line"><span></span></span>
292
+ <span class="line"><span>dist = [sum(edist) for edist in string_edit]</span></span>
293
+ <span class="line"><span>perm_idx = np.argmin(dist)</span></span>
294
+ <span class="line"><span>perm = perms[perm_idx]</span></span>
295
+ <span class="line"><span></span></span>
296
+ <span class="line"><span>for i, p in enumerate(perm):</span></span>
297
+ <span class="line"><span> print(&quot;\\n--------------- Text %d ---------------&quot; % (i + 1), flush=True)</span></span>
298
+ <span class="line"><span> ref, est, err = diff_strings(text_ref[i], text_est[p])</span></span>
299
+ <span class="line"><span> print(&quot;REF: &quot; + ref + &quot;\\n&quot; + &quot;HYP: &quot; + est + &quot;\\n&quot; + &quot;ERR: &quot; + err, flush=True)</span></span>
300
+ <span class="line"><span> print(&quot;Edit Distance = {}\\n&quot;.format(string_edit[perm_idx][i]), flush=True)</span></span>
301
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="task5-✅-checkpoint-5-1-point" tabindex="-1"><a class="header-anchor" href="#task5-✅-checkpoint-5-1-point"><span>Task5 (✅ Checkpoint 5 (1 point))</span></a></h3><p>Enhance your own pre-recordings. Your input speech can be recorded by yourself or you can also find it from other sources (e.g., youtube).</p><p>Discuss whether input speech was clearly denoised, and if not, what would be a potential reason.</p><p>[YOUR ANSWER HERE]</p><p>Please submit the spectrogram and waveform of your input and enhanced speech to GradeScope for Task 5 along with the screenshot of your answer.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>from google.colab import files</span></span>
302
+ <span class="line"><span>from IPython.display import display, Audio</span></span>
303
+ <span class="line"><span>import soundfile</span></span>
304
+ <span class="line"><span>fs = 16000 </span></span>
305
+ <span class="line"><span>uploaded = files.upload()</span></span>
306
+ <span class="line"><span></span></span>
307
+ <span class="line"><span>for file_name in uploaded.keys():</span></span>
308
+ <span class="line"><span> speech, rate = soundfile.read(file_name)</span></span>
309
+ <span class="line"><span> assert rate == fs, &quot;mismatch in sampling rate&quot;</span></span>
310
+ <span class="line"><span> wave = enh_model_sc(speech[None, ...], fs)</span></span>
311
+ <span class="line"><span> print(f&quot;Your input speech {file_name}&quot;, flush=True)</span></span>
312
+ <span class="line"><span> display(Audio(speech, rate=fs))</span></span>
313
+ <span class="line"><span> print(f&quot;Enhanced speech for {file_name}&quot;, flush=True)</span></span>
314
+ <span class="line"><span> display(Audio(wave[0].squeeze(), rate=fs))</span></span>
315
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div>`,29);function w(E,S){const a=p("ExternalLinkIcon");return t(),c("div",null,[r,o,u,n("ul",null,[n("li",null,[n("a",v,[s("ESPnet repository"),e(a)])]),n("li",null,[n("a",m,[s("ESPnet documentation"),e(a)])]),n("li",null,[n("a",h,[s("ESPnet-SE repo"),e(a)])])]),b,_,n("p",null,[s("The notebook is adapted from this "),n("a",f,[s("Colab"),e(a)])]),x,n("p",null,[s("The pretrained models can be download from a direct URL, or from "),n("a",g,[s("zenodo"),e(a)]),s(" and "),n("a",k,[s("huggingface"),e(a)]),s(" with the corresponding model ID.")]),q])}const y=l(d,[["render",w],["__file","SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html.vue"]]),D=JSON.parse('{"path":"/tutorials/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html","title":"CMU 11492/11692 Spring 2023: Speech Enhancement","lang":"en-US","frontmatter":{},"headers":[{"level":2,"title":"❗Important Notes❗","slug":"❗important-notes❗","link":"#❗important-notes❗","children":[]},{"level":2,"title":"Install","slug":"install","link":"#install","children":[]},{"level":2,"title":"Speech Enhancement with Pretrained Models","slug":"speech-enhancement-with-pretrained-models","link":"#speech-enhancement-with-pretrained-models","children":[{"level":3,"title":"Single-Channel Enhancement, the CHiME example","slug":"single-channel-enhancement-the-chime-example","link":"#single-channel-enhancement-the-chime-example","children":[]},{"level":3,"title":"Task1 (✅ Checkpoint 1 (1 point))","slug":"task1-✅-checkpoint-1-1-point","link":"#task1-✅-checkpoint-1-1-point","children":[]},{"level":3,"title":"Multi-Channel Enhancement","slug":"multi-channel-enhancement","link":"#multi-channel-enhancement","children":[]},{"level":3,"title":"Task2 (✅ Checkpoint 2 (1 point))","slug":"task2-✅-checkpoint-2-1-point","link":"#task2-✅-checkpoint-2-1-point","children":[]}]},{"level":2,"title":"Speech Separation","slug":"speech-separation","link":"#speech-separation","children":[{"level":3,"title":"Model Selection","slug":"model-selection","link":"#model-selection","children":[]},{"level":3,"title":"Separate Speech Mixture","slug":"separate-speech-mixture","link":"#separate-speech-mixture","children":[]},{"level":3,"title":"Task3 (��� Checkpoint 3 (1 point))","slug":"task3-✅-checkpoint-3-1-point","link":"#task3-✅-checkpoint-3-1-point","children":[]}]},{"level":2,"title":"Evaluate separated speech with pretrained ASR model","slug":"evaluate-separated-speech-with-pretrained-asr-model","link":"#evaluate-separated-speech-with-pretrained-asr-model","children":[{"level":3,"title":"Task4 (✅ Checkpoint 4 (1 point))","slug":"task4-✅-checkpoint-4-1-point","link":"#task4-✅-checkpoint-4-1-point","children":[]},{"level":3,"title":"Task5 (✅ Checkpoint 5 (1 point))","slug":"task5-✅-checkpoint-5-1-point","link":"#task5-✅-checkpoint-5-1-point","children":[]}]}],"git":{},"filePathRelative":"tutorials/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).md"}');export{y as comp,D as data};
assets/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html-DDjiuGQB.js ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import{_ as l,r as o,o as t,c as p,a as s,d as a,b as e,e as i}from"./app-DTS6SjJz.js";const r={},c=s("h1",{id:"cmu-11492-11692-spring-2023-spoken-language-understanding",tabindex:"-1"},[s("a",{class:"header-anchor",href:"#cmu-11492-11692-spring-2023-spoken-language-understanding"},[s("span",null,"CMU 11492/11692 Spring 2023: Spoken Language Understanding")])],-1),d=s("p",null,"In this demonstration, we will show you the procedure to conduct spoken language understanding in ESPnet.",-1),D=s("p",null,"Main references:",-1),u={href:"https://github.com/espnet/espnet",target:"_blank",rel:"noopener noreferrer"},y={href:"https://espnet.github.io/espnet/",target:"_blank",rel:"noopener noreferrer"},m=i(`<p>Author:</p><ul><li>Siddhant Arora (siddhana@andrew.cmu.edu)</li></ul><h2 id="objectives" tabindex="-1"><a class="header-anchor" href="#objectives"><span>Objectives</span></a></h2><p>After this demonstration, you are expected to understand some latest advancements in spoken language understanding.</p><h2 id="❗important-notes❗" tabindex="-1"><a class="header-anchor" href="#❗important-notes❗"><span>❗Important Notes❗</span></a></h2><ul><li>We are using Colab to show the demo. However, Colab has some constraints on the total GPU runtime. If you use too much GPU time, you may not be able to use GPU for some time.</li><li>There are multiple in-class checkpoints ✅ throughout this tutorial. <strong>Your participation points are based on these tasks.</strong> Please try your best to follow all the steps! If you encounter issues, please notify the TAs as soon as possible so that we can make an adjustment for you.</li><li>Please submit PDF files of your completed notebooks to Gradescope. You can print the notebook using <code>File -&gt; Print</code> in the menu bar.</li></ul><h2 id="espnet-installation" tabindex="-1"><a class="header-anchor" href="#espnet-installation"><span>ESPnet installation</span></a></h2><p>We follow the ESPnet installation as the previous tutorials (takes around 15 minutes).</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">! python -m pip install transformers</span></span>
2
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">git clone https://github.com/espnet/espnet /espnet</span></span>
3
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">pip install /espnet</span></span>
4
+ <span class="line"><span style="color:#D4D4D4;">%pip install -q espnet_model_zoo</span></span>
5
+ <span class="line"><span style="color:#D4D4D4;">%pip install fairseq@git+https://github.com//pytorch/fairseq.git@f2146bdc7abf293186de9449bfa2272775e39e1d</span><span style="color:#6A9955;">#egg=fairseq</span></span>
6
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="spoken-language-understanding" tabindex="-1"><a class="header-anchor" href="#spoken-language-understanding"><span>Spoken Language Understanding</span></a></h2><p>Spoken Language Understanding (SLU) refers to the task of extracting semantic meaning or linguistic structure from spoken utterances. Some examples include recognizing the intent and their associated entities of a user’s command to take appropriate action, or even understanding the emotion behind a particular utterance, and engaging in conversations with a user by modeling the topic of a conversation. SLU is an essential component of many commercial applications like voice assistants, social bots, and intelligent home devices which have to map speech signals to executable commands every day.</p><p>Conventional SLU systems employ a cascaded approach for sequence labeling, where an automatic speech recognition (ASR) system first recognizes the spoken words from the input audio and a natural language understanding (NLU) system then extracts the intent from the predicted text. These cascaded approaches can effectively utilize pretrained ASR and NLU systems. However, they suffer from error propagation as errors in the ASR transcripts can adversely affect downstream SLU performance. Consequently, in this demo, we focus on end-to-end (E2E) SLU systems. E2E SLU systems aim to predict intent directly from speech. These E2E SLU systems can avoid the cascading of errors but cannot directly utilize strong acoustic and semantic representations from pretrained ASR systems and language models.</p><p>In this tutorial, we will show you some latest E2E SLU model architectures (in ESPnet-SLU) in the field of spoken language understanding, including</p><ul><li>E2E SLU (https://arxiv.org/abs/2111.14706)</li><li>Two Pass E2E SLU (https://arxiv.org/abs/2207.06670)</li></ul><h2 id="overview-of-the-espnet-slu" tabindex="-1"><a class="header-anchor" href="#overview-of-the-espnet-slu"><span>Overview of the ESPnet-SLU</span></a></h2><p>As ASR systems are getting better, there is an increasing interest in using the ASR output directly to do downstream Natural Language Processing (NLP) tasks. With the increase in SLU datasets and methodologies proposed, ESPnet-SLU is an open-source SLU toolkit built on an already existing open-source speech processing toolkit ESPnet. ESPnet-SLU standardize the pipelines involved in building an SLU model like data preparation, model training, and its evaluation. Having ESPnet-SLU would help users build systems for real world scenarios where many speech processing steps need to be applied before running the downstream task. ESPnet also provides an easy access to other speech technologies being developed like data augmentation, encoder sub-sampling, and speech-focused encoders like conformers. They also support many pretrained ASR and NLU systems that can be used as feature extractors in a SLU framework.</p><p>We have shown a sample architecure of our E2E SLU Model in the figure below:</p><p><img src="https://drive.google.com/uc?id=1qzWcOV3x5-cj9OHB-iVtCGfY1tQCWk76" alt="picture"></p><h2 id="_1-e2e-slu" tabindex="-1"><a class="header-anchor" href="#_1-e2e-slu"><span>1. E2E SLU</span></a></h2><h3 id="_1-1-download-sample-audio-file" tabindex="-1"><a class="header-anchor" href="#_1-1-download-sample-audio-file"><span>1.1 Download Sample Audio File</span></a></h3><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">gdown </span><span style="color:#F44747;">--</span><span style="color:#DCDCAA;">id</span><span style="color:#F44747;"> 18ANT62ittt7Ai2E8bQRlvT0ZVXXsf1eE</span><span style="color:#D4D4D4;"> -O /content/audio_file.wav</span></span>
7
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> os</span></span>
8
+ <span class="line"></span>
9
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> soundfile</span></span>
10
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> IPython.display </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> display, Audio</span></span>
11
+ <span class="line"><span style="color:#D4D4D4;">mixwav_mc, sr = soundfile.read(</span><span style="color:#CE9178;">&quot;/content/audio_file.wav&quot;</span><span style="color:#D4D4D4;">)</span></span>
12
+ <span class="line"><span style="color:#D4D4D4;">display(Audio(mixwav_mc.T, </span><span style="color:#9CDCFE;">rate</span><span style="color:#D4D4D4;">=sr))</span></span>
13
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="question1-✅-checkpoint-1-1-points" tabindex="-1"><a class="header-anchor" href="#question1-✅-checkpoint-1-1-points"><span>Question1 (✅ Checkpoint 1 (1 points))</span></a></h3><p>Run inference on given audio using E2E SLU for intent classification</p><h3 id="_1-2-download-and-load-pretrained-e2e-slu-model" tabindex="-1"><a class="header-anchor" href="#_1-2-download-and-load-pretrained-e2e-slu-model"><span>1.2 Download and Load pretrained E2E SLU Model</span></a></h3><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">git lfs clone https://huggingface.co/espnet/siddhana_slurp_new_asr_train_asr_conformer_raw_en_word_valid.acc.ave_10best /content/slurp_first_pass_model</span></span>
14
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet2.bin.asr_inference </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> Speech2Text</span></span>
15
+ <span class="line"><span style="color:#D4D4D4;">speech2text_slurp = Speech2Text.from_pretrained(</span></span>
16
+ <span class="line"><span style="color:#9CDCFE;"> asr_train_config</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&quot;/content/slurp_first_pass_model/exp/asr_train_asr_conformer_raw_en_word/config.yaml&quot;</span><span style="color:#D4D4D4;">,</span></span>
17
+ <span class="line"><span style="color:#9CDCFE;"> asr_model_file</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&quot;/content/slurp_first_pass_model/exp/asr_train_asr_conformer_raw_en_word/valid.acc.ave_10best.pth&quot;</span><span style="color:#D4D4D4;">,</span></span>
18
+ <span class="line"><span style="color:#9CDCFE;"> nbest</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">,</span></span>
19
+ <span class="line"><span style="color:#D4D4D4;">)</span></span>
20
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">nbests_orig = speech2text_slurp(mixwav_mc)</span></span>
21
+ <span class="line"><span style="color:#D4D4D4;">text, *_ = nbests_orig[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">]</span></span>
22
+ <span class="line"><span style="color:#569CD6;">def</span><span style="color:#DCDCAA;"> text_normalizer</span><span style="color:#D4D4D4;">(</span><span style="color:#9CDCFE;">sub_word_transcript</span><span style="color:#D4D4D4;">):</span></span>
23
+ <span class="line"><span style="color:#D4D4D4;"> transcript = sub_word_transcript[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">].replace(</span><span style="color:#CE9178;">&quot;▁&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;&quot;</span><span style="color:#D4D4D4;">)</span></span>
24
+ <span class="line"><span style="color:#C586C0;"> for</span><span style="color:#D4D4D4;"> sub_word </span><span style="color:#C586C0;">in</span><span style="color:#D4D4D4;"> sub_word_transcript[</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">:]:</span></span>
25
+ <span class="line"><span style="color:#C586C0;"> if</span><span style="color:#CE9178;"> &quot;▁&quot;</span><span style="color:#569CD6;"> in</span><span style="color:#D4D4D4;"> sub_word:</span></span>
26
+ <span class="line"><span style="color:#D4D4D4;"> transcript = transcript + </span><span style="color:#CE9178;">&quot; &quot;</span><span style="color:#D4D4D4;"> + sub_word.replace(</span><span style="color:#CE9178;">&quot;▁&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;&quot;</span><span style="color:#D4D4D4;">)</span></span>
27
+ <span class="line"><span style="color:#C586C0;"> else</span><span style="color:#D4D4D4;">:</span></span>
28
+ <span class="line"><span style="color:#D4D4D4;"> transcript = transcript + sub_word</span></span>
29
+ <span class="line"><span style="color:#C586C0;"> return</span><span style="color:#D4D4D4;"> transcript</span></span>
30
+ <span class="line"><span style="color:#D4D4D4;">intent_text=</span><span style="color:#CE9178;">&quot;{scenario: &quot;</span><span style="color:#D4D4D4;">+text.split()[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">].split(</span><span style="color:#CE9178;">&quot;_&quot;</span><span style="color:#D4D4D4;">)[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">]+</span><span style="color:#CE9178;">&quot;, action: &quot;</span><span style="color:#D4D4D4;">+</span><span style="color:#CE9178;">&quot;_&quot;</span><span style="color:#D4D4D4;">.join(text.split()[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">].split(</span><span style="color:#CE9178;">&quot;_&quot;</span><span style="color:#D4D4D4;">)[</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">:])+</span><span style="color:#CE9178;">&quot;}&quot;</span></span>
31
+ <span class="line"><span style="color:#DCDCAA;">print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;INTENT: </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">intent_text</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&quot;</span><span style="color:#D4D4D4;">)</span></span>
32
+ <span class="line"><span style="color:#D4D4D4;">transcript=text_normalizer(text.split()[</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">:])</span></span>
33
+ <span class="line"><span style="color:#DCDCAA;">print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;ASR hypothesis: </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">transcript</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&quot;</span><span style="color:#D4D4D4;">)</span></span>
34
+ <span class="line"><span style="color:#DCDCAA;">print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;E2E SLU model fails to predict the correct action.&quot;</span><span style="color:#D4D4D4;">)</span></span>
35
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="_2-two-pass-e2e-slu" tabindex="-1"><a class="header-anchor" href="#_2-two-pass-e2e-slu"><span>2. Two Pass E2E SLU</span></a></h2><p>However, recent work has shown that E2E-SLU systems struggle to generalize to unique phrasing for the same intent, suggesting an opportunity for enhancing semantic modeling of existing SLU systems. A number of approaches have been proposed to learn semantic content directly from audio. These approaches aim to incorporate pretrained language models to improve semantic processing of SLU architectures. In this demo, we use the Two Pass E2E SLU model where the second pass model improves on the initial prediction by combining acoustic information from the entire speech and semantic information from ASR-hypothesis using a deliberation network.</p><p><img src="https://drive.google.com/uc?id=1imEA98mIqcC6i-Cgdc84msHKliaVgtdf" alt="pitcture"></p><h3 id="question2-✅-checkpoint-2-1-points" tabindex="-1"><a class="header-anchor" href="#question2-✅-checkpoint-2-1-points"><span>Question2 (✅ Checkpoint 2 (1 points))</span></a></h3><p>Run inference on given audio using 2 pass SLU</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">git lfs clone https://huggingface.co/espnet/slurp_slu_2pass /content/slurp_second_pass_model</span></span>
36
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet2.bin.slu_inference </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> Speech2Understand</span></span>
37
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> transformers </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> AutoModel, AutoTokenizer</span></span>
38
+ <span class="line"><span style="color:#D4D4D4;">speech2text_second_pass_slurp = Speech2Understand.from_pretrained(</span></span>
39
+ <span class="line"><span style="color:#9CDCFE;"> slu_train_config</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&quot;/content/slurp_second_pass_model/exp/slu_train_asr_bert_conformer_deliberation_raw_en_word/config.yaml&quot;</span><span style="color:#D4D4D4;">,</span></span>
40
+ <span class="line"><span style="color:#9CDCFE;"> slu_model_file</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&quot;/content/slurp_second_pass_model/exp/slu_train_asr_bert_conformer_deliberation_raw_en_word/valid.acc.ave_10best.pth&quot;</span><span style="color:#D4D4D4;">,</span></span>
41
+ <span class="line"><span style="color:#9CDCFE;"> nbest</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">,</span></span>
42
+ <span class="line"><span style="color:#D4D4D4;">)</span></span>
43
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet2.tasks.slu </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> SLUTask</span></span>
44
+ <span class="line"><span style="color:#D4D4D4;">preprocess_fn=SLUTask.build_preprocess_fn(</span></span>
45
+ <span class="line"><span style="color:#D4D4D4;"> speech2text_second_pass_slurp.asr_train_args, </span><span style="color:#569CD6;">False</span></span>
46
+ <span class="line"><span style="color:#D4D4D4;"> )</span></span>
47
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> numpy </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> np</span></span>
48
+ <span class="line"><span style="color:#D4D4D4;">transcript = preprocess_fn.text_cleaner(transcript)</span></span>
49
+ <span class="line"><span style="color:#D4D4D4;">tokens = preprocess_fn.transcript_tokenizer.text2tokens(transcript)</span></span>
50
+ <span class="line"><span style="color:#D4D4D4;">text_ints = np.array(preprocess_fn.transcript_token_id_converter.tokens2ids(tokens), </span><span style="color:#9CDCFE;">dtype</span><span style="color:#D4D4D4;">=np.int64)</span></span>
51
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> torch</span></span>
52
+ <span class="line"><span style="color:#D4D4D4;">nbests = speech2text_second_pass_slurp(mixwav_mc,torch.tensor(text_ints))</span></span>
53
+ <span class="line"><span style="color:#D4D4D4;">text1, *_ = nbests[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">]</span></span>
54
+ <span class="line"><span style="color:#D4D4D4;">intent_text=</span><span style="color:#CE9178;">&quot;{scenario: &quot;</span><span style="color:#D4D4D4;">+text1.split()[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">].split(</span><span style="color:#CE9178;">&quot;_&quot;</span><span style="color:#D4D4D4;">)[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">]+</span><span style="color:#CE9178;">&quot;, action: &quot;</span><span style="color:#D4D4D4;">+</span><span style="color:#CE9178;">&quot;_&quot;</span><span style="color:#D4D4D4;">.join(text1.split()[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">].split(</span><span style="color:#CE9178;">&quot;_&quot;</span><span style="color:#D4D4D4;">)[</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">:])+</span><span style="color:#CE9178;">&quot;}&quot;</span></span>
55
+ <span class="line"><span style="color:#DCDCAA;">print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;INTENT: </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">intent_text</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&quot;</span><span style="color:#D4D4D4;">)</span></span>
56
+ <span class="line"><span style="color:#D4D4D4;">transcript=text_normalizer(text1.split()[</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">:])</span></span>
57
+ <span class="line"><span style="color:#DCDCAA;">print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;ASR hypothesis: </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">transcript</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&quot;</span><span style="color:#D4D4D4;">)</span></span>
58
+ <span class="line"><span style="color:#DCDCAA;">print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;Second pass SLU model successfully recognizes the correct action.&quot;</span><span style="color:#D4D4D4;">)</span></span>
59
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="_3-e2e-slu-for-slot-filling" tabindex="-1"><a class="header-anchor" href="#_3-e2e-slu-for-slot-filling"><span>3. E2E SLU for Slot Filling</span></a></h2><h3 id="question3-✅-checkpoint-3-1-point" tabindex="-1"><a class="header-anchor" href="#question3-✅-checkpoint-3-1-point"><span>Question3 (✅ Checkpoint 3 (1 point))</span></a></h3><p>Run inference on given audio using E2E SLU for slot filling</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">gdown </span><span style="color:#F44747;">--</span><span style="color:#DCDCAA;">id</span><span style="color:#F44747;"> 1ezs8IPutLr</span><span style="color:#D4D4D4;">-C0PXKb6pfOlb6XXFDXcPd -O /content/audio_slurp_entity_file.wav</span></span>
60
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> os</span></span>
61
+ <span class="line"></span>
62
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> soundfile</span></span>
63
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> IPython.display </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> display, Audio</span></span>
64
+ <span class="line"><span style="color:#D4D4D4;">mixwav_mc, sr = soundfile.read(</span><span style="color:#CE9178;">&quot;/content/audio_slurp_entity_file.wav&quot;</span><span style="color:#D4D4D4;">)</span></span>
65
+ <span class="line"><span style="color:#D4D4D4;">display(Audio(mixwav_mc.T, </span><span style="color:#9CDCFE;">rate</span><span style="color:#D4D4D4;">=sr))</span></span>
66
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">git lfs clone https://huggingface.co/espnet/siddhana_slurp_entity_asr_train_asr_conformer_raw_en_word_valid.acc.ave_10best /content/slurp_entity_model</span></span>
67
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet2.bin.asr_inference </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> Speech2Text</span></span>
68
+ <span class="line"><span style="color:#D4D4D4;">speech2text_slurp = Speech2Text.from_pretrained(</span></span>
69
+ <span class="line"><span style="color:#9CDCFE;"> asr_train_config</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&quot;/content/slurp_entity_model/exp/asr_train_asr_conformer_raw_en_word/config.yaml&quot;</span><span style="color:#D4D4D4;">,</span></span>
70
+ <span class="line"><span style="color:#9CDCFE;"> asr_model_file</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&quot;/content/slurp_entity_model/exp/asr_train_asr_conformer_raw_en_word/valid.acc.ave_10best.pth&quot;</span><span style="color:#D4D4D4;">,</span></span>
71
+ <span class="line"><span style="color:#9CDCFE;"> nbest</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">,</span></span>
72
+ <span class="line"><span style="color:#D4D4D4;">)</span></span>
73
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">nbests_orig = speech2text_slurp(mixwav_mc)</span></span>
74
+ <span class="line"><span style="color:#D4D4D4;">text, *_ = nbests_orig[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">]</span></span>
75
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#569CD6;">def</span><span style="color:#DCDCAA;"> entity_text_normalizer</span><span style="color:#D4D4D4;">(</span><span style="color:#9CDCFE;">sub_word_transcript_list</span><span style="color:#D4D4D4;">):</span></span>
76
+ <span class="line"><span style="color:#D4D4D4;"> transcript_dict={}</span></span>
77
+ <span class="line"><span style="color:#C586C0;"> for</span><span style="color:#D4D4D4;"> sub_word_transcript_new </span><span style="color:#C586C0;">in</span><span style="color:#D4D4D4;"> sub_word_transcript_list:</span></span>
78
+ <span class="line"><span style="color:#D4D4D4;"> sub_word_transcript=sub_word_transcript_new.split()</span></span>
79
+ <span class="line"><span style="color:#6A9955;"> # print(sub_word_transcript_list)</span></span>
80
+ <span class="line"><span style="color:#6A9955;"> # print(sub_word_transcript)</span></span>
81
+ <span class="line"><span style="color:#D4D4D4;"> transcript = sub_word_transcript[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">].replace(</span><span style="color:#CE9178;">&quot;▁&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;&quot;</span><span style="color:#D4D4D4;">)</span></span>
82
+ <span class="line"><span style="color:#C586C0;"> for</span><span style="color:#D4D4D4;"> sub_word </span><span style="color:#C586C0;">in</span><span style="color:#D4D4D4;"> sub_word_transcript[</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">:]:</span></span>
83
+ <span class="line"><span style="color:#C586C0;"> if</span><span style="color:#CE9178;"> &quot;▁&quot;</span><span style="color:#569CD6;"> in</span><span style="color:#D4D4D4;"> sub_word:</span></span>
84
+ <span class="line"><span style="color:#D4D4D4;"> transcript = transcript + </span><span style="color:#CE9178;">&quot; &quot;</span><span style="color:#D4D4D4;"> + sub_word.replace(</span><span style="color:#CE9178;">&quot;▁&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;&quot;</span><span style="color:#D4D4D4;">)</span></span>
85
+ <span class="line"><span style="color:#C586C0;"> else</span><span style="color:#D4D4D4;">:</span></span>
86
+ <span class="line"><span style="color:#D4D4D4;"> transcript = transcript + sub_word</span></span>
87
+ <span class="line"><span style="color:#D4D4D4;"> transcript_dict[transcript.split(</span><span style="color:#CE9178;">&quot; FILL &quot;</span><span style="color:#D4D4D4;">)[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">]]=transcript.split(</span><span style="color:#CE9178;">&quot; FILL &quot;</span><span style="color:#D4D4D4;">)[</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">]</span></span>
88
+ <span class="line"><span style="color:#C586C0;"> return</span><span style="color:#D4D4D4;"> transcript_dict</span></span>
89
+ <span class="line"><span style="color:#D4D4D4;">intent_text=</span><span style="color:#CE9178;">&quot;{scenario: &quot;</span><span style="color:#D4D4D4;">+text.split()[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">].split(</span><span style="color:#CE9178;">&quot;_&quot;</span><span style="color:#D4D4D4;">)[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">]+</span><span style="color:#CE9178;">&quot;, action: &quot;</span><span style="color:#D4D4D4;">+</span><span style="color:#CE9178;">&quot;_&quot;</span><span style="color:#D4D4D4;">.join(text.split()[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">].split(</span><span style="color:#CE9178;">&quot;_&quot;</span><span style="color:#D4D4D4;">)[</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">:])+</span><span style="color:#CE9178;">&quot;}&quot;</span></span>
90
+ <span class="line"><span style="color:#6A9955;"># print(text)</span></span>
91
+ <span class="line"><span style="color:#DCDCAA;">print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;INTENT: </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">intent_text</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&quot;</span><span style="color:#D4D4D4;">)</span></span>
92
+ <span class="line"><span style="color:#6A9955;"># print(&quot; &quot;.join(text.split()[1:]).split(&quot;▁SEP&quot;)[-1].split())</span></span>
93
+ <span class="line"><span style="color:#D4D4D4;">transcript=text_normalizer(</span><span style="color:#CE9178;">&quot; &quot;</span><span style="color:#D4D4D4;">.join(text.split()[</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">:]).split(</span><span style="color:#CE9178;">&quot;▁SEP&quot;</span><span style="color:#D4D4D4;">)[-</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">].split())</span></span>
94
+ <span class="line"><span style="color:#DCDCAA;">print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;ASR hypothesis: </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">transcript</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&quot;</span><span style="color:#D4D4D4;">)</span></span>
95
+ <span class="line"><span style="color:#D4D4D4;">entity_transcript=entity_text_normalizer(</span><span style="color:#CE9178;">&quot; &quot;</span><span style="color:#D4D4D4;">.join(text.split()[</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">:]).split(</span><span style="color:#CE9178;">&quot;▁SEP&quot;</span><span style="color:#D4D4D4;">)[</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">:-</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">])</span></span>
96
+ <span class="line"><span style="color:#DCDCAA;">print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;Slot dictionary: </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">entity_transcript</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&quot;</span><span style="color:#D4D4D4;">)</span></span>
97
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="_4-e2e-slu-for-sentiment-analysis" tabindex="-1"><a class="header-anchor" href="#_4-e2e-slu-for-sentiment-analysis"><span>4. E2E SLU for Sentiment Analysis</span></a></h2><h3 id="question4-✅-checkpoint-4-1-point" tabindex="-1"><a class="header-anchor" href="#question4-✅-checkpoint-4-1-point"><span>Question4 (✅ Checkpoint 4 (1 point))</span></a></h3><p>Run inference on given audio using E2E SLU for sentiment analysis</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">gdown </span><span style="color:#F44747;">--</span><span style="color:#DCDCAA;">id</span><span style="color:#F44747;"> 1CZzmpMliwSzja9TdBV7wmidlGepZBEUi</span><span style="color:#D4D4D4;"> -O /content/audio_iemocap_file.wav</span></span>
98
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> os</span></span>
99
+ <span class="line"></span>
100
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> soundfile</span></span>
101
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> IPython.display </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> display, Audio</span></span>
102
+ <span class="line"><span style="color:#D4D4D4;">mixwav_mc, sr = soundfile.read(</span><span style="color:#CE9178;">&quot;/content/audio_iemocap_file.wav&quot;</span><span style="color:#D4D4D4;">)</span></span>
103
+ <span class="line"><span style="color:#D4D4D4;">display(Audio(mixwav_mc.T, </span><span style="color:#9CDCFE;">rate</span><span style="color:#D4D4D4;">=sr))</span></span>
104
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">git lfs clone https://huggingface.co/espnet/YushiUeda_iemocap_sentiment_asr_train_asr_conformer /content/iemocap_model</span></span>
105
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet2.bin.asr_inference </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> Speech2Text</span></span>
106
+ <span class="line"><span style="color:#D4D4D4;">speech2text_iemocap = Speech2Text.from_pretrained(</span></span>
107
+ <span class="line"><span style="color:#9CDCFE;"> asr_train_config</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&quot;/content/iemocap_model/exp/asr_train_asr_conformer_raw_en_word/config.yaml&quot;</span><span style="color:#D4D4D4;">,</span></span>
108
+ <span class="line"><span style="color:#9CDCFE;"> asr_model_file</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&quot;/content/iemocap_model/exp/asr_train_asr_conformer_raw_en_word/valid.acc.ave_10best.pth&quot;</span><span style="color:#D4D4D4;">,</span></span>
109
+ <span class="line"><span style="color:#9CDCFE;"> nbest</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">,</span></span>
110
+ <span class="line"><span style="color:#D4D4D4;">)</span></span>
111
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">nbests_orig = speech2text_iemocap(mixwav_mc)</span></span>
112
+ <span class="line"><span style="color:#D4D4D4;">text, *_ = nbests_orig[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">]</span></span>
113
+ <span class="line"><span style="color:#D4D4D4;">sentiment_text=text.split()[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">]</span></span>
114
+ <span class="line"><span style="color:#DCDCAA;">print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;SENTIMENT: </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">sentiment_text</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&quot;</span><span style="color:#D4D4D4;">)</span></span>
115
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="question5-✅-checkpoint-5-1-point" tabindex="-1"><a class="header-anchor" href="#question5-✅-checkpoint-5-1-point"><span>Question5 (✅ Checkpoint 5 (1 point))</span></a></h3><p>Discuss about potential advantages of integrating pre-trained LMs inside E2E SLU framework compared to using them in cascaded manner?</p><p>[ANSWER HERE]</p>`,51);function v(h,_){const n=o("ExternalLinkIcon");return t(),p("div",null,[c,d,D,s("ul",null,[s("li",null,[s("a",u,[a("ESPnet repository"),e(n)])]),s("li",null,[s("a",y,[a("ESPnet documentation"),e(n)])])]),m])}const b=l(r,[["render",v],["__file","SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html.vue"]]),g=JSON.parse('{"path":"/tutorials/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html","title":"CMU 11492/11692 Spring 2023: Spoken Language Understanding","lang":"en-US","frontmatter":{},"headers":[{"level":2,"title":"Objectives","slug":"objectives","link":"#objectives","children":[]},{"level":2,"title":"❗Important Notes❗","slug":"❗important-notes❗","link":"#❗important-notes❗","children":[]},{"level":2,"title":"ESPnet installation","slug":"espnet-installation","link":"#espnet-installation","children":[]},{"level":2,"title":"Spoken Language Understanding","slug":"spoken-language-understanding","link":"#spoken-language-understanding","children":[]},{"level":2,"title":"Overview of the ESPnet-SLU","slug":"overview-of-the-espnet-slu","link":"#overview-of-the-espnet-slu","children":[]},{"level":2,"title":"1. E2E SLU","slug":"_1-e2e-slu","link":"#_1-e2e-slu","children":[{"level":3,"title":"1.1 Download Sample Audio File","slug":"_1-1-download-sample-audio-file","link":"#_1-1-download-sample-audio-file","children":[]},{"level":3,"title":"Question1 (✅ Checkpoint 1 (1 points))","slug":"question1-✅-checkpoint-1-1-points","link":"#question1-✅-checkpoint-1-1-points","children":[]},{"level":3,"title":"1.2 Download and Load pretrained E2E SLU Model","slug":"_1-2-download-and-load-pretrained-e2e-slu-model","link":"#_1-2-download-and-load-pretrained-e2e-slu-model","children":[]}]},{"level":2,"title":"2. Two Pass E2E SLU","slug":"_2-two-pass-e2e-slu","link":"#_2-two-pass-e2e-slu","children":[{"level":3,"title":"Question2 (✅ Checkpoint 2 (1 points))","slug":"question2-✅-checkpoint-2-1-points","link":"#question2-✅-checkpoint-2-1-points","children":[]}]},{"level":2,"title":"3. E2E SLU for Slot Filling","slug":"_3-e2e-slu-for-slot-filling","link":"#_3-e2e-slu-for-slot-filling","children":[{"level":3,"title":"Question3 (✅ Checkpoint 3 (1 point))","slug":"question3-✅-checkpoint-3-1-point","link":"#question3-✅-checkpoint-3-1-point","children":[]}]},{"level":2,"title":"4. E2E SLU for Sentiment Analysis","slug":"_4-e2e-slu-for-sentiment-analysis","link":"#_4-e2e-slu-for-sentiment-analysis","children":[{"level":3,"title":"Question4 (✅ Checkpoint 4 (1 point))","slug":"question4-✅-checkpoint-4-1-point","link":"#question4-✅-checkpoint-4-1-point","children":[]},{"level":3,"title":"Question5 (✅ Checkpoint 5 (1 point))","slug":"question5-✅-checkpoint-5-1-point","link":"#question5-✅-checkpoint-5-1-point","children":[]}]}],"git":{},"filePathRelative":"tutorials/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).md"}');export{b as comp,g as data};
assets/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html-DnIftUJK.js ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import{_ as i,r as l,o as p,c as t,a as s,b as a,d as e,e as o}from"./app-DTS6SjJz.js";const c={},d={href:"https://colab.research.google.com/github/espnet/notebook/blob/master/espnet2_tts_realtime_demo.ipynb",target:"_blank",rel:"noopener noreferrer"},r=s("img",{src:"https://colab.research.google.com/assets/colab-badge.svg",alt:"Open In Colab"},null,-1),u=s("h1",{id:"cmu-11492-11692-spring-2023-text-to-speech",tabindex:"-1"},[s("a",{class:"header-anchor",href:"#cmu-11492-11692-spring-2023-text-to-speech"},[s("span",null,"CMU 11492/11692 Spring 2023: Text to Speech")])],-1),v=s("p",null,"In this demonstration, we will show you some demonstrations of text to speech systems in ESPnet.",-1),m=s("p",null,"Main references:",-1),h={href:"https://github.com/espnet/espnet",target:"_blank",rel:"noopener noreferrer"},b={href:"https://espnet.github.io/espnet/",target:"_blank",rel:"noopener noreferrer"},_=s("p",null,"Author:",-1),g=s("ul",null,[s("li",null,"Siddhant Arora (siddhana@andrew.cmu.edu)")],-1),k={href:"https://colab.research.google.com/github/espnet/notebook/blob/master/espnet2_tts_realtime_demo.ipynb",target:"_blank",rel:"noopener noreferrer"},f=o(`<h2 id="❗important-notes❗" tabindex="-1"><a class="header-anchor" href="#❗important-notes❗"><span>❗Important Notes❗</span></a></h2><ul><li>We are using Colab to show the demo. However, Colab has some constraints on the total GPU runtime. If you use too much GPU time, you may not be able to use GPU for some time.</li><li>There are multiple in-class checkpoints ✅ throughout this tutorial. <strong>Your participation points are based on these tasks.</strong> Please try your best to follow all the steps! If you encounter issues, please notify the TAs as soon as possible so that we can make an adjustment for you.</li><li>Please submit PDF files of your completed notebooks to Gradescope. You can print the notebook using <code>File -&gt; Print</code> in the menu bar.</li></ul><h2 id="installation" tabindex="-1"><a class="header-anchor" href="#installation"><span>Installation</span></a></h2><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># NOTE: pip shows imcompatible errors due to preinstalled libraries but you do not need to care</span></span>
2
+ <span class="line"><span>!pip install typeguard==2.13.3</span></span>
3
+ <span class="line"><span>!git clone --depth 5 -b spoken_dialog_demo https://github.com/siddhu001/espnet.git</span></span>
4
+ <span class="line"><span>!cd espnet &amp;&amp; pip install .</span></span>
5
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!pip install parallel_wavegan==0.5.4 </span></span>
6
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!pip install pyopenjtalk==0.2</span></span>
7
+ <span class="line"><span>!pip install pypinyin==0.44.0 </span></span>
8
+ <span class="line"><span>!pip install parallel_wavegan==0.5.4 </span></span>
9
+ <span class="line"><span>!pip install gdown==4.4.0</span></span>
10
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!pip install espnet_model_zoo</span></span>
11
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h2 id="single-speaker-tts-model-demo" tabindex="-1"><a class="header-anchor" href="#single-speaker-tts-model-demo"><span>Single speaker TTS model demo</span></a></h2><h3 id="tts-model" tabindex="-1"><a class="header-anchor" href="#tts-model"><span>TTS Model</span></a></h3><p>You can try end-to-end text2wav model &amp; combination of text2mel and vocoder.<br> If you use text2wav model, you do not need to use vocoder (automatically disabled).</p><p><strong>Text2wav models</strong>:</p><ul><li>VITS</li></ul><p><strong>Text2mel models</strong>:</p><ul><li>Tacotron2</li><li>Transformer-TTS</li><li>(Conformer) FastSpeech</li><li>(Conformer) FastSpeech2</li></ul><p><strong>Vocoders</strong>:</p><ul><li>Griffin Lim</li><li>Parallel WaveGAN</li><li>Multi-band MelGAN</li><li>HiFiGAN</li><li>Style MelGAN.</li></ul><p>In this demo, we will only experiment with the English TTS model, but ESPnet-TTS supports multiple languages like Japanese and Mandarin.</p><blockquote><p>The terms of use follow that of each corpus. ESPnet-TTS use the following corpora:</p></blockquote><ul><li><code>ljspeech_*</code>: LJSpeech dataset <ul><li>https://keithito.com/LJ-Speech-Dataset/</li></ul></li><li><code>jsut_*</code>: JSUT corpus <ul><li>https://sites.google.com/site/shinnosuketakamichi/publication/jsut</li></ul></li><li><code>jvs_*</code>: JVS corpus + JSUT corpus <ul><li>https://sites.google.com/site/shinnosuketakamichi/research-topics/jvs_corpus</li><li>https://sites.google.com/site/shinnosuketakamichi/publication/jsut</li></ul></li><li><code>tsukuyomi_*</code>: つくよみちゃんコーパス + JSUT corpus <ul><li>https://tyc.rei-yumesaki.net/material/corpus/</li><li>https://sites.google.com/site/shinnosuketakamichi/publication/jsut</li></ul></li><li><code>csmsc_*</code>: Chinese Standard Mandarin Speech Corpus <ul><li>https://www.data-baker.com/open_source.html</li></ul></li></ul><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>#@title Download English model { run: &quot;auto&quot; }</span></span>
12
+ <span class="line"><span>lang = &#39;English&#39;</span></span>
13
+ <span class="line"><span>tag = &quot;kan-bayashi/ljspeech_vits&quot; #@param [&quot;kan-bayashi/ljspeech_tacotron2&quot;, &quot;kan-bayashi/ljspeech_fastspeech&quot;, &quot;kan-bayashi/ljspeech_vits&quot;]</span></span>
14
+ <span class="line"><span>vocoder_tag = &quot;none&quot; #@param [&quot;none&quot;, &quot;parallel_wavegan/ljspeech_parallel_wavegan.v1&quot;]</span></span>
15
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!gdown --id &quot;1PjT9FX13d7Mv6loCs-wv5R_v3QrmLixf&amp;confirm=t&quot; -O /content/tts_model.zip</span></span>
16
+ <span class="line"><span>!unzip /content/tts_model.zip -d /content/tts_model</span></span>
17
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="model-setup" tabindex="-1"><a class="header-anchor" href="#model-setup"><span>Model Setup</span></a></h3><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>from espnet2.bin.tts_inference import Text2Speech</span></span>
18
+ <span class="line"><span>from espnet2.utils.types import str_or_none</span></span>
19
+ <span class="line"><span></span></span>
20
+ <span class="line"><span>text2speech = Text2Speech.from_pretrained(</span></span>
21
+ <span class="line"><span> train_config=&quot;/content/tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/config.yaml&quot;,</span></span>
22
+ <span class="line"><span> model_file=&quot;/content/tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/train.total_count.ave_10best.pth&quot;,</span></span>
23
+ <span class="line"><span> vocoder_tag=str_or_none(vocoder_tag),</span></span>
24
+ <span class="line"><span> device=&quot;cuda&quot;,</span></span>
25
+ <span class="line"><span> # Only for Tacotron 2 &amp; Transformer</span></span>
26
+ <span class="line"><span> threshold=0.5,</span></span>
27
+ <span class="line"><span> # Only for Tacotron 2</span></span>
28
+ <span class="line"><span> minlenratio=0.0,</span></span>
29
+ <span class="line"><span> maxlenratio=10.0,</span></span>
30
+ <span class="line"><span> use_att_constraint=False,</span></span>
31
+ <span class="line"><span> backward_window=1,</span></span>
32
+ <span class="line"><span> forward_window=3,</span></span>
33
+ <span class="line"><span> # Only for FastSpeech &amp; FastSpeech2 &amp; VITS</span></span>
34
+ <span class="line"><span> speed_control_alpha=1.0,</span></span>
35
+ <span class="line"><span> # Only for VITS</span></span>
36
+ <span class="line"><span> noise_scale=0.333,</span></span>
37
+ <span class="line"><span> noise_scale_dur=0.333,</span></span>
38
+ <span class="line"><span>)</span></span>
39
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="synthesis-✅-checkpoint-1-2-point" tabindex="-1"><a class="header-anchor" href="#synthesis-✅-checkpoint-1-2-point"><span>Synthesis (✅ Checkpoint 1 (2 point))</span></a></h3><p>Run inference of pretrained single-speaker TTS model. Please experiment with running TTS model on different utterances. Provide some examples of failure cases and plot spectrogram and waveform of the utterances for both successful and failure cases. (1 point)</p><p>Please also discuss possible explanation of these failure cases. (1 point)</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>import time</span></span>
40
+ <span class="line"><span>import torch</span></span>
41
+ <span class="line"><span></span></span>
42
+ <span class="line"><span># decide the input sentence by yourself</span></span>
43
+ <span class="line"><span>print(f&quot;Input your favorite sentence in {lang}.&quot;)</span></span>
44
+ <span class="line"><span>x = input()</span></span>
45
+ <span class="line"><span></span></span>
46
+ <span class="line"><span># synthesis</span></span>
47
+ <span class="line"><span>with torch.no_grad():</span></span>
48
+ <span class="line"><span> start = time.time()</span></span>
49
+ <span class="line"><span> wav = text2speech(x)[&quot;wav&quot;]</span></span>
50
+ <span class="line"><span>rtf = (time.time() - start) / (len(wav) / text2speech.fs)</span></span>
51
+ <span class="line"><span>print(f&quot;RTF = {rtf:5f}&quot;)</span></span>
52
+ <span class="line"><span></span></span>
53
+ <span class="line"><span># let us listen to generated samples</span></span>
54
+ <span class="line"><span>from IPython.display import display, Audio</span></span>
55
+ <span class="line"><span>display(Audio(wav.view(-1).cpu().numpy(), rate=text2speech.fs))</span></span>
56
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="tts-model-selection" tabindex="-1"><a class="header-anchor" href="#tts-model-selection"><span>TTS Model selection</span></a></h3><h3 id="question2-✅-checkpoint-2-1-point" tabindex="-1"><a class="header-anchor" href="#question2-✅-checkpoint-2-1-point"><span>Question2 (✅ Checkpoint 2 (1 point))</span></a></h3><p>Please experiment with running different TTS models like Tacotron or FastSpeech. Please also experiment both with Griffin Lim and Parallel WaveGAN vocoder. Please discuss which is better and why.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>#@title Download English model { run: &quot;auto&quot; }</span></span>
57
+ <span class="line"><span>lang = &#39;English&#39;</span></span>
58
+ <span class="line"><span>tag = &quot;kan-bayashi/ljspeech_tacotron2&quot; #@param [&quot;kan-bayashi/ljspeech_tacotron2&quot;, &quot;kan-bayashi/ljspeech_fastspeech&quot;, &quot;kan-bayashi/ljspeech_vits&quot;]</span></span>
59
+ <span class="line"><span>vocoder_tag = &quot;none&quot; #@param [&quot;none&quot;, &quot;parallel_wavegan/ljspeech_parallel_wavegan.v1&quot;]</span></span>
60
+ <span class="line"><span># when vocoder_tag is none, Griffin Lim algorithm is used</span></span>
61
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!gdown --id &quot;1PXsSaulipN31HnQ8YWwsi9Ndb3B2My-J&amp;confirm=t&quot; -O /content/tts_tacotron_model.zip</span></span>
62
+ <span class="line"><span>!unzip /content/tts_tacotron_model.zip -d /content/tts_tacotron_model</span></span>
63
+ <span class="line"><span>#For fastspeech model run the commented lines below</span></span>
64
+ <span class="line"><span>#!gdown --id &quot;13Jek_NbI8Qai42v4GKYxx3-jXOun5m2-&amp;confirm=t&quot; -O /content/tts_fastspeech_model.zip</span></span>
65
+ <span class="line"><span>#!unzip /content/tts_fastspeech_model.zip -d /content/tts_fastspeech_model</span></span>
66
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>from espnet2.bin.tts_inference import Text2Speech</span></span>
67
+ <span class="line"><span>from espnet2.utils.types import str_or_none</span></span>
68
+ <span class="line"><span>!ln -sf /content/tts_tacotron_model/exp .</span></span>
69
+ <span class="line"><span>text2speech = Text2Speech.from_pretrained(</span></span>
70
+ <span class="line"><span> # model_tag=str_or_none(tag),</span></span>
71
+ <span class="line"><span> train_config=&quot;/content/tts_tacotron_model/exp/tts_train_tacotron2_raw_phn_tacotron_g2p_en_no_space/config.yaml&quot;,</span></span>
72
+ <span class="line"><span> model_file=&quot;/content/tts_tacotron_model/exp/tts_train_tacotron2_raw_phn_tacotron_g2p_en_no_space/199epoch.pth&quot;,</span></span>
73
+ <span class="line"><span> vocoder_tag=str_or_none(vocoder_tag),</span></span>
74
+ <span class="line"><span> device=&quot;cuda&quot;,</span></span>
75
+ <span class="line"><span> # Only for Tacotron 2 &amp; Transformer</span></span>
76
+ <span class="line"><span> threshold=0.5,</span></span>
77
+ <span class="line"><span> # Only for Tacotron 2</span></span>
78
+ <span class="line"><span> minlenratio=0.0,</span></span>
79
+ <span class="line"><span> maxlenratio=10.0,</span></span>
80
+ <span class="line"><span> use_att_constraint=False,</span></span>
81
+ <span class="line"><span> backward_window=1,</span></span>
82
+ <span class="line"><span> forward_window=3,</span></span>
83
+ <span class="line"><span> # Only for FastSpeech &amp; FastSpeech2 &amp; VITS</span></span>
84
+ <span class="line"><span> speed_control_alpha=1.0,</span></span>
85
+ <span class="line"><span> # Only for VITS</span></span>
86
+ <span class="line"><span> noise_scale=0.333,</span></span>
87
+ <span class="line"><span> noise_scale_dur=0.333,</span></span>
88
+ <span class="line"><span>)</span></span>
89
+ <span class="line"><span># For fastspeech model run the commented lines below</span></span>
90
+ <span class="line"><span># from espnet2.bin.tts_inference import Text2Speech</span></span>
91
+ <span class="line"><span># from espnet2.utils.types import str_or_none</span></span>
92
+ <span class="line"><span># !ln -sf /content/tts_fastspeech_model/exp .</span></span>
93
+ <span class="line"><span># text2speech = Text2Speech.from_pretrained(</span></span>
94
+ <span class="line"><span># # model_tag=str_or_none(tag),</span></span>
95
+ <span class="line"><span># train_config=&quot;/content/tts_fastspeech_model/exp/tts_train_fastspeech_raw_phn_tacotron_g2p_en_no_space/config.yaml&quot;,</span></span>
96
+ <span class="line"><span># model_file=&quot;/content/tts_fastspeech_model/exp/tts_train_fastspeech_raw_phn_tacotron_g2p_en_no_space/1000epoch.pth&quot;,</span></span>
97
+ <span class="line"><span># vocoder_tag=str_or_none(vocoder_tag),</span></span>
98
+ <span class="line"><span># device=&quot;cuda&quot;,</span></span>
99
+ <span class="line"><span># # Only for Tacotron 2 &amp; Transformer</span></span>
100
+ <span class="line"><span># threshold=0.5,</span></span>
101
+ <span class="line"><span># # Only for Tacotron 2</span></span>
102
+ <span class="line"><span># minlenratio=0.0,</span></span>
103
+ <span class="line"><span># maxlenratio=10.0,</span></span>
104
+ <span class="line"><span># use_att_constraint=False,</span></span>
105
+ <span class="line"><span># backward_window=1,</span></span>
106
+ <span class="line"><span># forward_window=3,</span></span>
107
+ <span class="line"><span># # Only for FastSpeech &amp; FastSpeech2 &amp; VITS</span></span>
108
+ <span class="line"><span># speed_control_alpha=1.0,</span></span>
109
+ <span class="line"><span># # Only for VITS</span></span>
110
+ <span class="line"><span># noise_scale=0.333,</span></span>
111
+ <span class="line"><span># noise_scale_dur=0.333,</span></span>
112
+ <span class="line"><span># )</span></span>
113
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>import time</span></span>
114
+ <span class="line"><span>import torch</span></span>
115
+ <span class="line"><span></span></span>
116
+ <span class="line"><span># decide the input sentence by yourself</span></span>
117
+ <span class="line"><span>print(f&quot;Input your favorite sentence in {lang}.&quot;)</span></span>
118
+ <span class="line"><span>x = input()</span></span>
119
+ <span class="line"><span></span></span>
120
+ <span class="line"><span># synthesis</span></span>
121
+ <span class="line"><span>with torch.no_grad():</span></span>
122
+ <span class="line"><span> start = time.time()</span></span>
123
+ <span class="line"><span> wav = text2speech(x)[&quot;wav&quot;]</span></span>
124
+ <span class="line"><span>rtf = (time.time() - start) / (len(wav) / text2speech.fs)</span></span>
125
+ <span class="line"><span>print(f&quot;RTF = {rtf:5f}&quot;)</span></span>
126
+ <span class="line"><span></span></span>
127
+ <span class="line"><span># let us listen to generated samples</span></span>
128
+ <span class="line"><span>from IPython.display import display, Audio</span></span>
129
+ <span class="line"><span>display(Audio(wav.view(-1).cpu().numpy(), rate=text2speech.fs))</span></span>
130
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="multi-speaker-model-demo" tabindex="-1"><a class="header-anchor" href="#multi-speaker-model-demo"><span>Multi-speaker Model Demo</span></a></h2><h3 id="model-selection" tabindex="-1"><a class="header-anchor" href="#model-selection"><span>Model Selection</span></a></h3><p>Now we provide only English multi-speaker pretrained model.</p><blockquote><p>The terms of use follow that of each corpus. We use the following corpora:</p></blockquote><ul><li><code>libritts_*</code>: LibriTTS corpus <ul><li>http://www.openslr.org/60</li></ul></li><li><code>vctk_*</code>: English Multi-speaker Corpus for CSTR Voice Cloning Toolkit <ul><li>http://www.udialogue.org/download/cstr-vctk-corpus.html</li></ul></li></ul><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>#@title English multi-speaker pretrained model { run: &quot;auto&quot; }</span></span>
131
+ <span class="line"><span>lang = &#39;English&#39;</span></span>
132
+ <span class="line"><span>tag = &#39;kan-bayashi/vctk_full_band_multi_spk_vits&#39; #@param [&quot;kan-bayashi/vctk_gst_tacotron2&quot;, &quot;kan-bayashi/vctk_gst_transformer&quot;, &quot;kan-bayashi/vctk_xvector_tacotron2&quot;, &quot;kan-bayashi/vctk_xvector_transformer&quot;, &quot;kan-bayashi/vctk_xvector_conformer_fastspeech2&quot;, &quot;kan-bayashi/vctk_gst+xvector_tacotron2&quot;, &quot;kan-bayashi/vctk_gst+xvector_transformer&quot;, &quot;kan-bayashi/vctk_gst+xvector_conformer_fastspeech2&quot;, &quot;kan-bayashi/vctk_multi_spk_vits&quot;, &quot;kan-bayashi/vctk_full_band_multi_spk_vits&quot;, &quot;kan-bayashi/libritts_xvector_transformer&quot;, &quot;kan-bayashi/libritts_xvector_conformer_fastspeech2&quot;, &quot;kan-bayashi/libritts_gst+xvector_transformer&quot;, &quot;kan-bayashi/libritts_gst+xvector_conformer_fastspeech2&quot;, &quot;kan-bayashi/libritts_xvector_vits&quot;] {type:&quot;string&quot;}</span></span>
133
+ <span class="line"><span>vocoder_tag = &quot;none&quot; #@param [&quot;none&quot;, &quot;parallel_wavegan/vctk_parallel_wavegan.v1.long&quot;, &quot;parallel_wavegan/vctk_multi_band_melgan.v2&quot;, &quot;parallel_wavegan/vctk_style_melgan.v1&quot;, &quot;parallel_wavegan/vctk_hifigan.v1&quot;, &quot;parallel_wavegan/libritts_parallel_wavegan.v1.long&quot;, &quot;parallel_wavegan/libritts_multi_band_melgan.v2&quot;, &quot;parallel_wavegan/libritts_hifigan.v1&quot;, &quot;parallel_wavegan/libritts_style_melgan.v1&quot;] {type:&quot;string&quot;}</span></span>
134
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!gdown --id &quot;1fzyyjLvrT_jldw4lfOD1P8FK2MGoIZO_&amp;confirm=t&quot; -O /content/tts_multi-speaker_model.zip</span></span>
135
+ <span class="line"><span>!unzip /content/tts_multi-speaker_model.zip -d /content/tts_multi-speaker_model</span></span>
136
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="model-setup-1" tabindex="-1"><a class="header-anchor" href="#model-setup-1"><span>Model Setup</span></a></h3><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>from espnet2.bin.tts_inference import Text2Speech</span></span>
137
+ <span class="line"><span>from espnet2.utils.types import str_or_none</span></span>
138
+ <span class="line"><span></span></span>
139
+ <span class="line"><span>text2speech = Text2Speech.from_pretrained(</span></span>
140
+ <span class="line"><span> train_config=&quot;/content/tts_multi-speaker_model/exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/config.yaml&quot;,</span></span>
141
+ <span class="line"><span> model_file=&quot;/content/tts_multi-speaker_model/exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/train.total_count.ave_10best.pth&quot;,</span></span>
142
+ <span class="line"><span> vocoder_tag=str_or_none(vocoder_tag),</span></span>
143
+ <span class="line"><span> device=&quot;cuda&quot;,</span></span>
144
+ <span class="line"><span> # Only for Tacotron 2 &amp; Transformer</span></span>
145
+ <span class="line"><span> threshold=0.5,</span></span>
146
+ <span class="line"><span> # Only for Tacotron 2</span></span>
147
+ <span class="line"><span> minlenratio=0.0,</span></span>
148
+ <span class="line"><span> maxlenratio=10.0,</span></span>
149
+ <span class="line"><span> use_att_constraint=False,</span></span>
150
+ <span class="line"><span> backward_window=1,</span></span>
151
+ <span class="line"><span> forward_window=3,</span></span>
152
+ <span class="line"><span> # Only for FastSpeech &amp; FastSpeech2 &amp; VITS</span></span>
153
+ <span class="line"><span> speed_control_alpha=1.0,</span></span>
154
+ <span class="line"><span> # Only for VITS</span></span>
155
+ <span class="line"><span> noise_scale=0.333,</span></span>
156
+ <span class="line"><span> noise_scale_dur=0.333,</span></span>
157
+ <span class="line"><span>)</span></span>
158
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="speaker-selection" tabindex="-1"><a class="header-anchor" href="#speaker-selection"><span>Speaker selection</span></a></h3><p>For multi-speaker model, we need to provide X-vector and/or the reference speech to decide the speaker characteristics.<br> For X-vector, you can select the speaker from the dumped x-vectors.<br> For the reference speech, you can use any speech but please make sure the sampling rate is matched.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>import glob</span></span>
159
+ <span class="line"><span>import os</span></span>
160
+ <span class="line"><span>import numpy as np</span></span>
161
+ <span class="line"><span>import kaldiio</span></span>
162
+ <span class="line"><span></span></span>
163
+ <span class="line"><span># Get model directory path</span></span>
164
+ <span class="line"><span>from espnet_model_zoo.downloader import ModelDownloader</span></span>
165
+ <span class="line"><span>d = ModelDownloader()</span></span>
166
+ <span class="line"><span># model_dir = os.path.dirname(d.download_and_unpack(tag)[&quot;train_config&quot;])</span></span>
167
+ <span class="line"><span></span></span>
168
+ <span class="line"><span># X-vector selection</span></span>
169
+ <span class="line"><span>spembs = None</span></span>
170
+ <span class="line"><span>if text2speech.use_spembs:</span></span>
171
+ <span class="line"><span> xvector_ark = [p for p in glob.glob(f&quot;/content/tts_multi-speaker_model/dump/**/spk_xvector.ark&quot;, recursive=True) if &quot;tr&quot; in p][0]</span></span>
172
+ <span class="line"><span> xvectors = {k: v for k, v in kaldiio.load_ark(xvector_ark)}</span></span>
173
+ <span class="line"><span> spks = list(xvectors.keys())</span></span>
174
+ <span class="line"><span></span></span>
175
+ <span class="line"><span> # randomly select speaker</span></span>
176
+ <span class="line"><span> random_spk_idx = np.random.randint(0, len(spks))</span></span>
177
+ <span class="line"><span> spk = spks[random_spk_idx]</span></span>
178
+ <span class="line"><span> spembs = xvectors[spk]</span></span>
179
+ <span class="line"><span> print(f&quot;selected spk: {spk}&quot;)</span></span>
180
+ <span class="line"><span></span></span>
181
+ <span class="line"><span># Speaker ID selection</span></span>
182
+ <span class="line"><span>sids = None</span></span>
183
+ <span class="line"><span>if text2speech.use_sids:</span></span>
184
+ <span class="line"><span> spk2sid = glob.glob(f&quot;/content/tts_multi-speaker_model/dump/**/spk2sid&quot;, recursive=True)[0]</span></span>
185
+ <span class="line"><span> with open(spk2sid) as f:</span></span>
186
+ <span class="line"><span> lines = [line.strip() for line in f.readlines()]</span></span>
187
+ <span class="line"><span> sid2spk = {int(line.split()[1]): line.split()[0] for line in lines}</span></span>
188
+ <span class="line"><span> </span></span>
189
+ <span class="line"><span> # randomly select speaker</span></span>
190
+ <span class="line"><span> sids = np.array(np.random.randint(1, len(sid2spk)))</span></span>
191
+ <span class="line"><span> spk = sid2spk[int(sids)]</span></span>
192
+ <span class="line"><span> print(f&quot;selected spk: {spk}&quot;)</span></span>
193
+ <span class="line"><span></span></span>
194
+ <span class="line"><span># Reference speech selection for GST</span></span>
195
+ <span class="line"><span>speech = None</span></span>
196
+ <span class="line"><span>if text2speech.use_speech:</span></span>
197
+ <span class="line"><span> # you can change here to load your own reference speech</span></span>
198
+ <span class="line"><span> # e.g.</span></span>
199
+ <span class="line"><span> # import soundfile as sf</span></span>
200
+ <span class="line"><span> # speech, fs = sf.read(&quot;/path/to/reference.wav&quot;)</span></span>
201
+ <span class="line"><span> # speech = torch.from_numpy(speech).float()</span></span>
202
+ <span class="line"><span> speech = torch.randn(50000,) * 0.01</span></span>
203
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="synthesis-✅-checkpoint3-2-point" tabindex="-1"><a class="header-anchor" href="#synthesis-✅-checkpoint3-2-point"><span>Synthesis(✅ Checkpoint3 (2 point))</span></a></h3><p>Run inference of pretrained multi-speaker TTS model on more than one speaker id. Plot spectrogram and waveform of the synthesized speech for these speaker ids.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>import time</span></span>
204
+ <span class="line"><span>import torch</span></span>
205
+ <span class="line"><span></span></span>
206
+ <span class="line"><span># decide the input sentence by yourself</span></span>
207
+ <span class="line"><span>print(f&quot;Input your favorite sentence in {lang}.&quot;)</span></span>
208
+ <span class="line"><span>x = input()</span></span>
209
+ <span class="line"><span></span></span>
210
+ <span class="line"><span># synthesis</span></span>
211
+ <span class="line"><span>with torch.no_grad():</span></span>
212
+ <span class="line"><span> start = time.time()</span></span>
213
+ <span class="line"><span> wav = text2speech(x, speech=speech, spembs=spembs, sids=sids)[&quot;wav&quot;]</span></span>
214
+ <span class="line"><span>rtf = (time.time() - start) / (len(wav) / text2speech.fs)</span></span>
215
+ <span class="line"><span>print(f&quot;RTF = {rtf:5f}&quot;)</span></span>
216
+ <span class="line"><span></span></span>
217
+ <span class="line"><span># let us listen to generated samples</span></span>
218
+ <span class="line"><span>from IPython.display import display, Audio</span></span>
219
+ <span class="line"><span>display(Audio(wav.view(-1).cpu().numpy(), rate=text2speech.fs))</span></span>
220
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div>`,49);function x(q,y){const n=l("ExternalLinkIcon");return p(),t("div",null,[s("p",null,[s("a",d,[r,a(n)])]),u,v,m,s("ul",null,[s("li",null,[s("a",h,[e("ESPnet repository"),a(n)])]),s("li",null,[s("a",b,[e("ESPnet documentation"),a(n)])])]),_,g,s("p",null,[e("The notebook is adapted from this "),s("a",k,[e("Colab"),a(n)])]),f])}const T=i(c,[["render",x],["__file","TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html.vue"]]),S=JSON.parse('{"path":"/tutorials/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html","title":"CMU 11492/11692 Spring 2023: Text to Speech","lang":"en-US","frontmatter":{},"headers":[{"level":2,"title":"❗Important Notes❗","slug":"❗important-notes❗","link":"#❗important-notes❗","children":[]},{"level":2,"title":"Installation","slug":"installation","link":"#installation","children":[]},{"level":2,"title":"Single speaker TTS model demo","slug":"single-speaker-tts-model-demo","link":"#single-speaker-tts-model-demo","children":[{"level":3,"title":"TTS Model","slug":"tts-model","link":"#tts-model","children":[]},{"level":3,"title":"Model Setup","slug":"model-setup","link":"#model-setup","children":[]},{"level":3,"title":"Synthesis (✅ Checkpoint 1 (2 point))","slug":"synthesis-✅-checkpoint-1-2-point","link":"#synthesis-✅-checkpoint-1-2-point","children":[]},{"level":3,"title":"TTS Model selection","slug":"tts-model-selection","link":"#tts-model-selection","children":[]},{"level":3,"title":"Question2 (✅ Checkpoint 2 (1 point))","slug":"question2-✅-checkpoint-2-1-point","link":"#question2-✅-checkpoint-2-1-point","children":[]}]},{"level":2,"title":"Multi-speaker Model Demo","slug":"multi-speaker-model-demo","link":"#multi-speaker-model-demo","children":[{"level":3,"title":"Model Selection","slug":"model-selection","link":"#model-selection","children":[]},{"level":3,"title":"Model Setup","slug":"model-setup-1","link":"#model-setup-1","children":[]},{"level":3,"title":"Speaker selection","slug":"speaker-selection","link":"#speaker-selection","children":[]},{"level":3,"title":"Synthesis(✅ Checkpoint3 (2 point))","slug":"synthesis-✅-checkpoint3-2-point","link":"#synthesis-✅-checkpoint3-2-point","children":[]}]}],"git":{},"filePathRelative":"tutorials/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).md"}');export{T as comp,S as data};
assets/app-DTS6SjJz.js ADDED
The diff for this file is too large to render. See raw diff
 
assets/asr_cli.html-BA-xBrC-.js ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import{_ as o,r as p,o as t,c as i,a as s,d as a,b as e,e as l}from"./app-DTS6SjJz.js";const r={},c=s("h1",{id:"speech-recognition-recipe",tabindex:"-1"},[s("a",{class:"header-anchor",href:"#speech-recognition-recipe"},[s("span",null,"Speech Recognition (Recipe)")])],-1),d={href:"https://github.com/ShigekiKarita",target:"_blank",rel:"noopener noreferrer"},D=l(`<p>July 29 2019</p><p>ESPnet Hackathon 2019 @Tokyo</p><h2 id="abstract" tabindex="-1"><a class="header-anchor" href="#abstract"><span>Abstract</span></a></h2><p>This example shows you a practical ASR example using ESPnet as a command line interface, and also as a library.</p><p>See also</p><ul><li>documetation https://espnet.github.io/espnet/</li><li>github https://github.com/espnet</li></ul><h2 id="installation" tabindex="-1"><a class="header-anchor" href="#installation"><span>Installation</span></a></h2><p>ESPnet depends on Kaldi ASR toolkit and Warp-CTC. This will take a few minutes.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># OS setup</span></span>
2
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">sudo apt-get install bc tree</span></span>
3
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">cat /etc/os-release</span></span>
4
+ <span class="line"></span>
5
+ <span class="line"><span style="color:#6A9955;"># espnet setup</span></span>
6
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">git clone https://github.com/espnet/espnet</span></span>
7
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">cd espnet; pip install -e .</span></span>
8
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">mkdir -p espnet/tools/venv/</span><span style="color:#DCDCAA;">bin</span><span style="color:#D4D4D4;">; touch espnet/tools/venv/</span><span style="color:#DCDCAA;">bin</span><span style="color:#D4D4D4;">/activate</span></span>
9
+ <span class="line"></span>
10
+ <span class="line"><span style="color:#6A9955;"># warp ctc setup</span></span>
11
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">git clone https://github.com/espnet/warp-ctc -b pytorch-</span><span style="color:#B5CEA8;">1.1</span></span>
12
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">cd warp-ctc </span><span style="color:#F44747;">&amp;&amp;</span><span style="color:#D4D4D4;"> mkdir build </span><span style="color:#F44747;">&amp;&amp;</span><span style="color:#D4D4D4;"> cd build </span><span style="color:#F44747;">&amp;&amp;</span><span style="color:#D4D4D4;"> cmake .. </span><span style="color:#F44747;">&amp;&amp;</span><span style="color:#D4D4D4;"> make -j4</span></span>
13
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">cd warp-ctc/pytorch_binding </span><span style="color:#F44747;">&amp;&amp;</span><span style="color:#D4D4D4;"> python setup.py install </span></span>
14
+ <span class="line"></span>
15
+ <span class="line"><span style="color:#6A9955;"># kaldi setup</span></span>
16
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">cd ./espnet/tools; git clone https://github.com/kaldi-asr/kaldi</span></span>
17
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">echo </span><span style="color:#CE9178;">&quot;&quot;</span><span style="color:#D4D4D4;"> &gt; ./espnet/tools/kaldi/tools/extras/check_dependencies.sh </span><span style="color:#6A9955;"># ignore check</span></span>
18
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">chmod +x ./espnet/tools/kaldi/tools/extras/check_dependencies.sh</span></span>
19
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">cd ./espnet/tools/kaldi/tools; make sph2pipe sclite</span></span>
20
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">rm -rf espnet/tools/kaldi/tools/python</span></span>
21
+ <span class="line"><span style="color:#D4D4D4;">![ ! -e ubuntu16-featbin.tar.gz ] </span><span style="color:#F44747;">&amp;&amp;</span><span style="color:#D4D4D4;"> wget https://</span><span style="color:#B5CEA8;">18</span><span style="color:#D4D4D4;">-</span><span style="color:#B5CEA8;">198329952</span><span style="color:#D4D4D4;">-gh.circle-artifacts.com/</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">/home/circleci/repo/ubuntu16-featbin.tar.gz</span></span>
22
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">tar -xf ./ubuntu16-featbin.tar.gz</span></span>
23
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">cp featbin/* espnet/tools/kaldi/src/featbin/</span></span>
24
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="espnet-command-line-usage-espnet-egs-xxx" tabindex="-1"><a class="header-anchor" href="#espnet-command-line-usage-espnet-egs-xxx"><span>ESPnet command line usage (espnet/egs/xxx)</span></a></h2><p>You can use the end-to-end script <code>run.sh</code> for reproducing systems reported in <code>espnet/egs/*/asr1/RESULTS.md</code>. Typically, we organize <code>run.sh</code> with several stages:</p><ol start="0"><li>Data download (if available)</li><li>Kaldi-style data preparation</li><li>Save python-friendly data (e.g., JSON, HDF5, etc)</li><li>Language model training</li><li>ASR model training</li><li>Decoding and evaluation</li></ol><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">ls espnet/egs</span></span>
25
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h3 id="stage-0-2-data-preparation" tabindex="-1"><a class="header-anchor" href="#stage-0-2-data-preparation"><span>Stage 0 - 2 Data preparation</span></a></h3><p>For example, if you add <code>--stop-stage 2</code>, you can stop the script before neural network training.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">cd espnet/egs/an4/asr1; ./run.sh </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">ngpu </span><span style="color:#B5CEA8;">1</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">stop-stage </span><span style="color:#B5CEA8;">2</span></span>
26
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h2 id="kaldi-style-directory-structure" tabindex="-1"><a class="header-anchor" href="#kaldi-style-directory-structure"><span>Kaldi-style directory structure</span></a></h2><p>Always we organize each recipe placed in <code>egs/xxx/asr1</code> in Kaldi way:</p>`,18),y=s("li",null,[s("code",null,"conf/"),a(": kaldi configurations, e.g., speech feature")],-1),u=s("code",null,"data/",-1),v={href:"https://kaldi-asr.org/doc/data_prep.html",target:"_blank",rel:"noopener noreferrer"},h=s("li",null,[s("code",null,"exp/"),a(": intermidiate files through experiments, e.g., log files, model parameters")],-1),m=s("code",null,"fbank/",-1),b={href:"https://kaldi-asr.org/doc/io.html",target:"_blank",rel:"noopener noreferrer"},g=s("li",null,[s("code",null,"dump/"),a(": ESPnet meta data for tranining, e.g., json, hdf5")],-1),C=s("li",null,[s("code",null,"local/"),a(": corpus specific data preparation scripts")],-1),E={href:"https://github.com/kaldi-asr/kaldi/tree/master/egs/wsj/s5/steps",target:"_blank",rel:"noopener noreferrer"},k={href:"https://github.com/kaldi-asr/kaldi/tree/master/egs/wsj/s5/utils",target:"_blank",rel:"noopener noreferrer"},f=l(`<div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">tree -L </span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;"> espnet/egs/an4/asr1</span></span>
27
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h3 id="tips-essential-files-in-data-preparation" tabindex="-1"><a class="header-anchor" href="#tips-essential-files-in-data-preparation"><span>TIPS: essential files in data preparation</span></a></h3><p>To create a new recipe, all you need is stage 1 that creates key-value pair files:</p><ul><li>speech<code>data/xxx/wav.scp</code></li><li>text<code>data/xxx/text</code></li></ul><h4 id="raw-speech-file-list" tabindex="-1"><a class="header-anchor" href="#raw-speech-file-list"><span>raw speech file list</span></a></h4><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">head espnet/egs/an4/asr1/data/train/wav.scp</span></span>
28
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h3 id="raw-text-list" tabindex="-1"><a class="header-anchor" href="#raw-text-list"><span>raw text list</span></a></h3><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">head espnet/egs/an4/asr1/data/train/text</span></span>
29
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h3 id="tips-explore-datasets-with-data-json" tabindex="-1"><a class="header-anchor" href="#tips-explore-datasets-with-data-json"><span>TIPS: explore datasets with data.json</span></a></h3><p>To explore datasets easily, ESPnet stores metadata <code>dump/xxx/data.json</code> in the stage 2.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> json</span></span>
30
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> matplotlib.pyplot </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> plt</span></span>
31
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> kaldiio</span></span>
32
+ <span class="line"></span>
33
+ <span class="line"><span style="color:#6A9955;"># load 10-th speech/text in data.json</span></span>
34
+ <span class="line"><span style="color:#D4D4D4;">root = </span><span style="color:#CE9178;">&quot;espnet/egs/an4/asr1&quot;</span></span>
35
+ <span class="line"><span style="color:#C586C0;">with</span><span style="color:#DCDCAA;"> open</span><span style="color:#D4D4D4;">(root + </span><span style="color:#CE9178;">&quot;/dump/test/deltafalse/data.json&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;r&quot;</span><span style="color:#D4D4D4;">) </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> f:</span></span>
36
+ <span class="line"><span style="color:#D4D4D4;"> test_json = json.load(f)[</span><span style="color:#CE9178;">&quot;utts&quot;</span><span style="color:#D4D4D4;">]</span></span>
37
+ <span class="line"><span style="color:#D4D4D4;"> </span></span>
38
+ <span class="line"><span style="color:#D4D4D4;">key, info = </span><span style="color:#4EC9B0;">list</span><span style="color:#D4D4D4;">(test_json.items())[</span><span style="color:#B5CEA8;">10</span><span style="color:#D4D4D4;">]</span></span>
39
+ <span class="line"></span>
40
+ <span class="line"><span style="color:#6A9955;"># plot the speech feature</span></span>
41
+ <span class="line"><span style="color:#D4D4D4;">fbank = kaldiio.load_mat(info[</span><span style="color:#CE9178;">&quot;input&quot;</span><span style="color:#D4D4D4;">][</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">][</span><span style="color:#CE9178;">&quot;feat&quot;</span><span style="color:#D4D4D4;">])</span></span>
42
+ <span class="line"><span style="color:#D4D4D4;">plt.matshow(fbank.T[::-</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">])</span></span>
43
+ <span class="line"><span style="color:#D4D4D4;">plt.title(key + </span><span style="color:#CE9178;">&quot;: &quot;</span><span style="color:#D4D4D4;"> + info[</span><span style="color:#CE9178;">&quot;output&quot;</span><span style="color:#D4D4D4;">][</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">][</span><span style="color:#CE9178;">&quot;text&quot;</span><span style="color:#D4D4D4;">])</span></span>
44
+ <span class="line"></span>
45
+ <span class="line"><span style="color:#6A9955;"># print the key-value pair</span></span>
46
+ <span class="line"><span style="color:#D4D4D4;">key, info</span></span>
47
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="stage-3-4-nn-training" tabindex="-1"><a class="header-anchor" href="#stage-3-4-nn-training"><span>Stage 3 - 4 NN Training</span></a></h3><p>Let&#39;s go to the most interesting part...</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">tail espnet/egs/an4/asr1/conf/train_mtlalpha1.0.yaml</span></span>
48
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">cd espnet/egs/an4/asr1; ./run.sh </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">ngpu </span><span style="color:#B5CEA8;">1</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">stage </span><span style="color:#B5CEA8;">3</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">stop-stage </span><span style="color:#B5CEA8;">4</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">train-config ./conf/train_mtlalpha1.0.yaml</span></span>
49
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h3 id="tips-change-yaml-py" tabindex="-1"><a class="header-anchor" href="#tips-change-yaml-py"><span>TIPS: change_yaml.py</span></a></h3><p>You can tweak YAML config by <strong>$(change_yaml.py xxx.yaml -a yyy=zzz)</strong></p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">cd espnet/egs/an4/asr1; source path.sh; \\</span></span>
50
+ <span class="line"><span style="color:#D4D4D4;"> ./run.sh </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">ngpu </span><span style="color:#B5CEA8;">1</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">stage </span><span style="color:#B5CEA8;">4</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">stop-stage </span><span style="color:#B5CEA8;">4</span><span style="color:#D4D4D4;"> \\</span></span>
51
+ <span class="line"><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">train-config </span><span style="color:#F44747;">$</span><span style="color:#D4D4D4;">(change_yaml.py ./conf/train_mtlalpha1.0.yaml -a eunits=</span><span style="color:#B5CEA8;">100</span><span style="color:#D4D4D4;">)</span></span>
52
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="tips-tensorboard" tabindex="-1"><a class="header-anchor" href="#tips-tensorboard"><span>TIPS: tensorboard</span></a></h3><p>You can easily monitor effects of the config by tensorboard</p><h3 id="decoding-and-evaluation" tabindex="-1"><a class="header-anchor" href="#decoding-and-evaluation"><span>Decoding and evaluation</span></a></h3><p>decode config (<code>change_yaml.py</code> also works)</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">cat espnet/egs/an4/asr1/conf/decode_ctcweight1.0.yaml</span></span>
53
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h4 id="command-line-usage" tabindex="-1"><a class="header-anchor" href="#command-line-usage"><span>Command line usage</span></a></h4><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">cd espnet/egs/an4/asr1; ./run.sh </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">stage </span><span style="color:#B5CEA8;">5</span></span>
54
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h4 id="asr-result-as-data-json" tabindex="-1"><a class="header-anchor" href="#asr-result-as-data-json"><span>ASR result as <code>data.json</code></span></a></h4><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">head -n20 espnet/egs/an4/asr1/exp/train_nodev_pytorch_train_mtlalpha1.0/decode_test_decode_ctcweight1.0_lm_word100/data.json</span></span>
55
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h3 id="recognize-speech-from-python" tabindex="-1"><a class="header-anchor" href="#recognize-speech-from-python"><span>Recognize speech from python</span></a></h3><p>Let&#39;s use ESPnet as a library and the trained model:</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">ls espnet/egs/an4/asr1/exp/train_nodev_pytorch_train_mtlalpha1.0/results</span></span>
56
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h4 id="recap-load-speech-from-data-json" tabindex="-1"><a class="header-anchor" href="#recap-load-speech-from-data-json"><span>recap: load speech from data.json</span></a></h4><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> json</span></span>
57
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> matplotlib.pyplot </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> plt</span></span>
58
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> kaldiio</span></span>
59
+ <span class="line"></span>
60
+ <span class="line"><span style="color:#6A9955;"># load 10-th speech/text in data.json</span></span>
61
+ <span class="line"><span style="color:#D4D4D4;">root = </span><span style="color:#CE9178;">&quot;espnet/egs/an4/asr1&quot;</span></span>
62
+ <span class="line"><span style="color:#C586C0;">with</span><span style="color:#DCDCAA;"> open</span><span style="color:#D4D4D4;">(root + </span><span style="color:#CE9178;">&quot;/dump/test/deltafalse/data.json&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;r&quot;</span><span style="color:#D4D4D4;">) </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> f:</span></span>
63
+ <span class="line"><span style="color:#D4D4D4;"> test_json = json.load(f)[</span><span style="color:#CE9178;">&quot;utts&quot;</span><span style="color:#D4D4D4;">]</span></span>
64
+ <span class="line"><span style="color:#D4D4D4;"> </span></span>
65
+ <span class="line"><span style="color:#D4D4D4;">key, info = </span><span style="color:#4EC9B0;">list</span><span style="color:#D4D4D4;">(test_json.items())[</span><span style="color:#B5CEA8;">10</span><span style="color:#D4D4D4;">]</span></span>
66
+ <span class="line"></span>
67
+ <span class="line"><span style="color:#6A9955;"># plot the speech feature</span></span>
68
+ <span class="line"><span style="color:#D4D4D4;">fbank = kaldiio.load_mat(info[</span><span style="color:#CE9178;">&quot;input&quot;</span><span style="color:#D4D4D4;">][</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">][</span><span style="color:#CE9178;">&quot;feat&quot;</span><span style="color:#D4D4D4;">])</span></span>
69
+ <span class="line"><span style="color:#D4D4D4;">plt.matshow(fbank.T[::-</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">])</span></span>
70
+ <span class="line"><span style="color:#D4D4D4;">plt.title(key + </span><span style="color:#CE9178;">&quot;: &quot;</span><span style="color:#D4D4D4;"> + info[</span><span style="color:#CE9178;">&quot;output&quot;</span><span style="color:#D4D4D4;">][</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">][</span><span style="color:#CE9178;">&quot;text&quot;</span><span style="color:#D4D4D4;">])</span></span>
71
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h4 id="load-model" tabindex="-1"><a class="header-anchor" href="#load-model"><span>load model</span></a></h4><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> json</span></span>
72
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> torch</span></span>
73
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> argparse</span></span>
74
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet.bin.asr_recog </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> get_parser</span></span>
75
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet.nets.pytorch_backend.e2e_asr </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> E2E</span></span>
76
+ <span class="line"></span>
77
+ <span class="line"><span style="color:#D4D4D4;">root = </span><span style="color:#CE9178;">&quot;espnet/egs/an4/asr1&quot;</span></span>
78
+ <span class="line"><span style="color:#D4D4D4;">model_dir = root + </span><span style="color:#CE9178;">&quot;/exp/train_nodev_pytorch_train_mtlalpha1.0/results&quot;</span></span>
79
+ <span class="line"></span>
80
+ <span class="line"><span style="color:#6A9955;"># load model</span></span>
81
+ <span class="line"><span style="color:#C586C0;">with</span><span style="color:#DCDCAA;"> open</span><span style="color:#D4D4D4;">(model_dir + </span><span style="color:#CE9178;">&quot;/model.json&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;r&quot;</span><span style="color:#D4D4D4;">) </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> f:</span></span>
82
+ <span class="line"><span style="color:#D4D4D4;"> idim, odim, conf = json.load(f)</span></span>
83
+ <span class="line"><span style="color:#D4D4D4;">model = E2E(idim, odim, argparse.Namespace(**conf))</span></span>
84
+ <span class="line"><span style="color:#D4D4D4;">model.load_state_dict(torch.load(model_dir + </span><span style="color:#CE9178;">&quot;/model.loss.best&quot;</span><span style="color:#D4D4D4;">))</span></span>
85
+ <span class="line"><span style="color:#D4D4D4;">model.cpu().eval()</span></span>
86
+ <span class="line"></span>
87
+ <span class="line"><span style="color:#6A9955;"># load token dict</span></span>
88
+ <span class="line"><span style="color:#C586C0;">with</span><span style="color:#DCDCAA;"> open</span><span style="color:#D4D4D4;">(root + </span><span style="color:#CE9178;">&quot;/data/lang_1char/train_nodev_units.txt&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;r&quot;</span><span style="color:#D4D4D4;">) </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> f:</span></span>
89
+ <span class="line"><span style="color:#D4D4D4;"> token_list = [entry.split()[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">] </span><span style="color:#C586C0;">for</span><span style="color:#D4D4D4;"> entry </span><span style="color:#C586C0;">in</span><span style="color:#D4D4D4;"> f]</span></span>
90
+ <span class="line"><span style="color:#D4D4D4;">token_list.insert(</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&#39;&lt;blank&gt;&#39;</span><span style="color:#D4D4D4;">)</span></span>
91
+ <span class="line"><span style="color:#D4D4D4;">token_list.append(</span><span style="color:#CE9178;">&#39;&lt;eos&gt;&#39;</span><span style="color:#D4D4D4;">)</span></span>
92
+ <span class="line"></span>
93
+ <span class="line"><span style="color:#6A9955;"># recognize speech</span></span>
94
+ <span class="line"><span style="color:#D4D4D4;">parser = get_parser()</span></span>
95
+ <span class="line"><span style="color:#D4D4D4;">args = parser.parse_args([</span><span style="color:#CE9178;">&quot;--beam-size&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;2&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;--ctc-weight&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;1.0&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;--result-label&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;out.json&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;--model&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;&quot;</span><span style="color:#D4D4D4;">])</span></span>
96
+ <span class="line"><span style="color:#D4D4D4;">result = model.recognize(fbank, args, token_list)</span></span>
97
+ <span class="line"><span style="color:#D4D4D4;">s = </span><span style="color:#CE9178;">&quot;&quot;</span><span style="color:#D4D4D4;">.join(conf[</span><span style="color:#CE9178;">&quot;char_list&quot;</span><span style="color:#D4D4D4;">][y] </span><span style="color:#C586C0;">for</span><span style="color:#D4D4D4;"> y </span><span style="color:#C586C0;">in</span><span style="color:#D4D4D4;"> result[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">][</span><span style="color:#CE9178;">&quot;yseq&quot;</span><span style="color:#D4D4D4;">]).replace(</span><span style="color:#CE9178;">&quot;&lt;eos&gt;&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;&quot;</span><span style="color:#D4D4D4;">).replace(</span><span style="color:#CE9178;">&quot;&lt;space&gt;&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot; &quot;</span><span style="color:#D4D4D4;">).replace(</span><span style="color:#CE9178;">&quot;&lt;blank&gt;&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;&quot;</span><span style="color:#D4D4D4;">)</span></span>
98
+ <span class="line"></span>
99
+ <span class="line"><span style="color:#DCDCAA;">print</span><span style="color:#D4D4D4;">(</span><span style="color:#CE9178;">&quot;groundtruth:&quot;</span><span style="color:#D4D4D4;">, info[</span><span style="color:#CE9178;">&quot;output&quot;</span><span style="color:#D4D4D4;">][</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">][</span><span style="color:#CE9178;">&quot;text&quot;</span><span style="color:#D4D4D4;">])</span></span>
100
+ <span class="line"><span style="color:#DCDCAA;">print</span><span style="color:#D4D4D4;">(</span><span style="color:#CE9178;">&quot;prediction: &quot;</span><span style="color:#D4D4D4;">, s)</span></span>
101
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> os</span></span>
102
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> kaldiio</span></span>
103
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> IPython.display </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> Audio</span></span>
104
+ <span class="line"></span>
105
+ <span class="line"></span>
106
+ <span class="line"><span style="color:#C586C0;">try</span><span style="color:#D4D4D4;">:</span></span>
107
+ <span class="line"><span style="color:#D4D4D4;"> d = os.getcwd()</span></span>
108
+ <span class="line"><span style="color:#D4D4D4;"> os.chdir(root)</span></span>
109
+ <span class="line"><span style="color:#D4D4D4;"> sr, wav = kaldiio.load_scp(</span><span style="color:#CE9178;">&quot;data/test/wav.scp&quot;</span><span style="color:#D4D4D4;">)[key]</span></span>
110
+ <span class="line"><span style="color:#C586C0;">finally</span><span style="color:#D4D4D4;">:</span></span>
111
+ <span class="line"><span style="color:#D4D4D4;"> os.chdir(d)</span></span>
112
+ <span class="line"><span style="color:#D4D4D4;">Audio(wav, </span><span style="color:#9CDCFE;">rate</span><span style="color:#D4D4D4;">=sr)</span></span>
113
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div>`,35);function x(_,q){const n=p("ExternalLinkIcon");return t(),i("div",null,[c,s("p",null,[a("Author: "),s("a",d,[a("Shigeki Karita"),e(n)])]),D,s("ul",null,[y,s("li",null,[u,a(": almost raw "),s("a",v,[a("data prepared by Kaldi"),e(n)])]),h,s("li",null,[m,a(": speech feature binary files, e.g., "),s("a",b,[a("ark, scp"),e(n)])]),g,C,s("li",null,[s("a",E,[a("steps/"),e(n)]),a(", "),s("a",k,[a("utils/"),e(n)]),a(": Kaldi's helper scripts")])]),f])}const w=o(r,[["render",x],["__file","asr_cli.html.vue"]]),F=JSON.parse('{"path":"/espnet2/asr/asr_cli.html","title":"Speech Recognition (Recipe)","lang":"en-US","frontmatter":{},"headers":[{"level":2,"title":"Abstract","slug":"abstract","link":"#abstract","children":[]},{"level":2,"title":"Installation","slug":"installation","link":"#installation","children":[]},{"level":2,"title":"ESPnet command line usage (espnet/egs/xxx)","slug":"espnet-command-line-usage-espnet-egs-xxx","link":"#espnet-command-line-usage-espnet-egs-xxx","children":[{"level":3,"title":"Stage 0 - 2 Data preparation","slug":"stage-0-2-data-preparation","link":"#stage-0-2-data-preparation","children":[]}]},{"level":2,"title":"Kaldi-style directory structure","slug":"kaldi-style-directory-structure","link":"#kaldi-style-directory-structure","children":[{"level":3,"title":"TIPS: essential files in data preparation","slug":"tips-essential-files-in-data-preparation","link":"#tips-essential-files-in-data-preparation","children":[]},{"level":3,"title":"raw text list","slug":"raw-text-list","link":"#raw-text-list","children":[]},{"level":3,"title":"TIPS: explore datasets with data.json","slug":"tips-explore-datasets-with-data-json","link":"#tips-explore-datasets-with-data-json","children":[]},{"level":3,"title":"Stage 3 - 4 NN Training","slug":"stage-3-4-nn-training","link":"#stage-3-4-nn-training","children":[]},{"level":3,"title":"TIPS: change_yaml.py","slug":"tips-change-yaml-py","link":"#tips-change-yaml-py","children":[]},{"level":3,"title":"TIPS: tensorboard","slug":"tips-tensorboard","link":"#tips-tensorboard","children":[]},{"level":3,"title":"Decoding and evaluation","slug":"decoding-and-evaluation","link":"#decoding-and-evaluation","children":[]},{"level":3,"title":"Recognize speech from python","slug":"recognize-speech-from-python","link":"#recognize-speech-from-python","children":[]}]}],"git":{},"filePathRelative":"espnet2/asr/asr_cli.md"}');export{w as comp,F as data};
assets/asr_library.html-rEQwKTMV.js ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import{_ as l,r as p,o as t,c as r,a as n,d as s,b as e,e as i}from"./app-DTS6SjJz.js";const d={},c=n("h1",{id:"speech-recognition-library",tabindex:"-1"},[n("a",{class:"header-anchor",href:"#speech-recognition-library"},[n("span",null,"Speech Recognition (Library)")])],-1),o=n("p",null,"This example shows you a practical ASR example using ESPnet as a command line interface and library.",-1),u=n("p",null,"See also",-1),v={href:"https://colab.research.google.com/github/espnet/notebook/blob/master/asr_library.ipynb",target:"_blank",rel:"noopener noreferrer"},m=n("li",null,"documetation https://espnet.github.io/espnet/",-1),b=n("li",null,"github https://github.com/espnet",-1),h={href:"https://github.com/ShigekiKarita",target:"_blank",rel:"noopener noreferrer"},g=i(`<h2 id="installation" tabindex="-1"><a class="header-anchor" href="#installation"><span>Installation</span></a></h2><p>ESPnet depends on Kaldi ASR toolkit and Warp-CTC. This cell will take a few minutes.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># TODO(karita): put these lines in ./espnet/tools/setup_colab.sh</span></span>
2
+ <span class="line"><span># OS setup</span></span>
3
+ <span class="line"><span>!sudo apt-get install bc tree</span></span>
4
+ <span class="line"><span>!cat /etc/os-release</span></span>
5
+ <span class="line"><span></span></span>
6
+ <span class="line"><span># espnet setup</span></span>
7
+ <span class="line"><span>!git clone https://github.com/espnet/espnet</span></span>
8
+ <span class="line"><span>!cd espnet; pip install -e .</span></span>
9
+ <span class="line"><span>!mkdir espnet/tools/venv/bin; touch espnet/tools/venv/bin/activate</span></span>
10
+ <span class="line"><span></span></span>
11
+ <span class="line"><span># warp ctc setup</span></span>
12
+ <span class="line"><span>!git clone https://github.com/espnet/warp-ctc -b pytorch-1.1</span></span>
13
+ <span class="line"><span>!cd warp-ctc &amp;&amp; mkdir build &amp;&amp; cd build &amp;&amp; cmake .. &amp;&amp; make -j4</span></span>
14
+ <span class="line"><span>!cd warp-ctc/pytorch_binding &amp;&amp; python setup.py install </span></span>
15
+ <span class="line"><span></span></span>
16
+ <span class="line"><span># kaldi setup</span></span>
17
+ <span class="line"><span>!cd ./espnet/tools; git clone https://github.com/kaldi-asr/kaldi</span></span>
18
+ <span class="line"><span>!echo &quot;&quot; &gt; ./espnet/tools/kaldi/tools/extras/check_dependencies.sh # ignore check</span></span>
19
+ <span class="line"><span>!chmod +x ./espnet/tools/kaldi/tools/extras/check_dependencies.sh</span></span>
20
+ <span class="line"><span>!cd ./espnet/tools/kaldi/tools; make sph2pipe sclite</span></span>
21
+ <span class="line"><span>!rm -rf espnet/tools/kaldi/tools/python</span></span>
22
+ <span class="line"><span>![ ! -e ubuntu16-featbin.tar.gz ] &amp;&amp; wget https://18-198329952-gh.circle-artifacts.com/0/home/circleci/repo/ubuntu16-featbin.tar.gz</span></span>
23
+ <span class="line"><span>!tar -xf ./ubuntu16-featbin.tar.gz</span></span>
24
+ <span class="line"><span>!cp featbin/* espnet/tools/kaldi/src/featbin/</span></span>
25
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="espnet-data-preparation" tabindex="-1"><a class="header-anchor" href="#espnet-data-preparation"><span>ESPnet data preparation</span></a></h2><p>You can use the end-to-end script <code>run.sh</code> for reproducing systems reported in <code>espnet/egs/*/asr1/RESULTS.md</code>. Typically, we organize <code>run.sh</code> with several stages:</p><ol start="0"><li>Data download (if available)</li><li>Kaldi-style data preparation</li><li>Dump useful data for traning (e.g., JSON, HDF5, etc)</li><li>Lanuage model training</li><li>ASR model training</li><li>Decoding and evaluation</li></ol><p>For example, if you add <code>--stop-stage 2</code>, you can stop the script before neural network training.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!cd espnet/egs/an4/asr1; ./run.sh --ngpu 1 --stop-stage 2</span></span>
26
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h2 id="kaldi-style-directories" tabindex="-1"><a class="header-anchor" href="#kaldi-style-directories"><span>Kaldi-style directories</span></a></h2><p>Always we organize each recipe placed in <code>egs/xxx/asr1</code> in Kaldi way. For example, the important directories are:</p>`,10),_=n("li",null,[n("code",null,"conf/"),s(": kaldi configurations, e.g., speech feature")],-1),k=n("code",null,"data/",-1),f={href:"https://kaldi-asr.org/doc/data_prep.html",target:"_blank",rel:"noopener noreferrer"},q=n("li",null,[n("code",null,"exp/"),s(": intermidiate files through experiments, e.g., log files, model parameters")],-1),x=n("code",null,"fbank/",-1),y={href:"https://kaldi-asr.org/doc/io.html",target:"_blank",rel:"noopener noreferrer"},E=n("li",null,[n("code",null,"dump/"),s(": ESPnet meta data for tranining, e.g., json, hdf5")],-1),w=n("li",null,[n("code",null,"local/"),s(": corpus specific data preparation scripts")],-1),D={href:"https://github.com/kaldi-asr/kaldi/tree/master/egs/wsj/s5/steps",target:"_blank",rel:"noopener noreferrer"},S={href:"https://github.com/kaldi-asr/kaldi/tree/master/egs/wsj/s5/utils",target:"_blank",rel:"noopener noreferrer"},j=i(`<div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!tree -L 1</span></span>
27
+ <span class="line"><span>!ls data/train</span></span>
28
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="espnet-as-a-library" tabindex="-1"><a class="header-anchor" href="#espnet-as-a-library"><span>ESPnet as a library</span></a></h2><p>Here we use ESPnet as a library to create a simple Python snippet for speech recognition. ESPnet &#39;s training script&#39;<code>asr_train.py</code> has three parts:</p><ol><li>Load train/dev dataset</li><li>Create minibatches</li><li>Build neural networks</li><li>Update neural networks by iterating datasets</li></ol><p>Let&#39;s implement these procedures from scratch!</p><h3 id="load-train-dev-dataset-1-4" tabindex="-1"><a class="header-anchor" href="#load-train-dev-dataset-1-4"><span>Load train/dev dataset (1/4)</span></a></h3><p>First, we will check how <code>run.sh</code> organized the JSON files and load the pair of the speech feature and its transcription.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>import json</span></span>
29
+ <span class="line"><span>import matplotlib.pyplot as plt</span></span>
30
+ <span class="line"><span>import kaldiio</span></span>
31
+ <span class="line"><span></span></span>
32
+ <span class="line"><span>root = &quot;espnet/egs/an4/asr1&quot;</span></span>
33
+ <span class="line"><span>with open(root + &quot;/dump/train_nodev/deltafalse/data.json&quot;, &quot;r&quot;) as f:</span></span>
34
+ <span class="line"><span> train_json = json.load(f)[&quot;utts&quot;]</span></span>
35
+ <span class="line"><span>with open(root + &quot;/dump/train_dev/deltafalse/data.json&quot;, &quot;r&quot;) as f:</span></span>
36
+ <span class="line"><span> dev_json = json.load(f)[&quot;utts&quot;]</span></span>
37
+ <span class="line"><span> </span></span>
38
+ <span class="line"><span># the first training data for speech recognition</span></span>
39
+ <span class="line"><span>key, info = next(iter(train_json.items()))</span></span>
40
+ <span class="line"><span></span></span>
41
+ <span class="line"><span># plot the 80-dim fbank + 3-dim pitch speech feature</span></span>
42
+ <span class="line"><span>fbank = kaldiio.load_mat(info[&quot;input&quot;][0][&quot;feat&quot;])</span></span>
43
+ <span class="line"><span>plt.matshow(fbank.T[::-1])</span></span>
44
+ <span class="line"><span>plt.title(key + &quot;: &quot; + info[&quot;output&quot;][0][&quot;text&quot;])</span></span>
45
+ <span class="line"><span></span></span>
46
+ <span class="line"><span># print the key-value pair</span></span>
47
+ <span class="line"><span>key, info</span></span>
48
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="create-minibatches-2-4" tabindex="-1"><a class="header-anchor" href="#create-minibatches-2-4"><span>Create minibatches (2/4)</span></a></h3><p>To parallelize neural network training, we create minibatches that containes several sequence pairs by splitting datasets.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>from espnet.utils.training.batchfy import make_batchset</span></span>
49
+ <span class="line"><span></span></span>
50
+ <span class="line"><span>batch_size = 32</span></span>
51
+ <span class="line"><span>trainset = make_batchset(train_json, batch_size)</span></span>
52
+ <span class="line"><span>devset = make_batchset(dev_json, batch_size)</span></span>
53
+ <span class="line"><span>assert len(devset[0]) == batch_size</span></span>
54
+ <span class="line"><span>devset[0][:3]</span></span>
55
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="build-neural-networks-3-4" tabindex="-1"><a class="header-anchor" href="#build-neural-networks-3-4"><span>Build neural networks (3/4)</span></a></h3>`,12),z={href:"https://papers.nips.cc/paper/7181-attention-is-all-you-need.pdf",target:"_blank",rel:"noopener noreferrer"},T=i(`<p>NOTE: You can also use your custom model in command line tools as <code>asr_train.py --model-module your_module:YourModel</code></p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>import argparse</span></span>
56
+ <span class="line"><span>from espnet.bin.asr_train import get_parser</span></span>
57
+ <span class="line"><span>from espnet.nets.pytorch_backend.e2e_asr import E2E</span></span>
58
+ <span class="line"><span></span></span>
59
+ <span class="line"><span>parser = get_parser()</span></span>
60
+ <span class="line"><span>parser = E2E.add_arguments(parser)</span></span>
61
+ <span class="line"><span>config = parser.parse_args([</span></span>
62
+ <span class="line"><span> &quot;--mtlalpha&quot;, &quot;0.0&quot;, # weight for cross entropy and CTC loss</span></span>
63
+ <span class="line"><span> &quot;--outdir&quot;, &quot;out&quot;, &quot;--dict&quot;, &quot;&quot;]) # TODO: allow no arg</span></span>
64
+ <span class="line"><span></span></span>
65
+ <span class="line"><span>idim = info[&quot;input&quot;][0][&quot;shape&quot;][1]</span></span>
66
+ <span class="line"><span>odim = info[&quot;output&quot;][0][&quot;shape&quot;][1]</span></span>
67
+ <span class="line"><span>setattr(config, &quot;char_list&quot;, [])</span></span>
68
+ <span class="line"><span>model = E2E(idim, odim, config)</span></span>
69
+ <span class="line"><span>model</span></span>
70
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="update-neural-networks-by-iterating-datasets-4-4" tabindex="-1"><a class="header-anchor" href="#update-neural-networks-by-iterating-datasets-4-4"><span>Update neural networks by iterating datasets (4/4)</span></a></h3><p>Finaly, we got the training part.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>import numpy</span></span>
71
+ <span class="line"><span>import torch</span></span>
72
+ <span class="line"><span>from torch.nn.utils.rnn import pad_sequence</span></span>
73
+ <span class="line"><span>from torch.nn.utils.clip_grad import clip_grad_norm_</span></span>
74
+ <span class="line"><span>from torch.utils.data import DataLoader</span></span>
75
+ <span class="line"><span></span></span>
76
+ <span class="line"><span>def collate(minibatch):</span></span>
77
+ <span class="line"><span> fbanks = []</span></span>
78
+ <span class="line"><span> tokens = []</span></span>
79
+ <span class="line"><span> for key, info in minibatch[0]:</span></span>
80
+ <span class="line"><span> fbanks.append(torch.tensor(kaldiio.load_mat(info[&quot;input&quot;][0][&quot;feat&quot;])))</span></span>
81
+ <span class="line"><span> tokens.append(torch.tensor([int(s) for s in info[&quot;output&quot;][0][&quot;tokenid&quot;].split()]))</span></span>
82
+ <span class="line"><span> ilens = torch.tensor([x.shape[0] for x in fbanks])</span></span>
83
+ <span class="line"><span> return pad_sequence(fbanks, batch_first=True), ilens, pad_sequence(tokens, batch_first=True)</span></span>
84
+ <span class="line"><span></span></span>
85
+ <span class="line"><span>train_loader = DataLoader(trainset, collate_fn=collate, shuffle=True, pin_memory=True)</span></span>
86
+ <span class="line"><span>dev_loader = DataLoader(devset, collate_fn=collate, pin_memory=True)</span></span>
87
+ <span class="line"><span>model.cuda()</span></span>
88
+ <span class="line"><span>optim = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.98))</span></span>
89
+ <span class="line"><span></span></span>
90
+ <span class="line"><span>n_iter = len(trainset)</span></span>
91
+ <span class="line"><span>n_epoch = 10</span></span>
92
+ <span class="line"><span>total_iter = n_iter * n_epoch</span></span>
93
+ <span class="line"><span>train_acc = []</span></span>
94
+ <span class="line"><span>valid_acc = []</span></span>
95
+ <span class="line"><span>for epoch in range(n_epoch):</span></span>
96
+ <span class="line"><span> # training</span></span>
97
+ <span class="line"><span> acc = []</span></span>
98
+ <span class="line"><span> model.train()</span></span>
99
+ <span class="line"><span> for data in train_loader:</span></span>
100
+ <span class="line"><span> loss = model(*[d.cuda() for d in data])</span></span>
101
+ <span class="line"><span> optim.zero_grad()</span></span>
102
+ <span class="line"><span> loss.backward()</span></span>
103
+ <span class="line"><span> acc.append(model.acc)</span></span>
104
+ <span class="line"><span> norm = clip_grad_norm_(model.parameters(), 10.0)</span></span>
105
+ <span class="line"><span> optim.step()</span></span>
106
+ <span class="line"><span> train_acc.append(numpy.mean(acc))</span></span>
107
+ <span class="line"><span></span></span>
108
+ <span class="line"><span> # validation</span></span>
109
+ <span class="line"><span> acc = []</span></span>
110
+ <span class="line"><span> model.eval()</span></span>
111
+ <span class="line"><span> for data in dev_loader:</span></span>
112
+ <span class="line"><span> model(*[d.cuda() for d in data])</span></span>
113
+ <span class="line"><span> acc.append(model.acc)</span></span>
114
+ <span class="line"><span> valid_acc.append(numpy.mean(acc))</span></span>
115
+ <span class="line"><span> print(f&quot;epoch: {epoch}, train acc: {train_acc[-1]:.3f}, dev acc: {valid_acc[-1]:.3f}&quot;)</span></span>
116
+ <span class="line"><span></span></span>
117
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>import matplotlib.pyplot as plt</span></span>
118
+ <span class="line"><span></span></span>
119
+ <span class="line"><span>plt.plot(range(len(train_acc)), train_acc, label=&quot;train acc&quot;)</span></span>
120
+ <span class="line"><span>plt.plot(range(len(valid_acc)), valid_acc, label=&quot;dev acc&quot;)</span></span>
121
+ <span class="line"><span>plt.grid()</span></span>
122
+ <span class="line"><span>plt.legend()</span></span>
123
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>torch.save(model.state_dict(), &quot;best.pt&quot;)</span></span>
124
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h3 id="recognize-speech" tabindex="-1"><a class="header-anchor" href="#recognize-speech"><span>Recognize speech</span></a></h3><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>import json</span></span>
125
+ <span class="line"><span>import matplotlib.pyplot as plt</span></span>
126
+ <span class="line"><span>import kaldiio</span></span>
127
+ <span class="line"><span>from espnet.bin.asr_recog import get_parser</span></span>
128
+ <span class="line"><span></span></span>
129
+ <span class="line"><span># load data</span></span>
130
+ <span class="line"><span>root = &quot;espnet/egs/an4/asr1&quot;</span></span>
131
+ <span class="line"><span>with open(root + &quot;/dump/test/deltafalse/data.json&quot;, &quot;r&quot;) as f:</span></span>
132
+ <span class="line"><span> test_json = json.load(f)[&quot;utts&quot;]</span></span>
133
+ <span class="line"><span> </span></span>
134
+ <span class="line"><span>key, info = list(test_json.items())[10]</span></span>
135
+ <span class="line"><span></span></span>
136
+ <span class="line"><span># plot the 80-dim fbank + 3-dim pitch speech feature</span></span>
137
+ <span class="line"><span>fbank = kaldiio.load_mat(info[&quot;input&quot;][0][&quot;feat&quot;])</span></span>
138
+ <span class="line"><span>plt.matshow(fbank.T[::-1])</span></span>
139
+ <span class="line"><span>plt.title(key + &quot;: &quot; + info[&quot;output&quot;][0][&quot;text&quot;])</span></span>
140
+ <span class="line"><span></span></span>
141
+ <span class="line"><span># load token dict</span></span>
142
+ <span class="line"><span>with open(root + &quot;/data/lang_1char/train_nodev_units.txt&quot;, &quot;r&quot;) as f:</span></span>
143
+ <span class="line"><span> token_list = [entry.split()[0] for entry in f]</span></span>
144
+ <span class="line"><span>token_list.insert(0, &#39;&lt;blank&gt;&#39;)</span></span>
145
+ <span class="line"><span>token_list.append(&#39;&lt;eos&gt;&#39;)</span></span>
146
+ <span class="line"><span></span></span>
147
+ <span class="line"><span># recognize speech</span></span>
148
+ <span class="line"><span>parser = get_parser()</span></span>
149
+ <span class="line"><span>args = parser.parse_args([</span></span>
150
+ <span class="line"><span> &quot;--beam-size&quot;, &quot;1&quot;,</span></span>
151
+ <span class="line"><span> &quot;--ctc-weight&quot;, &quot;0&quot;,</span></span>
152
+ <span class="line"><span> &quot;--result-label&quot;, &quot;out.json&quot;,</span></span>
153
+ <span class="line"><span> &quot;--model&quot;, &quot;&quot;</span></span>
154
+ <span class="line"><span>])</span></span>
155
+ <span class="line"><span>model.cpu()</span></span>
156
+ <span class="line"><span>model.eval()</span></span>
157
+ <span class="line"><span></span></span>
158
+ <span class="line"><span>def to_str(result):</span></span>
159
+ <span class="line"><span> return &quot;&quot;.join(token_list[y] for y in result[0][&quot;yseq&quot;]) \\</span></span>
160
+ <span class="line"><span> .replace(&quot;&lt;eos&gt;&quot;, &quot;&quot;).replace(&quot;&lt;space&gt;&quot;, &quot; &quot;).replace(&quot;&lt;blank&gt;&quot;, &quot;&quot;)</span></span>
161
+ <span class="line"><span></span></span>
162
+ <span class="line"><span>print(&quot;groundtruth:&quot;, info[&quot;output&quot;][0][&quot;text&quot;])</span></span>
163
+ <span class="line"><span>print(&quot;prediction: &quot;, to_str(model.recognize(fbank, args, token_list)))</span></span>
164
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span></span></span>
165
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div>`,10);function L(P,K){const a=p("ExternalLinkIcon");return t(),r("div",null,[c,o,u,n("ul",null,[n("li",null,[s("run in "),n("a",v,[s("colab"),e(a)])]),m,b]),n("p",null,[s("Author: "),n("a",h,[s("Shigeki Karita"),e(a)])]),g,n("ul",null,[_,n("li",null,[k,s(": almost raw "),n("a",f,[s("data prepared by Kaldi"),e(a)])]),q,n("li",null,[x,s(": speech feature binary files, e.g., "),n("a",y,[s("ark, scp"),e(a)])]),E,w,n("li",null,[n("a",D,[s("steps/"),e(a)]),s(", "),n("a",S,[s("utils/"),e(a)]),s(": Kaldi's helper scripts")])]),j,n("p",null,[s("For simplicity, we use a predefined model: "),n("a",z,[s("Transformer"),e(a)]),s(".")]),T])}const R=l(d,[["render",L],["__file","asr_library.html.vue"]]),C=JSON.parse('{"path":"/espnet2/asr/asr_library.html","title":"Speech Recognition (Library)","lang":"en-US","frontmatter":{},"headers":[{"level":2,"title":"Installation","slug":"installation","link":"#installation","children":[]},{"level":2,"title":"ESPnet data preparation","slug":"espnet-data-preparation","link":"#espnet-data-preparation","children":[]},{"level":2,"title":"Kaldi-style directories","slug":"kaldi-style-directories","link":"#kaldi-style-directories","children":[]},{"level":2,"title":"ESPnet as a library","slug":"espnet-as-a-library","link":"#espnet-as-a-library","children":[{"level":3,"title":"Load train/dev dataset (1/4)","slug":"load-train-dev-dataset-1-4","link":"#load-train-dev-dataset-1-4","children":[]},{"level":3,"title":"Create minibatches (2/4)","slug":"create-minibatches-2-4","link":"#create-minibatches-2-4","children":[]},{"level":3,"title":"Build neural networks (3/4)","slug":"build-neural-networks-3-4","link":"#build-neural-networks-3-4","children":[]},{"level":3,"title":"Update neural networks by iterating datasets (4/4)","slug":"update-neural-networks-by-iterating-datasets-4-4","link":"#update-neural-networks-by-iterating-datasets-4-4","children":[]},{"level":3,"title":"Recognize speech","slug":"recognize-speech","link":"#recognize-speech","children":[]}]}],"git":{},"filePathRelative":"espnet2/asr/asr_library.md"}');export{R as comp,C as data};
assets/espnet2_2pass_slu_demo.html-BmvJ92Ni.js ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import{_ as s,o as n,c as a,e as l}from"./app-DTS6SjJz.js";const e={},o=l(`<h1 id="espnet-2-pass-slu-demonstration" tabindex="-1"><a class="header-anchor" href="#espnet-2-pass-slu-demonstration"><span>ESPNET 2 pass SLU Demonstration</span></a></h1><p>This notebook provides a demonstration of the Two Pass End-to-End Spoken Language Understanding model</p><p>Paper Link: https://arxiv.org/abs/2207.06670</p><p>ESPnet2-SLU: https://github.com/espnet/espnet/tree/master/egs2/TEMPLATE/slu1</p><p>Author: Siddhant Arora</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">! python -m pip install transformers</span></span>
2
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">git clone https://github.com/espnet/espnet /espnet</span></span>
3
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">pip install /espnet</span></span>
4
+ <span class="line"><span style="color:#D4D4D4;">%pip install -q espnet_model_zoo</span></span>
5
+ <span class="line"><span style="color:#D4D4D4;">%pip install fairseq@git+https://github.com//pytorch/fairseq.git@f2146bdc7abf293186de9449bfa2272775e39e1d</span><span style="color:#6A9955;">#egg=fairseq</span></span>
6
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="download-audio-file" tabindex="-1"><a class="header-anchor" href="#download-audio-file"><span>Download Audio File</span></a></h2><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># !gdown --id 1LxoxCoFgx3u8CvKb1loybGFtArKKPcAH -O /content/audio_file.wav</span></span>
7
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">gdown </span><span style="color:#F44747;">--</span><span style="color:#DCDCAA;">id</span><span style="color:#F44747;"> 18ANT62ittt7Ai2E8bQRlvT0ZVXXsf1eE</span><span style="color:#D4D4D4;"> -O /content/audio_file.wav</span></span>
8
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> os</span></span>
9
+ <span class="line"></span>
10
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> soundfile</span></span>
11
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> IPython.display </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> display, Audio</span></span>
12
+ <span class="line"><span style="color:#D4D4D4;">mixwav_mc, sr = soundfile.read(</span><span style="color:#CE9178;">&quot;/content/audio_file.wav&quot;</span><span style="color:#D4D4D4;">)</span></span>
13
+ <span class="line"><span style="color:#D4D4D4;">display(Audio(mixwav_mc.T, </span><span style="color:#9CDCFE;">rate</span><span style="color:#D4D4D4;">=sr))</span></span>
14
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="download-and-load-pretrained-first-pass-model" tabindex="-1"><a class="header-anchor" href="#download-and-load-pretrained-first-pass-model"><span>Download and Load pretrained First Pass Model</span></a></h2><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">git lfs clone https://huggingface.co/espnet/siddhana_slurp_new_asr_train_asr_conformer_raw_en_word_valid.acc.ave_10best /content/slurp_first_pass_model</span></span>
15
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet2.bin.asr_inference </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> Speech2Text</span></span>
16
+ <span class="line"><span style="color:#D4D4D4;">speech2text_slurp = Speech2Text.from_pretrained(</span></span>
17
+ <span class="line"><span style="color:#9CDCFE;"> asr_train_config</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&quot;/content/slurp_first_pass_model/exp/asr_train_asr_conformer_raw_en_word/config.yaml&quot;</span><span style="color:#D4D4D4;">,</span></span>
18
+ <span class="line"><span style="color:#9CDCFE;"> asr_model_file</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&quot;/content/slurp_first_pass_model/exp/asr_train_asr_conformer_raw_en_word/valid.acc.ave_10best.pth&quot;</span><span style="color:#D4D4D4;">,</span></span>
19
+ <span class="line"><span style="color:#9CDCFE;"> nbest</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">,</span></span>
20
+ <span class="line"><span style="color:#D4D4D4;">)</span></span>
21
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">nbests_orig = speech2text_slurp(mixwav_mc)</span></span>
22
+ <span class="line"><span style="color:#D4D4D4;">text, *_ = nbests_orig[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">]</span></span>
23
+ <span class="line"><span style="color:#569CD6;">def</span><span style="color:#DCDCAA;"> text_normalizer</span><span style="color:#D4D4D4;">(</span><span style="color:#9CDCFE;">sub_word_transcript</span><span style="color:#D4D4D4;">):</span></span>
24
+ <span class="line"><span style="color:#D4D4D4;"> transcript = sub_word_transcript[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">].replace(</span><span style="color:#CE9178;">&quot;▁&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;&quot;</span><span style="color:#D4D4D4;">)</span></span>
25
+ <span class="line"><span style="color:#C586C0;"> for</span><span style="color:#D4D4D4;"> sub_word </span><span style="color:#C586C0;">in</span><span style="color:#D4D4D4;"> sub_word_transcript[</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">:]:</span></span>
26
+ <span class="line"><span style="color:#C586C0;"> if</span><span style="color:#CE9178;"> &quot;▁&quot;</span><span style="color:#569CD6;"> in</span><span style="color:#D4D4D4;"> sub_word:</span></span>
27
+ <span class="line"><span style="color:#D4D4D4;"> transcript = transcript + </span><span style="color:#CE9178;">&quot; &quot;</span><span style="color:#D4D4D4;"> + sub_word.replace(</span><span style="color:#CE9178;">&quot;▁&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;&quot;</span><span style="color:#D4D4D4;">)</span></span>
28
+ <span class="line"><span style="color:#C586C0;"> else</span><span style="color:#D4D4D4;">:</span></span>
29
+ <span class="line"><span style="color:#D4D4D4;"> transcript = transcript + sub_word</span></span>
30
+ <span class="line"><span style="color:#C586C0;"> return</span><span style="color:#D4D4D4;"> transcript</span></span>
31
+ <span class="line"><span style="color:#D4D4D4;">intent_text=</span><span style="color:#CE9178;">&quot;{scenario: &quot;</span><span style="color:#D4D4D4;">+text.split()[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">].split(</span><span style="color:#CE9178;">&quot;_&quot;</span><span style="color:#D4D4D4;">)[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">]+</span><span style="color:#CE9178;">&quot;, action: &quot;</span><span style="color:#D4D4D4;">+</span><span style="color:#CE9178;">&quot;_&quot;</span><span style="color:#D4D4D4;">.join(text.split()[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">].split(</span><span style="color:#CE9178;">&quot;_&quot;</span><span style="color:#D4D4D4;">)[</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">:])+</span><span style="color:#CE9178;">&quot;}&quot;</span></span>
32
+ <span class="line"><span style="color:#DCDCAA;">print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;INTENT: </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">intent_text</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&quot;</span><span style="color:#D4D4D4;">)</span></span>
33
+ <span class="line"><span style="color:#D4D4D4;">transcript=text_normalizer(text.split()[</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">:])</span></span>
34
+ <span class="line"><span style="color:#DCDCAA;">print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;ASR hypothesis: </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">transcript</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&quot;</span><span style="color:#D4D4D4;">)</span></span>
35
+ <span class="line"><span style="color:#DCDCAA;">print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;First pass SLU model fails to predict the correct action.&quot;</span><span style="color:#D4D4D4;">)</span></span>
36
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="download-and-load-pretrained-second-pass-model" tabindex="-1"><a class="header-anchor" href="#download-and-load-pretrained-second-pass-model"><span>Download and Load pretrained Second Pass Model</span></a></h2><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">git lfs clone https://huggingface.co/espnet/slurp_slu_2pass /content/slurp_second_pass_model</span></span>
37
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet2.bin.slu_inference </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> Speech2Understand</span></span>
38
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> transformers </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> AutoModel, AutoTokenizer</span></span>
39
+ <span class="line"><span style="color:#D4D4D4;">speech2text_second_pass_slurp = Speech2Understand.from_pretrained(</span></span>
40
+ <span class="line"><span style="color:#9CDCFE;"> slu_train_config</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&quot;/content/slurp_second_pass_model/exp/slu_train_asr_bert_conformer_deliberation_raw_en_word/config.yaml&quot;</span><span style="color:#D4D4D4;">,</span></span>
41
+ <span class="line"><span style="color:#9CDCFE;"> slu_model_file</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&quot;/content/slurp_second_pass_model/exp/slu_train_asr_bert_conformer_deliberation_raw_en_word/valid.acc.ave_10best.pth&quot;</span><span style="color:#D4D4D4;">,</span></span>
42
+ <span class="line"><span style="color:#9CDCFE;"> nbest</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">,</span></span>
43
+ <span class="line"><span style="color:#D4D4D4;">)</span></span>
44
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet2.tasks.slu </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> SLUTask</span></span>
45
+ <span class="line"><span style="color:#D4D4D4;">preprocess_fn=SLUTask.build_preprocess_fn(</span></span>
46
+ <span class="line"><span style="color:#D4D4D4;"> speech2text_second_pass_slurp.asr_train_args, </span><span style="color:#569CD6;">False</span></span>
47
+ <span class="line"><span style="color:#D4D4D4;"> )</span></span>
48
+ <span class="line"></span>
49
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> numpy </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> np</span></span>
50
+ <span class="line"><span style="color:#D4D4D4;">transcript = preprocess_fn.text_cleaner(transcript)</span></span>
51
+ <span class="line"><span style="color:#D4D4D4;">tokens = preprocess_fn.transcript_tokenizer.text2tokens(transcript)</span></span>
52
+ <span class="line"><span style="color:#D4D4D4;">text_ints = np.array(preprocess_fn.transcript_token_id_converter.tokens2ids(tokens), </span><span style="color:#9CDCFE;">dtype</span><span style="color:#D4D4D4;">=np.int64)</span></span>
53
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> torch</span></span>
54
+ <span class="line"><span style="color:#D4D4D4;">nbests = speech2text_second_pass_slurp(mixwav_mc,torch.tensor(text_ints))</span></span>
55
+ <span class="line"><span style="color:#D4D4D4;">text1, *_ = nbests[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">]</span></span>
56
+ <span class="line"><span style="color:#D4D4D4;">intent_text=</span><span style="color:#CE9178;">&quot;{scenario: &quot;</span><span style="color:#D4D4D4;">+text1.split()[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">].split(</span><span style="color:#CE9178;">&quot;_&quot;</span><span style="color:#D4D4D4;">)[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">]+</span><span style="color:#CE9178;">&quot;, action: &quot;</span><span style="color:#D4D4D4;">+</span><span style="color:#CE9178;">&quot;_&quot;</span><span style="color:#D4D4D4;">.join(text1.split()[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">].split(</span><span style="color:#CE9178;">&quot;_&quot;</span><span style="color:#D4D4D4;">)[</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">:])+</span><span style="color:#CE9178;">&quot;}&quot;</span></span>
57
+ <span class="line"><span style="color:#DCDCAA;">print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;INTENT: </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">intent_text</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&quot;</span><span style="color:#D4D4D4;">)</span></span>
58
+ <span class="line"><span style="color:#D4D4D4;">transcript=text_normalizer(text1.split()[</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">:])</span></span>
59
+ <span class="line"><span style="color:#DCDCAA;">print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;ASR hypothesis: </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">transcript</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&quot;</span><span style="color:#D4D4D4;">)</span></span>
60
+ <span class="line"><span style="color:#DCDCAA;">print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;Second pass SLU model successfully recognizes the correct action.&quot;</span><span style="color:#D4D4D4;">)</span></span>
61
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div>`,19),p=[o];function t(r,i){return n(),a("div",null,p)}const d=s(e,[["render",t],["__file","espnet2_2pass_slu_demo.html.vue"]]),D=JSON.parse('{"path":"/espnet2/slu/espnet2_2pass_slu_demo.html","title":"ESPNET 2 pass SLU Demonstration","lang":"en-US","frontmatter":{},"headers":[{"level":2,"title":"Download Audio File","slug":"download-audio-file","link":"#download-audio-file","children":[]},{"level":2,"title":"Download and Load pretrained First Pass Model","slug":"download-and-load-pretrained-first-pass-model","link":"#download-and-load-pretrained-first-pass-model","children":[]},{"level":2,"title":"Download and Load pretrained Second Pass Model","slug":"download-and-load-pretrained-second-pass-model","link":"#download-and-load-pretrained-second-pass-model","children":[]}],"git":{},"filePathRelative":"espnet2/slu/espnet2_2pass_slu_demo.md"}');export{d as comp,D as data};
assets/espnet2_asr_realtime_demo.html-BnK1Wovv.js ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import{_ as p,r as o,o as i,c as r,a as s,d as n,b as l,e}from"./app-DTS6SjJz.js";const t={},c=s("h1",{id:"espnet2-asr-realtime-demonstration",tabindex:"-1"},[s("a",{class:"header-anchor",href:"#espnet2-asr-realtime-demonstration"},[s("span",null,"ESPnet2-ASR realtime demonstration")])],-1),d=s("p",null,"This notebook provides a demonstration of the realtime E2E-ASR using ESPnet2-ASR.",-1),D=s("ul",null,[s("li",null,"ESPnet2-ASR: https://github.com/espnet/espnet/tree/master/egs2/TEMPLATE/asr1")],-1),y={href:"https://github.com/ftshijt",target:"_blank",rel:"noopener noreferrer"},u=e(`<div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># </span><span style="color:#569CD6;">NOTE</span><span style="color:#6A9955;">: pip shows imcompatible errors due to preinstalled libraries but you do not need to care</span></span>
2
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">pip install -q espnet==</span><span style="color:#B5CEA8;">0.10</span><span style="color:#D4D4D4;">.0</span></span>
3
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">pip install -q espnet_model_zoo</span></span>
4
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="asr-model-demo" tabindex="-1"><a class="header-anchor" href="#asr-model-demo"><span>ASR model demo</span></a></h2><h3 id="model-selection" tabindex="-1"><a class="header-anchor" href="#model-selection"><span>Model Selection</span></a></h3>`,3),v={href:"https://github.com/espnet/espnet_model_zoo/blob/master/espnet_model_zoo/table.csv",target:"_blank",rel:"noopener noreferrer"},m=e(`<p>In this demonstration, we will show English, Japanese, Spanish, Mandrain, Multilingual ASR model, respectively</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;">#@title Choose English ASR model { run: &quot;auto&quot; }</span></span>
5
+ <span class="line"></span>
6
+ <span class="line"><span style="color:#D4D4D4;">lang = </span><span style="color:#CE9178;">&#39;en&#39;</span></span>
7
+ <span class="line"><span style="color:#D4D4D4;">fs = </span><span style="color:#B5CEA8;">16000</span><span style="color:#6A9955;"> #@param {type:&quot;integer&quot;}</span></span>
8
+ <span class="line"><span style="color:#D4D4D4;">tag = </span><span style="color:#CE9178;">&#39;Shinji Watanabe/spgispeech_asr_train_asr_conformer6_n_fft512_hop_length256_raw_en_unnorm_bpe5000_valid.acc.ave&#39;</span><span style="color:#6A9955;"> #@param [&quot;Shinji Watanabe/spgispeech_asr_train_asr_conformer6_n_fft512_hop_length256_raw_en_unnorm_bpe5000_valid.acc.ave&quot;, &quot;kamo-naoyuki/librispeech_asr_train_asr_conformer6_n_fft512_hop_length256_raw_en_bpe5000_scheduler_confwarmup_steps40000_optim_conflr0.0025_sp_valid.acc.ave&quot;] {type:&quot;string&quot;}</span></span>
9
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;">#@title Choose Japanese ASR model { run: &quot;auto&quot; }</span></span>
10
+ <span class="line"></span>
11
+ <span class="line"><span style="color:#D4D4D4;">lang = </span><span style="color:#CE9178;">&#39;ja&#39;</span></span>
12
+ <span class="line"><span style="color:#D4D4D4;">fs = </span><span style="color:#B5CEA8;">16000</span><span style="color:#6A9955;"> #@param {type:&quot;integer&quot;}</span></span>
13
+ <span class="line"><span style="color:#D4D4D4;">tag = </span><span style="color:#CE9178;">&#39;Shinji Watanabe/laborotv_asr_train_asr_conformer2_latest33_raw_char_sp_valid.acc.ave&#39;</span><span style="color:#6A9955;"> #@param [&quot;Shinji Watanabe/laborotv_asr_train_asr_conformer2_latest33_raw_char_sp_valid.acc.ave&quot;] {type:&quot;string&quot;}</span></span>
14
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;">#@title Choose Spanish ASR model { run: &quot;auto&quot; }</span></span>
15
+ <span class="line"></span>
16
+ <span class="line"><span style="color:#D4D4D4;">lang = </span><span style="color:#CE9178;">&#39;es&#39;</span></span>
17
+ <span class="line"><span style="color:#D4D4D4;">fs = </span><span style="color:#B5CEA8;">16000</span><span style="color:#6A9955;"> #@param {type:&quot;integer&quot;}</span></span>
18
+ <span class="line"><span style="color:#D4D4D4;">tag = </span><span style="color:#CE9178;">&#39;ftshijt/mls_asr_transformer_valid.acc.best&#39;</span><span style="color:#6A9955;"> #@param [&quot;ftshijt/mls_asr_transformer_valid.acc.best&quot;] {type:&quot;string&quot;}</span></span>
19
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;">#@title Choose Mandrain ASR model { run: &quot;auto&quot; }</span></span>
20
+ <span class="line"></span>
21
+ <span class="line"><span style="color:#D4D4D4;">lang = </span><span style="color:#CE9178;">&#39;zh&#39;</span></span>
22
+ <span class="line"><span style="color:#D4D4D4;">fs = </span><span style="color:#B5CEA8;">16000</span><span style="color:#6A9955;"> #@param {type:&quot;integer&quot;}</span></span>
23
+ <span class="line"><span style="color:#D4D4D4;">tag = </span><span style="color:#CE9178;">&#39;Emiru Tsunoo/aishell_asr_train_asr_streaming_transformer_raw_zh_char_sp_valid.acc.ave&#39;</span><span style="color:#6A9955;"> #@param [&quot; Emiru Tsunoo/aishell_asr_train_asr_streaming_transformer_raw_zh_char_sp_valid.acc.ave&quot;] {type:&quot;string&quot;}</span></span>
24
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;">#@title Choose Multilingual ASR model { run: &quot;auto&quot; }</span></span>
25
+ <span class="line"></span>
26
+ <span class="line"><span style="color:#D4D4D4;">lang = </span><span style="color:#CE9178;">&#39;multilingual&#39;</span></span>
27
+ <span class="line"><span style="color:#D4D4D4;">fs = </span><span style="color:#B5CEA8;">16000</span><span style="color:#6A9955;"> #@param {type:&quot;integer&quot;}</span></span>
28
+ <span class="line"><span style="color:#D4D4D4;">tag = </span><span style="color:#CE9178;">&#39;ftshijt/open_li52_asr_train_asr_raw_bpe7000_valid.acc.ave_10best&#39;</span><span style="color:#6A9955;"> #@param [&quot; ftshijt/open_li52_asr_train_asr_raw_bpe7000_valid.acc.ave_10best&quot;] {type:&quot;string&quot;}</span></span>
29
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="model-setup" tabindex="-1"><a class="header-anchor" href="#model-setup"><span>Model Setup</span></a></h3><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> time</span></span>
30
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> torch</span></span>
31
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> string</span></span>
32
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet_model_zoo.downloader </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> ModelDownloader</span></span>
33
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet2.bin.asr_inference </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> Speech2Text</span></span>
34
+ <span class="line"></span>
35
+ <span class="line"></span>
36
+ <span class="line"><span style="color:#D4D4D4;">d = ModelDownloader()</span></span>
37
+ <span class="line"><span style="color:#6A9955;"># It may takes a while to download and build models</span></span>
38
+ <span class="line"><span style="color:#D4D4D4;">speech2text = Speech2Text(</span></span>
39
+ <span class="line"><span style="color:#D4D4D4;"> **d.download_and_unpack(tag),</span></span>
40
+ <span class="line"><span style="color:#9CDCFE;"> device</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&quot;cuda&quot;</span><span style="color:#D4D4D4;">,</span></span>
41
+ <span class="line"><span style="color:#9CDCFE;"> minlenratio</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">0.0</span><span style="color:#D4D4D4;">,</span></span>
42
+ <span class="line"><span style="color:#9CDCFE;"> maxlenratio</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">0.0</span><span style="color:#D4D4D4;">,</span></span>
43
+ <span class="line"><span style="color:#9CDCFE;"> ctc_weight</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">0.3</span><span style="color:#D4D4D4;">,</span></span>
44
+ <span class="line"><span style="color:#9CDCFE;"> beam_size</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">10</span><span style="color:#D4D4D4;">,</span></span>
45
+ <span class="line"><span style="color:#9CDCFE;"> batch_size</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">,</span></span>
46
+ <span class="line"><span style="color:#9CDCFE;"> nbest</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">1</span></span>
47
+ <span class="line"><span style="color:#D4D4D4;">)</span></span>
48
+ <span class="line"></span>
49
+ <span class="line"><span style="color:#569CD6;">def</span><span style="color:#DCDCAA;"> text_normalizer</span><span style="color:#D4D4D4;">(</span><span style="color:#9CDCFE;">text</span><span style="color:#D4D4D4;">):</span></span>
50
+ <span class="line"><span style="color:#D4D4D4;"> text = text.upper()</span></span>
51
+ <span class="line"><span style="color:#C586C0;"> return</span><span style="color:#D4D4D4;"> text.translate(</span><span style="color:#4EC9B0;">str</span><span style="color:#D4D4D4;">.maketrans(</span><span style="color:#CE9178;">&#39;&#39;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&#39;&#39;</span><span style="color:#D4D4D4;">, string.punctuation))</span></span>
52
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="recognize-our-example-recordings" tabindex="-1"><a class="header-anchor" href="#recognize-our-example-recordings"><span>Recognize our example recordings</span></a></h3><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">git clone https://github.com/ftshijt/ESPNet_asr_egs.git</span></span>
53
+ <span class="line"></span>
54
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> pandas </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> pd</span></span>
55
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> soundfile</span></span>
56
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> librosa.display</span></span>
57
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> IPython.display </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> display, Audio</span></span>
58
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> matplotlib.pyplot </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> plt</span></span>
59
+ <span class="line"></span>
60
+ <span class="line"></span>
61
+ <span class="line"><span style="color:#D4D4D4;">egs = pd.read_csv(</span><span style="color:#CE9178;">&quot;ESPNet_asr_egs/egs.csv&quot;</span><span style="color:#D4D4D4;">)</span></span>
62
+ <span class="line"><span style="color:#C586C0;">for</span><span style="color:#D4D4D4;"> index, row </span><span style="color:#C586C0;">in</span><span style="color:#D4D4D4;"> egs.iterrows():</span></span>
63
+ <span class="line"><span style="color:#C586C0;"> if</span><span style="color:#D4D4D4;"> row[</span><span style="color:#CE9178;">&quot;lang&quot;</span><span style="color:#D4D4D4;">] == lang </span><span style="color:#569CD6;">or</span><span style="color:#D4D4D4;"> lang == </span><span style="color:#CE9178;">&quot;multilingual&quot;</span><span style="color:#D4D4D4;">:</span></span>
64
+ <span class="line"><span style="color:#D4D4D4;"> speech, rate = soundfile.read(</span><span style="color:#CE9178;">&quot;ESPNet_asr_egs/&quot;</span><span style="color:#D4D4D4;"> + row[</span><span style="color:#CE9178;">&quot;path&quot;</span><span style="color:#D4D4D4;">])</span></span>
65
+ <span class="line"><span style="color:#C586C0;"> assert</span><span style="color:#D4D4D4;"> fs == </span><span style="color:#4EC9B0;">int</span><span style="color:#D4D4D4;">(row[</span><span style="color:#CE9178;">&quot;sr&quot;</span><span style="color:#D4D4D4;">])</span></span>
66
+ <span class="line"><span style="color:#D4D4D4;"> nbests = speech2text(speech)</span></span>
67
+ <span class="line"></span>
68
+ <span class="line"><span style="color:#D4D4D4;"> text, *_ = nbests[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">]</span></span>
69
+ <span class="line"><span style="color:#DCDCAA;"> print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;Input Speech: ESPNet_asr_egs/</span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">row[</span><span style="color:#CE9178;">&#39;path&#39;</span><span style="color:#D4D4D4;">]</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&quot;</span><span style="color:#D4D4D4;">)</span></span>
70
+ <span class="line"><span style="color:#6A9955;"> # let us listen to samples</span></span>
71
+ <span class="line"><span style="color:#D4D4D4;"> display(Audio(speech, </span><span style="color:#9CDCFE;">rate</span><span style="color:#D4D4D4;">=rate))</span></span>
72
+ <span class="line"><span style="color:#D4D4D4;"> librosa.display.waveplot(speech, </span><span style="color:#9CDCFE;">sr</span><span style="color:#D4D4D4;">=rate)</span></span>
73
+ <span class="line"><span style="color:#D4D4D4;"> plt.show()</span></span>
74
+ <span class="line"><span style="color:#DCDCAA;"> print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;Reference text: </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">text_normalizer(row[</span><span style="color:#CE9178;">&#39;text&#39;</span><span style="color:#D4D4D4;">])</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&quot;</span><span style="color:#D4D4D4;">)</span></span>
75
+ <span class="line"><span style="color:#DCDCAA;"> print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;ASR hypothesis: </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">text_normalizer(text)</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&quot;</span><span style="color:#D4D4D4;">)</span></span>
76
+ <span class="line"><span style="color:#DCDCAA;"> print</span><span style="color:#D4D4D4;">(</span><span style="color:#CE9178;">&quot;*&quot;</span><span style="color:#D4D4D4;"> * </span><span style="color:#B5CEA8;">50</span><span style="color:#D4D4D4;">)</span></span>
77
+ <span class="line"></span>
78
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="recognize-your-own-pre-recordings" tabindex="-1"><a class="header-anchor" href="#recognize-your-own-pre-recordings"><span>Recognize your own pre-recordings</span></a></h3><ol><li>Upload your own pre-recorded recordings</li><li>Recognize your voice with the ASR system</li></ol><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> google.colab </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> files</span></span>
79
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> IPython.display </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> display, Audio</span></span>
80
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> soundfile</span></span>
81
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> librosa.display</span></span>
82
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> matplotlib.pyplot </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> plt</span></span>
83
+ <span class="line"></span>
84
+ <span class="line"><span style="color:#D4D4D4;">uploaded = files.upload()</span></span>
85
+ <span class="line"></span>
86
+ <span class="line"><span style="color:#C586C0;">for</span><span style="color:#D4D4D4;"> file_name </span><span style="color:#C586C0;">in</span><span style="color:#D4D4D4;"> uploaded.keys():</span></span>
87
+ <span class="line"><span style="color:#D4D4D4;"> speech, rate = soundfile.read(file_name)</span></span>
88
+ <span class="line"><span style="color:#C586C0;"> assert</span><span style="color:#D4D4D4;"> rate == fs, </span><span style="color:#CE9178;">&quot;mismatch in sampling rate&quot;</span></span>
89
+ <span class="line"><span style="color:#D4D4D4;"> nbests = speech2text(speech)</span></span>
90
+ <span class="line"><span style="color:#D4D4D4;"> text, *_ = nbests[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">]</span></span>
91
+ <span class="line"></span>
92
+ <span class="line"><span style="color:#DCDCAA;"> print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;Input Speech: </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">file_name</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&quot;</span><span style="color:#D4D4D4;">)</span></span>
93
+ <span class="line"><span style="color:#D4D4D4;"> display(Audio(speech, </span><span style="color:#9CDCFE;">rate</span><span style="color:#D4D4D4;">=rate))</span></span>
94
+ <span class="line"><span style="color:#D4D4D4;"> librosa.display.waveplot(speech, </span><span style="color:#9CDCFE;">sr</span><span style="color:#D4D4D4;">=rate)</span></span>
95
+ <span class="line"><span style="color:#D4D4D4;"> plt.show()</span></span>
96
+ <span class="line"><span style="color:#DCDCAA;"> print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;ASR hypothesis: </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">text_normalizer(text)</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&quot;</span><span style="color:#D4D4D4;">)</span></span>
97
+ <span class="line"><span style="color:#DCDCAA;"> print</span><span style="color:#D4D4D4;">(</span><span style="color:#CE9178;">&quot;*&quot;</span><span style="color:#D4D4D4;"> * </span><span style="color:#B5CEA8;">50</span><span style="color:#D4D4D4;">)</span></span>
98
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="recognize-your-own-live-recordings" tabindex="-1"><a class="header-anchor" href="#recognize-your-own-live-recordings"><span>Recognize your own live-recordings</span></a></h3><ol><li>Record your own voice</li><li>Recognize your voice with the ASR system</li></ol><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># from https://gist.github.com/korakot/c21c3476c024ad6d56d5f48b0bca92be</span></span>
99
+ <span class="line"></span>
100
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> IPython.display </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> Javascript</span></span>
101
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> google.colab </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> output</span></span>
102
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> base64 </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> b64decode</span></span>
103
+ <span class="line"></span>
104
+ <span class="line"><span style="color:#D4D4D4;">RECORD = </span><span style="color:#CE9178;">&quot;&quot;&quot;</span></span>
105
+ <span class="line"><span style="color:#CE9178;">const sleep = time =&gt; new Promise(resolve =&gt; setTimeout(resolve, time))</span></span>
106
+ <span class="line"><span style="color:#CE9178;">const b2text = blob =&gt; new Promise(resolve =&gt; {</span></span>
107
+ <span class="line"><span style="color:#CE9178;"> const reader = new FileReader()</span></span>
108
+ <span class="line"><span style="color:#CE9178;"> reader.onloadend = e =&gt; resolve(e.srcElement.result)</span></span>
109
+ <span class="line"><span style="color:#CE9178;"> reader.readAsDataURL(blob)</span></span>
110
+ <span class="line"><span style="color:#CE9178;">})</span></span>
111
+ <span class="line"><span style="color:#CE9178;">var record = time =&gt; new Promise(async resolve =&gt; {</span></span>
112
+ <span class="line"><span style="color:#CE9178;"> stream = await navigator.mediaDevices.getUserMedia({ audio: true })</span></span>
113
+ <span class="line"><span style="color:#CE9178;"> recorder = new MediaRecorder(stream)</span></span>
114
+ <span class="line"><span style="color:#CE9178;"> chunks = []</span></span>
115
+ <span class="line"><span style="color:#CE9178;"> recorder.ondataavailable = e =&gt; chunks.push(e.data)</span></span>
116
+ <span class="line"><span style="color:#CE9178;"> recorder.start()</span></span>
117
+ <span class="line"><span style="color:#CE9178;"> await sleep(time)</span></span>
118
+ <span class="line"><span style="color:#CE9178;"> recorder.onstop = async ()=&gt;{</span></span>
119
+ <span class="line"><span style="color:#CE9178;"> blob = new Blob(chunks)</span></span>
120
+ <span class="line"><span style="color:#CE9178;"> text = await b2text(blob)</span></span>
121
+ <span class="line"><span style="color:#CE9178;"> resolve(text)</span></span>
122
+ <span class="line"><span style="color:#CE9178;"> }</span></span>
123
+ <span class="line"><span style="color:#CE9178;"> recorder.stop()</span></span>
124
+ <span class="line"><span style="color:#CE9178;">})</span></span>
125
+ <span class="line"><span style="color:#CE9178;">&quot;&quot;&quot;</span></span>
126
+ <span class="line"></span>
127
+ <span class="line"><span style="color:#569CD6;">def</span><span style="color:#DCDCAA;"> record</span><span style="color:#D4D4D4;">(</span><span style="color:#9CDCFE;">sec</span><span style="color:#D4D4D4;">, </span><span style="color:#9CDCFE;">filename</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&#39;audio.wav&#39;</span><span style="color:#D4D4D4;">):</span></span>
128
+ <span class="line"><span style="color:#D4D4D4;"> display(Javascript(RECORD))</span></span>
129
+ <span class="line"><span style="color:#D4D4D4;"> s = output.eval_js(</span><span style="color:#CE9178;">&#39;record(</span><span style="color:#569CD6;">%d</span><span style="color:#CE9178;">)&#39;</span><span style="color:#D4D4D4;"> % (sec * </span><span style="color:#B5CEA8;">1000</span><span style="color:#D4D4D4;">))</span></span>
130
+ <span class="line"><span style="color:#D4D4D4;"> b = b64decode(s.split(</span><span style="color:#CE9178;">&#39;,&#39;</span><span style="color:#D4D4D4;">)[</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">])</span></span>
131
+ <span class="line"><span style="color:#C586C0;"> with</span><span style="color:#DCDCAA;"> open</span><span style="color:#D4D4D4;">(filename, </span><span style="color:#CE9178;">&#39;wb+&#39;</span><span style="color:#D4D4D4;">) </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> f:</span></span>
132
+ <span class="line"><span style="color:#D4D4D4;"> f.write(b)</span></span>
133
+ <span class="line"></span>
134
+ <span class="line"><span style="color:#D4D4D4;">audio = </span><span style="color:#CE9178;">&#39;audio.wav&#39;</span></span>
135
+ <span class="line"><span style="color:#D4D4D4;">second = </span><span style="color:#B5CEA8;">5</span></span>
136
+ <span class="line"><span style="color:#DCDCAA;">print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;Speak to your microphone </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">second</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;"> sec...&quot;</span><span style="color:#D4D4D4;">)</span></span>
137
+ <span class="line"><span style="color:#D4D4D4;">record(second, audio)</span></span>
138
+ <span class="line"><span style="color:#DCDCAA;">print</span><span style="color:#D4D4D4;">(</span><span style="color:#CE9178;">&quot;Done!&quot;</span><span style="color:#D4D4D4;">)</span></span>
139
+ <span class="line"></span>
140
+ <span class="line"></span>
141
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> librosa</span></span>
142
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> librosa.display</span></span>
143
+ <span class="line"><span style="color:#D4D4D4;">speech, rate = librosa.load(audio, </span><span style="color:#9CDCFE;">sr</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">16000</span><span style="color:#D4D4D4;">)</span></span>
144
+ <span class="line"><span style="color:#D4D4D4;">librosa.display.waveplot(speech, </span><span style="color:#9CDCFE;">sr</span><span style="color:#D4D4D4;">=rate)</span></span>
145
+ <span class="line"></span>
146
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> matplotlib.pyplot </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> plt</span></span>
147
+ <span class="line"><span style="color:#D4D4D4;">plt.show()</span></span>
148
+ <span class="line"></span>
149
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> pysndfile</span></span>
150
+ <span class="line"><span style="color:#D4D4D4;">pysndfile.sndio.write(</span><span style="color:#CE9178;">&#39;audio_ds.wav&#39;</span><span style="color:#D4D4D4;">, speech, </span><span style="color:#9CDCFE;">rate</span><span style="color:#D4D4D4;">=rate, </span><span style="color:#9CDCFE;">format</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&#39;wav&#39;</span><span style="color:#D4D4D4;">, </span><span style="color:#9CDCFE;">enc</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&#39;pcm16&#39;</span><span style="color:#D4D4D4;">)</span></span>
151
+ <span class="line"></span>
152
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> IPython.display </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> display, Audio</span></span>
153
+ <span class="line"><span style="color:#D4D4D4;">display(Audio(speech, </span><span style="color:#9CDCFE;">rate</span><span style="color:#D4D4D4;">=rate))</span></span>
154
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">nbests = speech2text(speech)</span></span>
155
+ <span class="line"><span style="color:#D4D4D4;">text, *_ = nbests[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">]</span></span>
156
+ <span class="line"></span>
157
+ <span class="line"><span style="color:#DCDCAA;">print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;ASR hypothesis: </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">text_normalizer(text)</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&quot;</span><span style="color:#D4D4D4;">)</span></span>
158
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div>`,17);function C(b,h){const a=o("ExternalLinkIcon");return i(),r("div",null,[c,d,D,s("p",null,[n("Author: Jiatong Shi ("),s("a",y,[n("@ftshijt"),l(a)]),n(")")]),u,s("p",null,[n("Please select model shown in "),s("a",v,[n("espnet_model_zoo"),l(a)])]),m])}const E=p(t,[["render",C],["__file","espnet2_asr_realtime_demo.html.vue"]]),g=JSON.parse('{"path":"/espnet2/asr/espnet2_asr_realtime_demo.html","title":"ESPnet2-ASR realtime demonstration","lang":"en-US","frontmatter":{},"headers":[{"level":2,"title":"ASR model demo","slug":"asr-model-demo","link":"#asr-model-demo","children":[{"level":3,"title":"Model Selection","slug":"model-selection","link":"#model-selection","children":[]},{"level":3,"title":"Model Setup","slug":"model-setup","link":"#model-setup","children":[]},{"level":3,"title":"Recognize our example recordings","slug":"recognize-our-example-recordings","link":"#recognize-our-example-recordings","children":[]},{"level":3,"title":"Recognize your own pre-recordings","slug":"recognize-your-own-pre-recordings","link":"#recognize-your-own-pre-recordings","children":[]},{"level":3,"title":"Recognize your own live-recordings","slug":"recognize-your-own-live-recordings","link":"#recognize-your-own-live-recordings","children":[]}]}],"git":{},"filePathRelative":"espnet2/asr/espnet2_asr_realtime_demo.md"}');export{E as comp,g as data};
assets/espnet2_asr_transfer_learning_demo.html-DJeSoTyY.js ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import{_ as l,r as t,o as p,c as r,a as s,d as n,b as a,e as o}from"./app-DTS6SjJz.js";const i={},c=s("h1",{id:"use-transfer-learning-for-asr-in-espnet2",tabindex:"-1"},[s("a",{class:"header-anchor",href:"#use-transfer-learning-for-asr-in-espnet2"},[s("span",null,[s("strong",null,"Use transfer learning for ASR in ESPnet2")])])],-1),d=s("p",null,"Author : Dan Berrebbi (dberrebb@andrew.cmu.edu)",-1),y=s("p",null,"Date : April 11th, 2022",-1),D=s("h1",{id:"abstract",tabindex:"-1"},[s("a",{class:"header-anchor",href:"#abstract"},[s("span",null,"Abstract")])],-1),u=s("p",null,"In that tutorial, we will introduce several options to use pre-trained models/parameters for Automatic Speech Recognition (ASR) in ESPnet2. Available options are :",-1),m=s("li",null,"use a local model you (or a collegue) have already trained,",-1),h={href:"https://huggingface.co/espnet",target:"_blank",rel:"noopener noreferrer"},g=o(`<p>We note that this is done for ASR training, so at <strong>stage 11</strong> of ESPnet2 models&#39; recipe.</p><h3 id="why-using-such-pre-trained-models" tabindex="-1"><a class="header-anchor" href="#why-using-such-pre-trained-models"><span>Why using such (pre-)trained models ?</span></a></h3><p>Several projects may involve making use of previously trained models, this is the reason why we developed ESPnet repository on HuggingFace for instance. Example of use cases are listed below (non-exhaustive):</p><ul><li>target a low resource language, a model trained from scratch may perform badly if trained with only few hours of data,</li><li>study robustness to shifts (domain, language ... shifts) of a model,</li><li>make use of massively trained multilingual models.</li><li>...</li></ul><h1 id="espnet-installation-about-10-minutes-in-total" tabindex="-1"><a class="header-anchor" href="#espnet-installation-about-10-minutes-in-total"><span>ESPnet installation (about 10 minutes in total)</span></a></h1><p>Please use the gpu environnement provided by google colab for runing this notebook.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">git clone </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">depth </span><span style="color:#B5CEA8;">5</span><span style="color:#D4D4D4;"> https://github.com/espnet/espnet</span></span>
2
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># It takes 30 seconds</span></span>
3
+ <span class="line"><span style="color:#D4D4D4;">%cd /content/espnet/tools</span></span>
4
+ <span class="line"><span style="color:#D4D4D4;">!./setup_anaconda.sh anaconda espnet </span><span style="color:#B5CEA8;">3.9</span></span>
5
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># It may take ~8 minutes</span></span>
6
+ <span class="line"><span style="color:#D4D4D4;">%cd /content/espnet/tools</span></span>
7
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">make CUDA_VERSION=</span><span style="color:#B5CEA8;">10.2</span></span>
8
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h1 id="mini-an4-recipe-as-a-transfer-learning-example" tabindex="-1"><a class="header-anchor" href="#mini-an4-recipe-as-a-transfer-learning-example"><span>mini_an4 recipe as a transfer learning example</span></a></h1><p>In this example, we use the <strong>mini_an4</strong> data, which has only 4 utterances for training. This is of course too small to train an ASR model, but it enables to run all the decribed transfer learning models on a colab environnement. After having run and understood those models/instructions, you can apply it to any other recipe of ESPnet2 or a new recipe that you build. First, move to the recipe directory</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">%cd /content/espnet/egs2/mini_an4/asr1</span></span>
9
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p><strong>Add a configuration file</strong></p><p>As the mini_an4 does not contain any configuration file for ASR model, we add one here.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">config = {</span><span style="color:#CE9178;">&#39;accum_grad&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">,</span></span>
10
+ <span class="line"><span style="color:#CE9178;"> &#39;batch_size&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">,</span></span>
11
+ <span class="line"><span style="color:#CE9178;"> &#39;batch_type&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#CE9178;">&#39;folded&#39;</span><span style="color:#D4D4D4;">,</span></span>
12
+ <span class="line"><span style="color:#CE9178;"> &#39;best_model_criterion&#39;</span><span style="color:#D4D4D4;">: [[</span><span style="color:#CE9178;">&#39;valid&#39;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&#39;acc&#39;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&#39;max&#39;</span><span style="color:#D4D4D4;">]],</span></span>
13
+ <span class="line"><span style="color:#CE9178;"> &#39;decoder&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#CE9178;">&#39;transformer&#39;</span><span style="color:#D4D4D4;">,</span></span>
14
+ <span class="line"><span style="color:#CE9178;"> &#39;decoder_conf&#39;</span><span style="color:#D4D4D4;">: {</span><span style="color:#CE9178;">&#39;dropout_rate&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#B5CEA8;">0.1</span><span style="color:#D4D4D4;">,</span></span>
15
+ <span class="line"><span style="color:#CE9178;"> &#39;input_layer&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#CE9178;">&#39;embed&#39;</span><span style="color:#D4D4D4;">,</span></span>
16
+ <span class="line"><span style="color:#CE9178;"> &#39;linear_units&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#B5CEA8;">2048</span><span style="color:#D4D4D4;">,</span></span>
17
+ <span class="line"><span style="color:#CE9178;"> &#39;num_blocks&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#B5CEA8;">6</span><span style="color:#D4D4D4;">},</span></span>
18
+ <span class="line"><span style="color:#CE9178;"> &#39;encoder&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#CE9178;">&#39;transformer&#39;</span><span style="color:#D4D4D4;">,</span></span>
19
+ <span class="line"><span style="color:#CE9178;"> &#39;encoder_conf&#39;</span><span style="color:#D4D4D4;">: {</span><span style="color:#CE9178;">&#39;attention_dropout_rate&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#B5CEA8;">0.0</span><span style="color:#D4D4D4;">,</span></span>
20
+ <span class="line"><span style="color:#CE9178;"> &#39;attention_heads&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#B5CEA8;">4</span><span style="color:#D4D4D4;">,</span></span>
21
+ <span class="line"><span style="color:#CE9178;"> &#39;dropout_rate&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#B5CEA8;">0.1</span><span style="color:#D4D4D4;">,</span></span>
22
+ <span class="line"><span style="color:#CE9178;"> &#39;input_layer&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#CE9178;">&#39;conv2d&#39;</span><span style="color:#D4D4D4;">,</span></span>
23
+ <span class="line"><span style="color:#CE9178;"> &#39;linear_units&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#B5CEA8;">2048</span><span style="color:#D4D4D4;">,</span></span>
24
+ <span class="line"><span style="color:#CE9178;"> &#39;num_blocks&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#B5CEA8;">12</span><span style="color:#D4D4D4;">,</span></span>
25
+ <span class="line"><span style="color:#CE9178;"> &#39;output_size&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#B5CEA8;">256</span><span style="color:#D4D4D4;">},</span></span>
26
+ <span class="line"><span style="color:#CE9178;"> &#39;grad_clip&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#B5CEA8;">5</span><span style="color:#D4D4D4;">,</span></span>
27
+ <span class="line"><span style="color:#CE9178;"> &#39;init&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#CE9178;">&#39;xavier_uniform&#39;</span><span style="color:#D4D4D4;">,</span></span>
28
+ <span class="line"><span style="color:#CE9178;"> &#39;keep_nbest_models&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">,</span></span>
29
+ <span class="line"><span style="color:#CE9178;"> &#39;max_epoch&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#B5CEA8;">5</span><span style="color:#D4D4D4;">,</span></span>
30
+ <span class="line"><span style="color:#CE9178;"> &#39;model_conf&#39;</span><span style="color:#D4D4D4;">: {</span><span style="color:#CE9178;">&#39;ctc_weight&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#B5CEA8;">0.3</span><span style="color:#D4D4D4;">,</span></span>
31
+ <span class="line"><span style="color:#CE9178;"> &#39;length_normalized_loss&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#569CD6;">False</span><span style="color:#D4D4D4;">,</span></span>
32
+ <span class="line"><span style="color:#CE9178;"> &#39;lsm_weight&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#B5CEA8;">0.1</span><span style="color:#D4D4D4;">},</span></span>
33
+ <span class="line"><span style="color:#CE9178;"> &#39;optim&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#CE9178;">&#39;adam&#39;</span><span style="color:#D4D4D4;">,</span></span>
34
+ <span class="line"><span style="color:#CE9178;"> &#39;optim_conf&#39;</span><span style="color:#D4D4D4;">: {</span><span style="color:#CE9178;">&#39;lr&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#B5CEA8;">1.0</span><span style="color:#D4D4D4;">},</span></span>
35
+ <span class="line"><span style="color:#CE9178;"> &#39;patience&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">,</span></span>
36
+ <span class="line"><span style="color:#CE9178;"> &#39;scheduler&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#CE9178;">&#39;noamlr&#39;</span><span style="color:#D4D4D4;">,</span></span>
37
+ <span class="line"><span style="color:#CE9178;"> &#39;scheduler_conf&#39;</span><span style="color:#D4D4D4;">: {</span><span style="color:#CE9178;">&#39;warmup_steps&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#B5CEA8;">1000</span><span style="color:#D4D4D4;">}}</span></span>
38
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> yaml</span></span>
39
+ <span class="line"><span style="color:#C586C0;">with</span><span style="color:#DCDCAA;"> open</span><span style="color:#D4D4D4;">(</span><span style="color:#CE9178;">&quot;conf/train_asr.yaml&quot;</span><span style="color:#D4D4D4;">,</span><span style="color:#CE9178;">&quot;w&quot;</span><span style="color:#D4D4D4;">) </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> f:</span></span>
40
+ <span class="line"><span style="color:#D4D4D4;"> yaml.dump(config, f)</span></span>
41
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p><strong>Data preparation (stage 1 - stage 5)</strong></p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">!./asr.sh </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">stage </span><span style="color:#B5CEA8;">1</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">stop_stage </span><span style="color:#B5CEA8;">5</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">train-</span><span style="color:#4EC9B0;">set</span><span style="color:#CE9178;"> &quot;train_nodev&quot;</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">valid-</span><span style="color:#4EC9B0;">set</span><span style="color:#CE9178;"> &quot;train_dev&quot;</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">test_sets </span><span style="color:#CE9178;">&quot;test&quot;</span></span>
42
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p><strong>Stage 10: ASR collect stats</strong>:</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># takes about 10 seconds</span></span>
43
+ <span class="line"><span style="color:#D4D4D4;">!./asr.sh </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">stage </span><span style="color:#B5CEA8;">10</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">stop_stage </span><span style="color:#B5CEA8;">10</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">train-</span><span style="color:#4EC9B0;">set</span><span style="color:#CE9178;"> &quot;train_nodev&quot;</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">valid-</span><span style="color:#4EC9B0;">set</span><span style="color:#CE9178;"> &quot;train_dev&quot;</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">test_sets </span><span style="color:#CE9178;">&quot;test&quot;</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">asr_config </span><span style="color:#CE9178;">&quot;conf/train_asr.yaml&quot;</span></span>
44
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><p><strong>Stage 11: ASR training (from scratch)</strong></p><p>We train our model for only 5 epochs, just to have a pre-trained model.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># takes about 1-2 minutes</span></span>
45
+ <span class="line"><span style="color:#D4D4D4;">!./asr.sh </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">stage </span><span style="color:#B5CEA8;">11</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">stop_stage </span><span style="color:#B5CEA8;">11</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">train-</span><span style="color:#4EC9B0;">set</span><span style="color:#CE9178;"> &quot;train_nodev&quot;</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">valid-</span><span style="color:#4EC9B0;">set</span><span style="color:#CE9178;"> &quot;train_dev&quot;</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">test_sets </span><span style="color:#CE9178;">&quot;test&quot;</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">asr_config </span><span style="color:#CE9178;">&quot;conf/train_asr.yaml&quot;</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">asr_tag </span><span style="color:#CE9178;">&quot;pre_trained_model&quot;</span></span>
46
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><p><strong>Stage 11.2 : ASR training over a pre-trained model</strong></p><p>We train our new model over the previously trained model. (here as we use the same training data, this is not very useful, but again this is a toy example that is reproducible with any model.)</p><p><strong>Step 1</strong> : make sure your ASR model file has the proper ESPnet format (should be ok if trained with ESPnet). It just needs to be a &quot;.pth&quot; (or &quot;.pt&quot; or other extension) type pytorch model.</p><p><strong>Step 2</strong> : add the parameter <code>--pretrained_model path/to/your/pretrained/model/file.pth</code> to run.sh.</p><p><strong>Step 3</strong> : step 2 will initialize your new model with the parameters of the pre-trained model. Thus your new model will be trained with a strong initialization. However, if your new model have different parameter sizes for some parts of the model (e.g. last projection layer could be modified ...). This will lead to an error because of mismatches in size. To prevent this to happen, you can add the parameter <code>--ignore_init_mismatch true</code> in run.sh.</p>`,28),v=s("strong",null,"Step 4 (Optional)",-1),_=s("code",null,"--pretrained_model",-1),b=s("code",null,"--pretrained_model <file_path>:<src_key>:<dst_key>:<exclude_Keys>",-1),E=s("code",null,"src_key",-1),f=s("code",null,"dst_key",-1),C=s("code",null,"src_key",-1),k=s("code",null,"exclude_Keys",-1),w=s("code",null,"src_key",-1),q=s("code",null,"dst_key",-1),x=s("code",null,"exclude_Keys",-1),A=s("code",null,"--pretrained_model <file_path>:::decoder",-1),F={href:"https://github.com/espnet/espnet/blob/e76c78c0c661ab37cc081d46d9b059dcb31292fe/espnet2/torch_utils/load_pretrained_model.py#L43-L53",target:"_blank",rel:"noopener noreferrer"},B=o(`<div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># takes about 1-2 minutes</span></span>
47
+ <span class="line"><span style="color:#D4D4D4;">!./asr.sh </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">stage </span><span style="color:#B5CEA8;">11</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">stop_stage </span><span style="color:#B5CEA8;">11</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">train-</span><span style="color:#4EC9B0;">set</span><span style="color:#CE9178;"> &quot;train_nodev&quot;</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">valid-</span><span style="color:#4EC9B0;">set</span><span style="color:#CE9178;"> &quot;train_dev&quot;</span><span style="color:#D4D4D4;"> \\</span></span>
48
+ <span class="line"><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">test_sets </span><span style="color:#CE9178;">&quot;test&quot;</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">asr_config </span><span style="color:#CE9178;">&quot;conf/train_asr.yaml&quot;</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">asr_tag </span><span style="color:#CE9178;">&quot;transfer_learning_with_pre_trained_model&quot;</span><span style="color:#D4D4D4;">\\</span></span>
49
+ <span class="line"><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">pretrained_model </span><span style="color:#CE9178;">&quot;/content/espnet/egs2/mini_an4/asr1/exp/asr_train_asr_raw_bpe30/valid.acc.ave.pth&quot;</span></span>
50
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p><strong>Stage 11.3 : ASR training over a HuggingFace pre-trained model</strong></p><p>We train our new model over the previously trained model from HuggingFace. Any model can be used, here we take a model trained on Bengali as an example. It can be found at https://huggingface.co/espnet/bn_openslr53.</p><h3 id="use-a-trained-model-from-espnet-repository-on-huggingface" tabindex="-1"><a class="header-anchor" href="#use-a-trained-model-from-espnet-repository-on-huggingface"><span>Use a trained model from ESPnet repository on HuggingFace.</span></a></h3>`,4),S={href:"https://huggingface.co/espnet",target:"_blank",rel:"noopener noreferrer"},P={href:"https://huggingface.co/espnet",target:"_blank",rel:"noopener noreferrer"},R=s("code",null,"wget https://huggingface.co/espnet/bn_openslr53/blob/main/exp/asr_train_asr_raw_bpe1000/41epoch.pth",-1),I=s("code",null,"git clone https://huggingface.co/espnet/bn_openslr53",-1),T=o(`<div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>cd espnet</span></span>
51
+ <span class="line"><span>git checkout fa1b865352475b744c37f70440de1cc6b257ba70</span></span>
52
+ <span class="line"><span>pip install -e .</span></span>
53
+ <span class="line"><span>cd egs2/bn_openslr53/asr1</span></span>
54
+ <span class="line"><span>./run.sh --skip_data_prep false --skip_train true --download_model espnet/bn_openslr53</span></span>
55
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>Then, as you have the &quot;.pth&quot; model file, you can follow the steps 1 to 4 from the previous section in order to use this pre-train model.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">wget https://huggingface.co/espnet/bn_openslr53/resolve/main/exp/asr_train_asr_raw_bpe1000/</span><span style="color:#F44747;">41epoch</span><span style="color:#D4D4D4;">.pth</span></span>
56
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>The next command line will raise an error because of the size mismatch of some parameters, as mentionned before (step3).</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># will fail in about 5 seconds</span></span>
57
+ <span class="line"><span style="color:#D4D4D4;">!./asr.sh </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">stage </span><span style="color:#B5CEA8;">11</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">stop_stage </span><span style="color:#B5CEA8;">11</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">train-</span><span style="color:#4EC9B0;">set</span><span style="color:#CE9178;"> &quot;train_nodev&quot;</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">valid-</span><span style="color:#4EC9B0;">set</span><span style="color:#CE9178;"> &quot;train_dev&quot;</span><span style="color:#D4D4D4;"> \\</span></span>
58
+ <span class="line"><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">test_sets </span><span style="color:#CE9178;">&quot;test&quot;</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">asr_config </span><span style="color:#CE9178;">&quot;conf/train_asr.yaml&quot;</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">asr_tag </span><span style="color:#CE9178;">&quot;transfer_learning_with_pre_trained_model&quot;</span><span style="color:#D4D4D4;">\\</span></span>
59
+ <span class="line"><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">pretrained_model </span><span style="color:#CE9178;">&quot;/content/espnet/egs2/mini_an4/asr1/41epoch.pth&quot;</span></span>
60
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>To solve this issue, as mentionned, we can use the <code>--ignore_init_mismatch &quot;true&quot;</code> parameter.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># takes about 1-2 minutes</span></span>
61
+ <span class="line"><span style="color:#D4D4D4;">!./asr.sh </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">stage </span><span style="color:#B5CEA8;">11</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">stop_stage </span><span style="color:#B5CEA8;">11</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">train-</span><span style="color:#4EC9B0;">set</span><span style="color:#CE9178;"> &quot;train_nodev&quot;</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">valid-</span><span style="color:#4EC9B0;">set</span><span style="color:#CE9178;"> &quot;train_dev&quot;</span><span style="color:#D4D4D4;"> \\</span></span>
62
+ <span class="line"><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">test_sets </span><span style="color:#CE9178;">&quot;test&quot;</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">asr_config </span><span style="color:#CE9178;">&quot;conf/train_asr.yaml&quot;</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">asr_tag </span><span style="color:#CE9178;">&quot;transfer_learning_with_pre_trained_model_from_HF&quot;</span><span style="color:#D4D4D4;">\\</span></span>
63
+ <span class="line"><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">pretrained_model </span><span style="color:#CE9178;">&quot;/content/espnet/egs2/mini_an4/asr1/41epoch.pth&quot;</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">ignore_init_mismatch </span><span style="color:#CE9178;">&quot;true&quot;</span><span style="color:#D4D4D4;"> </span></span>
64
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p><strong>Additional note about the <code>--ignore_init_mismatch true</code> option :</strong> This option is very convenient because in lots of transfer learning use cases, you will aim to use a model trained on a language X (e.g. X=English) for another language Y. Language Y may have a vocabulary (set of tokens) different from language X, for instance if you target Y=Totonac, a Mexican low resource language, your model may be stronger if you use a different set of bpes/tokens thatn the one used to train the English model. In that situation, the last layer (projection to vocabulary space) of your ASR model needs to be initialized from scratch and may be different in shape than the one of the English model. For that reason, you should use the <code>--ignore_init_mismatch true</code> option. It also enables to handle the case where the scripts are differents from languages X to Y.</p>`,8);function H(z,j){const e=t("ExternalLinkIcon");return p(),r("div",null,[c,d,y,D,u,s("ul",null,[m,s("li",null,[n("use a trained model from "),s("a",h,[n("ESPnet repository on HuggingFace"),a(e)]),n(".")])]),g,s("p",null,[v,n(" : if you only want to use some specific parts of the pre-trained model, or exclude specific parts, you can specify it in the "),_,n(" argument by passing the component names with the following syntax : "),b,n(". "),E,n(" are the parameters you want to keep from the pre-trained model. "),f,n(" are the parameters you want to initialize in the new model with the "),C,n("parameters. And "),k,n(" are the parameters from the pre-trained model that you do not want to use. You can leave "),w,n(" and "),q,n(" fields empty and just fill "),x,n(" with the parameters that you ant to drop. For instance, if you want to re-use encoder parameters but not decoder ones, syntax will be "),A,n(". You can see the argument expected format in more details "),s("a",F,[n("here"),a(e)]),n(".")]),B,s("p",null,[s("a",S,[n("ESPnet repository on HuggingFace"),a(e)]),n(' contains more than 200 pre-trained models, for a wide variety of languages and dataset, and we are actively expanding this repositories with new models every week! This enable any user to perform transfer learning with a wide variety of models without having to re-train them. In order to use our pre-trained models, the first step is to download the ".pth" model file from the '),s("a",P,[n("HugginFace page"),a(e)]),n(". There are several easy way to do it, either by manually downloading them (e.g. "),R,n("), cloning it ("),I,n(") or downloading it through an ESPnet recipe (described in the models' pages on HuggingFace):")]),T])}const U=l(i,[["render",H],["__file","espnet2_asr_transfer_learning_demo.html.vue"]]),W=JSON.parse('{"path":"/espnet2/asr/espnet2_asr_transfer_learning_demo.html","title":"Use transfer learning for ASR in ESPnet2","lang":"en-US","frontmatter":{},"headers":[{"level":3,"title":"Why using such (pre-)trained models ?","slug":"why-using-such-pre-trained-models","link":"#why-using-such-pre-trained-models","children":[]},{"level":3,"title":"Use a trained model from ESPnet repository on HuggingFace.","slug":"use-a-trained-model-from-espnet-repository-on-huggingface","link":"#use-a-trained-model-from-espnet-repository-on-huggingface","children":[]}],"git":{},"filePathRelative":"espnet2/asr/espnet2_asr_transfer_learning_demo.md"}');export{U as comp,W as data};
assets/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html-D9GsoT-_.js ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import{_ as o,r as l,o as i,c as p,a as s,d as e,b as a,e as t}from"./app-DTS6SjJz.js";const r={},c=s("h1",{id:"cmu-11751-18781-fall-2022-espnet-tutorial2-new-task",tabindex:"-1"},[s("a",{class:"header-anchor",href:"#cmu-11751-18781-fall-2022-espnet-tutorial2-new-task"},[s("span",null,"CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task)")])],-1),d={href:"https://github.com/espnet/espnet",target:"_blank",rel:"noopener noreferrer"},h=s("p",null,"Main references:",-1),u={href:"https://github.com/espnet/espnet",target:"_blank",rel:"noopener noreferrer"},v={href:"https://espnet.github.io/espnet/",target:"_blank",rel:"noopener noreferrer"},m={href:"https://colab.research.google.com/github/espnet/notebook/blob/master/espnet2_tutorial_2021_CMU_11751_18781.ipynb",target:"_blank",rel:"noopener noreferrer"},y={href:"https://colab.research.google.com/drive/1tY6PxF_M5Nx5n488x0DrpujJOyqW-ATi?usp=sharing",target:"_blank",rel:"noopener noreferrer"},b={href:"https://colab.research.google.com/drive/1d3yMY3xv_hpr2mznskZpF2Cj-bEooDv-?usp=sharing",target:"_blank",rel:"noopener noreferrer"},D=t(`<p>Author: Jiatong Shi (jiatongs@andrew.cmu.edu)</p><p>We would like to thank You (Neil) Zhang for kindly helping the hands-on tutorial and sharing his knowledge on the task.</p><h2 id="❗important-notes❗" tabindex="-1"><a class="header-anchor" href="#❗important-notes❗"><span>❗Important Notes❗</span></a></h2><ul><li>We are using Colab to show the demo. However, Colab has some constraints on the total GPU runtime. If you use too much GPU time, you may not be able to use GPU for some time.</li><li>There are multiple in-class checkpoints ✅ throughout this tutorial. There will also be some after-class excersices 📗 after the tutorial. <strong>Your participation points are based on these tasks.</strong> Please try your best to follow all the steps! If you encounter issues, please notify the TAs as soon as possible so that we can make an adjustment for you.</li><li>Please submit PDF files of your completed notebooks to Gradescope. You can print the notebook using <code>File -&gt; Print</code> in the menu bar.</li><li>This tutorial covers some advanced usage of ESPnet, which is the extension of the first tutorial.</li></ul><h2 id="useful-links" tabindex="-1"><a class="header-anchor" href="#useful-links"><span>Useful links</span></a></h2><ul><li>Installation https://espnet.github.io/espnet/installation.html</li><li>Usage https://espnet.github.io/espnet/espnet2_tutorial.html</li><li>Reference of task class in ESPnet https://espnet.github.io/espnet/espnet2_task.html</li></ul><h2 id="objectives" tabindex="-1"><a class="header-anchor" href="#objectives"><span>Objectives</span></a></h2><p>After this tutorial, you are expected to know:</p><ul><li>How to add new task in ESPnet2</li><li>How to add new models in ESPnet2</li><li>How to create a new recipe (and template) of a new task from scratch</li></ul><h2 id="function-to-print-date-and-time" tabindex="-1"><a class="header-anchor" href="#function-to-print-date-and-time"><span>Function to print date and time</span></a></h2><p>We first define a function to print the current date and time, which will be used in multiple places below.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#569CD6;">def</span><span style="color:#DCDCAA;"> print_date_and_time</span><span style="color:#D4D4D4;">():</span></span>
2
+ <span class="line"><span style="color:#C586C0;"> from</span><span style="color:#D4D4D4;"> datetime </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> datetime</span></span>
3
+ <span class="line"><span style="color:#C586C0;"> import</span><span style="color:#D4D4D4;"> pytz</span></span>
4
+ <span class="line"></span>
5
+ <span class="line"><span style="color:#D4D4D4;"> now = datetime.now(pytz.timezone(</span><span style="color:#CE9178;">&quot;America/New_York&quot;</span><span style="color:#D4D4D4;">))</span></span>
6
+ <span class="line"><span style="color:#DCDCAA;"> print</span><span style="color:#D4D4D4;">(</span><span style="color:#CE9178;">&quot;=&quot;</span><span style="color:#D4D4D4;"> * </span><span style="color:#B5CEA8;">60</span><span style="color:#D4D4D4;">)</span></span>
7
+ <span class="line"><span style="color:#DCDCAA;"> print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&#39; Current date and time: </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">now.strftime(</span><span style="color:#CE9178;">&quot;%m/</span><span style="color:#569CD6;">%d</span><span style="color:#CE9178;">/%Y %H:%M:%S&quot;</span><span style="color:#D4D4D4;">)</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&#39;</span><span style="color:#D4D4D4;">)</span></span>
8
+ <span class="line"><span style="color:#DCDCAA;"> print</span><span style="color:#D4D4D4;">(</span><span style="color:#CE9178;">&quot;=&quot;</span><span style="color:#D4D4D4;"> * </span><span style="color:#B5CEA8;">60</span><span style="color:#D4D4D4;">)</span></span>
9
+ <span class="line"></span>
10
+ <span class="line"><span style="color:#6A9955;"># example output</span></span>
11
+ <span class="line"><span style="color:#D4D4D4;">print_date_and_time()</span></span>
12
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h1 id="install-espnet-almost-same-procedure-as-your-first-tutorial" tabindex="-1"><a class="header-anchor" href="#install-espnet-almost-same-procedure-as-your-first-tutorial"><span>Install ESPnet (Almost same procedure as your first tutorial)</span></a></h1><h2 id="download-espnet" tabindex="-1"><a class="header-anchor" href="#download-espnet"><span>Download ESPnet</span></a></h2><p>We use <code>git clone</code> to download the source code of ESPnet and then go to a specific commit.</p><p><strong>Important:</strong> In other versions of ESPnet, you may encounter errors related to imcompatible package versions (<code>numba</code>). Please use the same commit to avoid such issues.</p><p>Note that we are using another branch <code>espnet_tutorial_asvspoof</code> instead of &quot;master&quot;. You can also use your own fork to proceed the following sections if you want to use Github to save your code.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># It takes a few seconds</span></span>
13
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">git clone </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">depth </span><span style="color:#B5CEA8;">5</span><span style="color:#D4D4D4;"> -b </span><span style="color:#F44747;">2022fall_new_task_tutorial</span><span style="color:#D4D4D4;"> https://github.com/espnet/espnet</span></span>
14
+ <span class="line"></span>
15
+ <span class="line"><span style="color:#6A9955;"># We use a specific commit just for reproducibility.</span></span>
16
+ <span class="line"><span style="color:#D4D4D4;">%cd /content/espnet</span></span>
17
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">git checkout </span><span style="color:#F44747;">9cff98a78ceaa4d85843be0a50b369ec826b27f6</span></span>
18
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="setup-python-environment-based-on-anaconda-install-espnet" tabindex="-1"><a class="header-anchor" href="#setup-python-environment-based-on-anaconda-install-espnet"><span>Setup Python environment based on anaconda + Install ESPnet</span></a></h2><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># It takes 30 seconds</span></span>
19
+ <span class="line"><span style="color:#D4D4D4;">%cd /content/espnet/tools</span></span>
20
+ <span class="line"><span style="color:#D4D4D4;">!./setup_anaconda.sh anaconda espnet </span><span style="color:#B5CEA8;">3.9</span></span>
21
+ <span class="line"></span>
22
+ <span class="line"><span style="color:#6A9955;"># It may take 12 minutes</span></span>
23
+ <span class="line"><span style="color:#D4D4D4;">%cd /content/espnet/tools</span></span>
24
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">make TH_VERSION=</span><span style="color:#B5CEA8;">1.12</span><span style="color:#D4D4D4;">.1 CUDA_VERSION=</span><span style="color:#B5CEA8;">11.6</span></span>
25
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h1 id="what-we-provide-you-and-what-you-need-to-proceed" tabindex="-1"><a class="header-anchor" href="#what-we-provide-you-and-what-you-need-to-proceed"><span>What we provide you and what you need to proceed</span></a></h1><p>We have provide you most of the files needed for ASVSpoof recipe. So you do not need to add any additional files. However, noted that some of the files are not complete and need your completion to proceed. For a quick overview of the whole layout of the new task, please refer to https://github.com/espnet/espnet/compare/master...2022fall_new_task_tutorial</p><p>As elaborated in the warming-up, we have shown that there are two core components for a new task in ESPnet: a <strong>task library</strong> and correponding <strong>recipe setups</strong>. For the following of the section, we will briefly show the overall layout of adding the ASVSpoof task in ESPnet. The listed files are almost the minimum requirements to add a new task in ESPnet.</p><p><strong>Task library for ASVSpoof</strong></p><p>Followings are a list of files adding to ESPnet for ASVSpoof (files in &quot;&quot; are ones that need modifications)</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>- espnet2</span></span>
26
+ <span class="line"><span> - bin</span></span>
27
+ <span class="line"><span> - asvspoof_train.py # Major entry point for asvspoof</span></span>
28
+ <span class="line"><span> - &quot;asvspoof_inference.py&quot; (Checkpoint 4) # Inference scripts for asvspoof</span></span>
29
+ <span class="line"><span> - asvspoof</span></span>
30
+ <span class="line"><span> - decoder</span></span>
31
+ <span class="line"><span> - __init__.py</span></span>
32
+ <span class="line"><span> - abs_decoder.py # abstract class for decoder in ASVSpoof</span></span>
33
+ <span class="line"><span> - &quot;linear_decoder.py&quot; (Checkpoint 3) # simple linear decoder for ASVSpoof</span></span>
34
+ <span class="line"><span> - loss</span></span>
35
+ <span class="line"><span> - __init__.py</span></span>
36
+ <span class="line"><span> - abs_loss.py # abstract class for loss in ASVSpoof</span></span>
37
+ <span class="line"><span> - binary_loss.py # naive binary class loss for ASVSpoof</span></span>
38
+ <span class="line"><span> - am_softmax.py</span></span>
39
+ <span class="line"><span> - &quot;oc_softmax.py&quot; (Bouns)</span></span>
40
+ <span class="line"><span> - __init__.py</span></span>
41
+ <span class="line"><span> - &quot;espnet_model.py&quot; (Bouns)</span></span>
42
+ <span class="line"><span> - tasks</span></span>
43
+ <span class="line"><span> - &quot;asvspoof.py&quot; (Checkpoint 2)</span></span>
44
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>To help you understand more, we would recommend you to check the layout of other tasks (e.g., ASR, TTS, ST, etc.) to understand how the codebase is functioning.</p><p><strong>Recipe for ASVSpoof</strong></p><p>Followings are a list of files adding to ESPnet for ASVSpoof (files in boldface are ones that need modifications)</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>- egs2</span></span>
45
+ <span class="line"><span> - TEMPLATE</span></span>
46
+ <span class="line"><span> - asvspoof1</span></span>
47
+ <span class="line"><span> - &quot;asvspoof.sh&quot; (Checkpoint 1)</span></span>
48
+ <span class="line"><span> - others</span></span>
49
+ <span class="line"><span> - espnet_tutorial</span></span>
50
+ <span class="line"><span> - asvspoof11</span></span>
51
+ <span class="line"><span> - conf</span></span>
52
+ <span class="line"><span> - &quot;asvspoof.sh” (Checkpoint 1)</span></span>
53
+ <span class="line"><span> - local</span></span>
54
+ <span class="line"><span> - &quot;data_prep.py&quot; (Bouns)</span></span>
55
+ <span class="line"><span> - &quot;data.sh&quot; (Bouns)</span></span>
56
+ <span class="line"><span> - &quot;run.sh&quot; (Checkpoint 5)</span></span>
57
+ <span class="line"><span> - scripts</span></span>
58
+ <span class="line"><span> - pyscripts</span></span>
59
+ <span class="line"><span> - utils</span></span>
60
+ <span class="line"><span> - steps</span></span>
61
+ <span class="line"><span> - path.sh</span></span>
62
+ <span class="line"><span> - db.sh</span></span>
63
+ <span class="line"><span> - cmd.sh</span></span>
64
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div>`,30),f={href:"https://www.freecodecamp.org/news/symlink-tutorial-in-linux-how-to-create-and-remove-a-symbolic-link/",target:"_blank",rel:"noopener noreferrer"},g=s("code",null,"asvspoof.sh",-1),_=t(`<h2 id="asvspoof-data-preparation" tabindex="-1"><a class="header-anchor" href="#asvspoof-data-preparation"><span>ASVSpoof data preparation</span></a></h2><p>As discussed in the warm-up session, ASVSpoof aims to conduct a binary classfication. As the task layout is a bit different from the ASR task we touched on the first tutorial, so we need to use a different format to formulate the data. For here, to keep the simplicity, we stil use the exact same file as the first tutorial:</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>wav.scp text utt2spk spk2utt</span></span>
65
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>But on the other hand, we change the format of text into</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>utt_id1 0</span></span>
66
+ <span class="line"><span>utt_id2 1</span></span>
67
+ <span class="line"><span>utt_id3 0</span></span>
68
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>where 0 represents real speech and 1 stands for fake speech.</p><h3 id="download-dataset" tabindex="-1"><a class="header-anchor" href="#download-dataset"><span>Download dataset</span></a></h3><p>We first download the data from google drive. Noted that the data is a subset of the ASVSpoof2019 Challenge.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># a few seconds</span></span>
69
+ <span class="line"><span style="color:#D4D4D4;">%cd /content/espnet/egs2/espnet_tutorial/asvspoof1/</span></span>
70
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">gdown </span><span style="color:#F44747;">1HRdjjmGXBTXOqOq9iijuXPCA4y_46OzP</span></span>
71
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">unzip espnet_tutorial_asvspoof.zip</span></span>
72
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="prepare-data-stage1-stage2" tabindex="-1"><a class="header-anchor" href="#prepare-data-stage1-stage2"><span>Prepare data (Stage1 &amp; Stage2)</span></a></h3><p>This time, we make the task template to be as simple as possible. The data preparation will be only two stages, including basic data preparation and wave format.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># It may take around 6 minutes</span></span>
73
+ <span class="line"><span style="color:#D4D4D4;">!./asvspoof.sh </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">stage </span><span style="color:#B5CEA8;">1</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">stop_stage </span><span style="color:#B5CEA8;">2</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">train_set train </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">valid_set dev </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">test_sets </span><span style="color:#CE9178;">&quot;eval&quot;</span></span>
74
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="asvspoof-collect-stats-✅-checkpint-1-1-point" tabindex="-1"><a class="header-anchor" href="#asvspoof-collect-stats-✅-checkpint-1-1-point"><span>ASVSpoof collect stats (✅ Checkpint 1 (1 point))</span></a></h3><p>Similar to the previous tutorial, we collect the statisitcs for the data.</p><p>In the process, the data will be passed into a iterable loader. However, remember that the <code>text</code> file is no longer the format as the ASR recipe. Therefore, we will need to use another data loader to load the corresponding information.</p>`,15),k={href:"https://github.com/espnet/espnet/blob/a7bd6522b32ec6472c13f6a2289dcdff4a846c12/espnet2/train/dataset.py#L130-L249",target:"_blank",rel:"noopener noreferrer"},w=s("code",null,"[REPLACE_ME]",-1),E=s("code",null,"asvspoof.sh",-1),x=t(`<p>After the replacement, you should be able to run the following blocks</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># It takes less than 2 minutes</span></span>
75
+ <span class="line"><span style="color:#D4D4D4;">!./asvspoof.sh </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">stage </span><span style="color:#B5CEA8;">3</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">stop_stage </span><span style="color:#B5CEA8;">3</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">train_set train </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">valid_set dev </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">test_sets </span><span style="color:#CE9178;">&quot;dev eval&quot;</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">asvspoof_config conf/checkpoint1_dummy.yaml</span></span>
76
+ <span class="line"></span>
77
+ <span class="line"><span style="color:#6A9955;"># </span><span style="color:#569CD6;">NOTE</span><span style="color:#6A9955;">: Checkpoint 1</span></span>
78
+ <span class="line"><span style="color:#D4D4D4;">print_date_and_time()</span></span>
79
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="asvspoof-model" tabindex="-1"><a class="header-anchor" href="#asvspoof-model"><span>ASVSpoof Model</span></a></h2><p>In this section, we will define the ASVSpoof model and use the model to conduct the training of ASVSpoof task. For easier understanding, we first use an <code>encoder</code> to convert speech features into hidden representations and then use a <code>decoder</code> to conduct the classification.</p><h3 id="encoder-✅-checkpint-2-1-point" tabindex="-1"><a class="header-anchor" href="#encoder-✅-checkpint-2-1-point"><span>Encoder (✅ Checkpint 2 (1 point))</span></a></h3>`,5),A={href:"https://arxiv.org/abs/2010.13956",target:"_blank",rel:"noopener noreferrer"},C=t(`<p>Code-reusibility is one of the major benefits of using ESPnet as a toolkit for speech tasks. As ESPnet already support conformer block in ASR, it is easy to import into this new task.</p><p>In ESPnet, adding modules that we already have can be as simple as two-line codes. Please add lines into <code>/content/espnet/espnet2/tasks/asvspoof.py</code>. We have marked <code>TODO</code> in the scripts for your convenience.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># It takes less than 2 minutes</span></span>
80
+ <span class="line"><span style="color:#D4D4D4;">!./asvspoof.sh </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">stage </span><span style="color:#B5CEA8;">3</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">stop_stage </span><span style="color:#B5CEA8;">3</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">train_set train </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">valid_set dev </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">test_sets </span><span style="color:#CE9178;">&quot;dev eval&quot;</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">asvspoof_config conf/checkpoint2.yaml</span></span>
81
+ <span class="line"></span>
82
+ <span class="line"><span style="color:#6A9955;"># </span><span style="color:#569CD6;">NOTE</span><span style="color:#6A9955;">: Checkpoint 2</span></span>
83
+ <span class="line"><span style="color:#D4D4D4;">print_date_and_time()</span></span>
84
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="decoder-✅-checkpint-3-1-point" tabindex="-1"><a class="header-anchor" href="#decoder-✅-checkpint-3-1-point"><span>Decoder (✅ Checkpint 3 (1 point))</span></a></h3><p>In this stage, we will finally start the training. As the previous tutorial, we can use the Tensorboard to monitor the process.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># Load the TensorBoard notebook extension</span></span>
85
+ <span class="line"><span style="color:#D4D4D4;">%reload_ext tensorboard</span></span>
86
+ <span class="line"></span>
87
+ <span class="line"><span style="color:#6A9955;"># Launch tensorboard before training</span></span>
88
+ <span class="line"><span style="color:#D4D4D4;">%tensorboard </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">logdir /content/espnet/egs2/espnet_tutorial/asvspoof1/exp</span></span>
89
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>After we finished the encoder, we also need to create a decoder to conduct the prediciton. As the encoder will generate hidden representations, we want to have a simple decoder to conduct mean-pooling to all the hidden representation at the time-axis. There should be another linear layer to conclude the models into binary classification. Please fill the missing part in <code>/conent/espnet/espnet2/asvspoof/decoder/linear_decoder.py</code> to finally start the training. For people who are not familiar with Pytorch, please refer the related resources for details.</p><p>Related resources that could be helpful for this checkpoint:</p><ul><li>https://pytorch.org/docs/stable/generated/torch.mean.html</li><li>https://pytorch.org/docs/stable/generated/torch.nn.Linear.html</li><li>https://pytorch.org/tutorials/beginner/basics/buildmodel_tutorial.html</li></ul><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">nvidia-smi</span></span>
90
+ <span class="line"></span>
91
+ <span class="line"><span style="color:#6A9955;"># Training takes around 2 minutes</span></span>
92
+ <span class="line"><span style="color:#D4D4D4;">!./asvspoof.sh </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">stage </span><span style="color:#B5CEA8;">4</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">stop_stage </span><span style="color:#B5CEA8;">4</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">train_set train </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">valid_set dev </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">test_sets </span><span style="color:#CE9178;">&quot;dev eval&quot;</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">asvspoof_config conf/checkpoint2.yaml </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">inference_config conf/decode_asvspoof.yaml</span></span>
93
+ <span class="line"></span>
94
+ <span class="line"><span style="color:#6A9955;"># </span><span style="color:#569CD6;">NOTE</span><span style="color:#6A9955;">: Checkpoint 3</span></span>
95
+ <span class="line"><span style="color:#D4D4D4;">print_date_and_time()</span></span>
96
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="model-inference" tabindex="-1"><a class="header-anchor" href="#model-inference"><span>Model Inference</span></a></h2><h3 id="✅-checkpint-4-1-point" tabindex="-1"><a class="header-anchor" href="#✅-checkpint-4-1-point"><span>(✅ Checkpint 4 (1 point))</span></a></h3>`,12),S={href:"https://www.igi-global.com/dictionary/equal-error-rate-eer/35389",target:"_blank",rel:"noopener noreferrer"},F=t(`<p>Please fill the missing parts with <code>TODO</code>s in <code>/content/espnet/espnet2/bin/asvspoof_inference.py</code></p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">!./asvspoof.sh </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">stage </span><span style="color:#B5CEA8;">5</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">stop_stage </span><span style="color:#B5CEA8;">5</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">train_set train </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">valid_set dev </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">test_sets </span><span style="color:#CE9178;">&quot;eval&quot;</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">asvspoof_config conf/checkpoint2.yaml </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">inference_nj </span><span style="color:#B5CEA8;">1</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">gpu_inference true</span></span>
97
+ <span class="line"></span>
98
+ <span class="line"><span style="color:#6A9955;"># </span><span style="color:#569CD6;">NOTE</span><span style="color:#6A9955;">: Checkpoint 4</span></span>
99
+ <span class="line"><span style="color:#D4D4D4;">print_date_and_time()</span></span>
100
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="scoring" tabindex="-1"><a class="header-anchor" href="#scoring"><span>Scoring</span></a></h2><h3 id="✅-checkpint-5-1-point" tabindex="-1"><a class="header-anchor" href="#✅-checkpint-5-1-point"><span>(✅ Checkpint 5 (1 point))</span></a></h3><p>We have prepred the scoring script for you. We can get the EER by the following code-block</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">!./asvspoof.sh </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">stage </span><span style="color:#B5CEA8;">6</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">stop_stage </span><span style="color:#B5CEA8;">6</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">train_set train </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">valid_set dev </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">test_sets </span><span style="color:#CE9178;">&quot;eval&quot;</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">asvspoof_config conf/checkpoint2.yaml </span></span>
101
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">chmod +x scripts/utils/show_asvspoof_result.sh</span></span>
102
+ <span class="line"><span style="color:#6A9955;"># </span><span style="color:#569CD6;">NOTE</span><span style="color:#6A9955;">: Checkpoint 5</span></span>
103
+ <span class="line"><span style="color:#D4D4D4;">print_date_and_time()</span></span>
104
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="📗-exercise-1-1-point-bonus" tabindex="-1"><a class="header-anchor" href="#📗-exercise-1-1-point-bonus"><span>📗 Exercise 1 (1 point bonus)</span></a></h2><p>In the data you just downloaded, we have some extra data for training (<code>/content/espnet/egs2/espnet_tutorial/asvspoof1/espnet_asvspoof_tutorial/extend_train</code>). Please try to combine them with the training set and then conduct experiments the augmented set. You are also encouraged to change the model configuration. <strong>If you achieve a better equal error rate (EER) than the previous experiments, you can get a bonus point.</strong></p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># </span><span style="color:#569CD6;">TODO</span></span>
105
+ <span class="line"></span>
106
+ <span class="line"><span style="color:#6A9955;"># </span><span style="color:#569CD6;">NOTE</span><span style="color:#6A9955;">: Exercise 1</span></span>
107
+ <span class="line"><span style="color:#D4D4D4;">print_date_and_time()</span></span>
108
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="📗-exercise-2-1-point-bonus" tabindex="-1"><a class="header-anchor" href="#📗-exercise-2-1-point-bonus"><span>📗 Exercise 2 (1 point bonus)</span></a></h2>`,10),P={href:"https://arxiv.org/pdf/2010.13995.pdf",target:"_blank",rel:"noopener noreferrer"},q=t(`<p>We have implemented the AM softmax method located in <code>/content/espnet/espnet2/asvspoof/loss/am_softmax_loss.py</code> and also prepared the template <code>/content/espnet/espnet2/asvspoof/loss/oc_softmax_loss.py</code> for your implementation. You can follow the TODOs to implement the methods (note that the inference/train_config should change accordingly).</p><p><strong>If you successfully implement the OC-softmax and get similar/better EER, you can get a bouns point</strong></p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># </span><span style="color:#569CD6;">TODO</span></span>
109
+ <span class="line"></span>
110
+ <span class="line"><span style="color:#6A9955;"># </span><span style="color:#569CD6;">NOTE</span><span style="color:#6A9955;">: Exercise 2</span></span>
111
+ <span class="line"><span style="color:#D4D4D4;">print_date_and_time()</span></span>
112
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div>`,3);function T(I,B){const n=l("ExternalLinkIcon");return i(),p("div",null,[c,s("p",null,[s("a",d,[e("ESPnet"),a(n)]),e(" is a widely-used end-to-end speech processing toolkit. It has supported various speech processing tasks. ESPnet uses PyTorch as a main deep learning engine, and also follows Kaldi style data processing, feature extraction/format, and recipes to provide a complete setup for speech recognition and other speech processing experiments.")]),h,s("ul",null,[s("li",null,[s("a",u,[e("ESPnet repository"),a(n)])]),s("li",null,[s("a",v,[e("ESPnet documentation"),a(n)])]),s("li",null,[s("a",m,[e("ESPnet tutorial in Speech Recognition and Understanding (Fall 2021)"),a(n)])]),s("li",null,[s("a",y,[e("Recitation in Multilingual NLP (Spring 2022)"),a(n)])]),s("li",null,[s("a",b,[e("ESPnet tutorial1 in Speech Recognition and Understanding (Fall 2022)"),a(n)])])]),D,s("p",null,[e("Noted that because of the "),s("a",f,[e("symlink"),a(n)]),e(", the "),g,e(" is essentially the same for checkpoint 1.")]),_,s("p",null,[e("Fortunately, we have a wide range of data loaders for choices, which is listing in "),s("a",k,[e("here"),a(n)]),e(". Please choose the correct file format and replace the "),w,e(" token in "),E]),x,s("p",null,[e("First, we are going to focus on the encoder part. There has been a long history over the discussion of the speech encoder in our community. Given the sequential perspective, people firstly investigated recurrent neural networks. More recently, we are focusing on "),s("a",A,[e("conformer block"),a(n)]),e(", which is an extension to the transformer block. In the previous settings, we used a transformer block to collect stats. However, we would want to switch to conformer.")]),C,s("p",null,[e("As the training is finished, we expect to conduct ASVSpoof on the test set. To approach that, we first have to finish the inference codebase. For our task specifically, we need the log-probability of the prediction to compute "),s("a",S,[e("equal error rate (EER)"),a(n)]),e(". Therefore the output should be a float number for each utterance.")]),F,s("p",null,[e("One main issue of speech anti-spoofing research is the generalization to unseen attacks, i.e., synthesis methods not seen in training the anti-spoofing models. In fact, the test set in our scenario is exact in the same case. Recently, there is a "),s("a",P,[e("one-class learning method"),a(n)]),e(" that compacts the natural speech representations and separate them from the fake speech with a certain margin in the embedding space.")]),q])}const V=o(r,[["render",T],["__file","espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html.vue"]]),N=JSON.parse('{"path":"/tutorials/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html","title":"CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task)","lang":"en-US","frontmatter":{},"headers":[{"level":2,"title":"❗Important Notes❗","slug":"❗important-notes❗","link":"#❗important-notes❗","children":[]},{"level":2,"title":"Useful links","slug":"useful-links","link":"#useful-links","children":[]},{"level":2,"title":"Objectives","slug":"objectives","link":"#objectives","children":[]},{"level":2,"title":"Function to print date and time","slug":"function-to-print-date-and-time","link":"#function-to-print-date-and-time","children":[]},{"level":2,"title":"Download ESPnet","slug":"download-espnet","link":"#download-espnet","children":[]},{"level":2,"title":"Setup Python environment based on anaconda + Install ESPnet","slug":"setup-python-environment-based-on-anaconda-install-espnet","link":"#setup-python-environment-based-on-anaconda-install-espnet","children":[]},{"level":2,"title":"ASVSpoof data preparation","slug":"asvspoof-data-preparation","link":"#asvspoof-data-preparation","children":[{"level":3,"title":"Download dataset","slug":"download-dataset","link":"#download-dataset","children":[]},{"level":3,"title":"Prepare data (Stage1 & Stage2)","slug":"prepare-data-stage1-stage2","link":"#prepare-data-stage1-stage2","children":[]},{"level":3,"title":"ASVSpoof collect stats (✅ Checkpint 1 (1 point))","slug":"asvspoof-collect-stats-✅-checkpint-1-1-point","link":"#asvspoof-collect-stats-✅-checkpint-1-1-point","children":[]}]},{"level":2,"title":"ASVSpoof Model","slug":"asvspoof-model","link":"#asvspoof-model","children":[{"level":3,"title":"Encoder (✅ Checkpint 2 (1 point))","slug":"encoder-✅-checkpint-2-1-point","link":"#encoder-✅-checkpint-2-1-point","children":[]},{"level":3,"title":"Decoder (✅ Checkpint 3 (1 point))","slug":"decoder-✅-checkpint-3-1-point","link":"#decoder-✅-checkpint-3-1-point","children":[]}]},{"level":2,"title":"Model Inference","slug":"model-inference","link":"#model-inference","children":[{"level":3,"title":"(✅ Checkpint 4 (1 point))","slug":"✅-checkpint-4-1-point","link":"#✅-checkpint-4-1-point","children":[]}]},{"level":2,"title":"Scoring","slug":"scoring","link":"#scoring","children":[{"level":3,"title":"(✅ Checkpint 5 (1 point))","slug":"✅-checkpint-5-1-point","link":"#✅-checkpint-5-1-point","children":[]}]},{"level":2,"title":"📗 Exercise 1 (1 point bonus)","slug":"📗-exercise-1-1-point-bonus","link":"#📗-exercise-1-1-point-bonus","children":[]},{"level":2,"title":"📗 Exercise 2 (1 point bonus)","slug":"📗-exercise-2-1-point-bonus","link":"#📗-exercise-2-1-point-bonus","children":[]}],"git":{},"filePathRelative":"tutorials/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.md"}');export{V as comp,N as data};
assets/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html-DobzmqH0.js ADDED
The diff for this file is too large to render. See raw diff
 
assets/espnet2_streaming_asr_demo.html-Yx-OX8AZ.js ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import{_ as s,o as n,c as a,e}from"./app-DTS6SjJz.js";const l={},p=e(`<h1 id="espnet2-real-streaming-transformer-demonstration" tabindex="-1"><a class="header-anchor" href="#espnet2-real-streaming-transformer-demonstration"><span>ESPnet2 real streaming Transformer demonstration</span></a></h1><p>Details in &quot;Streaming Transformer ASR with Blockwise Synchronous Beam Search&quot; (https://arxiv.org/abs/2006.14941)</p><p>This local notebook provides a demonstration of streaming ASR based on Transformer using ESPnet2.</p><p>You can recognize a recorded audio file or a speech online.</p><p>Author: Keqi Deng (UCAS)</p><h2 id="train-a-streaming-transformer-model" tabindex="-1"><a class="header-anchor" href="#train-a-streaming-transformer-model"><span>Train a streaming Transformer model</span></a></h2><p>You can train a streaming Transformer model on your own corpus following the example of https://github.com/espnet/espnet/blob/master/egs2/aishell/asr1/run_streaming.sh</p><h2 id="download-pre-trained-model-and-audio-file-for-demo" tabindex="-1"><a class="header-anchor" href="#download-pre-trained-model-and-audio-file-for-demo"><span>Download pre-trained model and audio file for demo</span></a></h2><p>You can download the pre-trained model from the ESPnet_model_zoo or directly from Huggingface.</p><h3 id="for-mandarin-task-pretrained-using-aishell-1" tabindex="-1"><a class="header-anchor" href="#for-mandarin-task-pretrained-using-aishell-1"><span>For Mandarin Task (Pretrained using AISHELL-1)</span></a></h3><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">tag=</span><span style="color:#CE9178;">&#39;Emiru Tsunoo/aishell_asr_train_asr_streaming_transformer_raw_zh_char_sp_valid.acc.ave&#39;</span></span>
2
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h3 id="for-english-task-pretrained-using-tedlium2" tabindex="-1"><a class="header-anchor" href="#for-english-task-pretrained-using-tedlium2"><span>For English Task (Pretrained using Tedlium2)</span></a></h3><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">tag=</span><span style="color:#CE9178;">&#39;D-Keqi/espnet_asr_train_asr_streaming_transformer_raw_en_bpe500_sp_valid.acc.ave&#39;</span></span>
3
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h2 id="import-packages" tabindex="-1"><a class="header-anchor" href="#import-packages"><span>Import packages</span></a></h2><p>Make sure that you have installed the latest ESPnet</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> sys</span></span>
4
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> espnet</span></span>
5
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet2.bin.asr_inference_streaming </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> Speech2TextStreaming</span></span>
6
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet_model_zoo.downloader </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> ModelDownloader</span></span>
7
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> argparse</span></span>
8
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> numpy </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> np</span></span>
9
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> wave</span></span>
10
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="prepare-for-inference" tabindex="-1"><a class="header-anchor" href="#prepare-for-inference"><span>Prepare for inference</span></a></h2><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">d=ModelDownloader()</span></span>
11
+ <span class="line"><span style="color:#D4D4D4;">speech2text = Speech2TextStreaming(</span></span>
12
+ <span class="line"><span style="color:#D4D4D4;"> **d.download_and_unpack(tag),</span></span>
13
+ <span class="line"><span style="color:#9CDCFE;"> token_type</span><span style="color:#D4D4D4;">=</span><span style="color:#569CD6;">None</span><span style="color:#D4D4D4;">,</span></span>
14
+ <span class="line"><span style="color:#9CDCFE;"> bpemodel</span><span style="color:#D4D4D4;">=</span><span style="color:#569CD6;">None</span><span style="color:#D4D4D4;">,</span></span>
15
+ <span class="line"><span style="color:#9CDCFE;"> maxlenratio</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">0.0</span><span style="color:#D4D4D4;">,</span></span>
16
+ <span class="line"><span style="color:#9CDCFE;"> minlenratio</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">0.0</span><span style="color:#D4D4D4;">,</span></span>
17
+ <span class="line"><span style="color:#9CDCFE;"> beam_size</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">20</span><span style="color:#D4D4D4;">,</span></span>
18
+ <span class="line"><span style="color:#9CDCFE;"> ctc_weight</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">0.5</span><span style="color:#D4D4D4;">,</span></span>
19
+ <span class="line"><span style="color:#9CDCFE;"> lm_weight</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">0.0</span><span style="color:#D4D4D4;">,</span></span>
20
+ <span class="line"><span style="color:#9CDCFE;"> penalty</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">0.0</span><span style="color:#D4D4D4;">,</span></span>
21
+ <span class="line"><span style="color:#9CDCFE;"> nbest</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">,</span></span>
22
+ <span class="line"><span style="color:#9CDCFE;"> device</span><span style="color:#D4D4D4;"> = </span><span style="color:#CE9178;">&quot;cpu&quot;</span><span style="color:#D4D4D4;">,</span></span>
23
+ <span class="line"><span style="color:#9CDCFE;"> disable_repetition_detection</span><span style="color:#D4D4D4;">=</span><span style="color:#569CD6;">True</span><span style="color:#D4D4D4;">,</span></span>
24
+ <span class="line"><span style="color:#9CDCFE;"> decoder_text_length_limit</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">,</span></span>
25
+ <span class="line"><span style="color:#9CDCFE;"> encoded_feat_length_limit</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">0</span></span>
26
+ <span class="line"><span style="color:#D4D4D4;">)</span></span>
27
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">prev_lines = </span><span style="color:#B5CEA8;">0</span></span>
28
+ <span class="line"><span style="color:#569CD6;">def</span><span style="color:#DCDCAA;"> progress_output</span><span style="color:#D4D4D4;">(</span><span style="color:#9CDCFE;">text</span><span style="color:#D4D4D4;">):</span></span>
29
+ <span class="line"><span style="color:#569CD6;"> global</span><span style="color:#D4D4D4;"> prev_lines</span></span>
30
+ <span class="line"><span style="color:#D4D4D4;"> lines=[</span><span style="color:#CE9178;">&#39;&#39;</span><span style="color:#D4D4D4;">]</span></span>
31
+ <span class="line"><span style="color:#C586C0;"> for</span><span style="color:#D4D4D4;"> i </span><span style="color:#C586C0;">in</span><span style="color:#D4D4D4;"> text:</span></span>
32
+ <span class="line"><span style="color:#C586C0;"> if</span><span style="color:#DCDCAA;"> len</span><span style="color:#D4D4D4;">(lines[-</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">]) &gt; </span><span style="color:#B5CEA8;">100</span><span style="color:#D4D4D4;">:</span></span>
33
+ <span class="line"><span style="color:#D4D4D4;"> lines.append(</span><span style="color:#CE9178;">&#39;&#39;</span><span style="color:#D4D4D4;">)</span></span>
34
+ <span class="line"><span style="color:#D4D4D4;"> lines[-</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">] += i</span></span>
35
+ <span class="line"><span style="color:#C586C0;"> for</span><span style="color:#D4D4D4;"> i,line </span><span style="color:#C586C0;">in</span><span style="color:#DCDCAA;"> enumerate</span><span style="color:#D4D4D4;">(lines):</span></span>
36
+ <span class="line"><span style="color:#C586C0;"> if</span><span style="color:#D4D4D4;"> i == prev_lines:</span></span>
37
+ <span class="line"><span style="color:#D4D4D4;"> sys.stderr.write(</span><span style="color:#CE9178;">&#39;</span><span style="color:#D7BA7D;">\\n\\r</span><span style="color:#CE9178;">&#39;</span><span style="color:#D4D4D4;">)</span></span>
38
+ <span class="line"><span style="color:#C586C0;"> else</span><span style="color:#D4D4D4;">:</span></span>
39
+ <span class="line"><span style="color:#D4D4D4;"> sys.stderr.write(</span><span style="color:#CE9178;">&#39;</span><span style="color:#D7BA7D;">\\r\\033</span><span style="color:#CE9178;">[B</span><span style="color:#D7BA7D;">\\033</span><span style="color:#CE9178;">[K&#39;</span><span style="color:#D4D4D4;">)</span></span>
40
+ <span class="line"><span style="color:#D4D4D4;"> sys.stderr.write(line)</span></span>
41
+ <span class="line"></span>
42
+ <span class="line"><span style="color:#D4D4D4;"> prev_lines = </span><span style="color:#DCDCAA;">len</span><span style="color:#D4D4D4;">(lines)</span></span>
43
+ <span class="line"><span style="color:#D4D4D4;"> sys.stderr.flush()</span></span>
44
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#569CD6;">def</span><span style="color:#DCDCAA;"> recognize</span><span style="color:#D4D4D4;">(</span><span style="color:#9CDCFE;">wavfile</span><span style="color:#D4D4D4;">):</span></span>
45
+ <span class="line"><span style="color:#C586C0;"> with</span><span style="color:#D4D4D4;"> wave.open(wavfile, </span><span style="color:#CE9178;">&#39;rb&#39;</span><span style="color:#D4D4D4;">) </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> wavfile:</span></span>
46
+ <span class="line"><span style="color:#D4D4D4;"> ch=wavfile.getnchannels()</span></span>
47
+ <span class="line"><span style="color:#D4D4D4;"> bits=wavfile.getsampwidth()</span></span>
48
+ <span class="line"><span style="color:#D4D4D4;"> rate=wavfile.getframerate()</span></span>
49
+ <span class="line"><span style="color:#D4D4D4;"> nframes=wavfile.getnframes()</span></span>
50
+ <span class="line"><span style="color:#D4D4D4;"> buf = wavfile.readframes(-</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">)</span></span>
51
+ <span class="line"><span style="color:#D4D4D4;"> data=np.frombuffer(buf, </span><span style="color:#9CDCFE;">dtype</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&#39;int16&#39;</span><span style="color:#D4D4D4;">)</span></span>
52
+ <span class="line"><span style="color:#D4D4D4;"> speech = data.astype(np.float16)/</span><span style="color:#B5CEA8;">32767.0</span><span style="color:#6A9955;"> #32767 is the upper limit of 16-bit binary numbers and is used for the normalization of int to float.</span></span>
53
+ <span class="line"><span style="color:#D4D4D4;"> sim_chunk_length = </span><span style="color:#B5CEA8;">640</span></span>
54
+ <span class="line"><span style="color:#C586C0;"> if</span><span style="color:#D4D4D4;"> sim_chunk_length &gt; </span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">:</span></span>
55
+ <span class="line"><span style="color:#C586C0;"> for</span><span style="color:#D4D4D4;"> i </span><span style="color:#C586C0;">in</span><span style="color:#DCDCAA;"> range</span><span style="color:#D4D4D4;">(</span><span style="color:#DCDCAA;">len</span><span style="color:#D4D4D4;">(speech)//sim_chunk_length):</span></span>
56
+ <span class="line"><span style="color:#D4D4D4;"> results = speech2text(</span><span style="color:#9CDCFE;">speech</span><span style="color:#D4D4D4;">=speech[i*sim_chunk_length:(i+</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">)*sim_chunk_length], </span><span style="color:#9CDCFE;">is_final</span><span style="color:#D4D4D4;">=</span><span style="color:#569CD6;">False</span><span style="color:#D4D4D4;">)</span></span>
57
+ <span class="line"><span style="color:#C586C0;"> if</span><span style="color:#D4D4D4;"> results </span><span style="color:#569CD6;">is</span><span style="color:#569CD6;"> not</span><span style="color:#569CD6;"> None</span><span style="color:#569CD6;"> and</span><span style="color:#DCDCAA;"> len</span><span style="color:#D4D4D4;">(results) &gt; </span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">:</span></span>
58
+ <span class="line"><span style="color:#D4D4D4;"> nbests = [text </span><span style="color:#C586C0;">for</span><span style="color:#D4D4D4;"> text, token, token_int, hyp </span><span style="color:#C586C0;">in</span><span style="color:#D4D4D4;"> results]</span></span>
59
+ <span class="line"><span style="color:#D4D4D4;"> text = nbests[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">] </span><span style="color:#C586C0;">if</span><span style="color:#D4D4D4;"> nbests </span><span style="color:#569CD6;">is</span><span style="color:#569CD6;"> not</span><span style="color:#569CD6;"> None</span><span style="color:#569CD6;"> and</span><span style="color:#DCDCAA;"> len</span><span style="color:#D4D4D4;">(nbests) &gt; </span><span style="color:#B5CEA8;">0</span><span style="color:#C586C0;"> else</span><span style="color:#CE9178;"> &quot;&quot;</span></span>
60
+ <span class="line"><span style="color:#D4D4D4;"> progress_output(nbests[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">])</span></span>
61
+ <span class="line"><span style="color:#C586C0;"> else</span><span style="color:#D4D4D4;">:</span></span>
62
+ <span class="line"><span style="color:#D4D4D4;"> progress_output(</span><span style="color:#CE9178;">&quot;&quot;</span><span style="color:#D4D4D4;">)</span></span>
63
+ <span class="line"><span style="color:#D4D4D4;"> </span></span>
64
+ <span class="line"><span style="color:#D4D4D4;"> results = speech2text(speech[(i+</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">)*sim_chunk_length:</span><span style="color:#DCDCAA;">len</span><span style="color:#D4D4D4;">(speech)], </span><span style="color:#9CDCFE;">is_final</span><span style="color:#D4D4D4;">=</span><span style="color:#569CD6;">True</span><span style="color:#D4D4D4;">)</span></span>
65
+ <span class="line"><span style="color:#C586C0;"> else</span><span style="color:#D4D4D4;">:</span></span>
66
+ <span class="line"><span style="color:#D4D4D4;"> results = speech2text(speech, </span><span style="color:#9CDCFE;">is_final</span><span style="color:#D4D4D4;">=</span><span style="color:#569CD6;">True</span><span style="color:#D4D4D4;">)</span></span>
67
+ <span class="line"><span style="color:#D4D4D4;"> nbests = [text </span><span style="color:#C586C0;">for</span><span style="color:#D4D4D4;"> text, token, token_int, hyp </span><span style="color:#C586C0;">in</span><span style="color:#D4D4D4;"> results]</span></span>
68
+ <span class="line"><span style="color:#D4D4D4;"> progress_output(nbests[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">])</span></span>
69
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="recognize-the-audio-file" tabindex="-1"><a class="header-anchor" href="#recognize-the-audio-file"><span>Recognize the audio file</span></a></h2><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;">#You can upload your own audio file for recognition, and also we provide some demo audio files that you can download from Google drive. </span></span>
70
+ <span class="line"><span style="color:#6A9955;">#For Mandarin task, the demo file comes from the AISSHELL-1: https://drive.google.com/file/d/1l8w93r8Bs5FtC3A-1ydEqFQdP4k6FiUL/view?usp=sharing</span></span>
71
+ <span class="line"><span style="color:#6A9955;">#wavfile=&#39;./BAC009S0724W0121.wav&#39;</span></span>
72
+ <span class="line"><span style="color:#6A9955;">#For English task, the demo file comes from the Librispeech: https://drive.google.com/file/d/1l71ZUNQ6qQk95T54H0tH_OEwZvWnEL4u/view?usp=sharing</span></span>
73
+ <span class="line"><span style="color:#6A9955;">#wavfile=&#39;./61-70968-0000.wav&#39;</span></span>
74
+ <span class="line"><span style="color:#D4D4D4;">recognize(wavfile)</span></span>
75
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="recognize-the-speech-from-speaker" tabindex="-1"><a class="header-anchor" href="#recognize-the-speech-from-speaker"><span>Recognize the speech from speaker</span></a></h2><h3 id="install-pyaudio" tabindex="-1"><a class="header-anchor" href="#install-pyaudio"><span>Install pyaudio</span></a></h3><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> pyaudio</span></span>
76
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h3 id="streamingly-recognize-with-pyaudio" tabindex="-1"><a class="header-anchor" href="#streamingly-recognize-with-pyaudio"><span>Streamingly recognize with pyaudio</span></a></h3><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">CHUNK=</span><span style="color:#B5CEA8;">2048</span></span>
77
+ <span class="line"><span style="color:#D4D4D4;">FORMAT=pyaudio.paInt16</span></span>
78
+ <span class="line"><span style="color:#D4D4D4;">CHANNELS=</span><span style="color:#B5CEA8;">1</span></span>
79
+ <span class="line"><span style="color:#D4D4D4;">RATE=</span><span style="color:#B5CEA8;">16000</span></span>
80
+ <span class="line"><span style="color:#D4D4D4;">RECORD_SECONDS=</span><span style="color:#B5CEA8;">5</span></span>
81
+ <span class="line"><span style="color:#D4D4D4;">p=pyaudio.PyAudio()</span></span>
82
+ <span class="line"><span style="color:#D4D4D4;">stream = p.open(</span><span style="color:#9CDCFE;">format</span><span style="color:#D4D4D4;">=FORMAT,</span><span style="color:#9CDCFE;">channels</span><span style="color:#D4D4D4;">=CHANNELS,</span><span style="color:#9CDCFE;">rate</span><span style="color:#D4D4D4;">=RATE,</span><span style="color:#9CDCFE;">input</span><span style="color:#D4D4D4;">=</span><span style="color:#569CD6;">True</span><span style="color:#D4D4D4;">,</span><span style="color:#9CDCFE;">frames_per_buffer</span><span style="color:#D4D4D4;">=CHUNK)</span></span>
83
+ <span class="line"><span style="color:#C586C0;">for</span><span style="color:#D4D4D4;"> i </span><span style="color:#C586C0;">in</span><span style="color:#DCDCAA;"> range</span><span style="color:#D4D4D4;">(</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">,</span><span style="color:#4EC9B0;">int</span><span style="color:#D4D4D4;">(RATE/CHUNK*RECORD_SECONDS)+</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">):</span></span>
84
+ <span class="line"><span style="color:#D4D4D4;"> data=stream.read(CHUNK)</span></span>
85
+ <span class="line"><span style="color:#D4D4D4;"> data=np.frombuffer(data, </span><span style="color:#9CDCFE;">dtype</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&#39;int16&#39;</span><span style="color:#D4D4D4;">)</span></span>
86
+ <span class="line"><span style="color:#D4D4D4;"> data=data.astype(np.float16)/</span><span style="color:#B5CEA8;">32767.0</span><span style="color:#6A9955;"> #32767 is the upper limit of 16-bit binary numbers and is used for the normalization of int to float.</span></span>
87
+ <span class="line"><span style="color:#C586C0;"> if</span><span style="color:#D4D4D4;"> i==</span><span style="color:#4EC9B0;">int</span><span style="color:#D4D4D4;">(RATE/CHUNK*RECORD_SECONDS):</span></span>
88
+ <span class="line"><span style="color:#D4D4D4;"> results = speech2text(</span><span style="color:#9CDCFE;">speech</span><span style="color:#D4D4D4;">=data, </span><span style="color:#9CDCFE;">is_final</span><span style="color:#D4D4D4;">=</span><span style="color:#569CD6;">True</span><span style="color:#D4D4D4;">)</span></span>
89
+ <span class="line"><span style="color:#C586C0;"> break</span></span>
90
+ <span class="line"><span style="color:#D4D4D4;"> results = speech2text(</span><span style="color:#9CDCFE;">speech</span><span style="color:#D4D4D4;">=data, </span><span style="color:#9CDCFE;">is_final</span><span style="color:#D4D4D4;">=</span><span style="color:#569CD6;">False</span><span style="color:#D4D4D4;">)</span></span>
91
+ <span class="line"><span style="color:#C586C0;"> if</span><span style="color:#D4D4D4;"> results </span><span style="color:#569CD6;">is</span><span style="color:#569CD6;"> not</span><span style="color:#569CD6;"> None</span><span style="color:#569CD6;"> and</span><span style="color:#DCDCAA;"> len</span><span style="color:#D4D4D4;">(results) &gt; </span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">:</span></span>
92
+ <span class="line"><span style="color:#D4D4D4;"> nbests = [text </span><span style="color:#C586C0;">for</span><span style="color:#D4D4D4;"> text, token, token_int, hyp </span><span style="color:#C586C0;">in</span><span style="color:#D4D4D4;"> results]</span></span>
93
+ <span class="line"><span style="color:#D4D4D4;"> text = nbests[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">] </span><span style="color:#C586C0;">if</span><span style="color:#D4D4D4;"> nbests </span><span style="color:#569CD6;">is</span><span style="color:#569CD6;"> not</span><span style="color:#569CD6;"> None</span><span style="color:#569CD6;"> and</span><span style="color:#DCDCAA;"> len</span><span style="color:#D4D4D4;">(nbests) &gt; </span><span style="color:#B5CEA8;">0</span><span style="color:#C586C0;"> else</span><span style="color:#CE9178;"> &quot;&quot;</span></span>
94
+ <span class="line"><span style="color:#D4D4D4;"> progress_output(nbests[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">])</span></span>
95
+ <span class="line"><span style="color:#C586C0;"> else</span><span style="color:#D4D4D4;">:</span></span>
96
+ <span class="line"><span style="color:#D4D4D4;"> progress_output(</span><span style="color:#CE9178;">&quot;&quot;</span><span style="color:#D4D4D4;">)</span></span>
97
+ <span class="line"><span style="color:#D4D4D4;">nbests = [text </span><span style="color:#C586C0;">for</span><span style="color:#D4D4D4;"> text, token, token_int, hyp </span><span style="color:#C586C0;">in</span><span style="color:#D4D4D4;"> results]</span></span>
98
+ <span class="line"><span style="color:#D4D4D4;">progress_output(nbests[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">])</span></span>
99
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div>`,27),o=[p];function r(i,t){return n(),a("div",null,o)}const D=s(l,[["render",r],["__file","espnet2_streaming_asr_demo.html.vue"]]),d=JSON.parse('{"path":"/espnet2/asr/espnet2_streaming_asr_demo.html","title":"ESPnet2 real streaming Transformer demonstration","lang":"en-US","frontmatter":{},"headers":[{"level":2,"title":"Train a streaming Transformer model","slug":"train-a-streaming-transformer-model","link":"#train-a-streaming-transformer-model","children":[]},{"level":2,"title":"Download pre-trained model and audio file for demo","slug":"download-pre-trained-model-and-audio-file-for-demo","link":"#download-pre-trained-model-and-audio-file-for-demo","children":[{"level":3,"title":"For Mandarin Task (Pretrained using AISHELL-1)","slug":"for-mandarin-task-pretrained-using-aishell-1","link":"#for-mandarin-task-pretrained-using-aishell-1","children":[]},{"level":3,"title":"For English Task (Pretrained using Tedlium2)","slug":"for-english-task-pretrained-using-tedlium2","link":"#for-english-task-pretrained-using-tedlium2","children":[]}]},{"level":2,"title":"Import packages","slug":"import-packages","link":"#import-packages","children":[]},{"level":2,"title":"Prepare for inference","slug":"prepare-for-inference","link":"#prepare-for-inference","children":[]},{"level":2,"title":"Recognize the audio file","slug":"recognize-the-audio-file","link":"#recognize-the-audio-file","children":[]},{"level":2,"title":"Recognize the speech from speaker","slug":"recognize-the-speech-from-speaker","link":"#recognize-the-speech-from-speaker","children":[{"level":3,"title":"Install pyaudio","slug":"install-pyaudio","link":"#install-pyaudio","children":[]},{"level":3,"title":"Streamingly recognize with pyaudio","slug":"streamingly-recognize-with-pyaudio","link":"#streamingly-recognize-with-pyaudio","children":[]}]}],"git":{},"filePathRelative":"espnet2/asr/espnet2_streaming_asr_demo.md"}');export{D as comp,d as data};
assets/espnet2_tts_realtime_demo.html-BdxLBr1c.js ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import{_ as l,r as i,o as t,c as p,a as s,b as e,d as n,e as o}from"./app-DTS6SjJz.js";const r={},c={href:"https://colab.research.google.com/github/espnet/notebook/blob/master/espnet2_tts_realtime_demo.ipynb",target:"_blank",rel:"noopener noreferrer"},d=s("img",{src:"https://colab.research.google.com/assets/colab-badge.svg",alt:"Open In Colab"},null,-1),u=s("h1",{id:"espnet2-tts-realtime-demonstration",tabindex:"-1"},[s("a",{class:"header-anchor",href:"#espnet2-tts-realtime-demonstration"},[s("span",null,"ESPnet2-TTS realtime demonstration")])],-1),v=s("p",null,"This notebook provides a demonstration of the realtime E2E-TTS using ESPnet2-TTS and ParallelWaveGAN repo.",-1),m=s("ul",null,[s("li",null,"ESPnet2-TTS: https://github.com/espnet/espnet/tree/master/egs2/TEMPLATE/tts1"),s("li",null,"ParallelWaveGAN: https://github.com/kan-bayashi/ParallelWaveGAN")],-1),h={href:"https://github.com/kan-bayashi",target:"_blank",rel:"noopener noreferrer"},b=o(`<h2 id="installation" tabindex="-1"><a class="header-anchor" href="#installation"><span>Installation</span></a></h2><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># NOTE: pip shows imcompatible errors due to preinstalled libraries but you do not need to care</span></span>
2
+ <span class="line"><span>!pip install -q espnet==202308 pypinyin==0.44.0 parallel_wavegan==0.5.4 gdown==4.4.0 espnet_model_zoo</span></span>
3
+ <span class="line"><span></span></span>
4
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="single-speaker-model-demo" tabindex="-1"><a class="header-anchor" href="#single-speaker-model-demo"><span>Single speaker model demo</span></a></h2><h3 id="model-selection" tabindex="-1"><a class="header-anchor" href="#model-selection"><span>Model Selection</span></a></h3><p>Please select model: English, Japanese, and Mandarin are supported.</p><p>You can try end-to-end text2wav model &amp; combination of text2mel and vocoder.<br> If you use text2wav model, you do not need to use vocoder (automatically disabled).</p><p><strong>Text2wav models</strong>:</p><ul><li>VITS</li></ul><p><strong>Text2mel models</strong>:</p><ul><li>Tacotron2</li><li>Transformer-TTS</li><li>(Conformer) FastSpeech</li><li>(Conformer) FastSpeech2</li></ul><p><strong>Vocoders</strong>:</p><ul><li>Parallel WaveGAN</li><li>Multi-band MelGAN</li><li>HiFiGAN</li><li>Style MelGAN.</li></ul><blockquote><p>The terms of use follow that of each corpus. We use the following corpora:</p></blockquote><ul><li><code>ljspeech_*</code>: LJSpeech dataset <ul><li>https://keithito.com/LJ-Speech-Dataset/</li></ul></li><li><code>jsut_*</code>: JSUT corpus <ul><li>https://sites.google.com/site/shinnosuketakamichi/publication/jsut</li></ul></li><li><code>jvs_*</code>: JVS corpus + JSUT corpus <ul><li>https://sites.google.com/site/shinnosuketakamichi/research-topics/jvs_corpus</li><li>https://sites.google.com/site/shinnosuketakamichi/publication/jsut</li></ul></li><li><code>tsukuyomi_*</code>: つくよみちゃんコーパス + JSUT corpus <ul><li>https://tyc.rei-yumesaki.net/material/corpus/</li><li>https://sites.google.com/site/shinnosuketakamichi/publication/jsut</li></ul></li><li><code>csmsc_*</code>: Chinese Standard Mandarin Speech Corpus <ul><li>https://www.data-baker.com/open_source.html</li></ul></li></ul><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>#@title Choose English model { run: &quot;auto&quot; }</span></span>
5
+ <span class="line"><span>lang = &#39;English&#39;</span></span>
6
+ <span class="line"><span>tag = &#39;kan-bayashi/ljspeech_vits&#39; #@param [&quot;kan-bayashi/ljspeech_tacotron2&quot;, &quot;kan-bayashi/ljspeech_fastspeech&quot;, &quot;kan-bayashi/ljspeech_fastspeech2&quot;, &quot;kan-bayashi/ljspeech_conformer_fastspeech2&quot;, &quot;kan-bayashi/ljspeech_joint_finetune_conformer_fastspeech2_hifigan&quot;, &quot;kan-bayashi/ljspeech_joint_train_conformer_fastspeech2_hifigan&quot;, &quot;kan-bayashi/ljspeech_vits&quot;] {type:&quot;string&quot;}</span></span>
7
+ <span class="line"><span>vocoder_tag = &quot;none&quot; #@param [&quot;none&quot;, &quot;parallel_wavegan/ljspeech_parallel_wavegan.v1&quot;, &quot;parallel_wavegan/ljspeech_full_band_melgan.v2&quot;, &quot;parallel_wavegan/ljspeech_multi_band_melgan.v2&quot;, &quot;parallel_wavegan/ljspeech_hifigan.v1&quot;, &quot;parallel_wavegan/ljspeech_style_melgan.v1&quot;] {type:&quot;string&quot;}</span></span>
8
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>#@title Choose Japanese model { run: &quot;auto&quot; }</span></span>
9
+ <span class="line"><span>lang = &#39;Japanese&#39;</span></span>
10
+ <span class="line"><span>tag = &#39;kan-bayashi/jsut_full_band_vits_prosody&#39; #@param [&quot;kan-bayashi/jsut_tacotron2&quot;, &quot;kan-bayashi/jsut_transformer&quot;, &quot;kan-bayashi/jsut_fastspeech&quot;, &quot;kan-bayashi/jsut_fastspeech2&quot;, &quot;kan-bayashi/jsut_conformer_fastspeech2&quot;, &quot;kan-bayashi/jsut_conformer_fastspeech2_accent&quot;, &quot;kan-bayashi/jsut_conformer_fastspeech2_accent_with_pause&quot;, &quot;kan-bayashi/jsut_vits_accent_with_pause&quot;, &quot;kan-bayashi/jsut_full_band_vits_accent_with_pause&quot;, &quot;kan-bayashi/jsut_tacotron2_prosody&quot;, &quot;kan-bayashi/jsut_transformer_prosody&quot;, &quot;kan-bayashi/jsut_conformer_fastspeech2_tacotron2_prosody&quot;, &quot;kan-bayashi/jsut_vits_prosody&quot;, &quot;kan-bayashi/jsut_full_band_vits_prosody&quot;, &quot;kan-bayashi/jvs_jvs010_vits_prosody&quot;, &quot;kan-bayashi/tsukuyomi_full_band_vits_prosody&quot;] {type:&quot;string&quot;}</span></span>
11
+ <span class="line"><span>vocoder_tag = &#39;none&#39; #@param [&quot;none&quot;, &quot;parallel_wavegan/jsut_parallel_wavegan.v1&quot;, &quot;parallel_wavegan/jsut_multi_band_melgan.v2&quot;, &quot;parallel_wavegan/jsut_style_melgan.v1&quot;, &quot;parallel_wavegan/jsut_hifigan.v1&quot;] {type:&quot;string&quot;}</span></span>
12
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>#@title Choose Mandarin model { run: &quot;auto&quot; }</span></span>
13
+ <span class="line"><span>lang = &#39;Mandarin&#39;</span></span>
14
+ <span class="line"><span>tag = &#39;kan-bayashi/csmsc_full_band_vits&#39; #@param [&quot;kan-bayashi/csmsc_tacotron2&quot;, &quot;kan-bayashi/csmsc_transformer&quot;, &quot;kan-bayashi/csmsc_fastspeech&quot;, &quot;kan-bayashi/csmsc_fastspeech2&quot;, &quot;kan-bayashi/csmsc_conformer_fastspeech2&quot;, &quot;kan-bayashi/csmsc_vits&quot;, &quot;kan-bayashi/csmsc_full_band_vits&quot;] {type: &quot;string&quot;}</span></span>
15
+ <span class="line"><span>vocoder_tag = &quot;none&quot; #@param [&quot;none&quot;, &quot;parallel_wavegan/csmsc_parallel_wavegan.v1&quot;, &quot;parallel_wavegan/csmsc_multi_band_melgan.v2&quot;, &quot;parallel_wavegan/csmsc_hifigan.v1&quot;, &quot;parallel_wavegan/csmsc_style_melgan.v1&quot;] {type:&quot;string&quot;}</span></span>
16
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="model-setup" tabindex="-1"><a class="header-anchor" href="#model-setup"><span>Model Setup</span></a></h3><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>from espnet2.bin.tts_inference import Text2Speech</span></span>
17
+ <span class="line"><span>from espnet2.utils.types import str_or_none</span></span>
18
+ <span class="line"><span></span></span>
19
+ <span class="line"><span>text2speech = Text2Speech.from_pretrained(</span></span>
20
+ <span class="line"><span> model_tag=str_or_none(tag),</span></span>
21
+ <span class="line"><span> vocoder_tag=str_or_none(vocoder_tag),</span></span>
22
+ <span class="line"><span> device=&quot;cuda&quot;,</span></span>
23
+ <span class="line"><span> # Only for Tacotron 2 &amp; Transformer</span></span>
24
+ <span class="line"><span> threshold=0.5,</span></span>
25
+ <span class="line"><span> # Only for Tacotron 2</span></span>
26
+ <span class="line"><span> minlenratio=0.0,</span></span>
27
+ <span class="line"><span> maxlenratio=10.0,</span></span>
28
+ <span class="line"><span> use_att_constraint=False,</span></span>
29
+ <span class="line"><span> backward_window=1,</span></span>
30
+ <span class="line"><span> forward_window=3,</span></span>
31
+ <span class="line"><span> # Only for FastSpeech &amp; FastSpeech2 &amp; VITS</span></span>
32
+ <span class="line"><span> speed_control_alpha=1.0,</span></span>
33
+ <span class="line"><span> # Only for VITS</span></span>
34
+ <span class="line"><span> noise_scale=0.333,</span></span>
35
+ <span class="line"><span> noise_scale_dur=0.333,</span></span>
36
+ <span class="line"><span>)</span></span>
37
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="synthesis" tabindex="-1"><a class="header-anchor" href="#synthesis"><span>Synthesis</span></a></h3><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>import time</span></span>
38
+ <span class="line"><span>import torch</span></span>
39
+ <span class="line"><span></span></span>
40
+ <span class="line"><span># decide the input sentence by yourself</span></span>
41
+ <span class="line"><span>print(f&quot;Input your favorite sentence in {lang}.&quot;)</span></span>
42
+ <span class="line"><span>x = input()</span></span>
43
+ <span class="line"><span></span></span>
44
+ <span class="line"><span># synthesis</span></span>
45
+ <span class="line"><span>with torch.no_grad():</span></span>
46
+ <span class="line"><span> start = time.time()</span></span>
47
+ <span class="line"><span> wav = text2speech(x)[&quot;wav&quot;]</span></span>
48
+ <span class="line"><span>rtf = (time.time() - start) / (len(wav) / text2speech.fs)</span></span>
49
+ <span class="line"><span>print(f&quot;RTF = {rtf:5f}&quot;)</span></span>
50
+ <span class="line"><span></span></span>
51
+ <span class="line"><span># let us listen to generated samples</span></span>
52
+ <span class="line"><span>from IPython.display import display, Audio</span></span>
53
+ <span class="line"><span>display(Audio(wav.view(-1).cpu().numpy(), rate=text2speech.fs))</span></span>
54
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="multi-speaker-model-demo" tabindex="-1"><a class="header-anchor" href="#multi-speaker-model-demo"><span>Multi-speaker Model Demo</span></a></h2><h3 id="model-selection-1" tabindex="-1"><a class="header-anchor" href="#model-selection-1"><span>Model Selection</span></a></h3><p>Now we provide only English multi-speaker pretrained model.</p><blockquote><p>The terms of use follow that of each corpus. We use the following corpora:</p></blockquote><ul><li><code>libritts_*</code>: LibriTTS corpus <ul><li>http://www.openslr.org/60</li></ul></li><li><code>vctk_*</code>: English Multi-speaker Corpus for CSTR Voice Cloning Toolkit <ul><li>http://www.udialogue.org/download/cstr-vctk-corpus.html</li></ul></li></ul><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>#@title English multi-speaker pretrained model { run: &quot;auto&quot; }</span></span>
55
+ <span class="line"><span>lang = &#39;English&#39;</span></span>
56
+ <span class="line"><span>tag = &#39;kan-bayashi/vctk_full_band_multi_spk_vits&#39; #@param [&quot;kan-bayashi/vctk_gst_tacotron2&quot;, &quot;kan-bayashi/vctk_gst_transformer&quot;, &quot;kan-bayashi/vctk_xvector_tacotron2&quot;, &quot;kan-bayashi/vctk_xvector_transformer&quot;, &quot;kan-bayashi/vctk_xvector_conformer_fastspeech2&quot;, &quot;kan-bayashi/vctk_gst+xvector_tacotron2&quot;, &quot;kan-bayashi/vctk_gst+xvector_transformer&quot;, &quot;kan-bayashi/vctk_gst+xvector_conformer_fastspeech2&quot;, &quot;kan-bayashi/vctk_multi_spk_vits&quot;, &quot;kan-bayashi/vctk_full_band_multi_spk_vits&quot;, &quot;kan-bayashi/libritts_xvector_transformer&quot;, &quot;kan-bayashi/libritts_xvector_conformer_fastspeech2&quot;, &quot;kan-bayashi/libritts_gst+xvector_transformer&quot;, &quot;kan-bayashi/libritts_gst+xvector_conformer_fastspeech2&quot;, &quot;kan-bayashi/libritts_xvector_vits&quot;] {type:&quot;string&quot;}</span></span>
57
+ <span class="line"><span>vocoder_tag = &quot;none&quot; #@param [&quot;none&quot;, &quot;parallel_wavegan/vctk_parallel_wavegan.v1.long&quot;, &quot;parallel_wavegan/vctk_multi_band_melgan.v2&quot;, &quot;parallel_wavegan/vctk_style_melgan.v1&quot;, &quot;parallel_wavegan/vctk_hifigan.v1&quot;, &quot;parallel_wavegan/libritts_parallel_wavegan.v1.long&quot;, &quot;parallel_wavegan/libritts_multi_band_melgan.v2&quot;, &quot;parallel_wavegan/libritts_hifigan.v1&quot;, &quot;parallel_wavegan/libritts_style_melgan.v1&quot;] {type:&quot;string&quot;}</span></span>
58
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="model-setup-1" tabindex="-1"><a class="header-anchor" href="#model-setup-1"><span>Model Setup</span></a></h3><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>from espnet2.bin.tts_inference import Text2Speech</span></span>
59
+ <span class="line"><span>from espnet2.utils.types import str_or_none</span></span>
60
+ <span class="line"><span></span></span>
61
+ <span class="line"><span>text2speech = Text2Speech.from_pretrained(</span></span>
62
+ <span class="line"><span> model_tag=str_or_none(tag),</span></span>
63
+ <span class="line"><span> vocoder_tag=str_or_none(vocoder_tag),</span></span>
64
+ <span class="line"><span> device=&quot;cuda&quot;,</span></span>
65
+ <span class="line"><span> # Only for Tacotron 2 &amp; Transformer</span></span>
66
+ <span class="line"><span> threshold=0.5,</span></span>
67
+ <span class="line"><span> # Only for Tacotron 2</span></span>
68
+ <span class="line"><span> minlenratio=0.0,</span></span>
69
+ <span class="line"><span> maxlenratio=10.0,</span></span>
70
+ <span class="line"><span> use_att_constraint=False,</span></span>
71
+ <span class="line"><span> backward_window=1,</span></span>
72
+ <span class="line"><span> forward_window=3,</span></span>
73
+ <span class="line"><span> # Only for FastSpeech &amp; FastSpeech2 &amp; VITS</span></span>
74
+ <span class="line"><span> speed_control_alpha=1.0,</span></span>
75
+ <span class="line"><span> # Only for VITS</span></span>
76
+ <span class="line"><span> noise_scale=0.333,</span></span>
77
+ <span class="line"><span> noise_scale_dur=0.333,</span></span>
78
+ <span class="line"><span>)</span></span>
79
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="speaker-selection" tabindex="-1"><a class="header-anchor" href="#speaker-selection"><span>Speaker selection</span></a></h3><p>For multi-speaker model, we need to provide X-vector and/or the reference speech to decide the speaker characteristics.<br> For X-vector, you can select the speaker from the dumped x-vectors.<br> For the reference speech, you can use any speech but please make sure the sampling rate is matched.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>import glob</span></span>
80
+ <span class="line"><span>import os</span></span>
81
+ <span class="line"><span>import numpy as np</span></span>
82
+ <span class="line"><span>import kaldiio</span></span>
83
+ <span class="line"><span></span></span>
84
+ <span class="line"><span># Get model directory path</span></span>
85
+ <span class="line"><span>from espnet_model_zoo.downloader import ModelDownloader</span></span>
86
+ <span class="line"><span>d = ModelDownloader()</span></span>
87
+ <span class="line"><span>model_dir = os.path.dirname(d.download_and_unpack(tag)[&quot;train_config&quot;])</span></span>
88
+ <span class="line"><span></span></span>
89
+ <span class="line"><span># X-vector selection</span></span>
90
+ <span class="line"><span>spembs = None</span></span>
91
+ <span class="line"><span>if text2speech.use_spembs:</span></span>
92
+ <span class="line"><span> xvector_ark = [p for p in glob.glob(f&quot;{model_dir}/../../dump/**/spk_xvector.ark&quot;, recursive=True) if &quot;tr&quot; in p][0]</span></span>
93
+ <span class="line"><span> xvectors = {k: v for k, v in kaldiio.load_ark(xvector_ark)}</span></span>
94
+ <span class="line"><span> spks = list(xvectors.keys())</span></span>
95
+ <span class="line"><span></span></span>
96
+ <span class="line"><span> # randomly select speaker</span></span>
97
+ <span class="line"><span> random_spk_idx = np.random.randint(0, len(spks))</span></span>
98
+ <span class="line"><span> spk = spks[random_spk_idx]</span></span>
99
+ <span class="line"><span> spembs = xvectors[spk]</span></span>
100
+ <span class="line"><span> print(f&quot;selected spk: {spk}&quot;)</span></span>
101
+ <span class="line"><span></span></span>
102
+ <span class="line"><span># Speaker ID selection</span></span>
103
+ <span class="line"><span>sids = None</span></span>
104
+ <span class="line"><span>if text2speech.use_sids:</span></span>
105
+ <span class="line"><span> spk2sid = glob.glob(f&quot;{model_dir}/../../dump/**/spk2sid&quot;, recursive=True)[0]</span></span>
106
+ <span class="line"><span> with open(spk2sid) as f:</span></span>
107
+ <span class="line"><span> lines = [line.strip() for line in f.readlines()]</span></span>
108
+ <span class="line"><span> sid2spk = {int(line.split()[1]): line.split()[0] for line in lines}</span></span>
109
+ <span class="line"><span> </span></span>
110
+ <span class="line"><span> # randomly select speaker</span></span>
111
+ <span class="line"><span> sids = np.array(np.random.randint(1, len(sid2spk)))</span></span>
112
+ <span class="line"><span> spk = sid2spk[int(sids)]</span></span>
113
+ <span class="line"><span> print(f&quot;selected spk: {spk}&quot;)</span></span>
114
+ <span class="line"><span></span></span>
115
+ <span class="line"><span># Reference speech selection for GST</span></span>
116
+ <span class="line"><span>speech = None</span></span>
117
+ <span class="line"><span>if text2speech.use_speech:</span></span>
118
+ <span class="line"><span> # you can change here to load your own reference speech</span></span>
119
+ <span class="line"><span> # e.g.</span></span>
120
+ <span class="line"><span> # import soundfile as sf</span></span>
121
+ <span class="line"><span> # speech, fs = sf.read(&quot;/path/to/reference.wav&quot;)</span></span>
122
+ <span class="line"><span> # speech = torch.from_numpy(speech).float()</span></span>
123
+ <span class="line"><span> speech = torch.randn(50000,) * 0.01</span></span>
124
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="synthesis-1" tabindex="-1"><a class="header-anchor" href="#synthesis-1"><span>Synthesis</span></a></h3><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>import time</span></span>
125
+ <span class="line"><span>import torch</span></span>
126
+ <span class="line"><span></span></span>
127
+ <span class="line"><span># decide the input sentence by yourself</span></span>
128
+ <span class="line"><span>print(f&quot;Input your favorite sentence in {lang}.&quot;)</span></span>
129
+ <span class="line"><span>x = input()</span></span>
130
+ <span class="line"><span></span></span>
131
+ <span class="line"><span># synthesis</span></span>
132
+ <span class="line"><span>with torch.no_grad():</span></span>
133
+ <span class="line"><span> start = time.time()</span></span>
134
+ <span class="line"><span> wav = text2speech(x, speech=speech, spembs=spembs, sids=sids)[&quot;wav&quot;]</span></span>
135
+ <span class="line"><span>rtf = (time.time() - start) / (len(wav) / text2speech.fs)</span></span>
136
+ <span class="line"><span>print(f&quot;RTF = {rtf:5f}&quot;)</span></span>
137
+ <span class="line"><span></span></span>
138
+ <span class="line"><span># let us listen to generated samples</span></span>
139
+ <span class="line"><span>from IPython.display import display, Audio</span></span>
140
+ <span class="line"><span>display(Audio(wav.view(-1).cpu().numpy(), rate=text2speech.fs))</span></span>
141
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div>`,34);function _(q,k){const a=i("ExternalLinkIcon");return t(),p("div",null,[s("p",null,[s("a",c,[d,e(a)])]),u,v,m,s("p",null,[n("Author: Tomoki Hayashi ("),s("a",h,[n("@kan-bayashi"),e(a)]),n(")")]),b])}const f=l(r,[["render",_],["__file","espnet2_tts_realtime_demo.html.vue"]]),y=JSON.parse('{"path":"/espnet2/tts/espnet2_tts_realtime_demo.html","title":"ESPnet2-TTS realtime demonstration","lang":"en-US","frontmatter":{},"headers":[{"level":2,"title":"Installation","slug":"installation","link":"#installation","children":[]},{"level":2,"title":"Single speaker model demo","slug":"single-speaker-model-demo","link":"#single-speaker-model-demo","children":[{"level":3,"title":"Model Selection","slug":"model-selection","link":"#model-selection","children":[]},{"level":3,"title":"Model Setup","slug":"model-setup","link":"#model-setup","children":[]},{"level":3,"title":"Synthesis","slug":"synthesis","link":"#synthesis","children":[]}]},{"level":2,"title":"Multi-speaker Model Demo","slug":"multi-speaker-model-demo","link":"#multi-speaker-model-demo","children":[{"level":3,"title":"Model Selection","slug":"model-selection-1","link":"#model-selection-1","children":[]},{"level":3,"title":"Model Setup","slug":"model-setup-1","link":"#model-setup-1","children":[]},{"level":3,"title":"Speaker selection","slug":"speaker-selection","link":"#speaker-selection","children":[]},{"level":3,"title":"Synthesis","slug":"synthesis-1","link":"#synthesis-1","children":[]}]}],"git":{},"filePathRelative":"espnet2/tts/espnet2_tts_realtime_demo.md"}');export{f as comp,y as data};
assets/espnet2_tutorial_2021_CMU_11751_18781.html-BY6Z52B4.js ADDED
The diff for this file is too large to render. See raw diff
 
assets/espnet_se_demonstration_for_waspaa_2021.html--JdDNbEo.js ADDED
@@ -0,0 +1,355 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import{_ as l,r as p,o as t,c as r,a as n,b as a,d as s,e as i}from"./app-DTS6SjJz.js";const d={},o=n("h1",{id:"espnet-speech-enhancement-demonstration",tabindex:"-1"},[n("a",{class:"header-anchor",href:"#espnet-speech-enhancement-demonstration"},[n("span",null,"ESPnet Speech Enhancement Demonstration")])],-1),c={href:"https://colab.research.google.com/drive/1fjRJCh96SoYLZPRxsjF9VDv4Q2VoIckI?usp=sharing",target:"_blank",rel:"noopener noreferrer"},u=n("img",{src:"https://colab.research.google.com/assets/colab-badge.svg",alt:"Open In Colab"},null,-1),m=n("p",null,"This notebook provides a demonstration of the speech enhancement and separation using ESPnet2-SE.",-1),v=n("ul",null,[n("li",null,"ESPnet2-SE: https://github.com/espnet/espnet/tree/master/egs2/TEMPLATE/enh1")],-1),h=n("p",null,"Presenters:",-1),b=n("ul",null,[n("li",null,"Shinji Watanabe (shinjiw@cmu.edu)"),n("li",null,"Chenda Li (lichenda1996@sjtu.edu.cn)"),n("li",null,"Jing Shi (shijing2014@ia.ac.cn)"),n("li",null,"Wangyou Zhang (wyz-97@sjtu.edu.cn)"),n("li",null,"Yen-Ju Lu (neil.lu@citi.sinica.edu.tw)")],-1),_={href:"https://github.com/LiChenda",target:"_blank",rel:"noopener noreferrer"},f={href:"https://github.com/Emrys365",target:"_blank",rel:"noopener noreferrer"},g=i(`<h1 id="contents" tabindex="-1"><a class="header-anchor" href="#contents"><span>Contents</span></a></h1><p>(1) Tutorials on the Basic Usage</p><ol><li><p>Install</p></li><li><p>Speech Enhancement with Pretrained Models</p></li></ol><blockquote><p>We support various interfaces, e.g. Python API, HuggingFace API, portable speech enhancement scripts for other tasks, etc.</p></blockquote><p>2.1 Single-channel Enhancement (CHiME-4)</p><p>2.2 Enhance Your Own Recordings</p><p>2.3 Multi-channel Enhancement (CHiME-4)</p><ol start="3"><li>Speech Separation with Pretrained Models</li></ol><p>3.1 Model Selection</p><p>3.2 Separate Speech Mixture</p><ol start="4"><li>Evaluate Separated Speech with the Pretrained ASR Model</li></ol><p>(2) Tutorials for Adding New Recipe and Contributing to ESPnet-SE Project</p><ol><li><p>Creating a New Recipe</p></li><li><p>Implementing a New Speech Enhancement/Separation Model</p></li></ol><h1 id="_1-tutorials-on-the-basic-usage" tabindex="-1"><a class="header-anchor" href="#_1-tutorials-on-the-basic-usage"><span>(1) Tutorials on the Basic Usage</span></a></h1><h2 id="install" tabindex="-1"><a class="header-anchor" href="#install"><span>Install</span></a></h2><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>%pip install -q espnet==0.10.1</span></span>
2
+ <span class="line"><span>%pip install -q espnet_model_zoo</span></span>
3
+ <span class="line"><span></span></span>
4
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="speech-enhancement-with-pretrained-models" tabindex="-1"><a class="header-anchor" href="#speech-enhancement-with-pretrained-models"><span>Speech Enhancement with Pretrained Models</span></a></h2><h3 id="single-channel-enhancement-the-chime-example" tabindex="-1"><a class="header-anchor" href="#single-channel-enhancement-the-chime-example"><span>Single-Channel Enhancement, the CHiME example</span></a></h3><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># Download one utterance from real noisy speech of CHiME4</span></span>
5
+ <span class="line"><span>!gdown --id 1SmrN5NFSg6JuQSs2sfy3ehD8OIcqK6wS -O /content/M05_440C0213_PED_REAL.wav</span></span>
6
+ <span class="line"><span>import os</span></span>
7
+ <span class="line"><span></span></span>
8
+ <span class="line"><span>import soundfile</span></span>
9
+ <span class="line"><span>from IPython.display import display, Audio</span></span>
10
+ <span class="line"><span>mixwav_mc, sr = soundfile.read(&quot;/content/M05_440C0213_PED_REAL.wav&quot;)</span></span>
11
+ <span class="line"><span># mixwav.shape: num_samples, num_channels</span></span>
12
+ <span class="line"><span>mixwav_sc = mixwav_mc[:,4]</span></span>
13
+ <span class="line"><span>display(Audio(mixwav_mc.T, rate=sr))</span></span>
14
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h4 id="download-and-load-the-pretrained-conv-tasnet" tabindex="-1"><a class="header-anchor" href="#download-and-load-the-pretrained-conv-tasnet"><span>Download and load the pretrained Conv-Tasnet</span></a></h4><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!gdown --id 17DMWdw84wF3fz3t7ia1zssdzhkpVQGZm -O /content/chime_tasnet_singlechannel.zip</span></span>
15
+ <span class="line"><span>!unzip /content/chime_tasnet_singlechannel.zip -d /content/enh_model_sc</span></span>
16
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># Load the model</span></span>
17
+ <span class="line"><span># If you encounter error &quot;No module named &#39;espnet2&#39;&quot;, please re-run the 1st Cell. This might be a colab bug.</span></span>
18
+ <span class="line"><span>import sys</span></span>
19
+ <span class="line"><span>import soundfile</span></span>
20
+ <span class="line"><span>from espnet2.bin.enh_inference import SeparateSpeech</span></span>
21
+ <span class="line"><span></span></span>
22
+ <span class="line"><span></span></span>
23
+ <span class="line"><span>separate_speech = {}</span></span>
24
+ <span class="line"><span># For models downloaded from GoogleDrive, you can use the following script:</span></span>
25
+ <span class="line"><span>enh_model_sc = SeparateSpeech(</span></span>
26
+ <span class="line"><span> train_config=&quot;/content/enh_model_sc/exp/enh_train_enh_conv_tasnet_raw/config.yaml&quot;,</span></span>
27
+ <span class="line"><span> model_file=&quot;/content/enh_model_sc/exp/enh_train_enh_conv_tasnet_raw/5epoch.pth&quot;,</span></span>
28
+ <span class="line"><span> # for segment-wise process on long speech</span></span>
29
+ <span class="line"><span> normalize_segment_scale=False,</span></span>
30
+ <span class="line"><span> show_progressbar=True,</span></span>
31
+ <span class="line"><span> ref_channel=4,</span></span>
32
+ <span class="line"><span> normalize_output_wav=True,</span></span>
33
+ <span class="line"><span> device=&quot;cuda:0&quot;,</span></span>
34
+ <span class="line"><span>)</span></span>
35
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h4 id="enhance-the-single-channel-real-noisy-speech-in-chime4" tabindex="-1"><a class="header-anchor" href="#enhance-the-single-channel-real-noisy-speech-in-chime4"><span>Enhance the single-channel real noisy speech in CHiME4</span></a></h4><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># play the enhanced single-channel speech</span></span>
36
+ <span class="line"><span>wave = enh_model_sc(mixwav_sc[None, ...], sr)</span></span>
37
+ <span class="line"><span></span></span>
38
+ <span class="line"><span>print(&quot;Input real noisy speech&quot;, flush=True)</span></span>
39
+ <span class="line"><span>display(Audio(mixwav_sc, rate=sr))</span></span>
40
+ <span class="line"><span>print(&quot;Enhanced speech&quot;, flush=True)</span></span>
41
+ <span class="line"><span>display(Audio(wave[0].squeeze(), rate=sr))</span></span>
42
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="enhance-your-own-pre-recordings" tabindex="-1"><a class="header-anchor" href="#enhance-your-own-pre-recordings"><span>Enhance your own pre-recordings</span></a></h3><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>from google.colab import files</span></span>
43
+ <span class="line"><span>from IPython.display import display, Audio</span></span>
44
+ <span class="line"><span>import soundfile</span></span>
45
+ <span class="line"><span></span></span>
46
+ <span class="line"><span>uploaded = files.upload()</span></span>
47
+ <span class="line"><span></span></span>
48
+ <span class="line"><span>for file_name in uploaded.keys():</span></span>
49
+ <span class="line"><span> speech, rate = soundfile.read(file_name)</span></span>
50
+ <span class="line"><span> assert rate == sr, &quot;mismatch in sampling rate&quot;</span></span>
51
+ <span class="line"><span> wave = enh_model_sc(speech[None, ...], sr)</span></span>
52
+ <span class="line"><span> print(f&quot;Your input speech {file_name}&quot;, flush=True)</span></span>
53
+ <span class="line"><span> display(Audio(speech, rate=sr))</span></span>
54
+ <span class="line"><span> print(f&quot;Enhanced speech for {file_name}&quot;, flush=True)</span></span>
55
+ <span class="line"><span> display(Audio(wave[0].squeeze(), rate=sr))</span></span>
56
+ <span class="line"><span></span></span>
57
+ <span class="line"><span></span></span>
58
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="multi-channel-enhancement" tabindex="-1"><a class="header-anchor" href="#multi-channel-enhancement"><span>Multi-Channel Enhancement</span></a></h3><h4 id="download-and-load-the-pretrained-mvdr-neural-beamformer" tabindex="-1"><a class="header-anchor" href="#download-and-load-the-pretrained-mvdr-neural-beamformer"><span>Download and load the pretrained mvdr neural beamformer.</span></a></h4><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># Download the pretained enhancement model</span></span>
59
+ <span class="line"><span></span></span>
60
+ <span class="line"><span>!gdown --id 1FohDfBlOa7ipc9v2luY-QIFQ_GJ1iW_i -O /content/mvdr_beamformer_16k_se_raw_valid.zip</span></span>
61
+ <span class="line"><span>!unzip /content/mvdr_beamformer_16k_se_raw_valid.zip -d /content/enh_model_mc </span></span>
62
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># Load the model</span></span>
63
+ <span class="line"><span># If you encounter error &quot;No module named &#39;espnet2&#39;&quot;, please re-run the 1st Cell. This might be a colab bug.</span></span>
64
+ <span class="line"><span>import sys</span></span>
65
+ <span class="line"><span>import soundfile</span></span>
66
+ <span class="line"><span>from espnet2.bin.enh_inference import SeparateSpeech</span></span>
67
+ <span class="line"><span></span></span>
68
+ <span class="line"><span></span></span>
69
+ <span class="line"><span>separate_speech = {}</span></span>
70
+ <span class="line"><span># For models downloaded from GoogleDrive, you can use the following script:</span></span>
71
+ <span class="line"><span>enh_model_mc = SeparateSpeech(</span></span>
72
+ <span class="line"><span> train_config=&quot;/content/enh_model_mc/exp/enh_train_enh_beamformer_mvdr_raw/config.yaml&quot;,</span></span>
73
+ <span class="line"><span> model_file=&quot;/content/enh_model_mc/exp/enh_train_enh_beamformer_mvdr_raw/11epoch.pth&quot;,</span></span>
74
+ <span class="line"><span> # for segment-wise process on long speech</span></span>
75
+ <span class="line"><span> normalize_segment_scale=False,</span></span>
76
+ <span class="line"><span> show_progressbar=True,</span></span>
77
+ <span class="line"><span> ref_channel=4,</span></span>
78
+ <span class="line"><span> normalize_output_wav=True,</span></span>
79
+ <span class="line"><span> device=&quot;cuda:0&quot;,</span></span>
80
+ <span class="line"><span>)</span></span>
81
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h4 id="enhance-the-multi-channel-real-noisy-speech-in-chime4" tabindex="-1"><a class="header-anchor" href="#enhance-the-multi-channel-real-noisy-speech-in-chime4"><span>Enhance the multi-channel real noisy speech in CHiME4</span></a></h4><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>wave = enh_model_mc(mixwav_mc[None, ...], sr)</span></span>
82
+ <span class="line"><span>print(&quot;Input real noisy speech&quot;, flush=True)</span></span>
83
+ <span class="line"><span>display(Audio(mixwav_mc.T, rate=sr))</span></span>
84
+ <span class="line"><span>print(&quot;Enhanced speech&quot;, flush=True)</span></span>
85
+ <span class="line"><span>display(Audio(wave[0].squeeze(), rate=sr))</span></span>
86
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h4 id="portable-speech-enhancement-scripts-for-other-tasks" tabindex="-1"><a class="header-anchor" href="#portable-speech-enhancement-scripts-for-other-tasks"><span>Portable speech enhancement scripts for other tasks</span></a></h4><p>For an ESPNet ASR or TTS dataset like below:</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>data</span></span>
87
+ <span class="line"><span>\`-- et05_real_isolated_6ch_track</span></span>
88
+ <span class="line"><span> |-- spk2utt</span></span>
89
+ <span class="line"><span> |-- text</span></span>
90
+ <span class="line"><span> |-- utt2spk</span></span>
91
+ <span class="line"><span> |-- utt2uniq</span></span>
92
+ <span class="line"><span> \`-- wav.scp</span></span>
93
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>Run the following scripts to create an enhanced dataset:</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>scripts/utils/enhance_dataset.sh \\</span></span>
94
+ <span class="line"><span> --spk_num 1 \\</span></span>
95
+ <span class="line"><span> --gpu_inference true \\</span></span>
96
+ <span class="line"><span> --inference_nj 4 \\</span></span>
97
+ <span class="line"><span> --fs 16k \\</span></span>
98
+ <span class="line"><span> --id_prefix &quot;&quot; \\</span></span>
99
+ <span class="line"><span> dump/raw/et05_real_isolated_6ch_track \\</span></span>
100
+ <span class="line"><span> data/et05_real_isolated_6ch_track_enh \\</span></span>
101
+ <span class="line"><span> exp/enh_train_enh_beamformer_mvdr_raw/valid.loss.best.pth</span></span>
102
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>The above script will generate a new directory data/et05_real_isolated_6ch_track_enh:</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>data</span></span>
103
+ <span class="line"><span>\`-- et05_real_isolated_6ch_track_enh</span></span>
104
+ <span class="line"><span> |-- spk2utt</span></span>
105
+ <span class="line"><span> |-- text</span></span>
106
+ <span class="line"><span> |-- utt2spk</span></span>
107
+ <span class="line"><span> |-- utt2uniq</span></span>
108
+ <span class="line"><span> |-- wav.scp</span></span>
109
+ <span class="line"><span> \`-- wavs/</span></span>
110
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>where wav.scp contains paths to the enhanced audios (stored in wavs/).</p><h2 id="speech-separation" tabindex="-1"><a class="header-anchor" href="#speech-separation"><span>Speech Separation</span></a></h2><h3 id="model-selection" tabindex="-1"><a class="header-anchor" href="#model-selection"><span>Model Selection</span></a></h3><p>In this demonstration, we will show different speech separation models on wsj0_2mix.</p>`,43),x={href:"https://zenodo.org/",target:"_blank",rel:"noopener noreferrer"},w={href:"https://huggingface.co/",target:"_blank",rel:"noopener noreferrer"},q=i(`<div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>#@title Choose Speech Separation model { run: &quot;auto&quot; }</span></span>
111
+ <span class="line"><span></span></span>
112
+ <span class="line"><span>fs = 8000 #@param {type:&quot;integer&quot;}</span></span>
113
+ <span class="line"><span>tag = &quot;espnet/Chenda_Li_wsj0_2mix_enh_train_enh_conv_tasnet_raw_valid.si_snr.ave&quot; #@param [&quot;Chenda Li/wsj0_2mix_enh_train_enh_conv_tasnet_raw_valid.si_snr.ave&quot;, &quot;Chenda Li/wsj0_2mix_enh_train_enh_rnn_tf_raw_valid.si_snr.ave&quot;, &quot;https://zenodo.org/record/4688000/files/enh_train_enh_dprnn_tasnet_raw_valid.si_snr.ave.zip&quot;, &quot;espnet/Chenda_Li_wsj0_2mix_enh_train_enh_conv_tasnet_raw_valid.si_snr.ave&quot;]</span></span>
114
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># For models uploaded to Zenodo, you can use the following python script instead:</span></span>
115
+ <span class="line"><span>import sys</span></span>
116
+ <span class="line"><span>import soundfile</span></span>
117
+ <span class="line"><span>from espnet_model_zoo.downloader import ModelDownloader</span></span>
118
+ <span class="line"><span>from espnet2.bin.enh_inference import SeparateSpeech</span></span>
119
+ <span class="line"><span></span></span>
120
+ <span class="line"><span>d = ModelDownloader()</span></span>
121
+ <span class="line"><span></span></span>
122
+ <span class="line"><span>cfg = d.download_and_unpack(tag)</span></span>
123
+ <span class="line"><span>separate_speech = SeparateSpeech(</span></span>
124
+ <span class="line"><span> enh_train_config=cfg[&quot;train_config&quot;],</span></span>
125
+ <span class="line"><span> enh_model_file=cfg[&quot;model_file&quot;],</span></span>
126
+ <span class="line"><span> # for segment-wise process on long speech</span></span>
127
+ <span class="line"><span> segment_size=2.4,</span></span>
128
+ <span class="line"><span> hop_size=0.8,</span></span>
129
+ <span class="line"><span> normalize_segment_scale=False,</span></span>
130
+ <span class="line"><span> show_progressbar=True,</span></span>
131
+ <span class="line"><span> ref_channel=None,</span></span>
132
+ <span class="line"><span> normalize_output_wav=True,</span></span>
133
+ <span class="line"><span> device=&quot;cuda:0&quot;,</span></span>
134
+ <span class="line"><span>)</span></span>
135
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="separate-speech-mixture" tabindex="-1"><a class="header-anchor" href="#separate-speech-mixture"><span>Separate Speech Mixture</span></a></h3><h4 id="separate-the-example-in-wsj0-2mix-testing-set" tabindex="-1"><a class="header-anchor" href="#separate-the-example-in-wsj0-2mix-testing-set"><span>Separate the example in wsj0_2mix testing set</span></a></h4><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!gdown --id 1ZCUkd_Lb7pO2rpPr4FqYdtJBZ7JMiInx -O /content/447c020t_1.2106_422a0112_-1.2106.wav</span></span>
136
+ <span class="line"><span></span></span>
137
+ <span class="line"><span>import os</span></span>
138
+ <span class="line"><span>import soundfile</span></span>
139
+ <span class="line"><span>from IPython.display import display, Audio</span></span>
140
+ <span class="line"><span></span></span>
141
+ <span class="line"><span>mixwav, sr = soundfile.read(&quot;447c020t_1.2106_422a0112_-1.2106.wav&quot;)</span></span>
142
+ <span class="line"><span>waves_wsj = separate_speech(mixwav[None, ...], fs=sr)</span></span>
143
+ <span class="line"><span></span></span>
144
+ <span class="line"><span>print(&quot;Input mixture&quot;, flush=True)</span></span>
145
+ <span class="line"><span>display(Audio(mixwav, rate=sr))</span></span>
146
+ <span class="line"><span>print(f&quot;========= Separated speech with model {tag} =========&quot;, flush=True)</span></span>
147
+ <span class="line"><span>print(&quot;Separated spk1&quot;, flush=True)</span></span>
148
+ <span class="line"><span>display(Audio(waves_wsj[0].squeeze(), rate=sr))</span></span>
149
+ <span class="line"><span>print(&quot;Separated spk2&quot;, flush=True)</span></span>
150
+ <span class="line"><span>display(Audio(waves_wsj[1].squeeze(), rate=sr))</span></span>
151
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h4 id="separate-your-own-recordings" tabindex="-1"><a class="header-anchor" href="#separate-your-own-recordings"><span>Separate your own recordings</span></a></h4><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>from google.colab import files</span></span>
152
+ <span class="line"><span>from IPython.display import display, Audio</span></span>
153
+ <span class="line"><span>import soundfile</span></span>
154
+ <span class="line"><span></span></span>
155
+ <span class="line"><span>uploaded = files.upload()</span></span>
156
+ <span class="line"><span></span></span>
157
+ <span class="line"><span>for file_name in uploaded.keys():</span></span>
158
+ <span class="line"><span> mixwav_yours, rate = soundfile.read(file_name)</span></span>
159
+ <span class="line"><span> assert rate == sr, &quot;mismatch in sampling rate&quot;</span></span>
160
+ <span class="line"><span> waves_yours = separate_speech(mixwav_yours[None, ...], fs=sr)</span></span>
161
+ <span class="line"><span> print(&quot;Input mixture&quot;, flush=True)</span></span>
162
+ <span class="line"><span> display(Audio(mixwav_yours, rate=sr))</span></span>
163
+ <span class="line"><span> print(f&quot;========= Separated speech with model {tag} =========&quot;, flush=True)</span></span>
164
+ <span class="line"><span> print(&quot;Separated spk1&quot;, flush=True)</span></span>
165
+ <span class="line"><span> display(Audio(waves_yours[0].squeeze(), rate=sr))</span></span>
166
+ <span class="line"><span> print(&quot;Separated spk2&quot;, flush=True)</span></span>
167
+ <span class="line"><span> display(Audio(waves_yours[1].squeeze(), rate=sr))</span></span>
168
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h4 id="show-spectrums-of-separated-speech" tabindex="-1"><a class="header-anchor" href="#show-spectrums-of-separated-speech"><span>Show spectrums of separated speech</span></a></h4><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>import matplotlib.pyplot as plt</span></span>
169
+ <span class="line"><span>import torch</span></span>
170
+ <span class="line"><span>from torch_complex.tensor import ComplexTensor</span></span>
171
+ <span class="line"><span></span></span>
172
+ <span class="line"><span>from espnet.asr.asr_utils import plot_spectrogram</span></span>
173
+ <span class="line"><span>from espnet2.layers.stft import Stft</span></span>
174
+ <span class="line"><span></span></span>
175
+ <span class="line"><span></span></span>
176
+ <span class="line"><span>stft = Stft(</span></span>
177
+ <span class="line"><span> n_fft=512,</span></span>
178
+ <span class="line"><span> win_length=None,</span></span>
179
+ <span class="line"><span> hop_length=128,</span></span>
180
+ <span class="line"><span> window=&quot;hann&quot;,</span></span>
181
+ <span class="line"><span>)</span></span>
182
+ <span class="line"><span>ilens = torch.LongTensor([len(mixwav)])</span></span>
183
+ <span class="line"><span># specs: (T, F)</span></span>
184
+ <span class="line"><span>spec_mix = ComplexTensor(</span></span>
185
+ <span class="line"><span> *torch.unbind(</span></span>
186
+ <span class="line"><span> stft(torch.as_tensor(mixwav).unsqueeze(0), ilens)[0].squeeze(),</span></span>
187
+ <span class="line"><span> dim=-1</span></span>
188
+ <span class="line"><span> )</span></span>
189
+ <span class="line"><span>)</span></span>
190
+ <span class="line"><span>spec_sep1 = ComplexTensor(</span></span>
191
+ <span class="line"><span> *torch.unbind(</span></span>
192
+ <span class="line"><span> stft(torch.as_tensor(waves_wsj[0]), ilens)[0].squeeze(),</span></span>
193
+ <span class="line"><span> dim=-1</span></span>
194
+ <span class="line"><span> )</span></span>
195
+ <span class="line"><span>)</span></span>
196
+ <span class="line"><span>spec_sep2 = ComplexTensor(</span></span>
197
+ <span class="line"><span> *torch.unbind(</span></span>
198
+ <span class="line"><span> stft(torch.as_tensor(waves_wsj[1]), ilens)[0].squeeze(),</span></span>
199
+ <span class="line"><span> dim=-1</span></span>
200
+ <span class="line"><span> )</span></span>
201
+ <span class="line"><span>)</span></span>
202
+ <span class="line"><span></span></span>
203
+ <span class="line"><span># freqs = torch.linspace(0, sr / 2, spec_mix.shape[1])</span></span>
204
+ <span class="line"><span># frames = torch.linspace(0, len(mixwav) / sr, spec_mix.shape[0])</span></span>
205
+ <span class="line"><span>samples = torch.linspace(0, len(mixwav) / sr, len(mixwav))</span></span>
206
+ <span class="line"><span>plt.figure(figsize=(24, 12))</span></span>
207
+ <span class="line"><span>plt.subplot(3, 2, 1)</span></span>
208
+ <span class="line"><span>plt.title(&#39;Mixture Spectrogram&#39;)</span></span>
209
+ <span class="line"><span>plot_spectrogram(</span></span>
210
+ <span class="line"><span> plt, abs(spec_mix).transpose(-1, -2).numpy(), fs=sr,</span></span>
211
+ <span class="line"><span> mode=&#39;db&#39;, frame_shift=None,</span></span>
212
+ <span class="line"><span> bottom=False, labelbottom=False</span></span>
213
+ <span class="line"><span>)</span></span>
214
+ <span class="line"><span>plt.subplot(3, 2, 2)</span></span>
215
+ <span class="line"><span>plt.title(&#39;Mixture Wavform&#39;)</span></span>
216
+ <span class="line"><span>plt.plot(samples, mixwav)</span></span>
217
+ <span class="line"><span>plt.xlim(0, len(mixwav) / sr)</span></span>
218
+ <span class="line"><span></span></span>
219
+ <span class="line"><span>plt.subplot(3, 2, 3)</span></span>
220
+ <span class="line"><span>plt.title(&#39;Separated Spectrogram (spk1)&#39;)</span></span>
221
+ <span class="line"><span>plot_spectrogram(</span></span>
222
+ <span class="line"><span> plt, abs(spec_sep1).transpose(-1, -2).numpy(), fs=sr,</span></span>
223
+ <span class="line"><span> mode=&#39;db&#39;, frame_shift=None,</span></span>
224
+ <span class="line"><span> bottom=False, labelbottom=False</span></span>
225
+ <span class="line"><span>)</span></span>
226
+ <span class="line"><span>plt.subplot(3, 2, 4)</span></span>
227
+ <span class="line"><span>plt.title(&#39;Separated Wavform (spk1)&#39;)</span></span>
228
+ <span class="line"><span>plt.plot(samples, waves_wsj[0].squeeze())</span></span>
229
+ <span class="line"><span>plt.xlim(0, len(mixwav) / sr)</span></span>
230
+ <span class="line"><span></span></span>
231
+ <span class="line"><span>plt.subplot(3, 2, 5)</span></span>
232
+ <span class="line"><span>plt.title(&#39;Separated Spectrogram (spk2)&#39;)</span></span>
233
+ <span class="line"><span>plot_spectrogram(</span></span>
234
+ <span class="line"><span> plt, abs(spec_sep2).transpose(-1, -2).numpy(), fs=sr,</span></span>
235
+ <span class="line"><span> mode=&#39;db&#39;, frame_shift=None,</span></span>
236
+ <span class="line"><span> bottom=False, labelbottom=False</span></span>
237
+ <span class="line"><span>)</span></span>
238
+ <span class="line"><span>plt.subplot(3, 2, 6)</span></span>
239
+ <span class="line"><span>plt.title(&#39;Separated Wavform (spk2)&#39;)</span></span>
240
+ <span class="line"><span>plt.plot(samples, waves_wsj[1].squeeze())</span></span>
241
+ <span class="line"><span>plt.xlim(0, len(mixwav) / sr)</span></span>
242
+ <span class="line"><span>plt.xlabel(&quot;Time (s)&quot;)</span></span>
243
+ <span class="line"><span>plt.show()</span></span>
244
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="evluate-separated-speech-with-pretrained-asr-model" tabindex="-1"><a class="header-anchor" href="#evluate-separated-speech-with-pretrained-asr-model"><span>Evluate separated speech with pretrained ASR model</span></a></h2><p>The ground truths are:</p><p><code>text_1: SOME CRITICS INCLUDING HIGH REAGAN ADMINISTRATION OFFICIALS ARE RAISING THE ALARM THAT THE FED&#39;S POLICY IS TOO TIGHT AND COULD CAUSE A RECESSION NEXT YEAR</code></p><p><code>text_2: THE UNITED STATES UNDERTOOK TO DEFEND WESTERN EUROPE AGAINST SOVIET ATTACK</code></p><p>(This may take a while for the speech recognition.)</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>%pip install -q https://github.com/kpu/kenlm/archive/master.zip # ASR need kenlm</span></span>
245
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>import espnet_model_zoo</span></span>
246
+ <span class="line"><span>from espnet_model_zoo.downloader import ModelDownloader</span></span>
247
+ <span class="line"><span>from espnet2.bin.asr_inference import Speech2Text</span></span>
248
+ <span class="line"><span></span></span>
249
+ <span class="line"><span>wsj_8k_model_url=&quot;https://zenodo.org/record/4012264/files/asr_train_asr_transformer_raw_char_1gpu_valid.acc.ave.zip?download=1&quot;</span></span>
250
+ <span class="line"><span></span></span>
251
+ <span class="line"><span>d = ModelDownloader()</span></span>
252
+ <span class="line"><span>speech2text = Speech2Text(</span></span>
253
+ <span class="line"><span> **d.download_and_unpack(wsj_8k_model_url),</span></span>
254
+ <span class="line"><span> device=&quot;cuda:0&quot;,</span></span>
255
+ <span class="line"><span>)</span></span>
256
+ <span class="line"><span></span></span>
257
+ <span class="line"><span>text_est = [None, None]</span></span>
258
+ <span class="line"><span>text_est[0], *_ = speech2text(waves_wsj[0].squeeze())[0]</span></span>
259
+ <span class="line"><span>text_est[1], *_ = speech2text(waves_wsj[1].squeeze())[0]</span></span>
260
+ <span class="line"><span>text_m, *_ = speech2text(mixwav)[0]</span></span>
261
+ <span class="line"><span>print(&quot;Mix Speech to Text: &quot;, text_m)</span></span>
262
+ <span class="line"><span>print(&quot;Separated Speech 1 to Text: &quot;, text_est[0])</span></span>
263
+ <span class="line"><span>print(&quot;Separated Speech 2 to Text: &quot;, text_est[1])</span></span>
264
+ <span class="line"><span></span></span>
265
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>import difflib</span></span>
266
+ <span class="line"><span>from itertools import permutations</span></span>
267
+ <span class="line"><span></span></span>
268
+ <span class="line"><span>import editdistance</span></span>
269
+ <span class="line"><span>import numpy as np</span></span>
270
+ <span class="line"><span></span></span>
271
+ <span class="line"><span>colors = dict(</span></span>
272
+ <span class="line"><span> red=lambda text: f&quot;\\033[38;2;255;0;0m{text}\\033[0m&quot; if text else &quot;&quot;,</span></span>
273
+ <span class="line"><span> green=lambda text: f&quot;\\033[38;2;0;255;0m{text}\\033[0m&quot; if text else &quot;&quot;,</span></span>
274
+ <span class="line"><span> yellow=lambda text: f&quot;\\033[38;2;225;225;0m{text}\\033[0m&quot; if text else &quot;&quot;,</span></span>
275
+ <span class="line"><span> white=lambda text: f&quot;\\033[38;2;255;255;255m{text}\\033[0m&quot; if text else &quot;&quot;,</span></span>
276
+ <span class="line"><span> black=lambda text: f&quot;\\033[38;2;0;0;0m{text}\\033[0m&quot; if text else &quot;&quot;,</span></span>
277
+ <span class="line"><span>)</span></span>
278
+ <span class="line"><span></span></span>
279
+ <span class="line"><span>def diff_strings(ref, est):</span></span>
280
+ <span class="line"><span> &quot;&quot;&quot;Reference: https://stackoverflow.com/a/64404008/7384873&quot;&quot;&quot;</span></span>
281
+ <span class="line"><span> ref_str, est_str, err_str = [], [], []</span></span>
282
+ <span class="line"><span> matcher = difflib.SequenceMatcher(None, ref, est)</span></span>
283
+ <span class="line"><span> for opcode, a0, a1, b0, b1 in matcher.get_opcodes():</span></span>
284
+ <span class="line"><span> if opcode == &quot;equal&quot;:</span></span>
285
+ <span class="line"><span> txt = ref[a0:a1]</span></span>
286
+ <span class="line"><span> ref_str.append(txt)</span></span>
287
+ <span class="line"><span> est_str.append(txt)</span></span>
288
+ <span class="line"><span> err_str.append(&quot; &quot; * (a1 - a0))</span></span>
289
+ <span class="line"><span> elif opcode == &quot;insert&quot;:</span></span>
290
+ <span class="line"><span> ref_str.append(&quot;*&quot; * (b1 - b0))</span></span>
291
+ <span class="line"><span> est_str.append(colors[&quot;green&quot;](est[b0:b1]))</span></span>
292
+ <span class="line"><span> err_str.append(colors[&quot;black&quot;](&quot;I&quot; * (b1 - b0)))</span></span>
293
+ <span class="line"><span> elif opcode == &quot;delete&quot;:</span></span>
294
+ <span class="line"><span> ref_str.append(ref[a0:a1])</span></span>
295
+ <span class="line"><span> est_str.append(colors[&quot;red&quot;](&quot;*&quot; * (a1 - a0)))</span></span>
296
+ <span class="line"><span> err_str.append(colors[&quot;black&quot;](&quot;D&quot; * (a1 - a0)))</span></span>
297
+ <span class="line"><span> elif opcode == &quot;replace&quot;:</span></span>
298
+ <span class="line"><span> diff = a1 - a0 - b1 + b0</span></span>
299
+ <span class="line"><span> if diff &gt;= 0:</span></span>
300
+ <span class="line"><span> txt_ref = ref[a0:a1]</span></span>
301
+ <span class="line"><span> txt_est = colors[&quot;yellow&quot;](est[b0:b1]) + colors[&quot;red&quot;](&quot;*&quot; * diff)</span></span>
302
+ <span class="line"><span> txt_err = &quot;S&quot; * (b1 - b0) + &quot;D&quot; * diff</span></span>
303
+ <span class="line"><span> elif diff &lt; 0:</span></span>
304
+ <span class="line"><span> txt_ref = ref[a0:a1] + &quot;*&quot; * -diff</span></span>
305
+ <span class="line"><span> txt_est = colors[&quot;yellow&quot;](est[b0:b1]) + colors[&quot;green&quot;](&quot;*&quot; * -diff)</span></span>
306
+ <span class="line"><span> txt_err = &quot;S&quot; * (b1 - b0) + &quot;I&quot; * -diff</span></span>
307
+ <span class="line"><span></span></span>
308
+ <span class="line"><span> ref_str.append(txt_ref)</span></span>
309
+ <span class="line"><span> est_str.append(txt_est)</span></span>
310
+ <span class="line"><span> err_str.append(colors[&quot;black&quot;](txt_err))</span></span>
311
+ <span class="line"><span> return &quot;&quot;.join(ref_str), &quot;&quot;.join(est_str), &quot;&quot;.join(err_str)</span></span>
312
+ <span class="line"><span></span></span>
313
+ <span class="line"><span></span></span>
314
+ <span class="line"><span>text_ref = [</span></span>
315
+ <span class="line"><span> &quot;SOME CRITICS INCLUDING HIGH REAGAN ADMINISTRATION OFFICIALS ARE RAISING THE ALARM THAT THE FED&#39;S POLICY IS TOO TIGHT AND COULD CAUSE A RECESSION NEXT YEAR&quot;,</span></span>
316
+ <span class="line"><span> &quot;THE UNITED STATES UNDERTOOK TO DEFEND WESTERN EUROPE AGAINST SOVIET ATTACK&quot;,</span></span>
317
+ <span class="line"><span>]</span></span>
318
+ <span class="line"><span></span></span>
319
+ <span class="line"><span>print(&quot;=====================&quot; , flush=True)</span></span>
320
+ <span class="line"><span>perms = list(permutations(range(2)))</span></span>
321
+ <span class="line"><span>string_edit = [</span></span>
322
+ <span class="line"><span> [</span></span>
323
+ <span class="line"><span> editdistance.eval(text_ref[m], text_est[n])</span></span>
324
+ <span class="line"><span> for m, n in enumerate(p)</span></span>
325
+ <span class="line"><span> ]</span></span>
326
+ <span class="line"><span> for p in perms</span></span>
327
+ <span class="line"><span>]</span></span>
328
+ <span class="line"><span></span></span>
329
+ <span class="line"><span>dist = [sum(edist) for edist in string_edit]</span></span>
330
+ <span class="line"><span>perm_idx = np.argmin(dist)</span></span>
331
+ <span class="line"><span>perm = perms[perm_idx]</span></span>
332
+ <span class="line"><span></span></span>
333
+ <span class="line"><span>for i, p in enumerate(perm):</span></span>
334
+ <span class="line"><span> print(&quot;\\n--------------- Text %d ---------------&quot; % (i + 1), flush=True)</span></span>
335
+ <span class="line"><span> ref, est, err = diff_strings(text_ref[i], text_est[p])</span></span>
336
+ <span class="line"><span> print(&quot;REF: &quot; + ref + &quot;\\n&quot; + &quot;HYP: &quot; + est + &quot;\\n&quot; + &quot;ERR: &quot; + err, flush=True)</span></span>
337
+ <span class="line"><span> print(&quot;Edit Distance = {}\\n&quot;.format(string_edit[perm_idx][i]), flush=True)</span></span>
338
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h1 id="_2-tutorials-on-contributing-to-espnet-se-project" tabindex="-1"><a class="header-anchor" href="#_2-tutorials-on-contributing-to-espnet-se-project"><span>(2) Tutorials on Contributing to ESPNet-SE Project</span></a></h1><p>If you would like to contribute to the ESPnet-SE project, or if you would like to make modifications based on the current speech enhancement/separation functionality, the following tutorials will provide you detailed information about how to creating new recipes or new models in ESPnet-SE.</p><h2 id="creating-a-new-recipe" tabindex="-1"><a class="header-anchor" href="#creating-a-new-recipe"><span>Creating a New Recipe</span></a></h2><h3 id="step-1-create-recipe-directory" tabindex="-1"><a class="header-anchor" href="#step-1-create-recipe-directory"><span>Step 1 Create recipe directory</span></a></h3><p>First, run the following command to create the directory for the new recipe from our template:</p><div class="language-bash line-numbers-mode" data-ext="sh" data-title="sh"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#DCDCAA;">egs2/TEMPLATE/enh1/setup.sh</span><span style="color:#CE9178;"> egs2/</span><span style="color:#D4D4D4;">&lt;</span><span style="color:#CE9178;">your-recipe-nam</span><span style="color:#D4D4D4;">e&gt;</span><span style="color:#CE9178;">/enh1</span></span>
339
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><blockquote><p>For the following steps, we assume the operations are done under the directory <code>egs2/&lt;your-recipe-name&gt;/enh1/</code>.</p></blockquote><h3 id="step-2-write-scripts-for-data-preparation" tabindex="-1"><a class="header-anchor" href="#step-2-write-scripts-for-data-preparation"><span>Step 2 Write scripts for data preparation</span></a></h3>`,25),y=n("code",null,"local/data.sh",-1),k=n("code",null,"enh.sh",-1),E={href:"https://github.com/espnet/espnet/blob/master/egs2/wsj0_2mix/enh1/local/data.sh",target:"_blank",rel:"noopener noreferrer"},S=i(`<p>The script <code>local/data.sh</code> should finally generate Kaldi-style data directories under <code>&lt;recipe-dir&gt;/data/</code>. Each subset directory should contains at least 4 files:</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>&lt;recipe-dir&gt;/data/&lt;subset-name&gt;/</span></span>
340
+ <span class="line"><span>├── spk{1,2,3...}.scp (clean speech references)</span></span>
341
+ <span class="line"><span>├── spk2utt</span></span>
342
+ <span class="line"><span>├── utt2spk</span></span>
343
+ <span class="line"><span>└── wav.scp (noisy speech)</span></span>
344
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>Optionally, it can also contain <code>noise{}.scp</code> and <code>dereverb{}.scp</code>, which point to the corresponding noise and dereverberated references respectively. {} can be 1, 2, ..., depending on the number of noise types (dereverberated signals) in the input signal in <code>wav.scp</code>.</p><p>Make sure to sort the scp and other related files as in Kaldi. Also, remember to run <code>. ./path.sh</code> in <code>local/data.sh</code> before sorting, because it will force sorting to be byte-wise, i.e. <code>export LC_ALL=C</code>.</p>`,4),D={href:"https://www.shellcheck.net/",target:"_blank",rel:"noopener noreferrer"},T={href:"https://github.com/espnet/espnet/blob/master/ci/test_shell.sh",target:"_blank",rel:"noopener noreferrer"},C=n("h3",{id:"step-3-prepare-training-configuration",tabindex:"-1"},[n("a",{class:"header-anchor",href:"#step-3-prepare-training-configuration"},[n("span",null,"Step 3 Prepare training configuration")])],-1),A={href:"https://github.com/espnet/espnet/blob/master/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_rnn_tf.yaml",target:"_blank",rel:"noopener noreferrer"},I=n("code",null,"conf/",-1),N=n("blockquote",null,[n("p",null,[s("If you have multiple configuration files, it is recommended to put them under "),n("code",null,"conf/tuning/"),s(", and create a symbolic link "),n("code",null,"conf/tuning/train.yaml"),s(" pointing to the config file with the best performance.")])],-1),R=n("h3",{id:"step-4-prepare-run-sh",tabindex:"-1"},[n("a",{class:"header-anchor",href:"#step-4-prepare-run-sh"},[n("span",null,"Step 4 Prepare run.sh")])],-1),z=n("code",null,"run.sh",-1),M=n("code",null,"./run.sh",-1),P={href:"https://github.com/espnet/espnet/blob/master/egs2/wsj0_2mix/enh1/run.sh",target:"_blank",rel:"noopener noreferrer"},F=n("blockquote",null,[n("p",null,[s("If your recipes provide references for noise and/or dereverberation, you can add the argument "),n("code",null,"--use_noise_ref true"),s(" and/or "),n("code",null,"--use_dereverb_ref true"),s(" in "),n("code",null,"run.sh"),s(".")])],-1),L=n("h2",{id:"implementing-a-new-speech-enhancement-separation-model",tabindex:"-1"},[n("a",{class:"header-anchor",href:"#implementing-a-new-speech-enhancement-separation-model"},[n("span",null,"Implementing a New Speech Enhancement/Separation Model")])],-1),j=n("p",null,"The current ESPnet-SE tool adopts an encoder-separator-decoder architecture for all models, e.g.",-1),O={href:"https://github.com/espnet/espnet/blob/master/espnet2/enh/encoder/stft_encoder.py",target:"_blank",rel:"noopener noreferrer"},H={href:"https://github.com/espnet/espnet/blob/master/espnet2/enh/decoder/stft_decoder.py",target:"_blank",rel:"noopener noreferrer"},U={href:"https://github.com/espnet/espnet/blob/master/espnet2/enh/separator/dprnn_separator.py",target:"_blank",rel:"noopener noreferrer"},G={href:"https://github.com/espnet/espnet/blob/master/espnet2/enh/separator/rnn_separator.py",target:"_blank",rel:"noopener noreferrer"},W={href:"https://github.com/espnet/espnet/blob/master/espnet2/enh/separator/tcn_separator.py",target:"_blank",rel:"noopener noreferrer"},Y={href:"https://github.com/espnet/espnet/blob/master/espnet2/enh/separator/transformer_separator.py",target:"_blank",rel:"noopener noreferrer"},B={href:"https://github.com/espnet/espnet/blob/master/espnet2/enh/encoder/conv_encoder.py",target:"_blank",rel:"noopener noreferrer"},V={href:"https://github.com/espnet/espnet/blob/master/espnet2/enh/decoder/conv_decoder.py",target:"_blank",rel:"noopener noreferrer"},J={href:"https://github.com/espnet/espnet/blob/master/espnet2/enh/separator/tcn_separator.py",target:"_blank",rel:"noopener noreferrer"},K=n("h3",{id:"step-1-create-model-scripts",tabindex:"-1"},[n("a",{class:"header-anchor",href:"#step-1-create-model-scripts"},[n("span",null,"Step 1 Create model scripts")])],-1),Z={href:"https://github.com/espnet/espnet/tree/master/espnet2/enh/encoder",target:"_blank",rel:"noopener noreferrer"},Q={href:"https://github.com/espnet/espnet/tree/master/espnet2/enh/separator",target:"_blank",rel:"noopener noreferrer"},X={href:"https://github.com/espnet/espnet/tree/master/espnet2/enh/decoder",target:"_blank",rel:"noopener noreferrer"},$=n("code",null,"num_spk",-1),nn={href:"https://github.com/espnet/espnet/blob/master/espnet2/enh/separator/rnn_separator.py",target:"_blank",rel:"noopener noreferrer"},sn=n("code",null,"black",-1),en=n("code",null,"flake8",-1),an={href:"https://github.com/espnet/espnet/blob/master/ci/test_python.sh",target:"_blank",rel:"noopener noreferrer"},ln=n("h3",{id:"step-2-add-the-new-model-to-related-scripts",tabindex:"-1"},[n("a",{class:"header-anchor",href:"#step-2-add-the-new-model-to-related-scripts"},[n("span",null,"Step 2 Add the new model to related scripts")])],-1),pn={href:"https://github.com/espnet/espnet/blob/master/espnet2/tasks/enh.py#L37-L62",target:"_blank",rel:"noopener noreferrer"},tn=n("code",null,"ClassChoices",-1),rn=i('<ul><li>For encoders, add <code>&lt;key&gt;=&lt;your-model&gt;</code> to <code>encoder_choices</code>.</li><li>For decoders, add <code>&lt;key&gt;=&lt;your-model&gt;</code> to <code>decoder_choices</code>.</li><li>For separators, add <code>&lt;key&gt;=&lt;your-model&gt;</code> to <code>separator_choices</code>.</li></ul><h3 id="step-3-optional-create-new-loss-functions" tabindex="-1"><a class="header-anchor" href="#step-3-optional-create-new-loss-functions"><span>Step 3 [Optional] Create new loss functions</span></a></h3>',2),dn={href:"https://github.com/espnet/espnet/blob/master/espnet2/enh/espnet_model.py",target:"_blank",rel:"noopener noreferrer"},on=i(`<div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#DCDCAA;"> @</span><span style="color:#4EC9B0;">staticmethod</span></span>
345
+ <span class="line"><span style="color:#569CD6;"> def</span><span style="color:#DCDCAA;"> new_loss</span><span style="color:#D4D4D4;">(</span><span style="color:#9CDCFE;">ref</span><span style="color:#D4D4D4;">, </span><span style="color:#9CDCFE;">inf</span><span style="color:#D4D4D4;">):</span></span>
346
+ <span class="line"><span style="color:#CE9178;"> &quot;&quot;&quot;Your new loss</span></span>
347
+ <span class="line"><span style="color:#CE9178;"> Args:</span></span>
348
+ <span class="line"><span style="color:#CE9178;"> ref: (Batch, samples)</span></span>
349
+ <span class="line"><span style="color:#CE9178;"> inf: (Batch, samples)</span></span>
350
+ <span class="line"><span style="color:#CE9178;"> Returns:</span></span>
351
+ <span class="line"><span style="color:#CE9178;"> loss: (Batch,)</span></span>
352
+ <span class="line"><span style="color:#CE9178;"> &quot;&quot;&quot;</span></span>
353
+ <span class="line"><span style="color:#D4D4D4;"> ...</span></span>
354
+ <span class="line"><span style="color:#C586C0;"> return</span><span style="color:#D4D4D4;"> loss</span></span>
355
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div>`,1),cn={href:"https://github.com/espnet/espnet/blob/master/espnet2/enh/espnet_model.py#L21",target:"_blank",rel:"noopener noreferrer"},un={href:"https://github.com/espnet/espnet/blob/master/espnet2/enh/espnet_model.py#L246",target:"_blank",rel:"noopener noreferrer"},mn=n("h3",{id:"step-4-create-unit-tests-for-the-new-model",tabindex:"-1"},[n("a",{class:"header-anchor",href:"#step-4-create-unit-tests-for-the-new-model"},[n("span",null,"Step 4 Create unit tests for the new model")])],-1),vn={href:"https://github.com/espnet/espnet/tree/master/test/espnet2/enh/encoder",target:"_blank",rel:"noopener noreferrer"},hn={href:"https://github.com/espnet/espnet/tree/master/test/espnet2/enh/decoder",target:"_blank",rel:"noopener noreferrer"},bn={href:"https://github.com/espnet/espnet/tree/master/test/espnet2/enh/separator",target:"_blank",rel:"noopener noreferrer"};function _n(fn,gn){const e=p("ExternalLinkIcon");return t(),r("div",null,[o,n("p",null,[n("a",c,[u,a(e)])]),m,v,h,b,n("p",null,[s("This notebook is created by: Chenda Li ("),n("a",_,[s("@LiChenda"),a(e)]),s(") and Wangyou Zhang ("),n("a",f,[s("@Emrys365"),a(e)]),s(")")]),g,n("p",null,[s("The pretrained models can be download from direct URL, or from "),n("a",x,[s("zenodo"),a(e)]),s(" and "),n("a",w,[s("huggingface"),a(e)]),s(" with model ID.")]),q,n("p",null,[s("Prepare "),y,s(", which will be used in stage 1 in "),k,s(". It can take some arguments as input, see "),n("a",E,[s("egs2/wsj0_2mix/enh1/local/data.sh"),a(e)]),s(" for reference.")]),S,n("blockquote",null,[n("p",null,[s("Remember to check your new scripts with "),n("a",D,[s("shellcheck"),a(e)]),s(", otherwise they may fail the tests in "),n("a",T,[s("ci/test_shell.sh"),a(e)]),s(".")])]),C,n("p",null,[s("Prepare training configuration files (e.g. "),n("a",A,[s("train.yaml"),a(e)]),s(") under "),I,s(".")]),N,R,n("p",null,[s("Write "),z,s(" to provide a template entry script, so that users can easily run your recipe by "),M,s(". See "),n("a",P,[s("egs2/wsj0_2mix/enh1/run.sh"),a(e)]),s(" for reference.")]),F,L,j,n("blockquote",null,[n("p",null,[s("For Time-Frequency masking models, the encoder and decoder would be "),n("a",O,[s("stft_encoder.py"),a(e)]),s(" and "),n("a",H,[s("stft_decoder.py"),a(e)]),s(" respectively, and the separator can be any of "),n("a",U,[s("dprnn_separator.py"),a(e)]),s(", "),n("a",G,[s("rnn_separator.py"),a(e)]),s(", "),n("a",W,[s("tcn_separator.py"),a(e)]),s(", and "),n("a",Y,[s("transformer_separator.py"),a(e)]),s(". For TasNet, the encoder and decoder are "),n("a",B,[s("conv_encoder.py"),a(e)]),s(" and "),n("a",V,[s("conv_decoder.py"),a(e)]),s(" respectively. The separator is "),n("a",J,[s("tcn_separator.py"),a(e)]),s(".")])]),K,n("p",null,[s("For encoder, separator, and decoder models, create new scripts under "),n("a",Z,[s("espnet2/enh/encoder/"),a(e)]),s(", "),n("a",Q,[s("espnet2/enh/separator/"),a(e)]),s(", and "),n("a",X,[s("espnet2/enh/decoder/"),a(e)]),s(", respectively.")]),n("p",null,[s("For a separator model, please make sure it implements the "),$,s(" property. See "),n("a",nn,[s("espnet2/enh/separator/rnn_separator.py"),a(e)]),s(" for reference.")]),n("blockquote",null,[n("p",null,[s("Remember to format your new scripts to match the styles in "),sn,s(" and "),en,s(", otherwise they may fail the tests in "),n("a",an,[s("ci/test_python.sh"),a(e)]),s(".")])]),ln,n("p",null,[s("In "),n("a",pn,[s("espnet2/tasks/enh.py"),a(e)]),s(", add your new model to the corresponding "),tn,s(", e.g.")]),rn,n("p",null,[s("If you want to use a new loss function for your model, you can add it to "),n("a",dn,[s("espnet2/enh/espnet_model.py"),a(e)]),s(", such as:")]),on,n("p",null,[s("Then add your loss name to "),n("a",cn,[s("ALL_LOSS_TYPES"),a(e)]),s(", and handle the loss calculation in "),n("a",un,[s("_compute_loss"),a(e)]),s(".")]),mn,n("p",null,[s("Finally, it would be nice to make some unit tests for your new model under "),n("a",vn,[s("test/espnet2/enh/encoder"),a(e)]),s(", "),n("a",hn,[s("test/espnet2/enh/decoder"),a(e)]),s(", or "),n("a",bn,[s("test/espnet2/enh/separator"),a(e)]),s(".")])])}const wn=l(d,[["render",_n],["__file","espnet_se_demonstration_for_waspaa_2021.html.vue"]]),qn=JSON.parse('{"path":"/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html","title":"ESPnet Speech Enhancement Demonstration","lang":"en-US","frontmatter":{},"headers":[{"level":2,"title":"Install","slug":"install","link":"#install","children":[]},{"level":2,"title":"Speech Enhancement with Pretrained Models","slug":"speech-enhancement-with-pretrained-models","link":"#speech-enhancement-with-pretrained-models","children":[{"level":3,"title":"Single-Channel Enhancement, the CHiME example","slug":"single-channel-enhancement-the-chime-example","link":"#single-channel-enhancement-the-chime-example","children":[]},{"level":3,"title":"Enhance your own pre-recordings","slug":"enhance-your-own-pre-recordings","link":"#enhance-your-own-pre-recordings","children":[]},{"level":3,"title":"Multi-Channel Enhancement","slug":"multi-channel-enhancement","link":"#multi-channel-enhancement","children":[]}]},{"level":2,"title":"Speech Separation","slug":"speech-separation","link":"#speech-separation","children":[{"level":3,"title":"Model Selection","slug":"model-selection","link":"#model-selection","children":[]},{"level":3,"title":"Separate Speech Mixture","slug":"separate-speech-mixture","link":"#separate-speech-mixture","children":[]}]},{"level":2,"title":"Evluate separated speech with pretrained ASR model","slug":"evluate-separated-speech-with-pretrained-asr-model","link":"#evluate-separated-speech-with-pretrained-asr-model","children":[]},{"level":2,"title":"Creating a New Recipe","slug":"creating-a-new-recipe","link":"#creating-a-new-recipe","children":[{"level":3,"title":"Step 1 Create recipe directory","slug":"step-1-create-recipe-directory","link":"#step-1-create-recipe-directory","children":[]},{"level":3,"title":"Step 2 Write scripts for data preparation","slug":"step-2-write-scripts-for-data-preparation","link":"#step-2-write-scripts-for-data-preparation","children":[]},{"level":3,"title":"Step 3 Prepare training configuration","slug":"step-3-prepare-training-configuration","link":"#step-3-prepare-training-configuration","children":[]},{"level":3,"title":"Step 4 Prepare run.sh","slug":"step-4-prepare-run-sh","link":"#step-4-prepare-run-sh","children":[]}]},{"level":2,"title":"Implementing a New Speech Enhancement/Separation Model","slug":"implementing-a-new-speech-enhancement-separation-model","link":"#implementing-a-new-speech-enhancement-separation-model","children":[{"level":3,"title":"Step 1 Create model scripts","slug":"step-1-create-model-scripts","link":"#step-1-create-model-scripts","children":[]},{"level":3,"title":"Step 2 Add the new model to related scripts","slug":"step-2-add-the-new-model-to-related-scripts","link":"#step-2-add-the-new-model-to-related-scripts","children":[]},{"level":3,"title":"Step 3 [Optional] Create new loss functions","slug":"step-3-optional-create-new-loss-functions","link":"#step-3-optional-create-new-loss-functions","children":[]},{"level":3,"title":"Step 4 Create unit tests for the new model","slug":"step-4-create-unit-tests-for-the-new-model","link":"#step-4-create-unit-tests-for-the-new-model","children":[]}]}],"git":{},"filePathRelative":"espnet2/se/espnet_se_demonstration_for_waspaa_2021.md"}');export{wn as comp,qn as data};
assets/finetune_owsm.html-ICOQYZj2.js ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import{_ as s,o as n,c as a,e as l}from"./app-DTS6SjJz.js";const e={},p=l(`<h1 id="owsm-finetuning-with-custom-dataset" tabindex="-1"><a class="header-anchor" href="#owsm-finetuning-with-custom-dataset"><span>OWSM finetuning with custom dataset</span></a></h1><p>This Jupyter notebook provides a step-by-step guide on using the ESPnetEasy module to finetune owsm model. In this demonstration, we will leverage the custom dataset to finetune an OWSM model for ASR task.</p><h2 id="data-preparation" tabindex="-1"><a class="header-anchor" href="#data-preparation"><span>Data Preparation</span></a></h2><p>For this tutorial, we assume that we have the custom dataset with 654 audio with the following directory structure:</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>audio</span></span>
2
+ <span class="line"><span>├── 001 [420 entries exceeds filelimit, not opening dir]</span></span>
3
+ <span class="line"><span>└── 002 [234 entries exceeds filelimit, not opening dir]</span></span>
4
+ <span class="line"><span>transcription</span></span>
5
+ <span class="line"><span>└── owsm_v3.1</span></span>
6
+ <span class="line"><span> ├── 001.csv</span></span>
7
+ <span class="line"><span> └── 002.csv</span></span>
8
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>The csv files contains the audio path, text, and text_ctc data in Japanese. For example, the csv constains the following data:</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>audio/001/00014.wav,しゃべるたびに追いかけてくるんですけど,なんかしゃべるたびにおいかけてくるんですけど</span></span>
9
+ <span class="line"><span>audio/001/00015.wav,え、どうしよう,えどうしよう</span></span>
10
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> os</span></span>
11
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> glob </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> glob</span></span>
12
+ <span class="line"></span>
13
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> numpy </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> np</span></span>
14
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> librosa</span></span>
15
+ <span class="line"></span>
16
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> torch</span></span>
17
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet2.bin.s2t_inference </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> Speech2Text</span></span>
18
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet2.layers.create_lora_adapter </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> create_lora_adapter</span></span>
19
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> espnetez </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> ez</span></span>
20
+ <span class="line"></span>
21
+ <span class="line"><span style="color:#6A9955;"># Define hyper parameters</span></span>
22
+ <span class="line"><span style="color:#D4D4D4;">DUMP_DIR = </span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;./dump&quot;</span></span>
23
+ <span class="line"><span style="color:#D4D4D4;">CSV_DIR = </span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;./transcription&quot;</span></span>
24
+ <span class="line"><span style="color:#D4D4D4;">EXP_DIR = </span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;./exp/finetune&quot;</span></span>
25
+ <span class="line"><span style="color:#D4D4D4;">STATS_DIR = </span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;./exp/stats_finetune&quot;</span></span>
26
+ <span class="line"></span>
27
+ <span class="line"><span style="color:#D4D4D4;">FINETUNE_MODEL = </span><span style="color:#CE9178;">&quot;espnet/owsm_v3.1_ebf&quot;</span></span>
28
+ <span class="line"><span style="color:#D4D4D4;">LORA_TARGET = [</span></span>
29
+ <span class="line"><span style="color:#CE9178;"> &quot;w_1&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;w_2&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;merge_proj&quot;</span></span>
30
+ <span class="line"><span style="color:#D4D4D4;">]</span></span>
31
+ <span class="line"><span style="color:#D4D4D4;">LANGUAGE = </span><span style="color:#CE9178;">&quot;jpn&quot;</span></span>
32
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="setup-training-configs-and-model" tabindex="-1"><a class="header-anchor" href="#setup-training-configs-and-model"><span>Setup training configs and model</span></a></h2><p>Since we are going to finetune an OWSM model for ASR task, we will use the tokenizer and TokenIDConverter of the OWSM model. We will also use the training config as the default parameter sets, and update them with the finetuning configuration.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">pretrained_model = Speech2Text.from_pretrained(</span></span>
33
+ <span class="line"><span style="color:#D4D4D4;"> FINETUNE_MODEL,</span></span>
34
+ <span class="line"><span style="color:#9CDCFE;"> category_sym</span><span style="color:#D4D4D4;">=</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;&lt;</span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">LANGUAGE</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&gt;&quot;</span><span style="color:#D4D4D4;">,</span></span>
35
+ <span class="line"><span style="color:#9CDCFE;"> beam_size</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">10</span><span style="color:#D4D4D4;">,</span></span>
36
+ <span class="line"><span style="color:#D4D4D4;">) </span><span style="color:#6A9955;"># Load model to extract configs.</span></span>
37
+ <span class="line"><span style="color:#D4D4D4;">pretrain_config = </span><span style="color:#DCDCAA;">vars</span><span style="color:#D4D4D4;">(pretrained_model.s2t_train_args)</span></span>
38
+ <span class="line"><span style="color:#D4D4D4;">tokenizer = pretrained_model.tokenizer</span></span>
39
+ <span class="line"><span style="color:#D4D4D4;">converter = pretrained_model.converter</span></span>
40
+ <span class="line"><span style="color:#C586C0;">del</span><span style="color:#D4D4D4;"> pretrained_model</span></span>
41
+ <span class="line"></span>
42
+ <span class="line"><span style="color:#D4D4D4;">finetune_config = ez.config.update_finetune_config(</span></span>
43
+ <span class="line"><span style="color:#CE9178;"> &#39;s2t&#39;</span><span style="color:#D4D4D4;">,</span></span>
44
+ <span class="line"><span style="color:#D4D4D4;"> pretrain_config,</span></span>
45
+ <span class="line"><span style="color:#569CD6;"> f</span><span style="color:#CE9178;">&quot;./config/finetune_with_lora.yaml&quot;</span></span>
46
+ <span class="line"><span style="color:#D4D4D4;">)</span></span>
47
+ <span class="line"></span>
48
+ <span class="line"><span style="color:#6A9955;"># define model loading function</span></span>
49
+ <span class="line"><span style="color:#569CD6;">def</span><span style="color:#DCDCAA;"> count_parameters</span><span style="color:#D4D4D4;">(</span><span style="color:#9CDCFE;">model</span><span style="color:#D4D4D4;">):</span></span>
50
+ <span class="line"><span style="color:#C586C0;"> return</span><span style="color:#DCDCAA;"> sum</span><span style="color:#D4D4D4;">(p.numel() </span><span style="color:#C586C0;">for</span><span style="color:#D4D4D4;"> p </span><span style="color:#C586C0;">in</span><span style="color:#D4D4D4;"> model.parameters() </span><span style="color:#C586C0;">if</span><span style="color:#D4D4D4;"> p.requires_grad)</span></span>
51
+ <span class="line"></span>
52
+ <span class="line"><span style="color:#569CD6;">def</span><span style="color:#DCDCAA;"> build_model_fn</span><span style="color:#D4D4D4;">(</span><span style="color:#9CDCFE;">args</span><span style="color:#D4D4D4;">):</span></span>
53
+ <span class="line"><span style="color:#D4D4D4;"> pretrained_model = Speech2Text.from_pretrained(</span></span>
54
+ <span class="line"><span style="color:#D4D4D4;"> FINETUNE_MODEL,</span></span>
55
+ <span class="line"><span style="color:#9CDCFE;"> category_sym</span><span style="color:#D4D4D4;">=</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;&lt;</span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">LANGUAGE</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&gt;&quot;</span><span style="color:#D4D4D4;">,</span></span>
56
+ <span class="line"><span style="color:#9CDCFE;"> beam_size</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">10</span><span style="color:#D4D4D4;">,</span></span>
57
+ <span class="line"><span style="color:#D4D4D4;"> )</span></span>
58
+ <span class="line"><span style="color:#D4D4D4;"> model = pretrained_model.s2t_model</span></span>
59
+ <span class="line"><span style="color:#D4D4D4;"> model.train()</span></span>
60
+ <span class="line"><span style="color:#DCDCAA;"> print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&#39;Trainable parameters: </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">count_parameters(model)</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&#39;</span><span style="color:#D4D4D4;">)</span></span>
61
+ <span class="line"><span style="color:#6A9955;"> # apply lora</span></span>
62
+ <span class="line"><span style="color:#D4D4D4;"> create_lora_adapter(model, </span><span style="color:#9CDCFE;">target_modules</span><span style="color:#D4D4D4;">=LORA_TARGET)</span></span>
63
+ <span class="line"><span style="color:#DCDCAA;"> print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&#39;Trainable parameters after LORA: </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">count_parameters(model)</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&#39;</span><span style="color:#D4D4D4;">)</span></span>
64
+ <span class="line"><span style="color:#C586C0;"> return</span><span style="color:#D4D4D4;"> model</span></span>
65
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="wrap-with-espneteasydataset" tabindex="-1"><a class="header-anchor" href="#wrap-with-espneteasydataset"><span>Wrap with ESPnetEasyDataset</span></a></h2><p>Before initiating the training process, it is crucial to adapt the dataset to the ESPnet format. The dataset class should output tokenized text and audio files in <code>np.array</code> format.</p><p>Then let&#39;s define the custom dataset class. The owsm finetuning requires <code>audio</code>, <code>text</code>, <code>text_prev</code> and <code>text_ctc</code> data. You can use your custom-defined dataset, huggingface <code>datasets</code> library, or <code>lhotse</code> library, or any other dataloader that you want to use.</p><p>When you try to use custom-defined dataset, you should define the <code>data_info</code> dictionary. It defines the mapping between the output of your model and the input of ESPnet models.</p><p><strong>Note</strong>:</p><ul><li>Currently we do not support the custom dataloader that feeds processed feature.</li></ul><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">LANGUAGE = </span></span>
66
+ <span class="line"><span style="color:#6A9955;"># custom dataset class</span></span>
67
+ <span class="line"><span style="color:#569CD6;">class</span><span style="color:#4EC9B0;"> CustomDataset</span><span style="color:#D4D4D4;">(</span><span style="color:#4EC9B0;">torch</span><span style="color:#D4D4D4;">.</span><span style="color:#4EC9B0;">utils</span><span style="color:#D4D4D4;">.</span><span style="color:#4EC9B0;">data</span><span style="color:#D4D4D4;">.</span><span style="color:#4EC9B0;">Dataset</span><span style="color:#D4D4D4;">):</span></span>
68
+ <span class="line"><span style="color:#569CD6;"> def</span><span style="color:#DCDCAA;"> __init__</span><span style="color:#D4D4D4;">(</span><span style="color:#9CDCFE;">self</span><span style="color:#D4D4D4;">, </span><span style="color:#9CDCFE;">data_list</span><span style="color:#D4D4D4;">):</span></span>
69
+ <span class="line"><span style="color:#6A9955;"> # data_list is a list of tuples (audio_path, text, text_ctc)</span></span>
70
+ <span class="line"><span style="color:#569CD6;"> self</span><span style="color:#D4D4D4;">.data = data_list</span></span>
71
+ <span class="line"></span>
72
+ <span class="line"><span style="color:#569CD6;"> def</span><span style="color:#DCDCAA;"> __len__</span><span style="color:#D4D4D4;">(</span><span style="color:#9CDCFE;">self</span><span style="color:#D4D4D4;">):</span></span>
73
+ <span class="line"><span style="color:#C586C0;"> return</span><span style="color:#DCDCAA;"> len</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">self</span><span style="color:#D4D4D4;">.data)</span></span>
74
+ <span class="line"></span>
75
+ <span class="line"><span style="color:#569CD6;"> def</span><span style="color:#DCDCAA;"> __getitem__</span><span style="color:#D4D4D4;">(</span><span style="color:#9CDCFE;">self</span><span style="color:#D4D4D4;">, </span><span style="color:#9CDCFE;">idx</span><span style="color:#D4D4D4;">):</span></span>
76
+ <span class="line"><span style="color:#C586C0;"> return</span><span style="color:#569CD6;"> self</span><span style="color:#D4D4D4;">._parse_single_data(</span><span style="color:#569CD6;">self</span><span style="color:#D4D4D4;">.data[idx])</span></span>
77
+ <span class="line"></span>
78
+ <span class="line"><span style="color:#569CD6;"> def</span><span style="color:#DCDCAA;"> _parse_single_data</span><span style="color:#D4D4D4;">(</span><span style="color:#9CDCFE;">self</span><span style="color:#D4D4D4;">, </span><span style="color:#9CDCFE;">d</span><span style="color:#D4D4D4;">):</span></span>
79
+ <span class="line"><span style="color:#D4D4D4;"> text = </span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;&lt;</span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">LANGUAGE</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&gt;&lt;asr&gt;&lt;notimestamps&gt; </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">d[</span><span style="color:#CE9178;">&#39;transcript&#39;</span><span style="color:#D4D4D4;">]</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&quot;</span></span>
80
+ <span class="line"><span style="color:#C586C0;"> return</span><span style="color:#D4D4D4;"> {</span></span>
81
+ <span class="line"><span style="color:#CE9178;"> &quot;audio_path&quot;</span><span style="color:#D4D4D4;">: d[</span><span style="color:#CE9178;">&quot;audio_path&quot;</span><span style="color:#D4D4D4;">],</span></span>
82
+ <span class="line"><span style="color:#CE9178;"> &quot;text&quot;</span><span style="color:#D4D4D4;">: text,</span></span>
83
+ <span class="line"><span style="color:#CE9178;"> &quot;text_prev&quot;</span><span style="color:#D4D4D4;">: </span><span style="color:#CE9178;">&quot;&lt;na&gt;&quot;</span><span style="color:#D4D4D4;">,</span></span>
84
+ <span class="line"><span style="color:#CE9178;"> &quot;text_ctc&quot;</span><span style="color:#D4D4D4;">: d[</span><span style="color:#CE9178;">&#39;text_ctc&#39;</span><span style="color:#D4D4D4;">],</span></span>
85
+ <span class="line"><span style="color:#D4D4D4;"> }</span></span>
86
+ <span class="line"></span>
87
+ <span class="line"></span>
88
+ <span class="line"><span style="color:#D4D4D4;">data_list = []</span></span>
89
+ <span class="line"><span style="color:#C586C0;">for</span><span style="color:#D4D4D4;"> csv_file </span><span style="color:#C586C0;">in</span><span style="color:#DCDCAA;"> sorted</span><span style="color:#D4D4D4;">(glob(os.path.join(CSV_DIR, </span><span style="color:#CE9178;">&quot;*.csv&quot;</span><span style="color:#D4D4D4;">))):</span></span>
90
+ <span class="line"><span style="color:#C586C0;"> with</span><span style="color:#DCDCAA;"> open</span><span style="color:#D4D4D4;">(csv_file, </span><span style="color:#CE9178;">&quot;r&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#9CDCFE;">encoding</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&quot;utf-8&quot;</span><span style="color:#D4D4D4;">) </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> f:</span></span>
91
+ <span class="line"><span style="color:#D4D4D4;"> data_list += f.readlines()[</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">:] </span><span style="color:#6A9955;"># skip header</span></span>
92
+ <span class="line"></span>
93
+ <span class="line"><span style="color:#D4D4D4;">validation_examples = </span><span style="color:#B5CEA8;">20</span></span>
94
+ <span class="line"><span style="color:#D4D4D4;">train_dataset = CustomDataset(data_list[:-validation_examples])</span></span>
95
+ <span class="line"><span style="color:#D4D4D4;">valid_dataset = CustomDataset(data_list[-validation_examples:])</span></span>
96
+ <span class="line"></span>
97
+ <span class="line"><span style="color:#569CD6;">def</span><span style="color:#DCDCAA;"> tokenize</span><span style="color:#D4D4D4;">(</span><span style="color:#9CDCFE;">text</span><span style="color:#D4D4D4;">):</span></span>
98
+ <span class="line"><span style="color:#C586C0;"> return</span><span style="color:#D4D4D4;"> np.array(converter.tokens2ids(tokenizer.text2tokens(text)))</span></span>
99
+ <span class="line"></span>
100
+ <span class="line"><span style="color:#6A9955;"># The output of CustomDatasetInstance[idx] will converted to np.array</span></span>
101
+ <span class="line"><span style="color:#6A9955;"># with the functions defined in the data_info dictionary.</span></span>
102
+ <span class="line"><span style="color:#D4D4D4;">data_info = {</span></span>
103
+ <span class="line"><span style="color:#CE9178;"> &quot;speech&quot;</span><span style="color:#D4D4D4;">: </span><span style="color:#569CD6;">lambda</span><span style="color:#9CDCFE;"> d</span><span style="color:#D4D4D4;">: librosa.load(d[</span><span style="color:#CE9178;">&quot;audio_path&quot;</span><span style="color:#D4D4D4;">], </span><span style="color:#9CDCFE;">sr</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">16000</span><span style="color:#D4D4D4;">)[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">],</span></span>
104
+ <span class="line"><span style="color:#CE9178;"> &quot;text&quot;</span><span style="color:#D4D4D4;">: </span><span style="color:#569CD6;">lambda</span><span style="color:#9CDCFE;"> d</span><span style="color:#D4D4D4;">: tokenize(d[</span><span style="color:#CE9178;">&quot;text&quot;</span><span style="color:#D4D4D4;">]),</span></span>
105
+ <span class="line"><span style="color:#CE9178;"> &quot;text_prev&quot;</span><span style="color:#D4D4D4;">: </span><span style="color:#569CD6;">lambda</span><span style="color:#9CDCFE;"> d</span><span style="color:#D4D4D4;">: tokenize(d[</span><span style="color:#CE9178;">&quot;text_prev&quot;</span><span style="color:#D4D4D4;">]),</span></span>
106
+ <span class="line"><span style="color:#CE9178;"> &quot;text_ctc&quot;</span><span style="color:#D4D4D4;">: </span><span style="color:#569CD6;">lambda</span><span style="color:#9CDCFE;"> d</span><span style="color:#D4D4D4;">: tokenize(d[</span><span style="color:#CE9178;">&quot;text_ctc&quot;</span><span style="color:#D4D4D4;">]),</span></span>
107
+ <span class="line"><span style="color:#D4D4D4;">}</span></span>
108
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>Or if you want to use <code>datasets</code> library or <code>lhotse</code> library:</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># Datasets library</span></span>
109
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> datasets </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> load_dataset, Audio</span></span>
110
+ <span class="line"></span>
111
+ <span class="line"><span style="color:#D4D4D4;">train_dataset = load_dataset(</span><span style="color:#CE9178;">&quot;audiofolder&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#9CDCFE;">data_dir</span><span style="color:#D4D4D4;">=</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;/path/to/huggingface_dataset&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#9CDCFE;">split</span><span style="color:#D4D4D4;">=</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&#39;train[:-</span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">validation_examples</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">]&#39;</span><span style="color:#D4D4D4;">)</span></span>
112
+ <span class="line"><span style="color:#D4D4D4;">valid_dataset = load_dataset(</span><span style="color:#CE9178;">&quot;audiofolder&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#9CDCFE;">data_dir</span><span style="color:#D4D4D4;">=</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;/path/to/huggingface_dataset&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#9CDCFE;">split</span><span style="color:#D4D4D4;">=</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&#39;train[-</span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">validation_examples</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">:]&#39;</span><span style="color:#D4D4D4;">)</span></span>
113
+ <span class="line"><span style="color:#D4D4D4;">train_dataset = train_dataset.cast_column(</span><span style="color:#CE9178;">&quot;audio&quot;</span><span style="color:#D4D4D4;">, Audio(</span><span style="color:#9CDCFE;">sampling_rate</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">16000</span><span style="color:#D4D4D4;">))</span></span>
114
+ <span class="line"><span style="color:#D4D4D4;">valid_dataset = valid_dataset.cast_column(</span><span style="color:#CE9178;">&quot;audio&quot;</span><span style="color:#D4D4D4;">, Audio(</span><span style="color:#9CDCFE;">sampling_rate</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">16000</span><span style="color:#D4D4D4;">))</span></span>
115
+ <span class="line"><span style="color:#D4D4D4;">data_info = {</span></span>
116
+ <span class="line"><span style="color:#CE9178;"> &quot;speech&quot;</span><span style="color:#D4D4D4;">: </span><span style="color:#569CD6;">lambda</span><span style="color:#9CDCFE;"> d</span><span style="color:#D4D4D4;">: d[</span><span style="color:#CE9178;">&#39;audio&#39;</span><span style="color:#D4D4D4;">][</span><span style="color:#CE9178;">&#39;array&#39;</span><span style="color:#D4D4D4;">],</span></span>
117
+ <span class="line"><span style="color:#CE9178;"> &quot;text&quot;</span><span style="color:#D4D4D4;">: </span><span style="color:#569CD6;">lambda</span><span style="color:#9CDCFE;"> d</span><span style="color:#D4D4D4;">: tokenize(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;&lt;</span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">LANGUAGE</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&gt;&lt;asr&gt;&lt;notimestamps&gt; </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">d[</span><span style="color:#CE9178;">&#39;transcript&#39;</span><span style="color:#D4D4D4;">]</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&quot;</span><span style="color:#D4D4D4;">),</span></span>
118
+ <span class="line"><span style="color:#CE9178;"> &quot;text_prev&quot;</span><span style="color:#D4D4D4;">: </span><span style="color:#569CD6;">lambda</span><span style="color:#9CDCFE;"> d</span><span style="color:#D4D4D4;">: tokenize(</span><span style="color:#CE9178;">&quot;&lt;na&gt;&quot;</span><span style="color:#D4D4D4;">),</span></span>
119
+ <span class="line"><span style="color:#CE9178;"> &quot;text_ctc&quot;</span><span style="color:#D4D4D4;">: </span><span style="color:#569CD6;">lambda</span><span style="color:#9CDCFE;"> d</span><span style="color:#D4D4D4;">: tokenize(d[</span><span style="color:#CE9178;">&quot;text_ctc&quot;</span><span style="color:#D4D4D4;">]),</span></span>
120
+ <span class="line"><span style="color:#D4D4D4;">}</span></span>
121
+ <span class="line"></span>
122
+ <span class="line"><span style="color:#6A9955;"># Or lhotse library. The following code is from the official document.</span></span>
123
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> pathlib </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> Path</span></span>
124
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> lhotse </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> CutSet</span></span>
125
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> lhotse.recipes </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> download_librispeech, prepare_librispeech</span></span>
126
+ <span class="line"></span>
127
+ <span class="line"><span style="color:#569CD6;">def</span><span style="color:#DCDCAA;"> load_audio</span><span style="color:#D4D4D4;">(</span><span style="color:#9CDCFE;">audio_path</span><span style="color:#D4D4D4;">):</span></span>
128
+ <span class="line"><span style="color:#D4D4D4;"> y, _ = librosa.load(audio_path, </span><span style="color:#9CDCFE;">sr</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">16000</span><span style="color:#D4D4D4;">)</span></span>
129
+ <span class="line"><span style="color:#C586C0;"> return</span><span style="color:#D4D4D4;"> y</span></span>
130
+ <span class="line"></span>
131
+ <span class="line"><span style="color:#D4D4D4;">root_dir = Path(</span><span style="color:#CE9178;">&quot;data&quot;</span><span style="color:#D4D4D4;">)</span></span>
132
+ <span class="line"><span style="color:#D4D4D4;">tmp_dir = Path(</span><span style="color:#CE9178;">&quot;tmp&quot;</span><span style="color:#D4D4D4;">)</span></span>
133
+ <span class="line"><span style="color:#D4D4D4;">tmp_dir.mkdir(</span><span style="color:#9CDCFE;">exist_ok</span><span style="color:#D4D4D4;">=</span><span style="color:#569CD6;">True</span><span style="color:#D4D4D4;">)</span></span>
134
+ <span class="line"><span style="color:#D4D4D4;">num_jobs = os.cpu_count() - </span><span style="color:#B5CEA8;">1</span></span>
135
+ <span class="line"></span>
136
+ <span class="line"><span style="color:#D4D4D4;">libri_variant = </span><span style="color:#CE9178;">&quot;mini_librispeech&quot;</span></span>
137
+ <span class="line"><span style="color:#D4D4D4;">libri_root = download_librispeech(root_dir, </span><span style="color:#9CDCFE;">dataset_parts</span><span style="color:#D4D4D4;">=libri_variant)</span></span>
138
+ <span class="line"><span style="color:#D4D4D4;">libri = prepare_librispeech(</span></span>
139
+ <span class="line"><span style="color:#D4D4D4;"> libri_root, </span><span style="color:#9CDCFE;">dataset_parts</span><span style="color:#D4D4D4;">=libri_variant, </span><span style="color:#9CDCFE;">output_dir</span><span style="color:#D4D4D4;">=root_dir, </span><span style="color:#9CDCFE;">num_jobs</span><span style="color:#D4D4D4;">=num_jobs</span></span>
140
+ <span class="line"><span style="color:#D4D4D4;">)</span></span>
141
+ <span class="line"><span style="color:#D4D4D4;">train_dataset = CutSet.from_manifests(**libri[</span><span style="color:#CE9178;">&quot;train-clean-5&quot;</span><span style="color:#D4D4D4;">])</span></span>
142
+ <span class="line"><span style="color:#D4D4D4;">valid_dataset = CutSet.from_manifests(**libri[</span><span style="color:#CE9178;">&quot;dev-clean-2&quot;</span><span style="color:#D4D4D4;">])</span></span>
143
+ <span class="line"><span style="color:#D4D4D4;">data_info = {</span></span>
144
+ <span class="line"><span style="color:#CE9178;"> &quot;speech&quot;</span><span style="color:#D4D4D4;">: </span><span style="color:#569CD6;">lambda</span><span style="color:#9CDCFE;"> d</span><span style="color:#D4D4D4;">: load_audio(d.recording.sources[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">].source),</span></span>
145
+ <span class="line"><span style="color:#CE9178;"> &quot;text&quot;</span><span style="color:#D4D4D4;">: </span><span style="color:#569CD6;">lambda</span><span style="color:#9CDCFE;"> d</span><span style="color:#D4D4D4;">: tokenize(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;&lt;</span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">LANGUAGE</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&gt;&lt;asr&gt;&lt;notimestamps&gt; </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">d.supervisions[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">].text</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&quot;</span><span style="color:#D4D4D4;">),</span></span>
146
+ <span class="line"><span style="color:#CE9178;"> &quot;text_prev&quot;</span><span style="color:#D4D4D4;">: </span><span style="color:#569CD6;">lambda</span><span style="color:#9CDCFE;"> d</span><span style="color:#D4D4D4;">: tokenize(</span><span style="color:#CE9178;">&quot;&lt;na&gt;&quot;</span><span style="color:#D4D4D4;">),</span></span>
147
+ <span class="line"><span style="color:#CE9178;"> &quot;text_ctc&quot;</span><span style="color:#D4D4D4;">: </span><span style="color:#569CD6;">lambda</span><span style="color:#9CDCFE;"> d</span><span style="color:#D4D4D4;">: tokenize(d.supervisions[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">].text),</span></span>
148
+ <span class="line"><span style="color:#D4D4D4;">}</span></span>
149
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>And finally you need to wrap your custom dataset with ESPnetEasyDataset.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># Convert into ESPnet-Easy dataset format</span></span>
150
+ <span class="line"><span style="color:#D4D4D4;">train_dataset = ez.dataset.ESPnetEasyDataset(train_dataset, </span><span style="color:#9CDCFE;">data_info</span><span style="color:#D4D4D4;">=data_info)</span></span>
151
+ <span class="line"><span style="color:#D4D4D4;">valid_dataset = ez.dataset.ESPnetEasyDataset(valid_dataset, </span><span style="color:#9CDCFE;">data_info</span><span style="color:#D4D4D4;">=data_info)</span></span>
152
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="training" tabindex="-1"><a class="header-anchor" href="#training"><span>Training</span></a></h2><p>While the configuration remains consistent with other notebooks, the instantiation arguments for the Trainer class differ in this case. As we have not generated dump files, we can disregard arguments related to dump files and directly provide the train/valid dataset classes.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>trainer = Trainer(</span></span>
153
+ <span class="line"><span> ...</span></span>
154
+ <span class="line"><span> train_dataset=your_train_dataset_instance,</span></span>
155
+ <span class="line"><span> train_dataset=your_valid_dataset_instance,</span></span>
156
+ <span class="line"><span> ...</span></span>
157
+ <span class="line"><span>)</span></span>
158
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">trainer = ez.Trainer(</span></span>
159
+ <span class="line"><span style="color:#9CDCFE;"> task</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&#39;s2t&#39;</span><span style="color:#D4D4D4;">,</span></span>
160
+ <span class="line"><span style="color:#9CDCFE;"> train_config</span><span style="color:#D4D4D4;">=finetune_config,</span></span>
161
+ <span class="line"><span style="color:#9CDCFE;"> train_dataset</span><span style="color:#D4D4D4;">=train_dataset,</span></span>
162
+ <span class="line"><span style="color:#9CDCFE;"> valid_dataset</span><span style="color:#D4D4D4;">=valid_dataset,</span></span>
163
+ <span class="line"><span style="color:#9CDCFE;"> build_model_fn</span><span style="color:#D4D4D4;">=build_model_fn, </span><span style="color:#6A9955;"># provide the pre-trained model</span></span>
164
+ <span class="line"><span style="color:#9CDCFE;"> data_info</span><span style="color:#D4D4D4;">=data_info,</span></span>
165
+ <span class="line"><span style="color:#9CDCFE;"> output_dir</span><span style="color:#D4D4D4;">=EXP_DIR,</span></span>
166
+ <span class="line"><span style="color:#9CDCFE;"> stats_dir</span><span style="color:#D4D4D4;">=STATS_DIR,</span></span>
167
+ <span class="line"><span style="color:#9CDCFE;"> ngpu</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">1</span></span>
168
+ <span class="line"><span style="color:#D4D4D4;">)</span></span>
169
+ <span class="line"><span style="color:#D4D4D4;">trainer.collect_stats()</span></span>
170
+ <span class="line"><span style="color:#D4D4D4;">trainer.train()</span></span>
171
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="inference" tabindex="-1"><a class="header-anchor" href="#inference"><span>Inference</span></a></h2><p>When training is done, we can use the inference API to generate the transcription, but don&#39;t forget to apply lora before loading the model!</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">DEVICE = </span><span style="color:#CE9178;">&quot;cuda&quot;</span></span>
172
+ <span class="line"></span>
173
+ <span class="line"><span style="color:#D4D4D4;">model = Speech2Text.from_pretrained(</span></span>
174
+ <span class="line"><span style="color:#CE9178;"> &quot;espnet/owsm_v3.1_ebf&quot;</span><span style="color:#D4D4D4;">,</span></span>
175
+ <span class="line"><span style="color:#9CDCFE;"> category_sym</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&quot;&lt;jpn&gt;&quot;</span><span style="color:#D4D4D4;">,</span></span>
176
+ <span class="line"><span style="color:#9CDCFE;"> beam_size</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">10</span><span style="color:#D4D4D4;">,</span></span>
177
+ <span class="line"><span style="color:#9CDCFE;"> device</span><span style="color:#D4D4D4;">=DEVICE</span></span>
178
+ <span class="line"><span style="color:#D4D4D4;">)</span></span>
179
+ <span class="line"><span style="color:#D4D4D4;">create_lora_adapter(model.s2t_model, </span><span style="color:#9CDCFE;">target_modules</span><span style="color:#D4D4D4;">=LORA_TARGET)</span></span>
180
+ <span class="line"><span style="color:#D4D4D4;">model.s2t_model.eval()</span></span>
181
+ <span class="line"><span style="color:#D4D4D4;">d = torch.load(</span><span style="color:#CE9178;">&quot;./exp/finetune/1epoch.pth&quot;</span><span style="color:#D4D4D4;">)</span></span>
182
+ <span class="line"><span style="color:#D4D4D4;">model.s2t_model.load_state_dict(d)</span></span>
183
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="results" tabindex="-1"><a class="header-anchor" href="#results"><span>Results</span></a></h2><p>As a result, the finetuned owsm-v3.1 could successfully transcribe the audio files.</p><p><strong>Example</strong></p><ul><li>before finetune: 出してこの時間二のどりを。</li><li>after finetune: ダンスでこの世界に彩りを。</li></ul>`,33),o=[p];function t(i,r){return n(),a("div",null,o)}const D=s(e,[["render",t],["__file","finetune_owsm.html.vue"]]),d=JSON.parse('{"path":"/espnetez/asr/finetune_owsm.html","title":"OWSM finetuning with custom dataset","lang":"en-US","frontmatter":{},"headers":[{"level":2,"title":"Data Preparation","slug":"data-preparation","link":"#data-preparation","children":[]},{"level":2,"title":"Setup training configs and model","slug":"setup-training-configs-and-model","link":"#setup-training-configs-and-model","children":[]},{"level":2,"title":"Wrap with ESPnetEasyDataset","slug":"wrap-with-espneteasydataset","link":"#wrap-with-espneteasydataset","children":[]},{"level":2,"title":"Training","slug":"training","link":"#training","children":[]},{"level":2,"title":"Inference","slug":"inference","link":"#inference","children":[]},{"level":2,"title":"Results","slug":"results","link":"#results","children":[]}],"git":{},"filePathRelative":"espnetez/asr/finetune_owsm.md"}');export{D as comp,d as data};
assets/finetune_with_lora.html-3NfoQDOl.js ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import{_ as s,o as n,c as a,e}from"./app-DTS6SjJz.js";const l={},i=e(`<h1 id="finetune-model-with-espnet-easy" tabindex="-1"><a class="header-anchor" href="#finetune-model-with-espnet-easy"><span>Finetune Model with ESPnet-Easy</span></a></h1><p>In this notebook, we will explore the process of finetuning a pretrained model using the Librispeech-100 dataset. We&#39;ll start by downloading a pretrained model from the Hugging Face model hub and apply Low-Rank Adaptation (LoRA) techniques to reduce the number of training parameters.</p><p>In this notebook, we assume that the dump files have been already created. If you need guidance on creating the dump files, you can refer to the <code>training.ipynb</code> notebook.</p><p>First, we need to install the <code>loralib</code> package.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">%pip install loralib</span></span>
2
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>As with the <code>training.ipynb</code> notebook, we need to provide a dictionary to specify the file path and type for each data.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">DUMP_DIR = </span><span style="color:#CE9178;">&quot;./dump/libri100&quot;</span></span>
3
+ <span class="line"><span style="color:#D4D4D4;">data_info = {</span></span>
4
+ <span class="line"><span style="color:#CE9178;"> &quot;speech&quot;</span><span style="color:#D4D4D4;">: [</span><span style="color:#CE9178;">&quot;wav.scp&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;sound&quot;</span><span style="color:#D4D4D4;">],</span></span>
5
+ <span class="line"><span style="color:#CE9178;"> &quot;text&quot;</span><span style="color:#D4D4D4;">: [</span><span style="color:#CE9178;">&quot;text&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;text&quot;</span><span style="color:#D4D4D4;">],</span></span>
6
+ <span class="line"><span style="color:#D4D4D4;">}</span></span>
7
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="load-a-pretrained-model" tabindex="-1"><a class="header-anchor" href="#load-a-pretrained-model"><span>Load a pretrained model</span></a></h2><p>In ESPnet-Easy, you have the flexibility to define a custom model using the <code>build_model_fn</code> method. Additionally, you can load a pretrained model when needed.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet2.bin.asr_inference </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> Speech2Text</span></span>
8
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet2.layers.create_lora_adapter </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> create_lora_adapter</span></span>
9
+ <span class="line"></span>
10
+ <span class="line"></span>
11
+ <span class="line"><span style="color:#569CD6;">def</span><span style="color:#DCDCAA;"> build_model_fn</span><span style="color:#D4D4D4;">(</span><span style="color:#9CDCFE;">args</span><span style="color:#D4D4D4;">):</span></span>
12
+ <span class="line"><span style="color:#D4D4D4;"> pretrained_model = Speech2Text.from_pretrained(</span><span style="color:#CE9178;">&#39;pyf98/librispeech_conformer_hop_length160&#39;</span><span style="color:#D4D4D4;">)</span></span>
13
+ <span class="line"><span style="color:#D4D4D4;"> model = pretrained_model.asr_model</span></span>
14
+ <span class="line"><span style="color:#D4D4D4;"> model.train()</span></span>
15
+ <span class="line"></span>
16
+ <span class="line"><span style="color:#6A9955;"> # apply lora</span></span>
17
+ <span class="line"><span style="color:#D4D4D4;"> create_lora_adapter(model, </span><span style="color:#9CDCFE;">target_modules</span><span style="color:#D4D4D4;">=[</span><span style="color:#CE9178;">&#39;linear_q&#39;</span><span style="color:#D4D4D4;">])</span></span>
18
+ <span class="line"><span style="color:#C586C0;"> return</span><span style="color:#D4D4D4;"> model</span></span>
19
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>When working with a pretrained model, the configuration is inherited from the model by default. To activate the LoRA model, it&#39;s essential to set the <code>use_lora</code> parameter to <code>True</code>. This configuration update can be easily achieved using the <code>update_finetune_config</code> method.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> espnetez </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> ez</span></span>
20
+ <span class="line"></span>
21
+ <span class="line"></span>
22
+ <span class="line"><span style="color:#D4D4D4;">pretrained_model = Speech2Text.from_pretrained(</span><span style="color:#CE9178;">&#39;pyf98/librispeech_conformer_hop_length160&#39;</span><span style="color:#D4D4D4;">)</span></span>
23
+ <span class="line"><span style="color:#D4D4D4;">pretrain_config = </span><span style="color:#DCDCAA;">vars</span><span style="color:#D4D4D4;">(pretrained_model.asr_train_args)</span></span>
24
+ <span class="line"><span style="color:#C586C0;">del</span><span style="color:#D4D4D4;"> pretrained_model</span></span>
25
+ <span class="line"></span>
26
+ <span class="line"><span style="color:#D4D4D4;">finetune_config = ez.config.update_finetune_config(</span></span>
27
+ <span class="line"><span style="color:#CE9178;"> &#39;asr&#39;</span><span style="color:#D4D4D4;">,</span></span>
28
+ <span class="line"><span style="color:#D4D4D4;"> pretrain_config,</span></span>
29
+ <span class="line"><span style="color:#CE9178;"> &#39;config/finetune_with_lora.yaml&#39;</span></span>
30
+ <span class="line"><span style="color:#D4D4D4;">)</span></span>
31
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="training" tabindex="-1"><a class="header-anchor" href="#training"><span>Training</span></a></h2><p>Finally, let&#39;s start training.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">EXP_DIR = </span><span style="color:#CE9178;">&quot;exp/finetune&quot;</span></span>
32
+ <span class="line"><span style="color:#D4D4D4;">STATS_DIR = </span><span style="color:#CE9178;">&quot;exp/stats_finetune&quot;</span></span>
33
+ <span class="line"></span>
34
+ <span class="line"><span style="color:#D4D4D4;">trainer = ez.Trainer(</span></span>
35
+ <span class="line"><span style="color:#9CDCFE;"> task</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&#39;asr&#39;</span><span style="color:#D4D4D4;">,</span></span>
36
+ <span class="line"><span style="color:#9CDCFE;"> train_config</span><span style="color:#D4D4D4;">=finetune_config,</span></span>
37
+ <span class="line"><span style="color:#9CDCFE;"> train_dump_dir</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&quot;dump/libri100/train&quot;</span><span style="color:#D4D4D4;">,</span></span>
38
+ <span class="line"><span style="color:#9CDCFE;"> valid_dump_dir</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&quot;dump/libri100/dev&quot;</span><span style="color:#D4D4D4;">,</span></span>
39
+ <span class="line"><span style="color:#9CDCFE;"> build_model_fn</span><span style="color:#D4D4D4;">=build_model_fn, </span><span style="color:#6A9955;"># provide the pre-trained model</span></span>
40
+ <span class="line"><span style="color:#9CDCFE;"> data_info</span><span style="color:#D4D4D4;">=data_info,</span></span>
41
+ <span class="line"><span style="color:#9CDCFE;"> output_dir</span><span style="color:#D4D4D4;">=EXP_DIR,</span></span>
42
+ <span class="line"><span style="color:#9CDCFE;"> stats_dir</span><span style="color:#D4D4D4;">=STATS_DIR,</span></span>
43
+ <span class="line"><span style="color:#9CDCFE;"> ngpu</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">1</span></span>
44
+ <span class="line"><span style="color:#D4D4D4;">)</span></span>
45
+ <span class="line"><span style="color:#D4D4D4;">trainer.collect_stats()</span></span>
46
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">trainer.train()</span></span>
47
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div>`,16),p=[i];function o(t,r){return n(),a("div",null,p)}const d=s(l,[["render",o],["__file","finetune_with_lora.html.vue"]]),D=JSON.parse('{"path":"/espnetez/asr/finetune_with_lora.html","title":"Finetune Model with ESPnet-Easy","lang":"en-US","frontmatter":{},"headers":[{"level":2,"title":"Load a pretrained model","slug":"load-a-pretrained-model","link":"#load-a-pretrained-model","children":[]},{"level":2,"title":"Training","slug":"training","link":"#training","children":[]}],"git":{},"filePathRelative":"espnetez/asr/finetune_with_lora.md"}');export{d as comp,D as data};
assets/index.html-DGcx4T0I.js ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import{_ as i,r as s,o,c as r,a as e,b as t,w as c,d as n,e as p}from"./app-DTS6SjJz.js";const d="/images/espnet_logo1.png",h={},u={align:"center"},m=e("a",{href:"https://vuepress.vuejs.org/",target:"_blank"},[e("img",{width:"600",src:d,alt:"logo"})],-1),_=p(`<h3 id="install" tabindex="-1"><a class="header-anchor" href="#install"><span>Install</span></a></h3><p>If you intend to do full experiments, including DNN training, then see Installation.</p><p>If you just need the Python module only:</p><div class="language-bash line-numbers-mode" data-ext="sh" data-title="sh"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># We recommend you install PyTorch before installing espnet following https://pytorch.org/get-started/locally/</span></span>
2
+ <span class="line"><span style="color:#DCDCAA;">pip</span><span style="color:#CE9178;"> install</span><span style="color:#CE9178;"> espnet</span></span>
3
+ <span class="line"><span style="color:#6A9955;"># To install the latest</span></span>
4
+ <span class="line"><span style="color:#6A9955;"># pip install git+https://github.com/espnet/espnet</span></span>
5
+ <span class="line"><span style="color:#6A9955;"># To install additional packages</span></span>
6
+ <span class="line"><span style="color:#6A9955;"># pip install &quot;espnet[all]&quot;</span></span>
7
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="citation" tabindex="-1"><a class="header-anchor" href="#citation"><span>Citation</span></a></h3>`,5),g={href:"https://github.com/espnet/espnet?tab=readme-ov-file#citations",target:"_blank",rel:"noopener noreferrer"};function v(f,b){const a=s("font"),l=s("ExternalLinkIcon");return o(),r("div",null,[e("p",u,[m,e("h3",null,[t(a,{color:"gray"},{default:c(()=>[n("ESPnet: end-to-end speech processing toolkit")]),_:1})])]),_,e("p",null,[n("You can find the citation "),e("a",g,[n("here"),t(l)])])])}const x=i(h,[["render",v],["__file","index.html.vue"]]),k=JSON.parse('{"path":"/","title":"","lang":"en-US","frontmatter":{"home":true,"footer":"MIT Licensed | Copyright © 2024-present ESPnet Community","hero":"ESPnet","search":true},"headers":[{"level":3,"title":"Install","slug":"install","link":"#install","children":[]},{"level":3,"title":"Citation","slug":"citation","link":"#citation","children":[]}],"git":{},"filePathRelative":"README.md"}');export{x as comp,k as data};
assets/onnx_conversion_demo.html-D56NEMop.js ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import{_ as a,r as l,o,c as p,a as s,d as n,b as i,e as t}from"./app-DTS6SjJz.js";const r={},c=s("h1",{id:"espnet-onnx-demonstration",tabindex:"-1"},[s("a",{class:"header-anchor",href:"#espnet-onnx-demonstration"},[s("span",null,"espnet_onnx demonstration")])],-1),d=s("p",null,"This notebook provides a demonstration of how to export your trained model into onnx format. Currently only ASR is supported.",-1),m=s("p",null,"see also:",-1),D=s("ul",null,[s("li",null,"ESPnet: https://github.com/espnet/espnet"),s("li",null,"espnet_onnx: https://github.com/Masao-Someki/espnet_onnx")],-1),v={href:"https://github.com/Masao-Someki",target:"_blank",rel:"noopener noreferrer"},y=t(`<h2 id="table-of-contents" tabindex="-1"><a class="header-anchor" href="#table-of-contents"><span>Table of Contents</span></a></h2><ul><li>Install Dependency</li><li>Export your model</li><li>Inference with onnx</li><li>Using streaming model</li></ul><h1 id="install-dependency" tabindex="-1"><a class="header-anchor" href="#install-dependency"><span>Install Dependency</span></a></h1><p>To run this demo, you need to install the following packages.</p><ul><li>espnet_onnx</li><li>torch &gt;= 1.11.0 (already installed in Colab)</li><li>espnet</li><li>espnet_model_zoo</li><li>onnx</li></ul><p><code>torch</code>, <code>espnet</code>, <code>espnet_model_zoo</code>, <code>onnx</code> is required to run the exportation demo.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">pip install -U espnet_onnx espnet espnet_model_zoo onnx</span></span>
2
+ <span class="line"></span>
3
+ <span class="line"><span style="color:#6A9955;"># in this demo, we need to update scipy to avoid an error</span></span>
4
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">pip install -U scipy</span></span>
5
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h1 id="export-your-model" tabindex="-1"><a class="header-anchor" href="#export-your-model"><span>Export your model</span></a></h1><h2 id="export-model-from-espnet-model-zoo" tabindex="-1"><a class="header-anchor" href="#export-model-from-espnet-model-zoo"><span>Export model from espnet_model_zoo</span></a></h2><p>The easiest way to export a model is to use <code>espnet_model_zoo</code>. You can download, unpack, and export the pretrained models with <code>export_from_pretrained</code> method. <code>espnet_onnx</code> will save the onnx models into cache directory, which is <code>\${HOME}/.cache/espnet_onnx</code> in default.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># export the model.</span></span>
6
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet_onnx.export </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> ModelExport</span></span>
7
+ <span class="line"></span>
8
+ <span class="line"><span style="color:#D4D4D4;">tag_name = </span><span style="color:#CE9178;">&#39;kamo-naoyuki/timit_asr_train_asr_raw_word_valid.acc.ave&#39;</span></span>
9
+ <span class="line"></span>
10
+ <span class="line"><span style="color:#D4D4D4;">m = ModelExport()</span></span>
11
+ <span class="line"><span style="color:#D4D4D4;">m.export_from_pretrained(tag_name)</span></span>
12
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="export-from-custom-model" tabindex="-1"><a class="header-anchor" href="#export-from-custom-model"><span>Export from custom model</span></a></h2><p><code>espnet_onnx</code> can also export your own trained model with <code>export</code> method.</p><p>The following script shows how to export from <code>espnet2.bin.asr_inference.Speech2Text</code> instance. You can also export from a zipped file, by using the <code>export_from_zip</code> function.<br> For this demonstration, I&#39;m using the <code>from_pretrained</code> method to load parameters, but you can load your own model.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># prepare the espnet2.bin.asr_inference.Speech2Text instance.</span></span>
13
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet2.bin.asr_inference </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> Speech2Text</span></span>
14
+ <span class="line"></span>
15
+ <span class="line"><span style="color:#D4D4D4;">tag_name = </span><span style="color:#CE9178;">&#39;kamo-naoyuki/timit_asr_train_asr_raw_word_valid.acc.ave&#39;</span></span>
16
+ <span class="line"><span style="color:#D4D4D4;">speech2text = Speech2Text.from_pretrained(tag_name)</span></span>
17
+ <span class="line"></span>
18
+ <span class="line"></span>
19
+ <span class="line"><span style="color:#6A9955;"># export model</span></span>
20
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet_onnx.export </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> ModelExport</span></span>
21
+ <span class="line"></span>
22
+ <span class="line"><span style="color:#D4D4D4;">sample_model_tag = </span><span style="color:#CE9178;">&#39;demo/sample_model_1&#39;</span></span>
23
+ <span class="line"><span style="color:#D4D4D4;">m = ModelExport()</span></span>
24
+ <span class="line"><span style="color:#D4D4D4;">m.export(</span></span>
25
+ <span class="line"><span style="color:#D4D4D4;"> speech2text,</span></span>
26
+ <span class="line"><span style="color:#D4D4D4;"> sample_model_tag,</span></span>
27
+ <span class="line"><span style="color:#9CDCFE;"> quantize</span><span style="color:#D4D4D4;">=</span><span style="color:#569CD6;">False</span></span>
28
+ <span class="line"><span style="color:#D4D4D4;">)</span></span>
29
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h1 id="inference-with-onnx" tabindex="-1"><a class="header-anchor" href="#inference-with-onnx"><span>Inference with onnx</span></a></h1><p>Now, let&#39;s use the exported models for inference.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># please provide the tag_name to specify exported model.</span></span>
30
+ <span class="line"><span style="color:#D4D4D4;">tag_name = </span><span style="color:#CE9178;">&#39;kamo-naoyuki/timit_asr_train_asr_raw_word_valid.acc.ave&#39;</span></span>
31
+ <span class="line"></span>
32
+ <span class="line"></span>
33
+ <span class="line"><span style="color:#6A9955;"># upload wav file and let&#39;s inference!</span></span>
34
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> librosa</span></span>
35
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> google.colab </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> files</span></span>
36
+ <span class="line"></span>
37
+ <span class="line"><span style="color:#D4D4D4;">wav_file = files.upload()</span></span>
38
+ <span class="line"><span style="color:#D4D4D4;">y, sr = librosa.load(</span><span style="color:#4EC9B0;">list</span><span style="color:#D4D4D4;">(wav_file.keys())[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">], </span><span style="color:#9CDCFE;">sr</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">16000</span><span style="color:#D4D4D4;">)</span></span>
39
+ <span class="line"></span>
40
+ <span class="line"></span>
41
+ <span class="line"><span style="color:#6A9955;"># Use the exported onnx file to inference.</span></span>
42
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet_onnx </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> Speech2Text</span></span>
43
+ <span class="line"></span>
44
+ <span class="line"><span style="color:#D4D4D4;">speech2text = Speech2Text(tag_name)</span></span>
45
+ <span class="line"><span style="color:#D4D4D4;">nbest = speech2text(y)</span></span>
46
+ <span class="line"><span style="color:#DCDCAA;">print</span><span style="color:#D4D4D4;">(nbest[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">][</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">])</span></span>
47
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h1 id="using-streaming-model" tabindex="-1"><a class="header-anchor" href="#using-streaming-model"><span>Using streaming model</span></a></h1><p>Model exportation is exactly the same as non-streaming model. You can follow the <code>#Export your model</code> chapter.</p><p>As for streaming, you can specify the following configuration additionaly. Usually, these values should be the same as the training configuration.</p><ul><li>block_size</li><li>hop_size</li><li>look_ahead</li></ul><p>The length of the speech should be the same as <code>streaming_model.hop_size</code>. This value is calculated as follows</p><p>$$ \\begin{align} h &amp;= \\text{hop_size} * \\text{encoder.subsample} * \\text{stft.hop_length}\\ \\text{padding} &amp;= (\\text{stft.n_fft} // \\text{stft.hop_length}) * \\text{stft.hop_length} \\ \\text{len(wav)} &amp;= h + \\text{padding} \\end{align} $$</p><p>For example, the length of the speech is 8704 with the following configuration.</p><ul><li>block_size = 40</li><li>hop_size = 16</li><li>look_ahead = 16</li><li>encoder.subsample = 4</li><li>stft.n_fft = 512</li><li>stft.hop_length = 128</li></ul><p>Now, let&#39;s demonstrate the streaming inference.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># Export the streaming model.</span></span>
48
+ <span class="line"><span style="color:#6A9955;"># Note that the following model is very large</span></span>
49
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet_onnx.export </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> ModelExport</span></span>
50
+ <span class="line"></span>
51
+ <span class="line"><span style="color:#D4D4D4;">tag_name = </span><span style="color:#CE9178;">&#39;D-Keqi/espnet_asr_train_asr_streaming_transformer_raw_en_bpe500_sp_valid.acc.ave&#39;</span></span>
52
+ <span class="line"></span>
53
+ <span class="line"><span style="color:#D4D4D4;">m = ModelExport()</span></span>
54
+ <span class="line"><span style="color:#D4D4D4;">m.export_from_pretrained(tag_name)</span></span>
55
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># In this tutorial, we will use the recorded wav file to simulate streaming.</span></span>
56
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> librosa</span></span>
57
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet_onnx </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> StreamingSpeech2Text</span></span>
58
+ <span class="line"></span>
59
+ <span class="line"><span style="color:#D4D4D4;">tag_name = </span><span style="color:#CE9178;">&#39;D-Keqi/espnet_asr_train_asr_streaming_transformer_raw_en_bpe500_sp_valid.acc.ave&#39;</span></span>
60
+ <span class="line"><span style="color:#D4D4D4;">streaming_model = StreamingSpeech2Text(tag_name)</span></span>
61
+ <span class="line"></span>
62
+ <span class="line"><span style="color:#6A9955;"># upload wav file</span></span>
63
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> google.colab </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> files</span></span>
64
+ <span class="line"><span style="color:#D4D4D4;">wav_file = files.upload()</span></span>
65
+ <span class="line"><span style="color:#D4D4D4;">y, sr = librosa.load(</span><span style="color:#4EC9B0;">list</span><span style="color:#D4D4D4;">(wav_file.keys())[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">], </span><span style="color:#9CDCFE;">sr</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">16000</span><span style="color:#D4D4D4;">)</span></span>
66
+ <span class="line"></span>
67
+ <span class="line"><span style="color:#D4D4D4;">num_process = </span><span style="color:#DCDCAA;">len</span><span style="color:#D4D4D4;">(y) // streaming_model.hop_size + </span><span style="color:#B5CEA8;">1</span></span>
68
+ <span class="line"><span style="color:#DCDCAA;">print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;I will split your audio file into </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">num_process</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;"> blocks.&quot;</span><span style="color:#D4D4D4;">)</span></span>
69
+ <span class="line"></span>
70
+ <span class="line"><span style="color:#6A9955;"># simulate streaming.</span></span>
71
+ <span class="line"><span style="color:#D4D4D4;">streaming_model.start()</span></span>
72
+ <span class="line"><span style="color:#C586C0;">for</span><span style="color:#D4D4D4;"> i </span><span style="color:#C586C0;">in</span><span style="color:#DCDCAA;"> range</span><span style="color:#D4D4D4;">(num_process):</span></span>
73
+ <span class="line"><span style="color:#6A9955;"> # prepare wav file</span></span>
74
+ <span class="line"><span style="color:#D4D4D4;"> start = i * streaming_model.hop_size</span></span>
75
+ <span class="line"><span style="color:#D4D4D4;"> end = (i + </span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">) * streaming_model.hop_size</span></span>
76
+ <span class="line"><span style="color:#D4D4D4;"> wav_streaming = y[start : end]</span></span>
77
+ <span class="line"></span>
78
+ <span class="line"><span style="color:#6A9955;"> # apply padding if len(wav_streaming) &lt; streaming_model.hop_size</span></span>
79
+ <span class="line"><span style="color:#D4D4D4;"> wav_streaming = streaming_model.pad(wav_streaming)</span></span>
80
+ <span class="line"><span style="color:#D4D4D4;"> </span></span>
81
+ <span class="line"><span style="color:#6A9955;"> # compute asr</span></span>
82
+ <span class="line"><span style="color:#D4D4D4;"> nbest = streaming_model(wav_streaming)</span></span>
83
+ <span class="line"><span style="color:#DCDCAA;"> print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&#39;Result at position </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">i</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;"> : </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">nbest[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">][</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">]</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&#39;</span><span style="color:#D4D4D4;">)</span></span>
84
+ <span class="line"></span>
85
+ <span class="line"><span style="color:#D4D4D4;">final_nbest = streaming_model.end()</span></span>
86
+ <span class="line"><span style="color:#DCDCAA;">print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&#39;Final result : </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">final_nbest[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">][</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">]</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&#39;</span><span style="color:#D4D4D4;">)</span></span>
87
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div>`,29);function u(h,_){const e=l("ExternalLinkIcon");return o(),p("div",null,[c,d,m,D,s("p",null,[n("Author: "),s("a",v,[n("Masao Someki"),i(e)])]),y])}const x=a(r,[["render",u],["__file","onnx_conversion_demo.html.vue"]]),f=JSON.parse('{"path":"/espnet2/others/onnx_conversion_demo.html","title":"espnet_onnx demonstration","lang":"en-US","frontmatter":{},"headers":[{"level":2,"title":"Table of Contents","slug":"table-of-contents","link":"#table-of-contents","children":[]},{"level":2,"title":"Export model from espnet_model_zoo","slug":"export-model-from-espnet-model-zoo","link":"#export-model-from-espnet-model-zoo","children":[]},{"level":2,"title":"Export from custom model","slug":"export-from-custom-model","link":"#export-from-custom-model","children":[]}],"git":{},"filePathRelative":"espnet2/others/onnx_conversion_demo.md"}');export{x as comp,f as data};
assets/pretrained.html-JpE__EKJ.js ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import{_ as t,r as l,o as p,c as d,a as e,d as s,b as a,e as i}from"./app-DTS6SjJz.js";const c={},r=e("h1",{id:"pretrained-model",tabindex:"-1"},[e("a",{class:"header-anchor",href:"#pretrained-model"},[e("span",null,"Pretrained Model")])],-1),o=e("p",null,"This is the example notebook of how-to-recognize and -synthesize speech using the ESPnet models.",-1),u=e("p",null,"See also:",-1),m=e("ul",null,[e("li",null,"Tutorial: https://github.com/espnet/espnet/blob/master/doc/tutorial.md"),e("li",null,"Github: https://github.com/espnet")],-1),h={href:"https://github.com/takenori-y",target:"_blank",rel:"noopener noreferrer"},v=i(`<p>Last update: 2019/07/28</p><h2 id="setup-envrionment" tabindex="-1"><a class="header-anchor" href="#setup-envrionment"><span>Setup envrionment</span></a></h2><p>Let&#39;s setup the environmet for the demonstration. It takes around 10 minues. Please keep waiting for a while.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># OS setup</span></span>
2
+ <span class="line"><span>!sudo apt-get install bc tree sox</span></span>
3
+ <span class="line"><span>!cat /etc/os-release</span></span>
4
+ <span class="line"><span></span></span>
5
+ <span class="line"><span># espnet setup</span></span>
6
+ <span class="line"><span>!git clone https://github.com/espnet/espnet</span></span>
7
+ <span class="line"><span>!cd espnet; pip install -e .</span></span>
8
+ <span class="line"><span></span></span>
9
+ <span class="line"><span># warp ctc setup</span></span>
10
+ <span class="line"><span>!git clone https://github.com/espnet/warp-ctc -b pytorch-1.1</span></span>
11
+ <span class="line"><span>!cd warp-ctc &amp;&amp; mkdir build &amp;&amp; cd build &amp;&amp; cmake .. &amp;&amp; make -j</span></span>
12
+ <span class="line"><span>!cd warp-ctc/pytorch_binding &amp;&amp; python setup.py install </span></span>
13
+ <span class="line"><span></span></span>
14
+ <span class="line"><span># kaldi setup</span></span>
15
+ <span class="line"><span>!cd /content/espnet/tools; git clone https://github.com/kaldi-asr/kaldi</span></span>
16
+ <span class="line"><span>!echo &quot;&quot; &gt; ./espnet/tools/kaldi/tools/extras/check_dependencies.sh</span></span>
17
+ <span class="line"><span>!chmod +x ./espnet/tools/kaldi/tools/extras/check_dependencies.sh</span></span>
18
+ <span class="line"><span>!cd ./espnet/tools/kaldi/tools; make sph2pipe sclite</span></span>
19
+ <span class="line"><span>!rm -rf espnet/tools/kaldi/tools/python</span></span>
20
+ <span class="line"><span>!wget https://18-198329952-gh.circle-artifacts.com/0/home/circleci/repo/ubuntu16-featbin.tar.gz</span></span>
21
+ <span class="line"><span>!tar -xf ./ubuntu16-featbin.tar.gz</span></span>
22
+ <span class="line"><span>!cp featbin/* espnet/tools/kaldi/src/featbin/</span></span>
23
+ <span class="line"><span></span></span>
24
+ <span class="line"><span># sentencepiece setup</span></span>
25
+ <span class="line"><span>!cd espnet/tools; make sentencepiece.done</span></span>
26
+ <span class="line"><span></span></span>
27
+ <span class="line"><span># make dummy activate</span></span>
28
+ <span class="line"><span>!mkdir -p espnet/tools/venv/bin</span></span>
29
+ <span class="line"><span>!touch espnet/tools/venv/bin/activate</span></span>
30
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="recognize-speech-using-pretrained-models" tabindex="-1"><a class="header-anchor" href="#recognize-speech-using-pretrained-models"><span>Recognize speech using pretrained models</span></a></h2><p>Let&#39;s recognize 7-minutes long audio speech as an example. Go to a recipe directory and run <code>recog_wav.sh</code> at the directory.</p>`,6),b={href:"https://github.com/espnet/espnet#asr-demo",target:"_blank",rel:"noopener noreferrer"},g=i(`<div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!cd espnet/egs/tedlium2/asr1; bash ../../../utils/recog_wav.sh --models tedlium2.tacotron2.v1</span></span>
31
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>You can see the progress of the recognition.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!cat espnet/egs/tedlium2/asr1/decode/TomWujec_2010U/log/decode.log</span></span>
32
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>You can change E2E model, language model, decoding parameters, etc. For the detail, see <code>recog_wav.sh</code>.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!cat espnet/utils/recog_wav.sh</span></span>
33
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h2 id="synthesize-speech-using-pretrained-models" tabindex="-1"><a class="header-anchor" href="#synthesize-speech-using-pretrained-models"><span>Synthesize speech using pretrained models</span></a></h2><p>Let&#39;s synthesize speech using an E2E model. Go to a recipe directory and run <code>synth_wav.sh</code> at the directory.</p>`,7),x={href:"https://github.com/espnet/espnet#tts-demo",target:"_blank",rel:"noopener noreferrer"},k=i(`<div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!cd espnet/egs/ljspeech/tts1; \\</span></span>
34
+ <span class="line"><span>echo &quot;THIS IS A DEMONSTRATION OF TEXT TO SPEECH.&quot; &gt; example.txt; \\</span></span>
35
+ <span class="line"><span>bash ../../../utils/synth_wav.sh --models ljspeech.tacotron2.v1 example.txt</span></span>
36
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>Let&#39;s listen the synthesized speech!</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>from google.colab import files</span></span>
37
+ <span class="line"><span></span></span>
38
+ <span class="line"><span>files.download(&#39;espnet/egs/ljspeech/tts1/decode/example/wav/example.wav&#39;)</span></span>
39
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>You can change E2E model, decoding parameters, etc. For the detail, see <code>synth_wav.sh</code>.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!cat espnet/utils/synth_wav.sh</span></span>
40
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>We have a web storage to put your good trained models. If you want, please contact Shinji Watanabe <a href="mailto:shinjiw@ieee.org">shinjiw@ieee.org</a>.</p>`,6);function _(E,f){const n=l("ExternalLinkIcon");return p(),d("div",null,[r,o,u,m,e("p",null,[s("Author: "),e("a",h,[s("Takenori Yoshimura"),a(n)])]),v,e("p",null,[s("Available models are summarized "),e("a",b,[s("here"),a(n)]),s(".")]),g,e("p",null,[s("Available models are summarized "),e("a",x,[s("here"),a(n)]),s(".")]),k])}const w=t(c,[["render",_],["__file","pretrained.html.vue"]]),D=JSON.parse('{"path":"/espnet2/others/pretrained.html","title":"Pretrained Model","lang":"en-US","frontmatter":{},"headers":[{"level":2,"title":"Setup envrionment","slug":"setup-envrionment","link":"#setup-envrionment","children":[]},{"level":2,"title":"Recognize speech using pretrained models","slug":"recognize-speech-using-pretrained-models","link":"#recognize-speech-using-pretrained-models","children":[]},{"level":2,"title":"Synthesize speech using pretrained models","slug":"synthesize-speech-using-pretrained-models","link":"#synthesize-speech-using-pretrained-models","children":[]}],"git":{},"filePathRelative":"espnet2/others/pretrained.md"}');export{w as comp,D as data};
assets/se_demo.html-DY-mv2y8.js ADDED
@@ -0,0 +1,311 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import{_ as l,r as p,o as d,c as t,a as s,b as e,d as n,e as i}from"./app-DTS6SjJz.js";const r={},c=s("h1",{id:"espnet-speech-enhancement-demonstration",tabindex:"-1"},[s("a",{class:"header-anchor",href:"#espnet-speech-enhancement-demonstration"},[s("span",null,"ESPnet Speech Enhancement Demonstration")])],-1),o={href:"https://colab.research.google.com/drive/1fjRJCh96SoYLZPRxsjF9VDv4Q2VoIckI?usp=sharing",target:"_blank",rel:"noopener noreferrer"},u=s("img",{src:"https://colab.research.google.com/assets/colab-badge.svg",alt:"Open In Colab"},null,-1),v=s("p",null,"This notebook provides a demonstration of the speech enhancement and separation using ESPnet2-SE.",-1),m=s("ul",null,[s("li",null,"ESPnet2-SE: https://github.com/espnet/espnet/tree/master/egs2/TEMPLATE/enh1")],-1),b={href:"https://github.com/LiChenda",target:"_blank",rel:"noopener noreferrer"},h={href:"https://github.com/Emrys365",target:"_blank",rel:"noopener noreferrer"},_=i(`<h2 id="install" tabindex="-1"><a class="header-anchor" href="#install"><span>Install</span></a></h2><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>%pip install -q espnet==0.10.1</span></span>
2
+ <span class="line"><span>%pip install -q espnet_model_zoo</span></span>
3
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="speech-enhancement" tabindex="-1"><a class="header-anchor" href="#speech-enhancement"><span>Speech Enhancement</span></a></h2><h3 id="single-channel-enhancement-the-chime-example" tabindex="-1"><a class="header-anchor" href="#single-channel-enhancement-the-chime-example"><span>Single-Channel Enhancement, the CHiME example</span></a></h3><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># Download one utterance from real noisy speech of CHiME4</span></span>
4
+ <span class="line"><span>!gdown --id 1SmrN5NFSg6JuQSs2sfy3ehD8OIcqK6wS -O /content/M05_440C0213_PED_REAL.wav</span></span>
5
+ <span class="line"><span>import os</span></span>
6
+ <span class="line"><span></span></span>
7
+ <span class="line"><span>import soundfile</span></span>
8
+ <span class="line"><span>from IPython.display import display, Audio</span></span>
9
+ <span class="line"><span>mixwav_mc, sr = soundfile.read(&quot;/content/M05_440C0213_PED_REAL.wav&quot;)</span></span>
10
+ <span class="line"><span># mixwav.shape: num_samples, num_channels</span></span>
11
+ <span class="line"><span>mixwav_sc = mixwav_mc[:,4]</span></span>
12
+ <span class="line"><span>display(Audio(mixwav_mc.T, rate=sr))</span></span>
13
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h4 id="download-and-load-the-pretrained-conv-tasnet" tabindex="-1"><a class="header-anchor" href="#download-and-load-the-pretrained-conv-tasnet"><span>Download and load the pretrained Conv-Tasnet</span></a></h4><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!gdown --id 17DMWdw84wF3fz3t7ia1zssdzhkpVQGZm -O /content/chime_tasnet_singlechannel.zip</span></span>
14
+ <span class="line"><span>!unzip /content/chime_tasnet_singlechannel.zip -d /content/enh_model_sc</span></span>
15
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># Load the model</span></span>
16
+ <span class="line"><span># If you encounter error &quot;No module named &#39;espnet2&#39;&quot;, please re-run the 1st Cell. This might be a colab bug.</span></span>
17
+ <span class="line"><span>import sys</span></span>
18
+ <span class="line"><span>import soundfile</span></span>
19
+ <span class="line"><span>from espnet2.bin.enh_inference import SeparateSpeech</span></span>
20
+ <span class="line"><span></span></span>
21
+ <span class="line"><span></span></span>
22
+ <span class="line"><span>separate_speech = {}</span></span>
23
+ <span class="line"><span># For models downloaded from GoogleDrive, you can use the following script:</span></span>
24
+ <span class="line"><span>enh_model_sc = SeparateSpeech(</span></span>
25
+ <span class="line"><span> enh_train_config=&quot;/content/enh_model_sc/exp/enh_train_enh_conv_tasnet_raw/config.yaml&quot;,</span></span>
26
+ <span class="line"><span> enh_model_file=&quot;/content/enh_model_sc/exp/enh_train_enh_conv_tasnet_raw/5epoch.pth&quot;,</span></span>
27
+ <span class="line"><span> # for segment-wise process on long speech</span></span>
28
+ <span class="line"><span> normalize_segment_scale=False,</span></span>
29
+ <span class="line"><span> show_progressbar=True,</span></span>
30
+ <span class="line"><span> ref_channel=4,</span></span>
31
+ <span class="line"><span> normalize_output_wav=True,</span></span>
32
+ <span class="line"><span> device=&quot;cuda:0&quot;,</span></span>
33
+ <span class="line"><span>)</span></span>
34
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h4 id="enhance-the-single-channel-real-noisy-speech-in-chime4" tabindex="-1"><a class="header-anchor" href="#enhance-the-single-channel-real-noisy-speech-in-chime4"><span>Enhance the single-channel real noisy speech in CHiME4</span></a></h4><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># play the enhanced single-channel speech</span></span>
35
+ <span class="line"><span>wave = enh_model_sc(mixwav_sc[None, ...], sr)</span></span>
36
+ <span class="line"><span>print(&quot;Input real noisy speech&quot;, flush=True)</span></span>
37
+ <span class="line"><span>display(Audio(mixwav_sc, rate=sr))</span></span>
38
+ <span class="line"><span>print(&quot;Enhanced speech&quot;, flush=True)</span></span>
39
+ <span class="line"><span>display(Audio(wave[0].squeeze(), rate=sr))</span></span>
40
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="enhance-your-own-pre-recordings" tabindex="-1"><a class="header-anchor" href="#enhance-your-own-pre-recordings"><span>Enhance your own pre-recordings</span></a></h3><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>from google.colab import files</span></span>
41
+ <span class="line"><span>from IPython.display import display, Audio</span></span>
42
+ <span class="line"><span>import soundfile</span></span>
43
+ <span class="line"><span></span></span>
44
+ <span class="line"><span>uploaded = files.upload()</span></span>
45
+ <span class="line"><span></span></span>
46
+ <span class="line"><span>for file_name in uploaded.keys():</span></span>
47
+ <span class="line"><span> speech, rate = soundfile.read(file_name)</span></span>
48
+ <span class="line"><span> assert rate == sr, &quot;mismatch in sampling rate&quot;</span></span>
49
+ <span class="line"><span> wave = enh_model_sc(speech[None, ...], sr)</span></span>
50
+ <span class="line"><span> print(f&quot;Your input speech {file_name}&quot;, flush=True)</span></span>
51
+ <span class="line"><span> display(Audio(speech, rate=sr))</span></span>
52
+ <span class="line"><span> print(f&quot;Enhanced speech for {file_name}&quot;, flush=True)</span></span>
53
+ <span class="line"><span> display(Audio(wave[0].squeeze(), rate=sr))</span></span>
54
+ <span class="line"><span></span></span>
55
+ <span class="line"><span></span></span>
56
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="multi-channel-enhancement" tabindex="-1"><a class="header-anchor" href="#multi-channel-enhancement"><span>Multi-Channel Enhancement</span></a></h3><h4 id="download-and-load-the-pretrained-mvdr-neural-beamformer" tabindex="-1"><a class="header-anchor" href="#download-and-load-the-pretrained-mvdr-neural-beamformer"><span>Download and load the pretrained mvdr neural beamformer.</span></a></h4><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># Download the pretained enhancement model</span></span>
57
+ <span class="line"><span></span></span>
58
+ <span class="line"><span>!gdown --id 1FohDfBlOa7ipc9v2luY-QIFQ_GJ1iW_i -O /content/mvdr_beamformer_16k_se_raw_valid.zip</span></span>
59
+ <span class="line"><span>!unzip /content/mvdr_beamformer_16k_se_raw_valid.zip -d /content/enh_model_mc </span></span>
60
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># Load the model</span></span>
61
+ <span class="line"><span># If you encounter error &quot;No module named &#39;espnet2&#39;&quot;, please re-run the 1st Cell. This might be a colab bug.</span></span>
62
+ <span class="line"><span>import sys</span></span>
63
+ <span class="line"><span>import soundfile</span></span>
64
+ <span class="line"><span>from espnet2.bin.enh_inference import SeparateSpeech</span></span>
65
+ <span class="line"><span></span></span>
66
+ <span class="line"><span></span></span>
67
+ <span class="line"><span>separate_speech = {}</span></span>
68
+ <span class="line"><span># For models downloaded from GoogleDrive, you can use the following script:</span></span>
69
+ <span class="line"><span>enh_model_mc = SeparateSpeech(</span></span>
70
+ <span class="line"><span> enh_train_config=&quot;/content/enh_model_mc/exp/enh_train_enh_beamformer_mvdr_raw/config.yaml&quot;,</span></span>
71
+ <span class="line"><span> enh_model_file=&quot;/content/enh_model_mc/exp/enh_train_enh_beamformer_mvdr_raw/11epoch.pth&quot;,</span></span>
72
+ <span class="line"><span> # for segment-wise process on long speech</span></span>
73
+ <span class="line"><span> normalize_segment_scale=False,</span></span>
74
+ <span class="line"><span> show_progressbar=True,</span></span>
75
+ <span class="line"><span> ref_channel=4,</span></span>
76
+ <span class="line"><span> normalize_output_wav=True,</span></span>
77
+ <span class="line"><span> device=&quot;cuda:0&quot;,</span></span>
78
+ <span class="line"><span>)</span></span>
79
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h4 id="enhance-the-multi-channel-real-noisy-speech-in-chime4" tabindex="-1"><a class="header-anchor" href="#enhance-the-multi-channel-real-noisy-speech-in-chime4"><span>Enhance the multi-channel real noisy speech in CHiME4</span></a></h4><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>wave = enh_model_mc(mixwav_mc[None, ...], sr)</span></span>
80
+ <span class="line"><span>print(&quot;Input real noisy speech&quot;, flush=True)</span></span>
81
+ <span class="line"><span>display(Audio(mixwav_mc.T, rate=sr))</span></span>
82
+ <span class="line"><span>print(&quot;Enhanced speech&quot;, flush=True)</span></span>
83
+ <span class="line"><span>display(Audio(wave[0].squeeze(), rate=sr))</span></span>
84
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="speech-separation" tabindex="-1"><a class="header-anchor" href="#speech-separation"><span>Speech Separation</span></a></h2><h3 id="model-selection" tabindex="-1"><a class="header-anchor" href="#model-selection"><span>Model Selection</span></a></h3>`,20),f={href:"https://github.com/espnet/espnet_model_zoo/blob/master/espnet_model_zoo/table.csv",target:"_blank",rel:"noopener noreferrer"},x=i(`<p>In this demonstration, we will show different speech separation models on wsj0_2mix.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>#@title Choose Speech Separation model { run: &quot;auto&quot; }</span></span>
85
+ <span class="line"><span></span></span>
86
+ <span class="line"><span>fs = 8000 #@param {type:&quot;integer&quot;}</span></span>
87
+ <span class="line"><span>tag = &quot;Chenda Li/wsj0_2mix_enh_train_enh_conv_tasnet_raw_valid.si_snr.ave&quot; #@param [&quot;Chenda Li/wsj0_2mix_enh_train_enh_conv_tasnet_raw_valid.si_snr.ave&quot;, &quot;Chenda Li/wsj0_2mix_enh_train_enh_rnn_tf_raw_valid.si_snr.ave&quot;, &quot;https://zenodo.org/record/4688000/files/enh_train_enh_dprnn_tasnet_raw_valid.si_snr.ave.zip&quot;]</span></span>
88
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># For models uploaded to Zenodo, you can use the following python script instead:</span></span>
89
+ <span class="line"><span>import sys</span></span>
90
+ <span class="line"><span>import soundfile</span></span>
91
+ <span class="line"><span>from espnet_model_zoo.downloader import ModelDownloader</span></span>
92
+ <span class="line"><span>from espnet2.bin.enh_inference import SeparateSpeech</span></span>
93
+ <span class="line"><span></span></span>
94
+ <span class="line"><span>d = ModelDownloader()</span></span>
95
+ <span class="line"><span></span></span>
96
+ <span class="line"><span>cfg = d.download_and_unpack(tag)</span></span>
97
+ <span class="line"><span>separate_speech = SeparateSpeech(</span></span>
98
+ <span class="line"><span> enh_train_config=cfg[&quot;train_config&quot;],</span></span>
99
+ <span class="line"><span> enh_model_file=cfg[&quot;model_file&quot;],</span></span>
100
+ <span class="line"><span> # for segment-wise process on long speech</span></span>
101
+ <span class="line"><span> segment_size=2.4,</span></span>
102
+ <span class="line"><span> hop_size=0.8,</span></span>
103
+ <span class="line"><span> normalize_segment_scale=False,</span></span>
104
+ <span class="line"><span> show_progressbar=True,</span></span>
105
+ <span class="line"><span> ref_channel=None,</span></span>
106
+ <span class="line"><span> normalize_output_wav=True,</span></span>
107
+ <span class="line"><span> device=&quot;cuda:0&quot;,</span></span>
108
+ <span class="line"><span>)</span></span>
109
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="separate-speech-mixture" tabindex="-1"><a class="header-anchor" href="#separate-speech-mixture"><span>Separate Speech Mixture</span></a></h3><h4 id="separate-the-example-in-wsj0-2mix-testing-set" tabindex="-1"><a class="header-anchor" href="#separate-the-example-in-wsj0-2mix-testing-set"><span>Separate the example in wsj0_2mix testing set</span></a></h4><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!gdown --id 1ZCUkd_Lb7pO2rpPr4FqYdtJBZ7JMiInx -O /content/447c020t_1.2106_422a0112_-1.2106.wav</span></span>
110
+ <span class="line"><span></span></span>
111
+ <span class="line"><span>import os</span></span>
112
+ <span class="line"><span>import soundfile</span></span>
113
+ <span class="line"><span>from IPython.display import display, Audio</span></span>
114
+ <span class="line"><span></span></span>
115
+ <span class="line"><span>mixwav, sr = soundfile.read(&quot;447c020t_1.2106_422a0112_-1.2106.wav&quot;)</span></span>
116
+ <span class="line"><span>waves_wsj = separate_speech(mixwav[None, ...], fs=sr)</span></span>
117
+ <span class="line"><span></span></span>
118
+ <span class="line"><span>print(&quot;Input mixture&quot;, flush=True)</span></span>
119
+ <span class="line"><span>display(Audio(mixwav, rate=sr))</span></span>
120
+ <span class="line"><span>print(f&quot;========= Separated speech with model {tag} =========&quot;, flush=True)</span></span>
121
+ <span class="line"><span>print(&quot;Separated spk1&quot;, flush=True)</span></span>
122
+ <span class="line"><span>display(Audio(waves_wsj[0].squeeze(), rate=sr))</span></span>
123
+ <span class="line"><span>print(&quot;Separated spk2&quot;, flush=True)</span></span>
124
+ <span class="line"><span>display(Audio(waves_wsj[1].squeeze(), rate=sr))</span></span>
125
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h4 id="separate-your-own-recordings" tabindex="-1"><a class="header-anchor" href="#separate-your-own-recordings"><span>Separate your own recordings</span></a></h4><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>from google.colab import files</span></span>
126
+ <span class="line"><span>from IPython.display import display, Audio</span></span>
127
+ <span class="line"><span>import soundfile</span></span>
128
+ <span class="line"><span></span></span>
129
+ <span class="line"><span>uploaded = files.upload()</span></span>
130
+ <span class="line"><span></span></span>
131
+ <span class="line"><span>for file_name in uploaded.keys():</span></span>
132
+ <span class="line"><span> mixwav_yours, rate = soundfile.read(file_name)</span></span>
133
+ <span class="line"><span> assert rate == sr, &quot;mismatch in sampling rate&quot;</span></span>
134
+ <span class="line"><span> waves_yours = separate_speech(mixwav_yours[None, ...], fs=sr)</span></span>
135
+ <span class="line"><span> print(&quot;Input mixture&quot;, flush=True)</span></span>
136
+ <span class="line"><span> display(Audio(mixwav_yours, rate=sr))</span></span>
137
+ <span class="line"><span> print(f&quot;========= Separated speech with model {tag} =========&quot;, flush=True)</span></span>
138
+ <span class="line"><span> print(&quot;Separated spk1&quot;, flush=True)</span></span>
139
+ <span class="line"><span> display(Audio(waves_yours[0].squeeze(), rate=sr))</span></span>
140
+ <span class="line"><span> print(&quot;Separated spk2&quot;, flush=True)</span></span>
141
+ <span class="line"><span> display(Audio(waves_yours[1].squeeze(), rate=sr))</span></span>
142
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h4 id="show-spectrums-of-separated-speech" tabindex="-1"><a class="header-anchor" href="#show-spectrums-of-separated-speech"><span>Show spectrums of separated speech</span></a></h4><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>import matplotlib.pyplot as plt</span></span>
143
+ <span class="line"><span>import torch</span></span>
144
+ <span class="line"><span>from torch_complex.tensor import ComplexTensor</span></span>
145
+ <span class="line"><span></span></span>
146
+ <span class="line"><span>from espnet.asr.asr_utils import plot_spectrogram</span></span>
147
+ <span class="line"><span>from espnet2.layers.stft import Stft</span></span>
148
+ <span class="line"><span></span></span>
149
+ <span class="line"><span></span></span>
150
+ <span class="line"><span>stft = Stft(</span></span>
151
+ <span class="line"><span> n_fft=512,</span></span>
152
+ <span class="line"><span> win_length=None,</span></span>
153
+ <span class="line"><span> hop_length=128,</span></span>
154
+ <span class="line"><span> window=&quot;hann&quot;,</span></span>
155
+ <span class="line"><span>)</span></span>
156
+ <span class="line"><span>ilens = torch.LongTensor([len(mixwav)])</span></span>
157
+ <span class="line"><span># specs: (T, F)</span></span>
158
+ <span class="line"><span>spec_mix = ComplexTensor(</span></span>
159
+ <span class="line"><span> *torch.unbind(</span></span>
160
+ <span class="line"><span> stft(torch.as_tensor(mixwav).unsqueeze(0), ilens)[0].squeeze(),</span></span>
161
+ <span class="line"><span> dim=-1</span></span>
162
+ <span class="line"><span> )</span></span>
163
+ <span class="line"><span>)</span></span>
164
+ <span class="line"><span>spec_sep1 = ComplexTensor(</span></span>
165
+ <span class="line"><span> *torch.unbind(</span></span>
166
+ <span class="line"><span> stft(torch.as_tensor(waves_wsj[0]), ilens)[0].squeeze(),</span></span>
167
+ <span class="line"><span> dim=-1</span></span>
168
+ <span class="line"><span> )</span></span>
169
+ <span class="line"><span>)</span></span>
170
+ <span class="line"><span>spec_sep2 = ComplexTensor(</span></span>
171
+ <span class="line"><span> *torch.unbind(</span></span>
172
+ <span class="line"><span> stft(torch.as_tensor(waves_wsj[1]), ilens)[0].squeeze(),</span></span>
173
+ <span class="line"><span> dim=-1</span></span>
174
+ <span class="line"><span> )</span></span>
175
+ <span class="line"><span>)</span></span>
176
+ <span class="line"><span></span></span>
177
+ <span class="line"><span># freqs = torch.linspace(0, sr / 2, spec_mix.shape[1])</span></span>
178
+ <span class="line"><span># frames = torch.linspace(0, len(mixwav) / sr, spec_mix.shape[0])</span></span>
179
+ <span class="line"><span>samples = torch.linspace(0, len(mixwav) / sr, len(mixwav))</span></span>
180
+ <span class="line"><span>plt.figure(figsize=(24, 12))</span></span>
181
+ <span class="line"><span>plt.subplot(3, 2, 1)</span></span>
182
+ <span class="line"><span>plt.title(&#39;Mixture Spectrogram&#39;)</span></span>
183
+ <span class="line"><span>plot_spectrogram(</span></span>
184
+ <span class="line"><span> plt, abs(spec_mix).transpose(-1, -2).numpy(), fs=sr,</span></span>
185
+ <span class="line"><span> mode=&#39;db&#39;, frame_shift=None,</span></span>
186
+ <span class="line"><span> bottom=False, labelbottom=False</span></span>
187
+ <span class="line"><span>)</span></span>
188
+ <span class="line"><span>plt.subplot(3, 2, 2)</span></span>
189
+ <span class="line"><span>plt.title(&#39;Mixture Wavform&#39;)</span></span>
190
+ <span class="line"><span>plt.plot(samples, mixwav)</span></span>
191
+ <span class="line"><span>plt.xlim(0, len(mixwav) / sr)</span></span>
192
+ <span class="line"><span></span></span>
193
+ <span class="line"><span>plt.subplot(3, 2, 3)</span></span>
194
+ <span class="line"><span>plt.title(&#39;Separated Spectrogram (spk1)&#39;)</span></span>
195
+ <span class="line"><span>plot_spectrogram(</span></span>
196
+ <span class="line"><span> plt, abs(spec_sep1).transpose(-1, -2).numpy(), fs=sr,</span></span>
197
+ <span class="line"><span> mode=&#39;db&#39;, frame_shift=None,</span></span>
198
+ <span class="line"><span> bottom=False, labelbottom=False</span></span>
199
+ <span class="line"><span>)</span></span>
200
+ <span class="line"><span>plt.subplot(3, 2, 4)</span></span>
201
+ <span class="line"><span>plt.title(&#39;Separated Wavform (spk1)&#39;)</span></span>
202
+ <span class="line"><span>plt.plot(samples, waves_wsj[0].squeeze())</span></span>
203
+ <span class="line"><span>plt.xlim(0, len(mixwav) / sr)</span></span>
204
+ <span class="line"><span></span></span>
205
+ <span class="line"><span>plt.subplot(3, 2, 5)</span></span>
206
+ <span class="line"><span>plt.title(&#39;Separated Spectrogram (spk2)&#39;)</span></span>
207
+ <span class="line"><span>plot_spectrogram(</span></span>
208
+ <span class="line"><span> plt, abs(spec_sep2).transpose(-1, -2).numpy(), fs=sr,</span></span>
209
+ <span class="line"><span> mode=&#39;db&#39;, frame_shift=None,</span></span>
210
+ <span class="line"><span> bottom=False, labelbottom=False</span></span>
211
+ <span class="line"><span>)</span></span>
212
+ <span class="line"><span>plt.subplot(3, 2, 6)</span></span>
213
+ <span class="line"><span>plt.title(&#39;Separated Wavform (spk2)&#39;)</span></span>
214
+ <span class="line"><span>plt.plot(samples, waves_wsj[1].squeeze())</span></span>
215
+ <span class="line"><span>plt.xlim(0, len(mixwav) / sr)</span></span>
216
+ <span class="line"><span>plt.xlabel(&quot;Time (s)&quot;)</span></span>
217
+ <span class="line"><span>plt.show()</span></span>
218
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="evluate-separated-speech-with-pretrained-asr-model" tabindex="-1"><a class="header-anchor" href="#evluate-separated-speech-with-pretrained-asr-model"><span>Evluate separated speech with pretrained ASR model</span></a></h2><p>The ground truths are:</p><p><code>text_1: SOME CRITICS INCLUDING HIGH REAGAN ADMINISTRATION OFFICIALS ARE RAISING THE ALARM THAT THE FED&#39;S POLICY IS TOO TIGHT AND COULD CAUSE A RECESSION NEXT YEAR</code></p><p><code>text_2: THE UNITED STATES UNDERTOOK TO DEFEND WESTERN EUROPE AGAINST SOVIET ATTACK</code></p><p>(This may take a while for the speech recognition.)</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>import espnet_model_zoo</span></span>
219
+ <span class="line"><span>from espnet_model_zoo.downloader import ModelDownloader</span></span>
220
+ <span class="line"><span>from espnet2.bin.asr_inference import Speech2Text</span></span>
221
+ <span class="line"><span></span></span>
222
+ <span class="line"><span>wsj_8k_model_url=&quot;https://zenodo.org/record/4012264/files/asr_train_asr_transformer_raw_char_1gpu_valid.acc.ave.zip?download=1&quot;</span></span>
223
+ <span class="line"><span></span></span>
224
+ <span class="line"><span>d = ModelDownloader()</span></span>
225
+ <span class="line"><span>speech2text = Speech2Text(</span></span>
226
+ <span class="line"><span> **d.download_and_unpack(wsj_8k_model_url),</span></span>
227
+ <span class="line"><span> device=&quot;cuda:0&quot;,</span></span>
228
+ <span class="line"><span>)</span></span>
229
+ <span class="line"><span></span></span>
230
+ <span class="line"><span>text_est = [None, None]</span></span>
231
+ <span class="line"><span>text_est[0], *_ = speech2text(waves_wsj[0].squeeze())[0]</span></span>
232
+ <span class="line"><span>text_est[1], *_ = speech2text(waves_wsj[1].squeeze())[0]</span></span>
233
+ <span class="line"><span>text_m, *_ = speech2text(mixwav)[0]</span></span>
234
+ <span class="line"><span>print(&quot;Mix Speech to Text: &quot;, text_m)</span></span>
235
+ <span class="line"><span>print(&quot;Separated Speech 1 to Text: &quot;, text_est[0])</span></span>
236
+ <span class="line"><span>print(&quot;Separated Speech 2 to Text: &quot;, text_est[1])</span></span>
237
+ <span class="line"><span></span></span>
238
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>import difflib</span></span>
239
+ <span class="line"><span>from itertools import permutations</span></span>
240
+ <span class="line"><span></span></span>
241
+ <span class="line"><span>import editdistance</span></span>
242
+ <span class="line"><span>import numpy as np</span></span>
243
+ <span class="line"><span></span></span>
244
+ <span class="line"><span>colors = dict(</span></span>
245
+ <span class="line"><span> red=lambda text: f&quot;\\033[38;2;255;0;0m{text}\\033[0m&quot; if text else &quot;&quot;,</span></span>
246
+ <span class="line"><span> green=lambda text: f&quot;\\033[38;2;0;255;0m{text}\\033[0m&quot; if text else &quot;&quot;,</span></span>
247
+ <span class="line"><span> yellow=lambda text: f&quot;\\033[38;2;225;225;0m{text}\\033[0m&quot; if text else &quot;&quot;,</span></span>
248
+ <span class="line"><span> white=lambda text: f&quot;\\033[38;2;255;255;255m{text}\\033[0m&quot; if text else &quot;&quot;,</span></span>
249
+ <span class="line"><span> black=lambda text: f&quot;\\033[38;2;0;0;0m{text}\\033[0m&quot; if text else &quot;&quot;,</span></span>
250
+ <span class="line"><span>)</span></span>
251
+ <span class="line"><span></span></span>
252
+ <span class="line"><span>def diff_strings(ref, est):</span></span>
253
+ <span class="line"><span> &quot;&quot;&quot;Reference: https://stackoverflow.com/a/64404008/7384873&quot;&quot;&quot;</span></span>
254
+ <span class="line"><span> ref_str, est_str, err_str = [], [], []</span></span>
255
+ <span class="line"><span> matcher = difflib.SequenceMatcher(None, ref, est)</span></span>
256
+ <span class="line"><span> for opcode, a0, a1, b0, b1 in matcher.get_opcodes():</span></span>
257
+ <span class="line"><span> if opcode == &quot;equal&quot;:</span></span>
258
+ <span class="line"><span> txt = ref[a0:a1]</span></span>
259
+ <span class="line"><span> ref_str.append(txt)</span></span>
260
+ <span class="line"><span> est_str.append(txt)</span></span>
261
+ <span class="line"><span> err_str.append(&quot; &quot; * (a1 - a0))</span></span>
262
+ <span class="line"><span> elif opcode == &quot;insert&quot;:</span></span>
263
+ <span class="line"><span> ref_str.append(&quot;*&quot; * (b1 - b0))</span></span>
264
+ <span class="line"><span> est_str.append(colors[&quot;green&quot;](est[b0:b1]))</span></span>
265
+ <span class="line"><span> err_str.append(colors[&quot;black&quot;](&quot;I&quot; * (b1 - b0)))</span></span>
266
+ <span class="line"><span> elif opcode == &quot;delete&quot;:</span></span>
267
+ <span class="line"><span> ref_str.append(ref[a0:a1])</span></span>
268
+ <span class="line"><span> est_str.append(colors[&quot;red&quot;](&quot;*&quot; * (a1 - a0)))</span></span>
269
+ <span class="line"><span> err_str.append(colors[&quot;black&quot;](&quot;D&quot; * (a1 - a0)))</span></span>
270
+ <span class="line"><span> elif opcode == &quot;replace&quot;:</span></span>
271
+ <span class="line"><span> diff = a1 - a0 - b1 + b0</span></span>
272
+ <span class="line"><span> if diff &gt;= 0:</span></span>
273
+ <span class="line"><span> txt_ref = ref[a0:a1]</span></span>
274
+ <span class="line"><span> txt_est = colors[&quot;yellow&quot;](est[b0:b1]) + colors[&quot;red&quot;](&quot;*&quot; * diff)</span></span>
275
+ <span class="line"><span> txt_err = &quot;S&quot; * (b1 - b0) + &quot;D&quot; * diff</span></span>
276
+ <span class="line"><span> elif diff &lt; 0:</span></span>
277
+ <span class="line"><span> txt_ref = ref[a0:a1] + &quot;*&quot; * -diff</span></span>
278
+ <span class="line"><span> txt_est = colors[&quot;yellow&quot;](est[b0:b1]) + colors[&quot;green&quot;](&quot;*&quot; * -diff)</span></span>
279
+ <span class="line"><span> txt_err = &quot;S&quot; * (b1 - b0) + &quot;I&quot; * -diff</span></span>
280
+ <span class="line"><span></span></span>
281
+ <span class="line"><span> ref_str.append(txt_ref)</span></span>
282
+ <span class="line"><span> est_str.append(txt_est)</span></span>
283
+ <span class="line"><span> err_str.append(colors[&quot;black&quot;](txt_err))</span></span>
284
+ <span class="line"><span> return &quot;&quot;.join(ref_str), &quot;&quot;.join(est_str), &quot;&quot;.join(err_str)</span></span>
285
+ <span class="line"><span></span></span>
286
+ <span class="line"><span></span></span>
287
+ <span class="line"><span>text_ref = [</span></span>
288
+ <span class="line"><span> &quot;SOME CRITICS INCLUDING HIGH REAGAN ADMINISTRATION OFFICIALS ARE RAISING THE ALARM THAT THE FED&#39;S POLICY IS TOO TIGHT AND COULD CAUSE A RECESSION NEXT YEAR&quot;,</span></span>
289
+ <span class="line"><span> &quot;THE UNITED STATES UNDERTOOK TO DEFEND WESTERN EUROPE AGAINST SOVIET ATTACK&quot;,</span></span>
290
+ <span class="line"><span>]</span></span>
291
+ <span class="line"><span></span></span>
292
+ <span class="line"><span>print(&quot;=====================&quot; , flush=True)</span></span>
293
+ <span class="line"><span>perms = list(permutations(range(2)))</span></span>
294
+ <span class="line"><span>string_edit = [</span></span>
295
+ <span class="line"><span> [</span></span>
296
+ <span class="line"><span> editdistance.eval(text_ref[m], text_est[n])</span></span>
297
+ <span class="line"><span> for m, n in enumerate(p)</span></span>
298
+ <span class="line"><span> ]</span></span>
299
+ <span class="line"><span> for p in perms</span></span>
300
+ <span class="line"><span>]</span></span>
301
+ <span class="line"><span></span></span>
302
+ <span class="line"><span>dist = [sum(edist) for edist in string_edit]</span></span>
303
+ <span class="line"><span>perm_idx = np.argmin(dist)</span></span>
304
+ <span class="line"><span>perm = perms[perm_idx]</span></span>
305
+ <span class="line"><span></span></span>
306
+ <span class="line"><span>for i, p in enumerate(perm):</span></span>
307
+ <span class="line"><span> print(&quot;\\n--------------- Text %d ---------------&quot; % (i + 1), flush=True)</span></span>
308
+ <span class="line"><span> ref, est, err = diff_strings(text_ref[i], text_est[p])</span></span>
309
+ <span class="line"><span> print(&quot;REF: &quot; + ref + &quot;\\n&quot; + &quot;HYP: &quot; + est + &quot;\\n&quot; + &quot;ERR: &quot; + err, flush=True)</span></span>
310
+ <span class="line"><span> print(&quot;Edit Distance = {}\\n&quot;.format(string_edit[perm_idx][i]), flush=True)</span></span>
311
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div>`,17);function q(g,w){const a=p("ExternalLinkIcon");return d(),t("div",null,[c,s("p",null,[s("a",o,[u,e(a)])]),v,m,s("p",null,[n("Author: Chenda Li ("),s("a",b,[n("@LiChenda"),e(a)]),n("), Wangyou Zhang ("),s("a",h,[n("@Emrys365"),e(a)]),n(")")]),_,s("p",null,[n("Please select model shown in "),s("a",f,[n("espnet_model_zoo"),e(a)])]),x])}const S=l(r,[["render",q],["__file","se_demo.html.vue"]]),y=JSON.parse('{"path":"/espnet2/se/se_demo.html","title":"ESPnet Speech Enhancement Demonstration","lang":"en-US","frontmatter":{},"headers":[{"level":2,"title":"Install","slug":"install","link":"#install","children":[]},{"level":2,"title":"Speech Enhancement","slug":"speech-enhancement","link":"#speech-enhancement","children":[{"level":3,"title":"Single-Channel Enhancement, the CHiME example","slug":"single-channel-enhancement-the-chime-example","link":"#single-channel-enhancement-the-chime-example","children":[]},{"level":3,"title":"Enhance your own pre-recordings","slug":"enhance-your-own-pre-recordings","link":"#enhance-your-own-pre-recordings","children":[]},{"level":3,"title":"Multi-Channel Enhancement","slug":"multi-channel-enhancement","link":"#multi-channel-enhancement","children":[]}]},{"level":2,"title":"Speech Separation","slug":"speech-separation","link":"#speech-separation","children":[{"level":3,"title":"Model Selection","slug":"model-selection","link":"#model-selection","children":[]},{"level":3,"title":"Separate Speech Mixture","slug":"separate-speech-mixture","link":"#separate-speech-mixture","children":[]}]},{"level":2,"title":"Evluate separated speech with pretrained ASR model","slug":"evluate-separated-speech-with-pretrained-asr-model","link":"#evluate-separated-speech-with-pretrained-asr-model","children":[]}],"git":{},"filePathRelative":"espnet2/se/se_demo.md"}');export{S as comp,y as data};
assets/st_demo.html-WLzB4ZGO.js ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import{_ as l,r as i,o,c as d,a as s,b as a,d as e,e as t}from"./app-DTS6SjJz.js";const p={},r=s("h1",{id:"espnet-speech-translation-demonstration",tabindex:"-1"},[s("a",{class:"header-anchor",href:"#espnet-speech-translation-demonstration"},[s("span",null,"ESPnet Speech Translation Demonstration")])],-1),c={href:"https://colab.research.google.com/github/espnet/notebook/blob/master/st_demo.ipynb",target:"_blank",rel:"noopener noreferrer"},u=s("img",{src:"https://colab.research.google.com/assets/colab-badge.svg",alt:"Open In Colab"},null,-1),h=s("p",null,"See also",-1),m=s("ul",null,[s("li",null,"ESPnet: https://github.com/espnet/espnet"),s("li",null,"ESPnet documentation: https://espnet.github.io/espnet/"),s("li",null,"TTS demo: https://colab.research.google.com/github/espnet/notebook/blob/master/tts_realtime_demo.ipynb")],-1),v={href:"https://github.com/ShigekiKarita",target:"_blank",rel:"noopener noreferrer"},b=t(`<h2 id="install" tabindex="-1"><a class="header-anchor" href="#install"><span>Install</span></a></h2><p>It takes around 3 minutes. Please keep waiting for a while.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># OS setup</span></span>
2
+ <span class="line"><span>!cat /etc/os-release</span></span>
3
+ <span class="line"><span>!apt-get install -qq bc tree sox</span></span>
4
+ <span class="line"><span></span></span>
5
+ <span class="line"><span># espnet and moses setup</span></span>
6
+ <span class="line"><span>!git clone -q https://github.com/ShigekiKarita/espnet.git</span></span>
7
+ <span class="line"><span>!pip install -q torch==1.1</span></span>
8
+ <span class="line"><span>!cd espnet; git checkout c0466d9a356c1a33f671a546426d7bc33b5b17e8; pip install -q -e .</span></span>
9
+ <span class="line"><span>!cd espnet/tools/; make moses.done</span></span>
10
+ <span class="line"><span></span></span>
11
+ <span class="line"><span># download pre-compiled warp-ctc and kaldi tools</span></span>
12
+ <span class="line"><span>!espnet/utils/download_from_google_drive.sh \\</span></span>
13
+ <span class="line"><span> &quot;https://drive.google.com/open?id=13Y4tSygc8WtqzvAVGK_vRV9GlV7TRC0w&quot; espnet/tools tar.gz &gt; /dev/null</span></span>
14
+ <span class="line"><span></span></span>
15
+ <span class="line"><span># make dummy activate</span></span>
16
+ <span class="line"><span>!mkdir -p espnet/tools/venv/bin &amp;&amp; touch espnet/tools/venv/bin/activate</span></span>
17
+ <span class="line"><span>!echo &quot;setup done.&quot;</span></span>
18
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><hr><h2 id="spanish-speech-english-text-translation" tabindex="-1"><a class="header-anchor" href="#spanish-speech-english-text-translation"><span>Spanish speech -&gt; English text translation</span></a></h2><p>This audio says &quot;yo soy José.&quot;</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>from IPython.display import display, Audio</span></span>
19
+ <span class="line"><span>display(Audio(&quot;/content/espnet/test_utils/st_test.wav&quot;, rate=16000))</span></span>
20
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><p>Let&#39;s translate this into English text by our pretrained Transformer ST model trained on the Fisher-CALLHOME Spanish dataset.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># move on the recipe directory</span></span>
21
+ <span class="line"><span>import os</span></span>
22
+ <span class="line"><span>os.chdir(&quot;/content/espnet/egs/fisher_callhome_spanish/st1&quot;)</span></span>
23
+ <span class="line"><span></span></span>
24
+ <span class="line"><span>!../../../utils/translate_wav.sh --models fisher_callhome_spanish.transformer.v1.es-en ../../../test_utils/st_test.wav | tee /content/translated.txt</span></span>
25
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>As seen above, we successfully obtained the result: <strong>&quot;Translated text: yes i&#39;m jose&quot;</strong>!</p><h2 id="english-translated-text-to-speech-synthesis" tabindex="-1"><a class="header-anchor" href="#english-translated-text-to-speech-synthesis"><span>English translated text-to-speech synthesis</span></a></h2><p>Now let&#39;s generate an <strong>English speech</strong> from the translated text using a pretrained ESPnet-TTS model.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!sed -n &#39;s/Translated text://p&#39; /content/translated.txt | tr &#39;[:lower:]&#39; &#39;[:upper:]&#39; | tee /content/translated_sed.txt</span></span>
26
+ <span class="line"><span>!../../../utils/synth_wav.sh /content/translated_sed.txt</span></span>
27
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>import matplotlib.pyplot as plt</span></span>
28
+ <span class="line"><span>import kaldiio</span></span>
29
+ <span class="line"><span>fbank = next(iter(kaldiio.load_scp(&quot;decode/translated_sed/outputs/feats.scp&quot;).values()))</span></span>
30
+ <span class="line"><span>plt.matshow(fbank.T)</span></span>
31
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>from IPython.display import display, Audio</span></span>
32
+ <span class="line"><span>display(Audio(&quot;decode/translated_sed/wav_wnv/translated_sed_gen.wav&quot;))</span></span>
33
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><p>Successfully, it says &quot;Yes I&#39;m Jose&quot;! For more TTS demo, visit https://colab.research.google.com/github/espnet/notebook/blob/master/tts_realtime_demo.ipynb</p><h2 id="check-decoding-log" tabindex="-1"><a class="header-anchor" href="#check-decoding-log"><span>Check decoding log</span></a></h2><p>After the translation, you will find <code>&lt;decode_dir&gt;/&lt;wav name&gt;/result.json</code> for details;</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!cat decode/st_test/result.json</span></span>
34
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>and <code>&lt;decode_dir&gt;/&lt;wav name&gt;/log/decode.log</code> for runtime log;</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!cat decode/st_test/log/decode.log</span></span>
35
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>Let&#39;s calculate real-time factor (RTF) of the ST decoding from the <code>decode.log</code></p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>from dateutil import parser</span></span>
36
+ <span class="line"><span>from subprocess import PIPE, run</span></span>
37
+ <span class="line"><span></span></span>
38
+ <span class="line"><span># calc input duration (seconds)</span></span>
39
+ <span class="line"><span>input_sec = float(run([&quot;soxi&quot;, &quot;-D&quot;, &quot;/content/espnet/test_utils/st_test.wav&quot;], stdout=PIPE).stdout)</span></span>
40
+ <span class="line"><span></span></span>
41
+ <span class="line"><span># calc NN decoding time</span></span>
42
+ <span class="line"><span>with open(&quot;decode/st_test/log/decode.log&quot;, &quot;r&quot;) as f:</span></span>
43
+ <span class="line"><span> times = [parser.parse(x.split(&quot;(&quot;)[0]) for x in f if &quot;e2e_st_transformer&quot; in x]</span></span>
44
+ <span class="line"><span>decode_sec = (times[-1] - times[0]).total_seconds()</span></span>
45
+ <span class="line"><span></span></span>
46
+ <span class="line"><span># get real-time factor (RTF)</span></span>
47
+ <span class="line"><span>print(&quot;Input duration:\\t&quot;, input_sec, &quot;sec&quot;)</span></span>
48
+ <span class="line"><span>print(&quot;NN decoding:\\t&quot;, decode_sec, &quot;sec&quot;)</span></span>
49
+ <span class="line"><span>print(&quot;Real-time factor:\\t&quot;, decode_sec / input_sec)</span></span>
50
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>As you can see above, ESPnet-ST can <strong>translate speech faster than the input</strong> (it should be RTF &lt; 1.0).</p><h2 id="training-st-models-from-scratch" tabindex="-1"><a class="header-anchor" href="#training-st-models-from-scratch"><span>Training ST models from scratch</span></a></h2>`,25),g={href:"https://kaldi-asr.org/doc/kaldi_for_dummies.html",target:"_blank",rel:"noopener noreferrer"},x={href:"https://colab.research.google.com/github/espnet/notebook/blob/master/asr_cli.ipynb",target:"_blank",rel:"noopener noreferrer"},_={href:"https://colab.research.google.com/github/espnet/notebook/blob/master/tts_cli.ipynb",target:"_blank",rel:"noopener noreferrer"},k=s("code",null,"run.sh",-1),f=t(`<div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!cd /content/espnet/egs/must_c/st1/ &amp;&amp; ./run.sh --must-c /content</span></span>
51
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>However, it takes too much time to finish downloading the dataset. So we cancel the cell above.</p><h2 id="details-of-espnet-tools" tabindex="-1"><a class="header-anchor" href="#details-of-espnet-tools"><span>Details of ESPnet tools</span></a></h2><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!../../../utils/translate_wav.sh --help</span></span>
52
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!../../../utils/synth_wav.sh --help</span></span>
53
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span></span></span>
54
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div>`,6);function E(y,q){const n=i("ExternalLinkIcon");return o(),d("div",null,[r,s("p",null,[s("a",c,[u,a(n)])]),h,m,s("p",null,[e("Author: "),s("a",v,[e("Shigeki Karita"),a(n)])]),b,s("p",null,[e("We provide "),s("a",g,[e("Kaldi-style recipes"),a(n)]),e(" for ST as well as "),s("a",x,[e("ASR"),a(n)]),e(" and "),s("a",_,[e("TTS"),a(n)]),e(" as all-in-one bash script "),k,e(":")]),f])}const S=l(p,[["render",E],["__file","st_demo.html.vue"]]),w=JSON.parse('{"path":"/espnet2/st/st_demo.html","title":"ESPnet Speech Translation Demonstration","lang":"en-US","frontmatter":{},"headers":[{"level":2,"title":"Install","slug":"install","link":"#install","children":[]},{"level":2,"title":"Spanish speech -> English text translation","slug":"spanish-speech-english-text-translation","link":"#spanish-speech-english-text-translation","children":[]},{"level":2,"title":"English translated text-to-speech synthesis","slug":"english-translated-text-to-speech-synthesis","link":"#english-translated-text-to-speech-synthesis","children":[]},{"level":2,"title":"Check decoding log","slug":"check-decoding-log","link":"#check-decoding-log","children":[]},{"level":2,"title":"Training ST models from scratch","slug":"training-st-models-from-scratch","link":"#training-st-models-from-scratch","children":[]},{"level":2,"title":"Details of ESPnet tools","slug":"details-of-espnet-tools","link":"#details-of-espnet-tools","children":[]}],"git":{},"filePathRelative":"espnet2/st/st_demo.md"}');export{S as comp,w as data};
assets/style-SNWc1iKP.css ADDED
@@ -0,0 +1 @@
 
 
1
+ .vp-back-to-top-button{position:fixed!important;bottom:4rem;inset-inline-end:1rem;z-index:100;width:48px;height:48px;padding:8px;border-width:0;border-radius:50%;background:var(--back-to-top-bg-color);color:var(--back-to-top-color);box-shadow:2px 2px 10px 4px var(--back-to-top-shadow);cursor:pointer}@media (max-width: 959px){.vp-back-to-top-button{transform:scale(.8);transform-origin:100% 100%}}@media print{.vp-back-to-top-button{display:none}}.vp-back-to-top-button:hover{color:var(--back-to-top-color-hover)}.vp-back-to-top-button .back-to-top-icon{overflow:hidden;width:100%;height:100%;background:currentcolor;border-radius:50%;-webkit-mask-image:var(--back-to-top-icon);mask-image:var(--back-to-top-icon);-webkit-mask-position:50%;mask-position:50%;-webkit-mask-size:cover;mask-size:cover}.vp-scroll-progress{position:absolute;right:-2px;bottom:-2px;width:52px;height:52px}.vp-scroll-progress svg{width:100%;height:100%}.vp-scroll-progress circle{opacity:.9;transform:rotate(-90deg);transform-origin:50% 50%}.back-to-top-enter-active,.back-to-top-leave-active{transition:opacity .3s}.back-to-top-enter-from,.back-to-top-leave-to{opacity:0}:root{--back-to-top-z-index: 5;--back-to-top-icon: url("data:image/svg+xml,%3csvg%20xmlns='http://www.w3.org/2000/svg'%20viewBox='0%200%201024%201024'%3e%3cpath%20d='M512%20843.2c-36.2%200-66.4-13.6-85.8-21.8-10.8-4.6-22.6%203.6-21.8%2015.2l7%20102c.4%206.2%207.6%209.4%2012.6%205.6l29-22c3.6-2.8%209-1.8%2011.4%202l41%2064.2c3%204.8%2010.2%204.8%2013.2%200l41-64.2c2.4-3.8%207.8-4.8%2011.4-2l29%2022c5%203.8%2012.2.6%2012.6-5.6l7-102c.8-11.6-11-20-21.8-15.2-19.6%208.2-49.6%2021.8-85.8%2021.8'/%3e%3cpath%20d='m795.4%20586.2-96-98.2C699.4%20172%20513%2032%20513%2032S324.8%20172%20324.8%20488l-96%2098.2c-3.6%203.6-5.2%209-4.4%2014.2L261.2%20824c1.8%2011.4%2014.2%2017%2023.6%2010.8L419%20744s41.4%2040%2094.2%2040%2092.2-40%2092.2-40l134.2%2090.8c9.2%206.2%2021.6.6%2023.6-10.8l37-223.8c.4-5.2-1.2-10.4-4.8-14M513%20384c-34%200-61.4-28.6-61.4-64s27.6-64%2061.4-64c34%200%2061.4%2028.6%2061.4%2064S547%20384%20513%20384'/%3e%3c/svg%3e");--back-to-top-bg-color: #fff;--back-to-top-color: #3eaf7c;--back-to-top-color-hover: #71cda3;--back-to-top-shadow: rgb(0 0 0 / 20%)}div[class*=language-]:hover:before{display:none}div[class*=language-]:hover .vp-copy-code-button{opacity:1}.vp-copy-code-button{position:absolute;top:.5em;right:.5em;z-index:5;width:2.5rem;height:2.5rem;padding:0;border-width:0;border-radius:.5rem;background:transparent;outline:none;opacity:0;cursor:pointer;transition:opacity .4s}@media print{.vp-copy-code-button{display:none}}.vp-copy-code-button:focus,.vp-copy-code-button.copied{opacity:1}.vp-copy-code-button:hover,.vp-copy-code-button.copied{background:var(--copy-code-hover)}.vp-copy-code-button.copied .vp-copy-icon{-webkit-mask-image:var(--code-copied-icon);mask-image:var(--code-copied-icon)}.vp-copy-code-button.copied:after{content:attr(data-copied);position:absolute;top:0;right:calc(100% + .25rem);display:block;height:1.25rem;padding:.625rem;border-radius:.5rem;background:var(--copy-code-hover);color:var(--copy-code-color);font-weight:500;line-height:1.25rem;white-space:nowrap}.vp-copy-icon{width:1.25rem;height:1.25rem;padding:.625rem;background:currentcolor;color:var(--copy-code-color);font-size:1.25rem;-webkit-mask-image:var(--code-copy-icon);mask-image:var(--code-copy-icon);-webkit-mask-position:50%;mask-position:50%;-webkit-mask-repeat:no-repeat;mask-repeat:no-repeat;-webkit-mask-size:1em;mask-size:1em}:root{--code-copy-icon: url("data:image/svg+xml,%3csvg%20xmlns='http://www.w3.org/2000/svg'%20viewBox='0%200%2024%2024'%20fill='none'%20height='20'%20width='20'%20stroke='rgba(128,128,128,1)'%20stroke-width='2'%3e%3cpath%20stroke-linecap='round'%20stroke-linejoin='round'%20d='M9%205H7a2%202%200%200%200-2%202v12a2%202%200%200%200%202%202h10a2%202%200%200%200%202-2V7a2%202%200%200%200-2-2h-2M9%205a2%202%200%200%200%202%202h2a2%202%200%200%200%202-2M9%205a2%202%200%200%201%202-2h2a2%202%200%200%201%202%202'%20/%3e%3c/svg%3e");--code-copied-icon: url("data:image/svg+xml,%3csvg%20xmlns='http://www.w3.org/2000/svg'%20viewBox='0%200%2024%2024'%20fill='none'%20height='20'%20width='20'%20stroke='rgba(128,128,128,1)'%20stroke-width='2'%3e%3cpath%20stroke-linecap='round'%20stroke-linejoin='round'%20d='M9%205H7a2%202%200%200%200-2%202v12a2%202%200%200%200%202%202h10a2%202%200%200%200%202-2V7a2%202%200%200%200-2-2h-2M9%205a2%202%200%200%200%202%202h2a2%202%200%200%200%202-2M9%205a2%202%200%200%201%202-2h2a2%202%200%200%201%202%202m-6%209%202%202%204-4'%20/%3e%3c/svg%3e");--copy-code-color: #9e9e9e;--copy-code-hover: rgb(0 0 0 / 50%)}:root{--external-link-icon-color: #aaa}.external-link-icon{position:relative;display:inline-block;color:var(--external-link-icon-color);vertical-align:middle;top:-1px}@media print{.external-link-icon{display:none}}.external-link-icon-sr-only{position:absolute;width:1px;height:1px;padding:0;margin:-1px;overflow:hidden;clip:rect(0,0,0,0);white-space:nowrap;border-width:0;-webkit-user-select:none;-moz-user-select:none;user-select:none}:root{--medium-zoom-z-index: 100;--medium-zoom-bg-color: #ffffff;--medium-zoom-opacity: 1}.medium-zoom-overlay{background-color:var(--medium-zoom-bg-color)!important;z-index:var(--medium-zoom-z-index)}.medium-zoom-overlay~img{z-index:calc(var(--medium-zoom-z-index) + 1)}.medium-zoom--opened .medium-zoom-overlay{opacity:var(--medium-zoom-opacity)}:root{--nprogress-color: #29d;--nprogress-z-index: 1031}#nprogress{pointer-events:none}#nprogress .bar{background:var(--nprogress-color);position:fixed;z-index:var(--nprogress-z-index);top:0;left:0;width:100%;height:2px}.vp-page-meta{max-width:var(--content-width);margin:0 auto;padding:.75rem 2.5rem;display:flex;flex-wrap:wrap;justify-content:space-between;overflow:auto}@media (max-width: 959px){.vp-page-meta{padding:2rem}}@media (max-width: 419px){.vp-page-meta{padding:1.5rem}}@media print{.vp-page-meta{margin:0!important;padding-inline:0!important}}@media (max-width: 719px){.vp-page-meta{display:block}}.vp-page-meta .vp-meta-item{flex-grow:1}.vp-page-meta .vp-meta-item .vp-meta-label{font-weight:500}.vp-page-meta .vp-meta-item .vp-meta-label:not(a){color:var(--c-text-lighter)}.vp-page-meta .vp-meta-item .vp-meta-info{color:var(--c-text-quote);font-weight:400}.vp-page-meta .git-info{text-align:end}.vp-page-meta .edit-link{margin-top:.25rem;margin-bottom:.25rem;margin-inline-end:.5rem;font-size:14px}@media print{.vp-page-meta .edit-link{display:none}}.vp-page-meta .edit-link .icon{position:relative;bottom:-.125em;width:1em;height:1em;margin-inline-end:.25em}.vp-page-meta .last-updated,.vp-page-meta .contributors{margin-top:.25rem;margin-bottom:.25rem;font-size:14px}@media (max-width: 719px){.vp-page-meta .last-updated,.vp-page-meta .contributors{font-size:13px;text-align:start}}.vp-page-nav{display:flex;flex-wrap:wrap;max-width:var(--content-width, 740px);min-height:2rem;margin-inline:auto;margin-top:0;padding-block:.5rem;padding-inline:2rem;border-top:1px solid var(--c-border);transition:border-top var(--t-color);padding-top:1rem;padding-bottom:0}@media (max-width: 959px){.vp-page-nav{padding-inline:1rem}}@media print{.vp-page-nav{display:none}}.vp-page-nav .route-link{display:inline-block;flex-grow:1;margin:.25rem;padding:.25rem .5rem;border:1px solid var(--c-border);border-radius:.25rem}.vp-page-nav .route-link:hover{background:var(--c-bg-light)}.vp-page-nav .route-link .hint{color:var(--c-text-quote);font-size:.875rem;line-height:2}.vp-page-nav .prev{text-align:start}.vp-page-nav .next{text-align:end}:root{--c-brand: #3eaf7c;--c-brand-light: #4abf8a;--c-bg: #ffffff;--c-bg-light: #f3f4f5;--c-bg-lighter: #eeeeee;--c-bg-dark: #ebebec;--c-bg-darker: #e6e6e6;--c-bg-navbar: var(--c-bg);--c-bg-sidebar: var(--c-bg);--c-bg-arrow: #cccccc;--c-text: #2c3e50;--c-text-accent: var(--c-brand);--c-text-light: #3a5169;--c-text-lighter: #4e6e8e;--c-text-lightest: #6a8bad;--c-text-quote: #999999;--c-border: #eaecef;--c-border-dark: #dfe2e5;--c-tip: #42b983;--c-tip-bg: var(--c-bg-light);--c-tip-title: var(--c-text);--c-tip-text: var(--c-text);--c-tip-text-accent: var(--c-text-accent);--c-warning: #ffc310;--c-warning-bg: #fffae3;--c-warning-bg-light: #fff3ba;--c-warning-bg-lighter: #fff0b0;--c-warning-border-dark: #f7dc91;--c-warning-details-bg: #fff5ca;--c-warning-title: #f1b300;--c-warning-text: #746000;--c-warning-text-accent: #edb100;--c-warning-text-light: #c1971c;--c-warning-text-quote: #ccab49;--c-danger: #f11e37;--c-danger-bg: #ffe0e0;--c-danger-bg-light: #ffcfde;--c-danger-bg-lighter: #ffc9c9;--c-danger-border-dark: #f1abab;--c-danger-details-bg: #ffd4d4;--c-danger-title: #ed1e2c;--c-danger-text: #660000;--c-danger-text-accent: #bd1a1a;--c-danger-text-light: #b5474d;--c-danger-text-quote: #c15b5b;--c-details-bg: #eeeeee;--c-badge-tip: var(--c-tip);--c-badge-warning: #ecc808;--c-badge-warning-text: var(--c-bg);--c-badge-danger: #dc2626;--c-badge-danger-text: var(--c-bg);--c-code-group-tab-title: rgba(255, 255, 255, .9);--c-code-group-tab-bg: var(--code-bg-color);--c-code-group-tab-outline: var(var(--c-code-group-tab-title));--c-code-group-tab-active-border: var(--c-brand);--t-color: .3s ease;--t-transform: .3s ease;--code-bg-color: #282c34;--code-hl-bg-color: rgba(0, 0, 0, .66);--code-ln-color: #9e9e9e;--code-ln-wrapper-width: 3.5rem;--font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen, Ubuntu, Cantarell, "Fira Sans", "Droid Sans", "Helvetica Neue", sans-serif;--font-family-code: Consolas, Monaco, "Andale Mono", "Ubuntu Mono", monospace;--navbar-height: 3.6rem;--navbar-padding-v: .7rem;--navbar-padding-h: 1.5rem;--sidebar-width: 20rem;--sidebar-width-mobile: calc(var(--sidebar-width) * .82);--content-width: 740px;--homepage-width: 960px}.vp-back-to-top-button{--back-to-top-color: var(--c-brand);--back-to-top-color-hover: var(--c-brand-light);--back-to-top-bg-color: var(--c-bg)}.vp-catalog-wrapper{--catalog-bg-color: var(--c-bg);--catalog-bg-secondary-color: var(--c-bg-dark);--catalog-border-color: var(--c-border);--catalog-active-color: var(--c-brand);--catalog-hover-color: var(--c-brand-light)}.waline-wrapper{--waline-bg-color: var(--c-bg);--waline-bg-color-light: var(--c-bg-light);--waline-text-color: var(--c-color);--waline-border: 1px solid var(--c-border);--waline-border-color: var(--c-border);--waline-theme-color: var(--c-brand);--waline-active-color: var(--c-brand-light)}.DocSearch{--docsearch-primary-color: var(--c-brand);--docsearch-text-color: var(--c-text);--docsearch-highlight-color: var(--c-brand);--docsearch-muted-color: var(--c-text-quote);--docsearch-container-background: rgba(9, 10, 17, .8);--docsearch-modal-background: var(--c-bg-light);--docsearch-searchbox-background: var(--c-bg-lighter);--docsearch-searchbox-focus-background: var(--c-bg);--docsearch-searchbox-shadow: inset 0 0 0 2px var(--c-brand);--docsearch-hit-color: var(--c-text-light);--docsearch-hit-active-color: var(--c-bg);--docsearch-hit-background: var(--c-bg);--docsearch-hit-shadow: 0 1px 3px 0 var(--c-border-dark);--docsearch-footer-background: var(--c-bg)}.external-link-icon{--external-link-icon-color: var(--c-text-quote)}.medium-zoom-overlay{--medium-zoom-bg-color: var(--c-bg)}#nprogress{--nprogress-color: var(--c-brand)}body{--photo-swipe-bullet: var(--c-bg);--photo-swipe-bullet-active: var(--c-brand)}body{--pwa-text-color: var(--c-text);--pwa-bg-color: var(--c-bg);--pwa-border-color: var(--c-brand);--pwa-btn-text-color: var(--c-bg);--pwa-btn-bg-color: var(--c-brand);--pwa-btn-hover-bg-color: var(--c-brand-light)}.language-modal-mask{--redirect-bg-color: var(--c-bg);--redirect-bg-color-light: var(--c-bg-light);--redirect-bg-color-lighter: var(--c-bg-lighter);--redirect-text-color: var(--c-text);--redirect-primary-color: var(--c-brand);--redirect-primary-hover-color: var(--c-brand-light);--redirect-primary-text-color: var(--c-bg)}.search-box{--search-bg-color: var(--c-bg);--search-accent-color: var(--c-brand);--search-text-color: var(--c-text);--search-border-color: var(--c-border);--search-item-text-color: var(--c-text-lighter);--search-item-focus-bg-color: var(--c-bg-light)}html.dark{--c-brand: #3aa675;--c-brand-light: #349469;--c-bg: #22272e;--c-bg-light: #2b313a;--c-bg-lighter: #262c34;--c-bg-dark: #343b44;--c-bg-darker: #37404c;--c-text: #adbac7;--c-text-light: #96a7b7;--c-text-lighter: #8b9eb0;--c-text-lightest: #8094a8;--c-border: #3e4c5a;--c-border-dark: #34404c;--c-tip: #318a62;--c-warning: #e0ad15;--c-warning-bg: #2d2f2d;--c-warning-bg-light: #423e2a;--c-warning-bg-lighter: #44442f;--c-warning-border-dark: #957c35;--c-warning-details-bg: #39392d;--c-warning-title: #fdca31;--c-warning-text: #d8d96d;--c-warning-text-accent: #ffbf00;--c-warning-text-light: #ddb84b;--c-warning-text-quote: #ccab49;--c-danger: #fc1e38;--c-danger-bg: #39232c;--c-danger-bg-light: #4b2b35;--c-danger-bg-lighter: #553040;--c-danger-border-dark: #a25151;--c-danger-details-bg: #482936;--c-danger-title: #fc2d3b;--c-danger-text: #ea9ca0;--c-danger-text-accent: #fd3636;--c-danger-text-light: #d9777c;--c-danger-text-quote: #d56b6b;--c-details-bg: #323843;--c-badge-warning: var(--c-warning);--c-badge-warning-text: #3c2e05;--c-badge-danger: var(--c-danger);--c-badge-danger-text: #401416;--code-hl-bg-color: #363b46}html.dark .DocSearch{--docsearch-logo-color: var(--c-text);--docsearch-modal-shadow: inset 1px 1px 0 0 #2c2e40, 0 3px 8px 0 #000309;--docsearch-key-shadow: inset 0 -2px 0 0 #282d55, inset 0 0 1px 1px #51577d, 0 2px 2px 0 rgba(3, 4, 9, .3);--docsearch-key-gradient: linear-gradient(-225deg, #444950, #1c1e21);--docsearch-footer-shadow: inset 0 1px 0 0 rgba(73, 76, 106, .5), 0 -4px 8px 0 rgba(0, 0, 0, .2)}html.dark body{--pwa-shadow-color: rgb(0 0 0 / 30%);--pwa-content-color: #ccc;--pwa-content-light-color: #999}html,body{padding:0;margin:0;background-color:var(--c-bg);transition:background-color var(--t-color)}html.dark{color-scheme:dark}html{font-size:16px}body{font-family:var(--font-family);-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale;font-size:1rem;color:var(--c-text)}a{font-weight:500;color:var(--c-text-accent);text-decoration:none;overflow-wrap:break-word}p a code{font-weight:400;color:var(--c-text-accent)}kbd{font-family:var(--font-family-code);color:var(--c-text);background:var(--c-bg-lighter);border:solid .15rem var(--c-border-dark);border-bottom:solid .25rem var(--c-border-dark);border-radius:.15rem;padding:0 .15em}code{font-family:var(--font-family-code);color:var(--c-text-lighter);padding:.25rem .5rem;margin:0;font-size:.85em;background-color:var(--c-bg-light);border-radius:3px;overflow-wrap:break-word;transition:background-color var(--t-color)}blockquote{font-size:1rem;color:var(--c-text-quote);border-left:.2rem solid var(--c-border-dark);margin:1rem 0;padding:.25rem 0 .25rem 1rem;overflow-wrap:break-word}blockquote>p{margin:0}ul,ol{padding-left:1.2em}strong{font-weight:600}h1,h2,h3,h4,h5,h6{font-weight:600;line-height:1.25;overflow-wrap:break-word}h1:focus-visible,h2:focus-visible,h3:focus-visible,h4:focus-visible,h5:focus-visible,h6:focus-visible{outline:none}h1 .header-anchor,h2 .header-anchor,h3 .header-anchor,h4 .header-anchor,h5 .header-anchor,h6 .header-anchor{color:inherit;text-decoration:none;position:relative}h1 .header-anchor:hover:before,h2 .header-anchor:hover:before,h3 .header-anchor:hover:before,h4 .header-anchor:hover:before,h5 .header-anchor:hover:before,h6 .header-anchor:hover:before{font-size:.8em;content:"¶";position:absolute;left:-.75em;color:var(--c-brand)}h1 .header-anchor:focus-visible,h2 .header-anchor:focus-visible,h3 .header-anchor:focus-visible,h4 .header-anchor:focus-visible,h5 .header-anchor:focus-visible,h6 .header-anchor:focus-visible{outline:none}h1 .header-anchor:focus-visible:before,h2 .header-anchor:focus-visible:before,h3 .header-anchor:focus-visible:before,h4 .header-anchor:focus-visible:before,h5 .header-anchor:focus-visible:before,h6 .header-anchor:focus-visible:before{content:"¶";position:absolute;left:-.75em;color:var(--c-brand);outline:auto}h1{font-size:2.2rem}h2{font-size:1.65rem;padding-bottom:.3rem;border-bottom:1px solid var(--c-border);transition:border-color var(--t-color)}h3{font-size:1.35rem}h4{font-size:1.15rem}h5{font-size:1.05rem}h6{font-size:1rem}@media print{a[href^="http://"]:after,a[href^="https://"]:after{content:" (" attr(href) ") "}}p,ul,ol{line-height:1.7;overflow-wrap:break-word}hr{border:0;border-top:1px solid var(--c-border)}table{border-collapse:collapse;margin:1rem 0;display:block;overflow-x:auto;transition:border-color var(--t-color)}tr{border-top:1px solid var(--c-border-dark);transition:border-color var(--t-color)}tr:nth-child(2n){background-color:var(--c-bg-light);transition:background-color var(--t-color)}tr:nth-child(2n) code{background-color:var(--c-bg-dark)}th,td{padding:.6em 1em;border:1px solid var(--c-border-dark);transition:border-color var(--t-color)}.arrow{display:inline-block;vertical-align:middle;width:1em;height:1em;background-image:url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 24 24'%3E%3Cpath fill='rgba(0,0,0,0.5)' d='M7.41 15.41L12 10.83l4.59 4.58L18 14l-6-6-6 6z'/%3E%3C/svg%3E");background-position:center;background-repeat:no-repeat;line-height:normal;transition:all .3s}html.dark .arrow{background-image:url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 24 24'%3E%3Cpath fill='rgba(255,255,255,0.5)' d='M7.41 15.41L12 10.83l4.59 4.58L18 14l-6-6-6 6z'/%3E%3C/svg%3E")}.arrow.down{transform:rotate(180deg)}.arrow.right{transform:rotate(90deg)}.arrow.left{transform:rotate(-90deg)}.badge{display:inline-block;font-size:14px;font-weight:600;height:18px;line-height:18px;border-radius:3px;padding:0 6px;color:var(--c-bg);vertical-align:top;transition:color var(--t-color),background-color var(--t-color)}.badge.tip{background-color:var(--c-badge-tip)}.badge.warning{background-color:var(--c-badge-warning);color:var(--c-badge-warning-text)}.badge.danger{background-color:var(--c-badge-danger);color:var(--c-badge-danger-text)}.badge+.badge{margin-left:5px}code[class*=language-],pre[class*=language-]{color:#ccc;background:none;font-family:var(--font-family-code);font-size:1em;text-align:left;white-space:pre;word-spacing:normal;word-break:normal;word-wrap:normal;line-height:1.5;-moz-tab-size:4;-o-tab-size:4;tab-size:4;-webkit-hyphens:none;hyphens:none}pre[class*=language-]{padding:1em;margin:.5em 0;overflow:auto}:not(pre)>code[class*=language-],pre[class*=language-]{background:#2d2d2d}:not(pre)>code[class*=language-]{padding:.1em;border-radius:.3em;white-space:normal}.token.comment,.token.block-comment,.token.prolog,.token.doctype,.token.cdata{color:#999}.token.punctuation{color:#ccc}.token.tag,.token.attr-name,.token.namespace,.token.deleted{color:#ec5975}.token.function-name{color:#6196cc}.token.boolean,.token.number,.token.function{color:#f08d49}.token.property,.token.class-name,.token.constant,.token.symbol{color:#f8c555}.token.selector,.token.important,.token.atrule,.token.keyword,.token.builtin{color:#cc99cd}.token.string,.token.char,.token.attr-value,.token.regex,.token.variable{color:#7ec699}.token.operator,.token.entity,.token.url{color:#67cdcc}.token.important,.token.bold{font-weight:700}.token.italic{font-style:italic}.token.entity{cursor:help}.token.inserted{color:#3eaf7c}.theme-default-content pre,.theme-default-content pre[class*=language-]{line-height:1.375;padding:1.3rem 1.5rem;margin:.85rem 0;border-radius:6px;overflow:auto}.theme-default-content pre code,.theme-default-content pre[class*=language-] code{color:#fff;padding:0;background-color:transparent!important;border-radius:0;overflow-wrap:unset;-webkit-font-smoothing:auto;-moz-osx-font-smoothing:auto}.theme-default-content .line-number{font-family:var(--font-family-code)}div[class*=language-]{position:relative;background-color:var(--code-bg-color);border-radius:6px}div[class*=language-]:before{content:attr(data-title);position:absolute;z-index:3;top:.8em;right:1em;font-size:.75rem;color:var(--code-ln-color)}div[class*=language-] pre,div[class*=language-] pre[class*=language-]{background:transparent!important;position:relative;z-index:1}div[class*=language-] .highlight-lines{-webkit-user-select:none;-moz-user-select:none;user-select:none;padding-top:1.3rem;position:absolute;top:0;left:0;width:100%;line-height:1.375}div[class*=language-] .highlight-lines .highlight-line{background-color:var(--code-hl-bg-color)}div[class*=language-]:not(.line-numbers-mode) .line-numbers{display:none}div[class*=language-].line-numbers-mode .highlight-lines .highlight-line{position:relative}div[class*=language-].line-numbers-mode .highlight-lines .highlight-line:before{content:" ";position:absolute;z-index:2;left:0;top:0;display:block;width:var(--code-ln-wrapper-width);height:100%}div[class*=language-].line-numbers-mode pre{margin-left:var(--code-ln-wrapper-width);padding-left:1rem;vertical-align:middle}div[class*=language-].line-numbers-mode .line-numbers{position:absolute;top:0;width:var(--code-ln-wrapper-width);text-align:center;color:var(--code-ln-color);padding-top:1.25rem;line-height:1.375;counter-reset:line-number}div[class*=language-].line-numbers-mode .line-numbers .line-number{position:relative;z-index:3;-webkit-user-select:none;-moz-user-select:none;user-select:none;height:1.375em}div[class*=language-].line-numbers-mode .line-numbers .line-number:before{counter-increment:line-number;content:counter(line-number);font-size:.85em}div[class*=language-].line-numbers-mode:after{content:"";position:absolute;top:0;left:0;width:var(--code-ln-wrapper-width);height:100%;border-radius:6px 0 0 6px;border-right:1px solid var(--code-hl-bg-color)}@media (max-width: 419px){.theme-default-content div[class*=language-]{margin:.85rem -1.5rem;border-radius:0}}.code-group__nav{margin-top:.85rem;margin-bottom:calc(-1.7rem - 6px);padding-bottom:calc(1.7rem - 6px);padding-left:10px;padding-top:10px;border-top-left-radius:6px;border-top-right-radius:6px;background-color:var(--c-code-group-tab-bg)}.code-group__nav-tab{border:0;padding:5px;cursor:pointer;background-color:transparent;font-size:.85em;line-height:1.4;color:var(--c-code-group-tab-title);font-weight:600}.code-group__nav-tab:focus{outline:none}.code-group__nav-tab:focus-visible{outline:1px solid var(--c-code-group-tab-outline)}.code-group__nav-tab-active{border-bottom:var(--c-code-group-tab-active-border) 1px solid}@media (max-width: 419px){.code-group__nav{margin-left:-1.5rem;margin-right:-1.5rem;border-radius:0}}.code-group-item{display:none}.code-group-item__active{display:block}.code-group-item>pre{background-color:orange}.custom-container{transition:color var(--t-color),border-color var(--t-color),background-color var(--t-color)}.custom-container .custom-container-title{font-weight:600}.custom-container .custom-container-title:not(:only-child){margin-bottom:-.4rem}.custom-container.tip,.custom-container.warning,.custom-container.danger{padding:.1rem 1.5rem;border-left-width:.5rem;border-left-style:solid;margin:1rem 0}.custom-container.tip{border-color:var(--c-tip);background-color:var(--c-tip-bg);color:var(--c-tip-text)}.custom-container.tip .custom-container-title{color:var(--c-tip-title)}.custom-container.tip a{color:var(--c-tip-text-accent)}.custom-container.tip code{background-color:var(--c-bg-dark)}.custom-container.warning{border-color:var(--c-warning);background-color:var(--c-warning-bg);color:var(--c-warning-text)}.custom-container.warning .custom-container-title{color:var(--c-warning-title)}.custom-container.warning a{color:var(--c-warning-text-accent)}.custom-container.warning blockquote{border-left-color:var(--c-warning-border-dark);color:var(--c-warning-text-quote)}.custom-container.warning code{color:var(--c-warning-text-light);background-color:var(--c-warning-bg-light)}.custom-container.warning details{background-color:var(--c-warning-details-bg)}.custom-container.warning details code{background-color:var(--c-warning-bg-lighter)}.custom-container.warning .external-link-icon{--external-link-icon-color: var(--c-warning-text-quote)}.custom-container.danger{border-color:var(--c-danger);background-color:var(--c-danger-bg);color:var(--c-danger-text)}.custom-container.danger .custom-container-title{color:var(--c-danger-title)}.custom-container.danger a{color:var(--c-danger-text-accent)}.custom-container.danger blockquote{border-left-color:var(--c-danger-border-dark);color:var(--c-danger-text-quote)}.custom-container.danger code{color:var(--c-danger-text-light);background-color:var(--c-danger-bg-light)}.custom-container.danger details{background-color:var(--c-danger-details-bg)}.custom-container.danger details code{background-color:var(--c-danger-bg-lighter)}.custom-container.danger .external-link-icon{--external-link-icon-color: var(--c-danger-text-quote)}.custom-container.details{display:block;position:relative;border-radius:2px;margin:1.6em 0;padding:1.6em;background-color:var(--c-details-bg)}.custom-container.details code{background-color:var(--c-bg-darker)}.custom-container.details h4{margin-top:0}.custom-container.details figure:last-child,.custom-container.details p:last-child{margin-bottom:0;padding-bottom:0}.custom-container.details summary{outline:none;cursor:pointer}.home{padding:var(--navbar-height) 2rem 0;max-width:var(--homepage-width);margin:0 auto;display:block}.home .hero{text-align:center}.home .hero img{max-width:100%;max-height:280px;display:block;margin:3rem auto 1.5rem}.home .hero h1{font-size:3rem}.home .hero h1,.home .hero .description,.home .hero .actions{margin:1.8rem auto}.home .hero .actions{display:flex;flex-wrap:wrap;gap:1rem;justify-content:center}.home .hero .description{max-width:35rem;font-size:1.6rem;line-height:1.3;color:var(--c-text-lightest)}.home .hero .action-button{display:inline-block;font-size:1.2rem;padding:.8rem 1.6rem;border-width:2px;border-style:solid;border-radius:4px;transition:background-color var(--t-color);box-sizing:border-box}.home .hero .action-button.primary{color:var(--c-bg);background-color:var(--c-brand);border-color:var(--c-brand)}.home .hero .action-button.primary:hover{background-color:var(--c-brand-light)}.home .hero .action-button.secondary{color:var(--c-brand);background-color:var(--c-bg);border-color:var(--c-brand)}.home .hero .action-button.secondary:hover{color:var(--c-bg);background-color:var(--c-brand-light)}.home .features{border-top:1px solid var(--c-border);transition:border-color var(--t-color);padding:1.2rem 0;margin-top:2.5rem;display:flex;flex-wrap:wrap;align-items:flex-start;align-content:stretch;justify-content:space-between}.home .feature{flex-grow:1;flex-basis:30%;max-width:30%}.home .feature h2{font-size:1.4rem;font-weight:500;border-bottom:none;padding-bottom:0;color:var(--c-text-light)}.home .feature p{color:var(--c-text-lighter)}.home .theme-default-content{padding:0;margin:0}.home .footer{padding:2.5rem;border-top:1px solid var(--c-border);text-align:center;color:var(--c-text-lighter);transition:border-color var(--t-color)}@media (max-width: 719px){.home .features{flex-direction:column}.home .feature{max-width:100%;padding:0 2.5rem}}@media (max-width: 419px){.home{padding-left:1.5rem;padding-right:1.5rem}.home .hero img{max-height:210px;margin:2rem auto 1.2rem}.home .hero h1{font-size:2rem}.home .hero h1,.home .hero .description,.home .hero .actions{margin:1.2rem auto}.home .hero .description{font-size:1.2rem}.home .hero .action-button{font-size:1rem;padding:.6rem 1.2rem}.home .feature h2{font-size:1.25rem}}.page{padding-top:var(--navbar-height);padding-left:var(--sidebar-width)}.navbar{position:fixed;z-index:20;top:0;left:0;right:0;height:var(--navbar-height);box-sizing:border-box;border-bottom:1px solid var(--c-border);background-color:var(--c-bg-navbar);transition:background-color var(--t-color),border-color var(--t-color)}.sidebar{font-size:16px;width:var(--sidebar-width);position:fixed;z-index:10;margin:0;top:var(--navbar-height);left:0;bottom:0;box-sizing:border-box;border-right:1px solid var(--c-border);overflow-y:auto;scrollbar-width:thin;scrollbar-color:var(--c-brand) var(--c-border);background-color:var(--c-bg-sidebar);transition:transform var(--t-transform),background-color var(--t-color),border-color var(--t-color)}.sidebar::-webkit-scrollbar{width:7px}.sidebar::-webkit-scrollbar-track{background-color:var(--c-border)}.sidebar::-webkit-scrollbar-thumb{background-color:var(--c-brand)}.sidebar-mask{position:fixed;z-index:9;top:0;left:0;width:100vw;height:100vh;display:none}.theme-container.sidebar-open .sidebar-mask{display:block}.theme-container.sidebar-open .navbar>.toggle-sidebar-button .icon span:nth-child(1){transform:rotate(45deg) translate3d(5.5px,5.5px,0)}.theme-container.sidebar-open .navbar>.toggle-sidebar-button .icon span:nth-child(2){transform:scale3d(0,1,1)}.theme-container.sidebar-open .navbar>.toggle-sidebar-button .icon span:nth-child(3){transform:rotate(-45deg) translate3d(6px,-6px,0)}.theme-container.sidebar-open .navbar>.toggle-sidebar-button .icon span:nth-child(1),.theme-container.sidebar-open .navbar>.toggle-sidebar-button .icon span:nth-child(3){transform-origin:center}.theme-container.no-navbar .theme-default-content h1,.theme-container.no-navbar .theme-default-content h2,.theme-container.no-navbar .theme-default-content h3,.theme-container.no-navbar .theme-default-content h4,.theme-container.no-navbar .theme-default-content h5,.theme-container.no-navbar .theme-default-content h6{margin-top:1.5rem;padding-top:0}.theme-container.no-navbar .page{padding-top:0}.theme-container.no-navbar .sidebar{top:0}.theme-container.no-sidebar .sidebar{display:none}@media (max-width: 719px){.theme-container.no-sidebar .sidebar{display:block}}.theme-container.no-sidebar .page{padding-left:0}.theme-default-content a:not(.header-anchor):hover{text-decoration:underline}.theme-default-content img{max-width:100%}.theme-default-content h1,.theme-default-content h2,.theme-default-content h3,.theme-default-content h4,.theme-default-content h5,.theme-default-content h6{margin-top:calc(.5rem - var(--navbar-height));padding-top:calc(1rem + var(--navbar-height));margin-bottom:0}.theme-default-content h1:first-child,.theme-default-content h2:first-child,.theme-default-content h3:first-child,.theme-default-content h4:first-child,.theme-default-content h5:first-child,.theme-default-content h6:first-child{margin-bottom:1rem}.theme-default-content h1:first-child+p,.theme-default-content h1:first-child+pre,.theme-default-content h1:first-child+.custom-container,.theme-default-content h2:first-child+p,.theme-default-content h2:first-child+pre,.theme-default-content h2:first-child+.custom-container,.theme-default-content h3:first-child+p,.theme-default-content h3:first-child+pre,.theme-default-content h3:first-child+.custom-container,.theme-default-content h4:first-child+p,.theme-default-content h4:first-child+pre,.theme-default-content h4:first-child+.custom-container,.theme-default-content h5:first-child+p,.theme-default-content h5:first-child+pre,.theme-default-content h5:first-child+.custom-container,.theme-default-content h6:first-child+p,.theme-default-content h6:first-child+pre,.theme-default-content h6:first-child+.custom-container{margin-top:2rem}@media (max-width: 959px){.sidebar{font-size:15px;width:var(--sidebar-width-mobile)}.page{padding-left:var(--sidebar-width-mobile)}}@media (max-width: 719px){.sidebar{top:0;padding-top:var(--navbar-height);transform:translate(-100%)}.page{padding-left:0}.theme-container.sidebar-open .sidebar{transform:translate(0)}.theme-container.no-navbar .sidebar{padding-top:0}}@media (max-width: 419px){h1{font-size:1.9rem}}#vp-comment{max-width:var(--content-width);margin:0 auto;padding:2rem 2.5rem}@media (max-width: 959px){#vp-comment{padding:2rem}}@media (max-width: 419px){#vp-comment{padding:1.5rem}}.navbar{--navbar-line-height: calc( var(--navbar-height) - 2 * var(--navbar-padding-v) );padding:var(--navbar-padding-v) var(--navbar-padding-h);line-height:var(--navbar-line-height)}.navbar .logo{height:var(--navbar-line-height);margin-right:var(--navbar-padding-v);vertical-align:top}.navbar .site-name{font-size:1.3rem;font-weight:600;color:var(--c-text);position:relative}.navbar .navbar-items-wrapper{display:flex;position:absolute;box-sizing:border-box;top:var(--navbar-padding-v);right:var(--navbar-padding-h);height:var(--navbar-line-height);padding-left:var(--navbar-padding-h);white-space:nowrap;font-size:.9rem}.navbar .navbar-items-wrapper .search-box{flex:0 0 auto;vertical-align:top}@media screen and (max-width: 719px){.navbar{padding-left:4rem}.navbar .site-name{display:block;width:calc(100vw - 11rem);overflow:hidden;white-space:nowrap;text-overflow:ellipsis}.navbar .can-hide{display:none}}.navbar-items{display:inline-block}@media print{.navbar-items{display:none}}.navbar-items a{display:inline-block;line-height:1.4rem;color:inherit}.navbar-items a:hover,.navbar-items a.route-link-active{color:var(--c-text)}.navbar-items .navbar-item{position:relative;display:inline-block;margin-left:1.5rem;line-height:var(--navbar-line-height)}.navbar-items .navbar-item:first-child{margin-left:0}.navbar-items .navbar-item>a:hover,.navbar-items .navbar-item>a.route-link-active{margin-bottom:-2px;border-bottom:2px solid var(--c-text-accent)}@media (max-width: 719px){.navbar-items .navbar-item{margin-left:0}.navbar-items .navbar-item>a:hover,.navbar-items .navbar-item>a.route-link-active{margin-bottom:0;border-bottom:none}.navbar-items a:hover,.navbar-items a.route-link-active{color:var(--c-text-accent)}}.toggle-sidebar-button{position:absolute;top:.6rem;left:1rem;display:none;padding:.6rem;cursor:pointer}.toggle-sidebar-button .icon{display:flex;flex-direction:column;justify-content:center;align-items:center;width:1.25rem;height:1.25rem;cursor:inherit}.toggle-sidebar-button .icon span{display:inline-block;width:100%;height:2px;border-radius:2px;background-color:var(--c-text);transition:transform var(--t-transform)}.toggle-sidebar-button .icon span:nth-child(2){margin:6px 0}@media screen and (max-width: 719px){.toggle-sidebar-button{display:block}}.toggle-color-mode-button{display:flex;margin:auto;margin-left:1rem;border:0;background:none;color:var(--c-text);opacity:.8;cursor:pointer}@media print{.toggle-color-mode-button{display:none}}.toggle-color-mode-button:hover{opacity:1}.toggle-color-mode-button .icon{width:1.25rem;height:1.25rem}.DocSearch{transition:background-color var(--t-color)}.navbar-dropdown-wrapper{cursor:pointer}.navbar-dropdown-wrapper .navbar-dropdown-title,.navbar-dropdown-wrapper .navbar-dropdown-title-mobile{display:block;font-size:.9rem;font-family:inherit;cursor:inherit;padding:inherit;line-height:1.4rem;background:transparent;border:none;font-weight:500;color:var(--c-text)}.navbar-dropdown-wrapper .navbar-dropdown-title:hover,.navbar-dropdown-wrapper .navbar-dropdown-title-mobile:hover{border-color:transparent}.navbar-dropdown-wrapper .navbar-dropdown-title .arrow,.navbar-dropdown-wrapper .navbar-dropdown-title-mobile .arrow{vertical-align:middle;margin-top:-1px;margin-left:.4rem}.navbar-dropdown-wrapper .navbar-dropdown-title-mobile{display:none;font-weight:600;font-size:inherit}.navbar-dropdown-wrapper .navbar-dropdown-title-mobile:hover{color:var(--c-text-accent)}.navbar-dropdown-wrapper .navbar-dropdown .navbar-dropdown-item{color:inherit;line-height:1.7rem}.navbar-dropdown-wrapper .navbar-dropdown .navbar-dropdown-item .navbar-dropdown-subtitle{margin:.45rem 0 0;border-top:1px solid var(--c-border);padding:1rem 0 .45rem;font-size:.9rem}.navbar-dropdown-wrapper .navbar-dropdown .navbar-dropdown-item .navbar-dropdown-subtitle>span{padding:0 1.5rem 0 1.25rem}.navbar-dropdown-wrapper .navbar-dropdown .navbar-dropdown-item .navbar-dropdown-subtitle>a{font-weight:inherit}.navbar-dropdown-wrapper .navbar-dropdown .navbar-dropdown-item .navbar-dropdown-subtitle>a.route-link-active:after{display:none}.navbar-dropdown-wrapper .navbar-dropdown .navbar-dropdown-item .navbar-dropdown-subitem-wrapper{padding:0;list-style:none}.navbar-dropdown-wrapper .navbar-dropdown .navbar-dropdown-item .navbar-dropdown-subitem-wrapper .navbar-dropdown-subitem{font-size:.9em}.navbar-dropdown-wrapper .navbar-dropdown .navbar-dropdown-item a{display:block;line-height:1.7rem;position:relative;border-bottom:none;font-weight:400;margin-bottom:0;padding:0 1.5rem 0 1.25rem}.navbar-dropdown-wrapper .navbar-dropdown .navbar-dropdown-item a:hover,.navbar-dropdown-wrapper .navbar-dropdown .navbar-dropdown-item a.route-link-active{color:var(--c-text-accent)}.navbar-dropdown-wrapper .navbar-dropdown .navbar-dropdown-item a.route-link-active:after{content:"";width:0;height:0;border-left:5px solid var(--c-text-accent);border-top:3px solid transparent;border-bottom:3px solid transparent;position:absolute;top:calc(50% - 2px);left:9px}.navbar-dropdown-wrapper .navbar-dropdown .navbar-dropdown-item:first-child .navbar-dropdown-subtitle{margin-top:0;padding-top:0;border-top:0}.navbar-dropdown-wrapper.mobile.open .navbar-dropdown-title,.navbar-dropdown-wrapper.mobile.open .navbar-dropdown-title-mobile{margin-bottom:.5rem}.navbar-dropdown-wrapper.mobile .navbar-dropdown-title,.navbar-dropdown-wrapper.mobile .navbar-dropdown-title-mobile{display:none}.navbar-dropdown-wrapper.mobile .navbar-dropdown-title-mobile{display:block}.navbar-dropdown-wrapper.mobile .navbar-dropdown{transition:height .1s ease-out;overflow:hidden}.navbar-dropdown-wrapper.mobile .navbar-dropdown .navbar-dropdown-item .navbar-dropdown-subtitle{border-top:0;margin-top:0;padding-top:0;padding-bottom:0}.navbar-dropdown-wrapper.mobile .navbar-dropdown .navbar-dropdown-item .navbar-dropdown-subtitle,.navbar-dropdown-wrapper.mobile .navbar-dropdown .navbar-dropdown-item>a{font-size:15px;line-height:2rem}.navbar-dropdown-wrapper.mobile .navbar-dropdown .navbar-dropdown-item .navbar-dropdown-subitem{font-size:14px;padding-left:1rem}.navbar-dropdown-wrapper:not(.mobile){height:1.8rem}.navbar-dropdown-wrapper:not(.mobile):hover .navbar-dropdown,.navbar-dropdown-wrapper:not(.mobile).open .navbar-dropdown{display:block!important}.navbar-dropdown-wrapper:not(.mobile).open:blur{display:none}.navbar-dropdown-wrapper:not(.mobile) .navbar-dropdown{display:none;height:auto!important;box-sizing:border-box;max-height:calc(100vh - 2.7rem);overflow-y:auto;position:absolute;top:100%;right:0;background-color:var(--c-bg-navbar);padding:.6rem 0;border:1px solid var(--c-border);border-bottom-color:var(--c-border-dark);text-align:left;border-radius:.25rem;white-space:nowrap;margin:0}.page{padding-bottom:2rem;display:block}.page .theme-default-content{max-width:var(--content-width);margin:0 auto;padding:2rem 2.5rem;padding-top:0}@media (max-width: 959px){.page .theme-default-content{padding:2rem}}@media (max-width: 419px){.page .theme-default-content{padding:1.5rem}}.sidebar ul{padding:0;margin:0;list-style-type:none}.sidebar a{display:inline-block}.sidebar .navbar-items{display:none;border-bottom:1px solid var(--c-border);transition:border-color var(--t-color);padding:.5rem 0 .75rem}.sidebar .navbar-items a{font-weight:600}.sidebar .navbar-items .navbar-item{display:block;line-height:1.25rem;font-size:1.1em;padding:.5rem 0 .5rem 1.5rem}.sidebar .sidebar-items{padding:1.5rem 0}@media (max-width: 719px){.sidebar .navbar-items{display:block}.sidebar .navbar-items .navbar-dropdown-wrapper .navbar-dropdown .navbar-dropdown-item a.route-link-active:after{top:calc(1rem - 2px)}.sidebar .sidebar-items{padding:1rem 0}}.sidebar-item{cursor:default;border-left:.25rem solid transparent;color:var(--c-text)}.sidebar-item:focus-visible{outline-width:1px;outline-offset:-1px}.sidebar-item.active:not(p.sidebar-heading){font-weight:600;color:var(--c-text-accent);border-left-color:var(--c-text-accent)}.sidebar-item.sidebar-heading{transition:color .15s ease;font-size:1.1em;font-weight:700;padding:.35rem 1.5rem .35rem 1.25rem;width:100%;box-sizing:border-box;margin:0}.sidebar-item.sidebar-heading+.sidebar-item-children{transition:height .1s ease-out;overflow:hidden;margin-bottom:.75rem}.sidebar-item.collapsible{cursor:pointer}.sidebar-item.collapsible .arrow{position:relative;top:-.12em;left:.5em}.sidebar-item:not(.sidebar-heading){font-size:1em;font-weight:400;display:inline-block;margin:0;padding:.35rem 1rem .35rem 2rem;line-height:1.4;width:100%;box-sizing:border-box}.sidebar-item:not(.sidebar-heading)+.sidebar-item-children{padding-left:1rem;font-size:.95em}.sidebar-item-children .sidebar-item-children .sidebar-item:not(.sidebar-heading){padding:.25rem 1rem .25rem 1.75rem}.sidebar-item-children .sidebar-item-children .sidebar-item:not(.sidebar-heading).active{font-weight:500;border-left-color:transparent}a.sidebar-heading+.sidebar-item-children .sidebar-item:not(.sidebar-heading).active{border-left-color:transparent}a.sidebar-item{cursor:pointer}a.sidebar-item:hover{color:var(--c-text-accent)}.table-of-contents .badge{vertical-align:middle}.dropdown-enter-from,.dropdown-leave-to{height:0!important}.fade-slide-y-enter-active{transition:all .2s ease}.fade-slide-y-leave-active{transition:all .2s cubic-bezier(1,.5,.8,1)}.fade-slide-y-enter-from,.fade-slide-y-leave-to{transform:translateY(10px);opacity:0}:root{scroll-behavior:smooth}:root{--search-bg-color: #ffffff;--search-accent-color: #3eaf7c;--search-text-color: #2c3e50;--search-border-color: #eaecef;--search-item-text-color: #5d81a5;--search-item-focus-bg-color: #f3f4f5;--search-input-width: 8rem;--search-result-width: 20rem}.search-box{display:inline-block;position:relative;margin-left:1rem}@media print{.search-box{display:none}}.search-box input{-webkit-appearance:none;-moz-appearance:none;appearance:none;cursor:text;width:var(--search-input-width);height:2rem;color:var(--search-text-color);display:inline-block;border:1px solid var(--search-border-color);border-radius:2rem;font-size:.9rem;line-height:2rem;padding:0 .5rem 0 2rem;outline:none;transition:all ease .3s;background:var(--search-bg-color) url("data:image/svg+xml,%3c?xml%20version='1.0'%20encoding='UTF-8'?%3e%3csvg%20xmlns='http://www.w3.org/2000/svg'%20width='12'%20height='13'%3e%3cg%20stroke-width='2'%20stroke='%23aaa'%20fill='none'%3e%3cpath%20d='M11.29%2011.71l-4-4'/%3e%3ccircle%20cx='5'%20cy='5'%20r='4'/%3e%3c/g%3e%3c/svg%3e") .6rem .5rem no-repeat;background-size:1rem}@media (max-width: 719px){.search-box input{cursor:pointer;width:0;border-color:transparent;position:relative}}.search-box input:focus{cursor:auto;border-color:var(--search-accent-color)}@media (max-width: 719px){.search-box input:focus{cursor:text;left:0;width:10rem}}@media (max-width: 419px){.search-box input:focus{width:8rem}}.search-box .suggestions{background:var(--search-bg-color);width:var(--search-result-width);position:absolute;top:2rem;right:0;border:1px solid var(--search-border-color);border-radius:6px;padding:.4rem;list-style-type:none}@media (max-width: 419px){.search-box .suggestions{width:calc(100vw - 4rem);right:-.5rem}}.search-box .suggestion{line-height:1.4;padding:.4rem .6rem;border-radius:4px;cursor:pointer}.search-box .suggestion a{white-space:normal;color:var(--search-item-text-color)}.search-box .suggestion.focus{background-color:var(--search-item-focus-bg-color)}.search-box .suggestion.focus a{color:var(--search-accent-color)}.search-box .suggestion .page-title{font-weight:600}.search-box .suggestion .page-header{font-size:.9em;margin-left:.25em}.npm-badge[data-v-c758b2a0]{margin-right:.5rem}
assets/tacotron2.html-Ds1AKES7.js ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import{_ as e,r as l,o as t,c as o,a as s,d as n,b as p,e as i}from"./app-DTS6SjJz.js";const r={},c=s("h1",{id:"tts-demo-for-espnet-easy",tabindex:"-1"},[s("a",{class:"header-anchor",href:"#tts-demo-for-espnet-easy"},[s("span",null,"TTS demo for ESPnet-Easy!")])],-1),d=s("p",null,"In this notebook, we will demonstrate how to train an Text to Speech (TTS) model using the LJSpeech dataset. Basic flow of data preparation and training is the same with ASR.",-1),D={href:"https://keithito.com/LJ-Speech-Dataset/",target:"_blank",rel:"noopener noreferrer"},u=s("code",null,"/hdd/dataset/",-1),y=s("code",null,"/hdd/dataset/",-1),v=i(`<h2 id="data-preparation" tabindex="-1"><a class="header-anchor" href="#data-preparation"><span>Data preparation</span></a></h2><p>First, let&#39;s create dump files!<br> The format of the dump files is the same as the ASR dump files.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">{</span></span>
2
+ <span class="line"><span style="color:#CE9178;"> &quot;data_name&quot;</span><span style="color:#D4D4D4;">: [</span><span style="color:#CE9178;">&quot;dump_file_name&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;dump_format&quot;</span><span style="color:#D4D4D4;">]</span></span>
3
+ <span class="line"><span style="color:#D4D4D4;">}</span></span>
4
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> os</span></span>
5
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> espnetez </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> ez</span></span>
6
+ <span class="line"></span>
7
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> local.data_prep </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> get_dataset</span></span>
8
+ <span class="line"></span>
9
+ <span class="line"></span>
10
+ <span class="line"><span style="color:#D4D4D4;">DUMP_DIR = </span><span style="color:#CE9178;">&quot;./dump/ljspeech&quot;</span></span>
11
+ <span class="line"><span style="color:#D4D4D4;">LJS_DIRS = </span><span style="color:#CE9178;">&quot;/hdd/database/LJSpeech-1.1&quot;</span></span>
12
+ <span class="line"><span style="color:#D4D4D4;">data_info = {</span></span>
13
+ <span class="line"><span style="color:#CE9178;"> &quot;speech&quot;</span><span style="color:#D4D4D4;">: [</span><span style="color:#CE9178;">&quot;wav.scp&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;sound&quot;</span><span style="color:#D4D4D4;">],</span></span>
14
+ <span class="line"><span style="color:#CE9178;"> &quot;text&quot;</span><span style="color:#D4D4D4;">: [</span><span style="color:#CE9178;">&quot;text&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;text&quot;</span><span style="color:#D4D4D4;">],</span></span>
15
+ <span class="line"><span style="color:#D4D4D4;">}</span></span>
16
+ <span class="line"></span>
17
+ <span class="line"><span style="color:#D4D4D4;">train_dataset, test_dataset = get_dataset(LJS_DIRS)</span></span>
18
+ <span class="line"></span>
19
+ <span class="line"><span style="color:#D4D4D4;">train_dir = os.path.join(DUMP_DIR, </span><span style="color:#CE9178;">&quot;train&quot;</span><span style="color:#D4D4D4;">)</span></span>
20
+ <span class="line"><span style="color:#D4D4D4;">test_dir = os.path.join(DUMP_DIR, </span><span style="color:#CE9178;">&quot;test&quot;</span><span style="color:#D4D4D4;">)</span></span>
21
+ <span class="line"></span>
22
+ <span class="line"><span style="color:#D4D4D4;">ez.data.create_dump_file(train_dir, train_dataset, data_info)</span></span>
23
+ <span class="line"><span style="color:#D4D4D4;">ez.data.create_dump_file(test_dir, test_dataset, data_info)</span></span>
24
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="generate-token-list" tabindex="-1"><a class="header-anchor" href="#generate-token-list"><span>Generate token list</span></a></h2><p>To generate a token list, we need to run <code>espnet2.bin.tokenize_text</code> script. ESPnet-Easy has a wrapper function for this script.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># generate training texts from the training data</span></span>
25
+ <span class="line"><span style="color:#6A9955;"># you can select several datasets to train sentencepiece.</span></span>
26
+ <span class="line"><span style="color:#D4D4D4;">ez.preprocess.prepare_sentences([</span><span style="color:#CE9178;">&quot;dump/ljspeech/train/text&quot;</span><span style="color:#D4D4D4;">], </span><span style="color:#CE9178;">&quot;data/&quot;</span><span style="color:#D4D4D4;">)</span></span>
27
+ <span class="line"><span style="color:#D4D4D4;">ez.preprocess.tokenize(</span></span>
28
+ <span class="line"><span style="color:#9CDCFE;"> input</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&quot;data/train.txt&quot;</span><span style="color:#D4D4D4;">,</span></span>
29
+ <span class="line"><span style="color:#9CDCFE;"> output</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&quot;data/tokenized.txt&quot;</span><span style="color:#D4D4D4;">,</span></span>
30
+ <span class="line"><span style="color:#9CDCFE;"> token_type</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&quot;phn&quot;</span><span style="color:#D4D4D4;">,</span></span>
31
+ <span class="line"><span style="color:#9CDCFE;"> cleaner</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&quot;tacotron&quot;</span><span style="color:#D4D4D4;">,</span></span>
32
+ <span class="line"><span style="color:#9CDCFE;"> g2p</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&quot;g2p_en&quot;</span></span>
33
+ <span class="line"><span style="color:#D4D4D4;">)</span></span>
34
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="training" tabindex="-1"><a class="header-anchor" href="#training"><span>Training</span></a></h2><p>To prepare the stats file before training, you can execute the <code>collect_stats</code> method. This step is required before the training process and ensuring accurate statistics for the model.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">EXP_DIR = </span><span style="color:#CE9178;">&quot;exp/train_tts&quot;</span></span>
35
+ <span class="line"><span style="color:#D4D4D4;">STATS_DIR = </span><span style="color:#CE9178;">&quot;exp/stats&quot;</span></span>
36
+ <span class="line"></span>
37
+ <span class="line"><span style="color:#6A9955;"># load config</span></span>
38
+ <span class="line"><span style="color:#D4D4D4;">training_config = ez.config.from_yaml(</span></span>
39
+ <span class="line"><span style="color:#CE9178;"> &quot;tts&quot;</span><span style="color:#D4D4D4;">,</span></span>
40
+ <span class="line"><span style="color:#CE9178;"> &quot;tacotron2.yaml&quot;</span><span style="color:#D4D4D4;">,</span></span>
41
+ <span class="line"><span style="color:#D4D4D4;">)</span></span>
42
+ <span class="line"><span style="color:#C586C0;">with</span><span style="color:#DCDCAA;"> open</span><span style="color:#D4D4D4;">(</span><span style="color:#CE9178;">&quot;data/tokenized.txt&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;r&quot;</span><span style="color:#D4D4D4;">) </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> f:</span></span>
43
+ <span class="line"><span style="color:#D4D4D4;"> training_config[</span><span style="color:#CE9178;">&quot;token_list&quot;</span><span style="color:#D4D4D4;">] = [t.replace(</span><span style="color:#CE9178;">&quot;</span><span style="color:#D7BA7D;">\\n</span><span style="color:#CE9178;">&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;&quot;</span><span style="color:#D4D4D4;">) </span><span style="color:#C586C0;">for</span><span style="color:#D4D4D4;"> t </span><span style="color:#C586C0;">in</span><span style="color:#D4D4D4;"> f.readlines()]</span></span>
44
+ <span class="line"></span>
45
+ <span class="line"><span style="color:#6A9955;"># Define the Trainer class</span></span>
46
+ <span class="line"><span style="color:#D4D4D4;">trainer = ez.Trainer(</span></span>
47
+ <span class="line"><span style="color:#9CDCFE;"> task</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&#39;tts&#39;</span><span style="color:#D4D4D4;">,</span></span>
48
+ <span class="line"><span style="color:#9CDCFE;"> train_config</span><span style="color:#D4D4D4;">=training_config,</span></span>
49
+ <span class="line"><span style="color:#9CDCFE;"> train_dump_dir</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&quot;dump/ljspeech/train&quot;</span><span style="color:#D4D4D4;">,</span></span>
50
+ <span class="line"><span style="color:#9CDCFE;"> valid_dump_dir</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&quot;dump/ljspeech/test&quot;</span><span style="color:#D4D4D4;">,</span></span>
51
+ <span class="line"><span style="color:#9CDCFE;"> data_info</span><span style="color:#D4D4D4;">=data_info,</span></span>
52
+ <span class="line"><span style="color:#9CDCFE;"> output_dir</span><span style="color:#D4D4D4;">=EXP_DIR,</span></span>
53
+ <span class="line"><span style="color:#9CDCFE;"> stats_dir</span><span style="color:#D4D4D4;">=STATS_DIR,</span></span>
54
+ <span class="line"><span style="color:#9CDCFE;"> ngpu</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">,</span></span>
55
+ <span class="line"><span style="color:#D4D4D4;">)</span></span>
56
+ <span class="line"><span style="color:#D4D4D4;">trainer.collect_stats()</span></span>
57
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>Finally, we are ready to begin the training process!</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">trainer.train()</span></span>
58
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h2 id="inference" tabindex="-1"><a class="header-anchor" href="#inference"><span>Inference</span></a></h2><p>You can just use the inference API of the ESPnet.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> soundfile </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> sf</span></span>
59
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet2.bin.tts_inference </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> Text2Speech</span></span>
60
+ <span class="line"></span>
61
+ <span class="line"><span style="color:#D4D4D4;">m = Text2Speech(</span></span>
62
+ <span class="line"><span style="color:#CE9178;"> &quot;./exp/finetune/config.yaml&quot;</span><span style="color:#D4D4D4;">,</span></span>
63
+ <span class="line"><span style="color:#CE9178;"> &quot;./exp/finetune/valid.loss.ave.pth&quot;</span><span style="color:#D4D4D4;">,</span></span>
64
+ <span class="line"><span style="color:#D4D4D4;">)</span></span>
65
+ <span class="line"></span>
66
+ <span class="line"><span style="color:#D4D4D4;">text = </span><span style="color:#CE9178;">&quot;hello world&quot;</span></span>
67
+ <span class="line"><span style="color:#D4D4D4;">output = m(text)[</span><span style="color:#CE9178;">&#39;wav&#39;</span><span style="color:#D4D4D4;">]</span></span>
68
+ <span class="line"><span style="color:#D4D4D4;">sf.write(</span><span style="color:#CE9178;">&quot;output.wav&quot;</span><span style="color:#D4D4D4;">, output, </span><span style="color:#B5CEA8;">16000</span><span style="color:#D4D4D4;">)</span></span>
69
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div>`,15);function m(h,b){const a=l("ExternalLinkIcon");return t(),o("div",null,[c,d,s("p",null,[n("Before proceeding, please ensure that you have already downloaded the LJSpeech dataset from "),s("a",D,[n("here"),p(a)]),n(" and have placed the data in a directory of your choice. In this notebook, we assume that you have stored the dataset in the "),u,n(" directory. If your dataset is located in a different directory, please make sure to replace "),y,n(" with the actual path to your dataset.")]),v])}const E=e(r,[["render",m],["__file","tacotron2.html.vue"]]),_=JSON.parse('{"path":"/espnetez/tts/tacotron2.html","title":"TTS demo for ESPnet-Easy!","lang":"en-US","frontmatter":{},"headers":[{"level":2,"title":"Data preparation","slug":"data-preparation","link":"#data-preparation","children":[]},{"level":2,"title":"Generate token list","slug":"generate-token-list","link":"#generate-token-list","children":[]},{"level":2,"title":"Training","slug":"training","link":"#training","children":[]},{"level":2,"title":"Inference","slug":"inference","link":"#inference","children":[]}],"git":{},"filePathRelative":"espnetez/tts/tacotron2.md"}');export{E as comp,_ as data};
assets/train.html-BQ-t2Cs4.js ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import{_ as e,r as l,o as p,c as o,a as s,d as n,b as i,e as t}from"./app-DTS6SjJz.js";const r={},c=s("h1",{id:"sample-demo-for-espnet-easy",tabindex:"-1"},[s("a",{class:"header-anchor",href:"#sample-demo-for-espnet-easy"},[s("span",null,"Sample demo for ESPnet-Easy!")])],-1),d=s("p",null,"In this notebook, we will demonstrate how to train an Automatic Speech Recognition (ASR) model using the Librispeech-100 dataset. The process in this notebook follows the same dataset preparation approach as the kaldi-style dataset. If you are interested in fine-tuning pretrained models, please refer to the libri100_finetune.ipynb file.",-1),D={href:"https://www.openslr.org/12",target:"_blank",rel:"noopener noreferrer"},u=s("code",null,"/hdd/dataset/",-1),y=s("code",null,"/hdd/dataset/",-1),v=t(`<h2 id="data-preparation" tabindex="-1"><a class="header-anchor" href="#data-preparation"><span>Data Preparation</span></a></h2><p>This notebook follows the data preparation steps outlined in <code>asr.sh</code>. Initially, we will create a dump file to store information about the data, including the data ID, audio path, and transcriptions.</p><p>ESPnet-Easy supports various types of datasets, including:</p><ol><li><p>Dictionary-based dataset with the following structure:</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">{</span></span>
2
+ <span class="line"><span style="color:#CE9178;"> &quot;data_id&quot;</span><span style="color:#D4D4D4;">: {</span></span>
3
+ <span class="line"><span style="color:#CE9178;"> &quot;speech&quot;</span><span style="color:#D4D4D4;">: path_to_speech_file,</span></span>
4
+ <span class="line"><span style="color:#CE9178;"> &quot;text&quot;</span><span style="color:#D4D4D4;">: transcription</span></span>
5
+ <span class="line"><span style="color:#D4D4D4;"> }</span></span>
6
+ <span class="line"><span style="color:#D4D4D4;">}</span></span>
7
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div></li><li><p>List of datasets with the following structure:</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">[</span></span>
8
+ <span class="line"><span style="color:#D4D4D4;"> {</span></span>
9
+ <span class="line"><span style="color:#CE9178;"> &quot;speech&quot;</span><span style="color:#D4D4D4;">: path_to_speech_file,</span></span>
10
+ <span class="line"><span style="color:#CE9178;"> &quot;text&quot;</span><span style="color:#D4D4D4;">: transcription</span></span>
11
+ <span class="line"><span style="color:#D4D4D4;"> }</span></span>
12
+ <span class="line"><span style="color:#D4D4D4;">]</span></span>
13
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div></li></ol><p>If you choose to use a dictionary-based dataset, it&#39;s essential to ensure that each <code>data_id</code> is unique. ESPnet-Easy also accepts a dump file that may have already been created by <code>asr.sh</code>. However, in this notebook, we will create the dump file from scratch.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># Need to install espnet if you don&#39;t have it</span></span>
14
+ <span class="line"><span style="color:#D4D4D4;">%pip install -U espnet</span></span>
15
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><p>Now, let&#39;s create dump files!<br> Please note that you will need to provide a dictionary to specify the file path and type for each data. This dictionary should have the following format:</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">{</span></span>
16
+ <span class="line"><span style="color:#CE9178;"> &quot;data_name&quot;</span><span style="color:#D4D4D4;">: [</span><span style="color:#CE9178;">&quot;dump_file_name&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;dump_format&quot;</span><span style="color:#D4D4D4;">]</span></span>
17
+ <span class="line"><span style="color:#D4D4D4;">}</span></span>
18
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> os</span></span>
19
+ <span class="line"></span>
20
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> espnetez </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> ez</span></span>
21
+ <span class="line"></span>
22
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> local.data_prep </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> create_dataset</span></span>
23
+ <span class="line"></span>
24
+ <span class="line"></span>
25
+ <span class="line"><span style="color:#D4D4D4;">DUMP_DIR = </span><span style="color:#CE9178;">&quot;./dump/libri100&quot;</span></span>
26
+ <span class="line"><span style="color:#D4D4D4;">LIBRI_100_DIRS = [</span></span>
27
+ <span class="line"><span style="color:#D4D4D4;"> [</span><span style="color:#CE9178;">&quot;/hdd/database/librispeech-100/LibriSpeech/train-clean-100&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;train&quot;</span><span style="color:#D4D4D4;">],</span></span>
28
+ <span class="line"><span style="color:#D4D4D4;"> [</span><span style="color:#CE9178;">&quot;/hdd/database/librispeech-100/LibriSpeech/dev-clean&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;dev-clean&quot;</span><span style="color:#D4D4D4;">],</span></span>
29
+ <span class="line"><span style="color:#D4D4D4;"> [</span><span style="color:#CE9178;">&quot;/hdd/database/librispeech-100/LibriSpeech/dev-other&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;dev-other&quot;</span><span style="color:#D4D4D4;">],</span></span>
30
+ <span class="line"><span style="color:#D4D4D4;">]</span></span>
31
+ <span class="line"><span style="color:#D4D4D4;">data_info = {</span></span>
32
+ <span class="line"><span style="color:#CE9178;"> &quot;speech&quot;</span><span style="color:#D4D4D4;">: [</span><span style="color:#CE9178;">&quot;wav.scp&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;sound&quot;</span><span style="color:#D4D4D4;">],</span></span>
33
+ <span class="line"><span style="color:#CE9178;"> &quot;text&quot;</span><span style="color:#D4D4D4;">: [</span><span style="color:#CE9178;">&quot;text&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;text&quot;</span><span style="color:#D4D4D4;">],</span></span>
34
+ <span class="line"><span style="color:#D4D4D4;">}</span></span>
35
+ <span class="line"></span>
36
+ <span class="line"></span>
37
+ <span class="line"><span style="color:#C586C0;">for</span><span style="color:#D4D4D4;"> d, n </span><span style="color:#C586C0;">in</span><span style="color:#D4D4D4;"> LIBRI_100_DIRS:</span></span>
38
+ <span class="line"><span style="color:#D4D4D4;"> dump_dir = os.path.join(DUMP_DIR, n)</span></span>
39
+ <span class="line"><span style="color:#C586C0;"> if</span><span style="color:#569CD6;"> not</span><span style="color:#D4D4D4;"> os.path.exists(dump_dir):</span></span>
40
+ <span class="line"><span style="color:#D4D4D4;"> os.makedirs(dump_dir)</span></span>
41
+ <span class="line"></span>
42
+ <span class="line"><span style="color:#D4D4D4;"> dataset = create_dataset(d)</span></span>
43
+ <span class="line"><span style="color:#D4D4D4;"> ez.data.create_dump_file(dump_dir, dataset, data_info)</span></span>
44
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>For the validation files, you have two directories: <code>dev-clean</code> and <code>dev-other</code>. To create a unified dev dataset, you can use the <code>ez.data.join_dumps</code> function.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">ez.data.join_dumps(</span></span>
45
+ <span class="line"><span style="color:#D4D4D4;"> [</span><span style="color:#CE9178;">&quot;./dump/libri100/dev-clean&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;./dump/libri100/dev-other&quot;</span><span style="color:#D4D4D4;">], </span><span style="color:#CE9178;">&quot;./dump/libri100/dev&quot;</span></span>
46
+ <span class="line"><span style="color:#D4D4D4;">)</span></span>
47
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>Now you have dataset files in the <code>dump</code> directory. It looks like this:</p><p>wav.scp</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>1255-138279-0008 /hdd/database/librispeech-100/LibriSpeech/dev-other/1255/138279/1255-138279-0008.flac</span></span>
48
+ <span class="line"><span>1255-138279-0022 /hdd/database/librispeech-100/LibriSpeech/dev-other/1255/138279/1255-138279-0022.flac</span></span>
49
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><p>text</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>1255-138279-0008 TWO THREE</span></span>
50
+ <span class="line"><span>1255-138279-0022 IF I SAID SO OF COURSE I WILL</span></span>
51
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="train-sentencepiece-model" tabindex="-1"><a class="header-anchor" href="#train-sentencepiece-model"><span>Train sentencepiece model</span></a></h2><p>To train a SentencePiece model, we require a text file for training. Let&#39;s begin by creating the training file.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># generate training texts from the training data</span></span>
52
+ <span class="line"><span style="color:#6A9955;"># you can select several datasets to train sentencepiece.</span></span>
53
+ <span class="line"><span style="color:#D4D4D4;">ez.preprocess.prepare_sentences([</span><span style="color:#CE9178;">&quot;dump/libri100/train/text&quot;</span><span style="color:#D4D4D4;">], </span><span style="color:#CE9178;">&quot;dump/spm&quot;</span><span style="color:#D4D4D4;">)</span></span>
54
+ <span class="line"></span>
55
+ <span class="line"><span style="color:#D4D4D4;">ez.preprocess.train_sentencepiece(</span></span>
56
+ <span class="line"><span style="color:#CE9178;"> &quot;dump/spm/train.txt&quot;</span><span style="color:#D4D4D4;">,</span></span>
57
+ <span class="line"><span style="color:#CE9178;"> &quot;data/bpemodel&quot;</span><span style="color:#D4D4D4;">,</span></span>
58
+ <span class="line"><span style="color:#9CDCFE;"> vocab_size</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">5000</span><span style="color:#D4D4D4;">,</span></span>
59
+ <span class="line"><span style="color:#D4D4D4;">)</span></span>
60
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="configure-training-process" tabindex="-1"><a class="header-anchor" href="#configure-training-process"><span>Configure Training Process</span></a></h2><p>For configuring the training process, you can utilize the configuration files already provided by ESPnet contributors. To use a configuration file, you&#39;ll need to create a YAML file on your local machine. For instance, you can use the <a href="train_asr_e-branchformer_size256_mlp1024_linear1024_e12_mactrue_edrop0.0_ddrop0.0.yaml">e-branchformer config</a>.</p><p>In my case, I&#39;ve made a modification to the <code>batch_bins</code> parameter, changing it from <code>16000000</code> to <code>1600000</code> to run training on my GPU (RTX2080ti).</p><h2 id="training" tabindex="-1"><a class="header-anchor" href="#training"><span>Training</span></a></h2><p>To prepare the stats file before training, you can execute the <code>collect_stats</code> method. This step is required before the training process and ensuring accurate statistics for the model.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> espnetez </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> ez</span></span>
61
+ <span class="line"></span>
62
+ <span class="line"><span style="color:#D4D4D4;">EXP_DIR = </span><span style="color:#CE9178;">&quot;exp/train_asr_branchformer_e24_amp&quot;</span></span>
63
+ <span class="line"><span style="color:#D4D4D4;">STATS_DIR = </span><span style="color:#CE9178;">&quot;exp/stats&quot;</span></span>
64
+ <span class="line"></span>
65
+ <span class="line"><span style="color:#6A9955;"># load config</span></span>
66
+ <span class="line"><span style="color:#D4D4D4;">training_config = ez.config.from_yaml(</span></span>
67
+ <span class="line"><span style="color:#CE9178;"> &quot;asr&quot;</span><span style="color:#D4D4D4;">,</span></span>
68
+ <span class="line"><span style="color:#CE9178;"> &quot;config/train_asr_e_branchformer_size256_mlp1024_linear1024_e12_mactrue_edrop0.0_ddrop0.0.yaml&quot;</span><span style="color:#D4D4D4;">,</span></span>
69
+ <span class="line"><span style="color:#D4D4D4;">)</span></span>
70
+ <span class="line"><span style="color:#D4D4D4;">preprocessor_config = ez.utils.load_yaml(</span><span style="color:#CE9178;">&quot;config/preprocess.yaml&quot;</span><span style="color:#D4D4D4;">)</span></span>
71
+ <span class="line"><span style="color:#D4D4D4;">training_config.update(preprocessor_config)</span></span>
72
+ <span class="line"></span>
73
+ <span class="line"><span style="color:#C586C0;">with</span><span style="color:#DCDCAA;"> open</span><span style="color:#D4D4D4;">(preprocessor_config[</span><span style="color:#CE9178;">&quot;token_list&quot;</span><span style="color:#D4D4D4;">], </span><span style="color:#CE9178;">&quot;r&quot;</span><span style="color:#D4D4D4;">) </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> f:</span></span>
74
+ <span class="line"><span style="color:#D4D4D4;"> training_config[</span><span style="color:#CE9178;">&quot;token_list&quot;</span><span style="color:#D4D4D4;">] = [t.replace(</span><span style="color:#CE9178;">&quot;</span><span style="color:#D7BA7D;">\\n</span><span style="color:#CE9178;">&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;&quot;</span><span style="color:#D4D4D4;">) </span><span style="color:#C586C0;">for</span><span style="color:#D4D4D4;"> t </span><span style="color:#C586C0;">in</span><span style="color:#D4D4D4;"> f.readlines()]</span></span>
75
+ <span class="line"></span>
76
+ <span class="line"><span style="color:#6A9955;"># Define the Trainer class</span></span>
77
+ <span class="line"><span style="color:#D4D4D4;">trainer = ez.Trainer(</span></span>
78
+ <span class="line"><span style="color:#9CDCFE;"> task</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&#39;asr&#39;</span><span style="color:#D4D4D4;">,</span></span>
79
+ <span class="line"><span style="color:#9CDCFE;"> train_config</span><span style="color:#D4D4D4;">=training_config,</span></span>
80
+ <span class="line"><span style="color:#9CDCFE;"> train_dump_dir</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&quot;dump/libri100/train&quot;</span><span style="color:#D4D4D4;">,</span></span>
81
+ <span class="line"><span style="color:#9CDCFE;"> valid_dump_dir</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&quot;dump/libri100/dev&quot;</span><span style="color:#D4D4D4;">,</span></span>
82
+ <span class="line"><span style="color:#9CDCFE;"> data_info</span><span style="color:#D4D4D4;">=data_info,</span></span>
83
+ <span class="line"><span style="color:#9CDCFE;"> output_dir</span><span style="color:#D4D4D4;">=EXP_DIR,</span></span>
84
+ <span class="line"><span style="color:#9CDCFE;"> stats_dir</span><span style="color:#D4D4D4;">=STATS_DIR,</span></span>
85
+ <span class="line"><span style="color:#9CDCFE;"> ngpu</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">,</span></span>
86
+ <span class="line"><span style="color:#D4D4D4;">)</span></span>
87
+ <span class="line"><span style="color:#D4D4D4;">trainer.collect_stats()</span></span>
88
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>Finally, we are ready to begin the training process!</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">trainer.train()</span></span>
89
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h2 id="inference" tabindex="-1"><a class="header-anchor" href="#inference"><span>Inference</span></a></h2><p>You can just use the inference API of the ESPnet.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> librosa</span></span>
90
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet2.bin.asr_inference </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> Speech2Text</span></span>
91
+ <span class="line"></span>
92
+ <span class="line"><span style="color:#D4D4D4;">m = Speech2Text(</span></span>
93
+ <span class="line"><span style="color:#CE9178;"> &quot;./exp/train_asr_branchformer_e24_amp/config.yaml&quot;</span><span style="color:#D4D4D4;">,</span></span>
94
+ <span class="line"><span style="color:#CE9178;"> &quot;./exp/train_asr_branchformer_e24_amp/valid.acc.best.pth&quot;</span><span style="color:#D4D4D4;">,</span></span>
95
+ <span class="line"><span style="color:#9CDCFE;"> beam_size</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">10</span></span>
96
+ <span class="line"><span style="color:#D4D4D4;">)</span></span>
97
+ <span class="line"></span>
98
+ <span class="line"><span style="color:#C586C0;">with</span><span style="color:#DCDCAA;"> open</span><span style="color:#D4D4D4;">(</span><span style="color:#CE9178;">&quot;./dump/libri100/dev/wav.scp&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;r&quot;</span><span style="color:#D4D4D4;">) </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> f:</span></span>
99
+ <span class="line"><span style="color:#D4D4D4;"> sample_path = f.readlines()[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">]</span></span>
100
+ <span class="line"><span style="color:#D4D4D4;"> </span></span>
101
+ <span class="line"><span style="color:#D4D4D4;">y, sr = librosa.load(sample_path.split()[</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">], </span><span style="color:#9CDCFE;">sr</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">16000</span><span style="color:#D4D4D4;">, </span><span style="color:#9CDCFE;">mono</span><span style="color:#D4D4D4;">=</span><span style="color:#569CD6;">True</span><span style="color:#D4D4D4;">)</span></span>
102
+ <span class="line"><span style="color:#D4D4D4;">output = m(y)</span></span>
103
+ <span class="line"><span style="color:#DCDCAA;">print</span><span style="color:#D4D4D4;">(output[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">][</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">])</span></span>
104
+ <span class="line"></span>
105
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div>`,30);function m(b,h){const a=l("ExternalLinkIcon");return p(),o("div",null,[c,d,s("p",null,[n("Before proceeding, please ensure that you have already downloaded the Librispeech-100 dataset from "),s("a",D,[n("OpenSLR"),i(a)]),n(" and have placed the data in a directory of your choice. In this notebook, we assume that you have stored the dataset in the "),u,n(" directory. If your dataset is located in a different directory, please make sure to replace "),y,n(" with the actual path to your dataset.")]),v])}const C=e(r,[["render",m],["__file","train.html.vue"]]),_=JSON.parse('{"path":"/espnetez/asr/train.html","title":"Sample demo for ESPnet-Easy!","lang":"en-US","frontmatter":{},"headers":[{"level":2,"title":"Data Preparation","slug":"data-preparation","link":"#data-preparation","children":[]},{"level":2,"title":"Train sentencepiece model","slug":"train-sentencepiece-model","link":"#train-sentencepiece-model","children":[]},{"level":2,"title":"Configure Training Process","slug":"configure-training-process","link":"#configure-training-process","children":[]},{"level":2,"title":"Training","slug":"training","link":"#training","children":[]},{"level":2,"title":"Inference","slug":"inference","link":"#inference","children":[]}],"git":{},"filePathRelative":"espnetez/asr/train.md"}');export{C as comp,_ as data};
assets/tts_cli.html-BfB21gs4.js ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import{_ as i,r as l,o as r,c as o,a,d as e,b as n,e as t}from"./app-DTS6SjJz.js";const p={},d=a("h1",{id:"text-to-speech-recipe",tabindex:"-1"},[a("a",{class:"header-anchor",href:"#text-to-speech-recipe"},[a("span",null,"Text-to-Speech (Recipe)")])],-1),c=a("p",null,[e("This is the example notebook of how-to-run the ESPnet TTS recipe using an4 dataset."),a("br"),e(" You can understand the overview of TTS recipe through this notebook within an hour!")],-1),u=a("p",null,"See also:",-1),v={href:"https://espnet.github.io/espnet",target:"_blank",rel:"noopener noreferrer"},g={href:"https://github.com/espnet",target:"_blank",rel:"noopener noreferrer"},m={href:"https://github.com/kan-bayashi",target:"_blank",rel:"noopener noreferrer"},h=t(`<p>Last update: 2019/07/25</p><h2 id="setup-envrionment" tabindex="-1"><a class="header-anchor" href="#setup-envrionment"><span>Setup envrionment</span></a></h2><p>First, let&#39;s setup the environmet to run the recipe.<br> It take around 10 minues. Please keep waiting for a while.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># OS setup</span></span>
2
+ <span class="line"><span>!sudo apt-get install bc tree</span></span>
3
+ <span class="line"><span>!cat /etc/os-release</span></span>
4
+ <span class="line"><span></span></span>
5
+ <span class="line"><span># espnet setup</span></span>
6
+ <span class="line"><span>!git clone https://github.com/espnet/espnet</span></span>
7
+ <span class="line"><span>!cd espnet; pip install -e .</span></span>
8
+ <span class="line"><span></span></span>
9
+ <span class="line"><span># warp ctc setup</span></span>
10
+ <span class="line"><span>!git clone https://github.com/espnet/warp-ctc -b pytorch-1.1</span></span>
11
+ <span class="line"><span>!cd warp-ctc &amp;&amp; mkdir build &amp;&amp; cd build &amp;&amp; cmake .. &amp;&amp; make -j</span></span>
12
+ <span class="line"><span>!cd warp-ctc/pytorch_binding &amp;&amp; python setup.py install </span></span>
13
+ <span class="line"><span></span></span>
14
+ <span class="line"><span># kaldi setup</span></span>
15
+ <span class="line"><span>!cd /content/espnet/tools; git clone https://github.com/kaldi-asr/kaldi</span></span>
16
+ <span class="line"><span>!echo &quot;&quot; &gt; ./espnet/tools/kaldi/tools/extras/check_dependencies.sh # ignore check</span></span>
17
+ <span class="line"><span>!chmod +x ./espnet/tools/kaldi/tools/extras/check_dependencies.sh</span></span>
18
+ <span class="line"><span>!cd ./espnet/tools/kaldi/tools; make sph2pipe sclite</span></span>
19
+ <span class="line"><span>!rm -rf espnet/tools/kaldi/tools/python</span></span>
20
+ <span class="line"><span>!wget https://18-198329952-gh.circle-artifacts.com/0/home/circleci/repo/ubuntu16-featbin.tar.gz</span></span>
21
+ <span class="line"><span>!tar -xf ./ubuntu16-featbin.tar.gz # take a few minutes</span></span>
22
+ <span class="line"><span>!cp featbin/* espnet/tools/kaldi/src/featbin/</span></span>
23
+ <span class="line"><span></span></span>
24
+ <span class="line"><span># make dummy activate</span></span>
25
+ <span class="line"><span>!mkdir -p espnet/tools/venv/bin</span></span>
26
+ <span class="line"><span>!touch espnet/tools/venv/bin/activate</span></span>
27
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="run-the-recipe" tabindex="-1"><a class="header-anchor" href="#run-the-recipe"><span>Run the recipe</span></a></h2><p>Now ready to run the recipe!<br> We use the most simplest recipe <code>egs/an4/tts1</code> as an example.</p><blockquote><p>Unfortunately, <code>egs/an4/tts1</code> is too small to generate reasonable speech.<br> But you can understand the flow or TTS recipe through this recipe since all of the TTS recipes has the exactly same flow.</p></blockquote><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># Let&#39;s go to an4 recipe!</span></span>
28
+ <span class="line"><span>import os</span></span>
29
+ <span class="line"><span>os.chdir(&quot;/content/espnet/egs/an4/tts1&quot;)</span></span>
30
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>Before running the recipe, let us check the recipe structure.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!tree -L 1</span></span>
31
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>Each recipe has the same structure and files.</p><ul><li><strong>run.sh</strong>: Main script of the recipe. Once you run this script, all of the processing will be conducted from data download, preparation, feature extraction, training, and decoding.</li><li><strong>cmd.sh</strong>: Command configuration source file about how-to-run each processing. You can modify this script if you want to run the script through job control system e.g. Slurm or Torque.</li><li><strong>path.sh</strong>: Path configuration source file. Basically, we do not have to touch.</li><li><strong>conf/</strong>: Directory containing configuration files.</li><li><strong>local/</strong>: Directory containing the recipe-specific scripts e.g. data preparation.</li><li><strong>steps/</strong> and <strong>utils/</strong>: Directory containing kaldi tools.</li></ul><p>Main script <strong>run.sh</strong> consists of several stages:</p><ul><li><strong>stage -1</strong>: Download data if the data is available online.</li><li><strong>stage 0</strong>: Prepare data to make kaldi-stype data directory.</li><li><strong>stage 1</strong>: Extract feature vector, calculate statistics, and perform normalization.</li><li><strong>stage 2</strong>: Prepare a dictionary and make json files for training.</li><li><strong>stage 3</strong>: Train the E2E-TTS network.</li><li><strong>stage 4</strong>: Decode mel-spectrogram using the trained network.</li><li><strong>stage 5</strong>: Generate a waveform from a generated mel-spectrogram using Griffin-Lim.</li></ul><p>Currently, we support the following networks:</p>`,15),b={href:"https://arxiv.org/abs/1712.05884",target:"_blank",rel:"noopener noreferrer"},x={href:"https://arxiv.org/pdf/1809.08895.pdf",target:"_blank",rel:"noopener noreferrer"},_={href:"https://arxiv.org/pdf/1905.09263.pdf",target:"_blank",rel:"noopener noreferrer"},k=t(`<p>Let us check each stage step-by-step via <strong>--stage</strong> and <strong>--stop_stage</strong> options!</p><h3 id="stage-1-data-download" tabindex="-1"><a class="header-anchor" href="#stage-1-data-download"><span>Stage -1: Data download</span></a></h3><p>This stage downloads dataset if the dataset is available online.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!./run.sh --stage -1 --stop_stage -1</span></span>
32
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!tree -L 1</span></span>
33
+ <span class="line"><span>!ls downloads/</span></span>
34
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><p>You can see <strong>downloads</strong> directory is cretead, which containing donwloaded an4 dataset.</p><h3 id="stage-0-data-preparation" tabindex="-1"><a class="header-anchor" href="#stage-0-data-preparation"><span>Stage 0: Data preparation</span></a></h3><p>This stage creates kaldi-style data directories.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!./run.sh --stage 0 --stop_stage 0</span></span>
35
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!tree -L 1 data</span></span>
36
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>Through the data preparation stage, kaldi-style data directories will be created.<br> Here, <strong>data/train/</strong> is corresponding to training set, and <strong>data/test</strong> is corresponding to evaluation set.<br> Each directory has the same following files:</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!ls data/*</span></span>
37
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>The above four files are all we have to prepare to create new recipes.<br> Let&#39;s check each file.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!head -n 3 data/train/{wav.scp,text,utt2spk,spk2utt}</span></span>
38
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>Each file contains the following information:</p><ul><li><strong>wav.scp</strong>: List of audio path. Each line has <code>&lt;utt_id&gt; &lt;wavfile_path or command pipe&gt;</code>. <code>&lt;utt_id&gt;</code> must be unique.</li><li><strong>text</strong>: List of transcriptions. Each line has <code>&lt;utt_id&gt; &lt;transcription&gt;</code>. In the case of TTS, we assume that <code>&lt;transcription&gt;</code> is cleaned.</li><li><strong>utt2spk</strong>: List of correspondence table between utterances and speakers. Each line has <code>&lt;utt_id&gt; &lt;speaker_id&gt;</code>.</li><li><strong>spk2utt</strong>: List of correspondence table between speakers and utterances. Each lien has <code>&lt;speaker_id&gt; &lt;utt_id&gt; ... &lt;utt_id&gt; </code>. This file can be automatically created from <strong>utt2spk</strong>.</li></ul><p>In the ESPnet, speaker information is not used for any processing.<br> Therefore, <strong>utt2spk</strong> and <strong>spk2utt</strong> can be a dummy.</p><h3 id="stage-1-feature-extration" tabindex="-1"><a class="header-anchor" href="#stage-1-feature-extration"><span>Stage 1: Feature extration</span></a></h3><p>This stage performs the following processing:</p><ol><li>Mel-spectrogram extraction</li><li>Data split into training and validation set</li><li>Statistics (mean and variance) calculation</li><li>Normalization</li></ol><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!./run.sh --stage 1 --stop_stage 1 --nj 4</span></span>
39
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>Raw filterbanks are saved in <strong>fbank/</strong> directory with ark/scp format.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!ls fbank</span></span>
40
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p><strong>.ark</strong> is binary file and <strong>.scp</strong> contain the correspondence between <code>&lt;utt_id&gt;</code> and <code>&lt;path_in_ark&gt;</code>.<br> Since feature extraction can be performed for split small sets in parallel, raw_fbank is split into <code>raw_fbank_*.{1..N}.{scp,ark}.</code></p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!head -n 3 fbank/raw_fbank_train.1.scp</span></span>
41
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>These files can be loaded in python via <strong>kaldiio</strong> as follows:</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>import kaldiio</span></span>
42
+ <span class="line"><span>import matplotlib.pyplot as plt</span></span>
43
+ <span class="line"><span></span></span>
44
+ <span class="line"><span># load scp file</span></span>
45
+ <span class="line"><span>scp_dict = kaldiio.load_scp(&quot;fbank/raw_fbank_train.1.scp&quot;)</span></span>
46
+ <span class="line"><span>for key in scp_dict:</span></span>
47
+ <span class="line"><span> plt.imshow(scp_dict[key].T[::-1])</span></span>
48
+ <span class="line"><span> plt.title(key)</span></span>
49
+ <span class="line"><span> plt.colorbar()</span></span>
50
+ <span class="line"><span> plt.show()</span></span>
51
+ <span class="line"><span> break</span></span>
52
+ <span class="line"><span> </span></span>
53
+ <span class="line"><span># load ark file</span></span>
54
+ <span class="line"><span>ark_generator = kaldiio.load_ark(&quot;fbank/raw_fbank_train.1.ark&quot;)</span></span>
55
+ <span class="line"><span>for key, array in ark_generator:</span></span>
56
+ <span class="line"><span> plt.imshow(array.T[::-1])</span></span>
57
+ <span class="line"><span> plt.title(key)</span></span>
58
+ <span class="line"><span> plt.colorbar()</span></span>
59
+ <span class="line"><span> plt.show()</span></span>
60
+ <span class="line"><span> break</span></span>
61
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>After raw mel-spectrogram extraction, some files are added in <strong>data/train/</strong>.<br><strong>feats.scp</strong> is concatenated scp file of <strong>fbank/raw_fbank_train.{1..N}.scp</strong>.<br><strong>utt2num_frames</strong> has the number of feature frames of each <code>&lt;utt_id&gt;</code>.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!ls data/train</span></span>
62
+ <span class="line"><span>!head -n 3 data/train/{feats.scp,utt2num_frames}</span></span>
63
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><p>And <strong>data/train/</strong> directory is split into two directory:</p><ul><li><strong>data/train_nodev/</strong>: data directory for training</li><li><strong>data/train_dev/</strong>: data directory for validation</li></ul><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!ls data</span></span>
64
+ <span class="line"><span>!ls data/train_*</span></span>
65
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><p>You can find <strong>cmvn.ark</strong> in <strong>data/train_nodev</strong>, which is the calculated statistics file.<br> This file also can be loaded in python via kaldiio.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># load cmvn.ark file (Be careful not load_ark, but load_mat)</span></span>
66
+ <span class="line"><span>cmvn = kaldiio.load_mat(&quot;data/train_nodev/cmvn.ark&quot;)</span></span>
67
+ <span class="line"><span></span></span>
68
+ <span class="line"><span># cmvn consists of mean and variance, the last dimension of mean represents the number of frames.</span></span>
69
+ <span class="line"><span>print(&quot;cmvn shape = &quot;+ str(cmvn.shape))</span></span>
70
+ <span class="line"><span></span></span>
71
+ <span class="line"><span># calculate mean and variance</span></span>
72
+ <span class="line"><span>mu = cmvn[0, :-1] / cmvn[0, -1]</span></span>
73
+ <span class="line"><span>var = cmvn[1, :-1] / cmvn[0, -1]</span></span>
74
+ <span class="line"><span></span></span>
75
+ <span class="line"><span># show mean</span></span>
76
+ <span class="line"><span>print(&quot;mean = &quot; + str(mu))</span></span>
77
+ <span class="line"><span>print(&quot;variance = &quot; + str(var))</span></span>
78
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>Normalzed features for training, validation and evaluation set are dumped in <strong>dump/{train_nodev,train_dev,test}/</strong>.<br> There ark and scp can be loaded as the same as the above procedure.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!ls dump/*</span></span>
79
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h3 id="stage-2-dictionary-and-json-preparation" tabindex="-1"><a class="header-anchor" href="#stage-2-dictionary-and-json-preparation"><span>Stage 2: Dictionary and json preparation</span></a></h3><p>This stage creates dictrionary from <strong>data/train_nodev/text</strong> and makes json file for training.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!./run.sh --stage 2 --stop_stage 2</span></span>
80
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>Dictrionary file will be created in <strong>data/lang_1char/</strong>.<br> Dictionary file consists of <code>&lt;token&gt;</code> <code>&lt;token index&gt;</code>.<br> Here, <code>&lt;token index&gt;</code> starts from 1 because 0 is used as padding index.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!ls data</span></span>
81
+ <span class="line"><span>!cat data/lang_1char/train_nodev_units.txt</span></span>
82
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><p>Json file will be created for training / validation /evaludation sets and they are saved as <strong>dump/{train_nodev,train_dev,test}/data.json</strong>.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!ls dump/*/*.json</span></span>
83
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>Each json file contains all of the information in the data directory.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!head -n 27 dump/train_nodev/data.json</span></span>
84
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><ul><li>&quot;shape&quot;: Shape of the input or output sequence. Here input shape [63, 80] represents the number of frames = 63 and the dimension of mel-spectrogram = 80.</li><li>&quot;text&quot;: Original transcription.</li><li>&quot;token&quot;: Token sequence of original transcription.</li><li>&quot;tokenid&quot; Token id sequence of original transcription, which is converted using the dictionary.</li></ul><p>Now ready to start training!</p><h3 id="stage-3-network-training" tabindex="-1"><a class="header-anchor" href="#stage-3-network-training"><span>Stage 3: Network training</span></a></h3><p>This stage performs training of the network.<br> Network training configurations are written as <strong>.yaml</strong> format file.<br> Let us check the default cofiguration <strong>conf/train_pytroch_tacotron2.yaml</strong>.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!cat conf/train_pytorch_tacotron2.yaml</span></span>
85
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>You can modify this configuration file to change the hyperparameters.<br> Here, let&#39;s change the number of epochs for this demonstration.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># TODO(kan-bayashi): Change here to use change_yaml.py</span></span>
86
+ <span class="line"><span>!cat conf/train_pytorch_tacotron2.yaml | sed -e &quot;s/epochs: 50/epochs: 3/g&quot; &gt; conf/train_pytorch_tacotron2_sample.yaml</span></span>
87
+ <span class="line"><span>!cat conf/train_pytorch_tacotron2_sample.yaml</span></span>
88
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>Let&#39;s train the network.<br> You can specify the config file via <strong>--train_config</strong> option. It takes several minutes.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!./run.sh --stage 3 --stop_stage 3 --train_config conf/train_pytorch_tacotron2_sample.yaml --verbose 1</span></span>
89
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>You can see the training log in <code>exp/train_*/train.log</code>.</p><p>The models are saved in <code>exp/train_*/results/</code> directory.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!ls exp/train_nodev_pytorch_train_pytorch_tacotron2_sample/{results,results/att_ws}</span></span>
90
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p><code>exp/train_*/results/*.png</code> are the figures of training curve.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>from IPython.display import Image, display_png</span></span>
91
+ <span class="line"><span>print(&quot;all loss curve&quot;)</span></span>
92
+ <span class="line"><span>display_png(Image(&quot;exp/train_nodev_pytorch_train_pytorch_tacotron2_sample/results/all_loss.png&quot;))</span></span>
93
+ <span class="line"><span>print(&quot;l1 loss curve&quot;)</span></span>
94
+ <span class="line"><span>display_png(Image(&quot;exp/train_nodev_pytorch_train_pytorch_tacotron2_sample/results/l1_loss.png&quot;))</span></span>
95
+ <span class="line"><span>print(&quot;mse loss curve&quot;)</span></span>
96
+ <span class="line"><span>display_png(Image(&quot;exp/train_nodev_pytorch_train_pytorch_tacotron2_sample/results/mse_loss.png&quot;))</span></span>
97
+ <span class="line"><span>print(&quot;bce loss curve&quot;)</span></span>
98
+ <span class="line"><span>display_png(Image(&quot;exp/train_nodev_pytorch_train_pytorch_tacotron2_sample/results/bce_loss.png&quot;))</span></span>
99
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p><code>exp/train_*/results/att_ws/.png</code> are the figures of attention weights in each epoch.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>print(&quot;Attention weights of initial epoch&quot;)</span></span>
100
+ <span class="line"><span>display_png(Image(&quot;exp/train_nodev_pytorch_train_pytorch_tacotron2_sample/results/att_ws/fash-cen1-b.ep.1.png&quot;))</span></span>
101
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><p><code>exp/train_*/results/model.loss.best</code> contains only the model parameters.<br> On the other hand, <code>exp/train_*/results/snapshot</code> contains the model parameters, optimizer states, and iterator states.<br> So you can restart from the training by specifying the snapshot file with <strong>--resume</strong> option.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># resume training from snapshot.ep.2</span></span>
102
+ <span class="line"><span>!./run.sh --stage 3 --stop_stage 3 --train_config conf/train_pytorch_tacotron2_sample.yaml --resume exp/train_nodev_pytorch_train_pytorch_tacotron2_sample/results/snapshot.ep.2 --verbose 1</span></span>
103
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!cat exp/train_nodev_pytorch_train_pytorch_tacotron2_sample/train.log</span></span>
104
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>Also, we support tensorboard.<br> You can see the training log through tensorboard.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>%load_ext tensorboard</span></span>
105
+ <span class="line"><span>%tensorboard --logdir tensorboard/train_nodev_pytorch_train_pytorch_tacotron2_sample/</span></span>
106
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="stage-4-network-decoding" tabindex="-1"><a class="header-anchor" href="#stage-4-network-decoding"><span>Stage 4: Network decoding</span></a></h3><p>This stage performs decoding using the trained model to generate mel-spectrogram from a given text.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!./run.sh --stage 4 --stop_stage 4 --nj 8 --train_config conf/train_pytorch_tacotron2_sample.yaml </span></span>
107
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>Generated features are saved as ark/scp format.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!ls exp/train_nodev_pytorch_train_pytorch_tacotron2_sample/outputs_model.loss.best_decode/*</span></span>
108
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>We can specify the model or snapshot to be used for decoding via <strong>--model</strong>.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!./run.sh --stage 4 --stop_stage 4 --nj 8 --train_config conf/train_pytorch_tacotron2_sample.yaml --model snapshot.ep.2</span></span>
109
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!ls exp/train_nodev_pytorch_train_pytorch_tacotron2_sample/outputs_snapshot.ep.2_decode/*</span></span>
110
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h3 id="stage-5-waveform-synthesis" tabindex="-1"><a class="header-anchor" href="#stage-5-waveform-synthesis"><span>Stage 5: Waveform synthesis</span></a></h3><p>Finally, in this stage, we generate waveform using Grrifin-Lim algorithm.<br> First, we perform de-normalization to convert the generated mel-spectrogram into the original scale.<br> Then we apply Grrifin-Lim algorithm to restore phase components and apply inverse STFT to generate waveforms.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!./run.sh --stage 5 --stop_stage 5 --nj 8 --train_config conf/train_pytorch_tacotron2_sample.yaml --griffin_lim_iters 50</span></span>
111
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>Generated wav files are saved in <code>exp/train_nodev_pytorch_train_pytorch_tacotron2_sample/outputs_model.loss.best_decode_denorm/*/wav</code></p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!ls exp/train_nodev_pytorch_train_pytorch_tacotron2_sample/outputs_model.loss.best_decode_denorm/*/wav</span></span>
112
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!tree -L 3</span></span>
113
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h2 id="next-step" tabindex="-1"><a class="header-anchor" href="#next-step"><span>NEXT step</span></a></h2><ul><li>Try pretrained model to generate speech.</li><li>Try a large single speaker dataset recipe <strong>egs/ljspeech/tts1</strong>.</li><li>Try a large multi-speaker recipe <strong>egs/libritts/tts1</strong>.</li><li>Make the original recipe using your own dataset.</li></ul>`,82);function f(y,E){const s=l("ExternalLinkIcon");return r(),o("div",null,[d,c,u,a("ul",null,[a("li",null,[e("Documentaion: "),a("a",v,[e("https://espnet.github.io/espnet"),n(s)])]),a("li",null,[e("Github: "),a("a",g,[e("https://github.com/espnet"),n(s)])])]),a("p",null,[e("Author: "),a("a",m,[e("Tomoki Hayashi"),n(s)])]),h,a("ul",null,[a("li",null,[e("Tacotron2: "),a("a",b,[e("Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions"),n(s)])]),a("li",null,[e("Transformer: "),a("a",x,[e("Neural Speech Synthesis with Transformer Network"),n(s)])]),a("li",null,[e("FastSpeech: "),a("a",_,[e("FastSpeech: Fast, Robust and Controllable Text to Speech"),n(s)])])]),k])}const w=i(p,[["render",f],["__file","tts_cli.html.vue"]]),q=JSON.parse('{"path":"/espnet2/tts/tts_cli.html","title":"Text-to-Speech (Recipe)","lang":"en-US","frontmatter":{},"headers":[{"level":2,"title":"Setup envrionment","slug":"setup-envrionment","link":"#setup-envrionment","children":[]},{"level":2,"title":"Run the recipe","slug":"run-the-recipe","link":"#run-the-recipe","children":[{"level":3,"title":"Stage -1: Data download","slug":"stage-1-data-download","link":"#stage-1-data-download","children":[]},{"level":3,"title":"Stage 0: Data preparation","slug":"stage-0-data-preparation","link":"#stage-0-data-preparation","children":[]},{"level":3,"title":"Stage 1: Feature extration","slug":"stage-1-feature-extration","link":"#stage-1-feature-extration","children":[]},{"level":3,"title":"Stage 2: Dictionary and json preparation","slug":"stage-2-dictionary-and-json-preparation","link":"#stage-2-dictionary-and-json-preparation","children":[]},{"level":3,"title":"Stage 3: Network training","slug":"stage-3-network-training","link":"#stage-3-network-training","children":[]},{"level":3,"title":"Stage 4: Network decoding","slug":"stage-4-network-decoding","link":"#stage-4-network-decoding","children":[]},{"level":3,"title":"Stage 5: Waveform synthesis","slug":"stage-5-waveform-synthesis","link":"#stage-5-waveform-synthesis","children":[]}]},{"level":2,"title":"NEXT step","slug":"next-step","link":"#next-step","children":[]}],"git":{},"filePathRelative":"espnet2/tts/tts_cli.md"}');export{w as comp,q as data};
assets/tts_realtime_demo.html-BKOGq7as.js ADDED
@@ -0,0 +1,351 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import{_ as i,r as l,o as p,c as d,a as n,b as e,d as s,e as t}from"./app-DTS6SjJz.js";const o={},r={href:"https://colab.research.google.com/github/espnet/notebook/blob/master/tts_realtime_demo.ipynb",target:"_blank",rel:"noopener noreferrer"},c=n("img",{src:"https://colab.research.google.com/assets/colab-badge.svg",alt:"Open In Colab"},null,-1),v=n("h1",{id:"espnet-real-time-e2e-tts-demonstration",tabindex:"-1"},[n("a",{class:"header-anchor",href:"#espnet-real-time-e2e-tts-demonstration"},[n("span",null,"ESPnet real time E2E-TTS demonstration")])],-1),u=n("p",null,"This notebook provides a demonstration of the realtime E2E-TTS using ESPnet-TTS and ParallelWaveGAN (+ MelGAN).",-1),m=n("ul",null,[n("li",null,"ESPnet: https://github.com/espnet/espnet"),n("li",null,"ParallelWaveGAN: https://github.com/kan-bayashi/ParallelWaveGAN")],-1),h={href:"https://github.com/kan-bayashi",target:"_blank",rel:"noopener noreferrer"},b=t(`<h2 id="install" tabindex="-1"><a class="header-anchor" href="#install"><span>Install</span></a></h2><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># install minimal components</span></span>
2
+ <span class="line"><span>!pip install -q parallel_wavegan PyYaml unidecode ConfigArgparse g2p_en espnet_tts_frontend</span></span>
3
+ <span class="line"><span>!pip install --upgrade --no-cache-dir gdown</span></span>
4
+ <span class="line"><span>!git clone -q https://github.com/espnet/espnet.git</span></span>
5
+ <span class="line"><span>!cd espnet &amp;&amp; git fetch &amp;&amp; git checkout -b v.0.9.1 refs/tags/v.0.9.1</span></span>
6
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><hr><h2 id="english-demo" tabindex="-1"><a class="header-anchor" href="#english-demo"><span>English demo</span></a></h2><h3 id="download-pretrained-feature-generation-model" tabindex="-1"><a class="header-anchor" href="#download-pretrained-feature-generation-model"><span>Download pretrained feature generation model</span></a></h3><p>You can select one from three models. Please only run the seletected model cells.</p><h4 id="a-tacotron2" tabindex="-1"><a class="header-anchor" href="#a-tacotron2"><span>(a) Tacotron2</span></a></h4><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># download pretrained model</span></span>
7
+ <span class="line"><span>import os</span></span>
8
+ <span class="line"><span>if not os.path.exists(&quot;downloads/en/tacotron2&quot;):</span></span>
9
+ <span class="line"><span> !./espnet/utils/download_from_google_drive.sh \\</span></span>
10
+ <span class="line"><span> https://drive.google.com/open?id=1lFfeyewyOsxaNO-DEWy9iSz6qB9ZS1UR downloads/en/tacotron2 tar.gz</span></span>
11
+ <span class="line"><span></span></span>
12
+ <span class="line"><span># set path</span></span>
13
+ <span class="line"><span>trans_type = &quot;phn&quot;</span></span>
14
+ <span class="line"><span>dict_path = &quot;downloads/en/tacotron2/data/lang_1phn/phn_train_no_dev_units.txt&quot;</span></span>
15
+ <span class="line"><span>model_path = &quot;downloads/en/tacotron2/exp/phn_train_no_dev_pytorch_train_pytorch_tacotron2.v3/results/model.last1.avg.best&quot;</span></span>
16
+ <span class="line"><span></span></span>
17
+ <span class="line"><span>print(&quot;sucessfully finished download.&quot;)</span></span>
18
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h4 id="b-transformer" tabindex="-1"><a class="header-anchor" href="#b-transformer"><span>(b) Transformer</span></a></h4><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># download pretrained model</span></span>
19
+ <span class="line"><span>import os</span></span>
20
+ <span class="line"><span>if not os.path.exists(&quot;downloads/en/transformer&quot;):</span></span>
21
+ <span class="line"><span> !./espnet/utils/download_from_google_drive.sh \\</span></span>
22
+ <span class="line"><span> https://drive.google.com/open?id=1z8KSOWVBjK-_Ws4RxVN4NTx-Buy03-7c downloads/en/transformer tar.gz</span></span>
23
+ <span class="line"><span></span></span>
24
+ <span class="line"><span># set path</span></span>
25
+ <span class="line"><span>trans_type = &quot;phn&quot;</span></span>
26
+ <span class="line"><span>dict_path = &quot;downloads/en/transformer/data/lang_1phn/phn_train_no_dev_units.txt&quot;</span></span>
27
+ <span class="line"><span>model_path = &quot;downloads/en/transformer/exp/phn_train_no_dev_pytorch_train_pytorch_transformer.v3.single/results/model.last1.avg.best&quot;</span></span>
28
+ <span class="line"><span></span></span>
29
+ <span class="line"><span>print(&quot;sucessfully finished download.&quot;)</span></span>
30
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h4 id="c-fastspeech" tabindex="-1"><a class="header-anchor" href="#c-fastspeech"><span>(c) FastSpeech</span></a></h4><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># download pretrained model</span></span>
31
+ <span class="line"><span>import os</span></span>
32
+ <span class="line"><span>if not os.path.exists(&quot;downloads/en/fastspeech&quot;):</span></span>
33
+ <span class="line"><span> !./espnet/utils/download_from_google_drive.sh \\</span></span>
34
+ <span class="line"><span> https://drive.google.com/open?id=1P9I4qag8wAcJiTCPawt6WCKBqUfJFtFp downloads/en/fastspeech tar.gz</span></span>
35
+ <span class="line"><span></span></span>
36
+ <span class="line"><span># set path</span></span>
37
+ <span class="line"><span>trans_type = &quot;phn&quot;</span></span>
38
+ <span class="line"><span>dict_path = &quot;downloads/en/fastspeech/data/lang_1phn/phn_train_no_dev_units.txt&quot;</span></span>
39
+ <span class="line"><span>model_path = &quot;downloads/en/fastspeech/exp/phn_train_no_dev_pytorch_train_tacotron2.v3_fastspeech.v4.single/results/model.last1.avg.best&quot;</span></span>
40
+ <span class="line"><span></span></span>
41
+ <span class="line"><span>print(&quot;Sucessfully finished download.&quot;)</span></span>
42
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="download-pretrained-vocoder-model" tabindex="-1"><a class="header-anchor" href="#download-pretrained-vocoder-model"><span>Download pretrained vocoder model</span></a></h3><p>You can select one from two models. Please only run the seletected model cells.</p><h4 id="a-parallel-wavegan" tabindex="-1"><a class="header-anchor" href="#a-parallel-wavegan"><span>(a) Parallel WaveGAN</span></a></h4><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># download pretrained model</span></span>
43
+ <span class="line"><span>import os</span></span>
44
+ <span class="line"><span>if not os.path.exists(&quot;downloads/en/parallel_wavegan&quot;):</span></span>
45
+ <span class="line"><span> !./espnet/utils/download_from_google_drive.sh \\</span></span>
46
+ <span class="line"><span> https://drive.google.com/open?id=1Grn7X9wD35UcDJ5F7chwdTqTa4U7DeVB downloads/en/parallel_wavegan tar.gz</span></span>
47
+ <span class="line"><span></span></span>
48
+ <span class="line"><span># set path</span></span>
49
+ <span class="line"><span>vocoder_path = &quot;downloads/en/parallel_wavegan/ljspeech.parallel_wavegan.v2/checkpoint-400000steps.pkl&quot;</span></span>
50
+ <span class="line"><span></span></span>
51
+ <span class="line"><span>print(&quot;Sucessfully finished download.&quot;)</span></span>
52
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h4 id="b-melgan" tabindex="-1"><a class="header-anchor" href="#b-melgan"><span>(b) MelGAN</span></a></h4><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># download pretrained model</span></span>
53
+ <span class="line"><span>import os</span></span>
54
+ <span class="line"><span>if not os.path.exists(&quot;downloads/en/melgan&quot;):</span></span>
55
+ <span class="line"><span> !./espnet/utils/download_from_google_drive.sh \\</span></span>
56
+ <span class="line"><span> https://drive.google.com/open?id=1_a8faVA5OGCzIcJNw4blQYjfG4oA9VEt downloads/en/melgan tar.gz</span></span>
57
+ <span class="line"><span></span></span>
58
+ <span class="line"><span># set path</span></span>
59
+ <span class="line"><span>vocoder_path = &quot;downloads/en/melgan/train_nodev_ljspeech_melgan.v3.long/checkpoint-4000000steps.pkl&quot;</span></span>
60
+ <span class="line"><span></span></span>
61
+ <span class="line"><span>print(&quot;Sucessfully finished download.&quot;)</span></span>
62
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h4 id="c-multi-band-melgan" tabindex="-1"><a class="header-anchor" href="#c-multi-band-melgan"><span>(c) Multi-band MelGAN</span></a></h4><p>This is an <strong>EXPERIMENTAL</strong> model.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># download pretrained model</span></span>
63
+ <span class="line"><span>import os</span></span>
64
+ <span class="line"><span>if not os.path.exists(&quot;downloads/en/mb-melgan&quot;):</span></span>
65
+ <span class="line"><span> !./espnet/utils/download_from_google_drive.sh \\</span></span>
66
+ <span class="line"><span> https://drive.google.com/open?id=1rGG5y15uy4WZ-lJy8NPVTkmB_6VhC20V downloads/en/mb-melgan tar.gz</span></span>
67
+ <span class="line"><span></span></span>
68
+ <span class="line"><span># set path</span></span>
69
+ <span class="line"><span>vocoder_path = &quot;downloads/en/mb-melgan/train_nodev_ljspeech_multi_band_melgan.v1/checkpoint-1000000steps.pkl&quot;</span></span>
70
+ <span class="line"><span></span></span>
71
+ <span class="line"><span>print(&quot;Sucessfully finished download.&quot;)</span></span>
72
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="setup" tabindex="-1"><a class="header-anchor" href="#setup"><span>Setup</span></a></h3><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># add path</span></span>
73
+ <span class="line"><span>import sys</span></span>
74
+ <span class="line"><span>sys.path.append(&quot;espnet&quot;)</span></span>
75
+ <span class="line"><span></span></span>
76
+ <span class="line"><span># define device</span></span>
77
+ <span class="line"><span>import torch</span></span>
78
+ <span class="line"><span>device = torch.device(&quot;cuda&quot;)</span></span>
79
+ <span class="line"><span></span></span>
80
+ <span class="line"><span># define E2E-TTS model</span></span>
81
+ <span class="line"><span>from argparse import Namespace</span></span>
82
+ <span class="line"><span>from espnet.asr.asr_utils import get_model_conf</span></span>
83
+ <span class="line"><span>from espnet.asr.asr_utils import torch_load</span></span>
84
+ <span class="line"><span>from espnet.utils.dynamic_import import dynamic_import</span></span>
85
+ <span class="line"><span>idim, odim, train_args = get_model_conf(model_path)</span></span>
86
+ <span class="line"><span>model_class = dynamic_import(train_args.model_module)</span></span>
87
+ <span class="line"><span>model = model_class(idim, odim, train_args)</span></span>
88
+ <span class="line"><span>torch_load(model_path, model)</span></span>
89
+ <span class="line"><span>model = model.eval().to(device)</span></span>
90
+ <span class="line"><span>inference_args = Namespace(**{</span></span>
91
+ <span class="line"><span> &quot;threshold&quot;: 0.5,&quot;minlenratio&quot;: 0.0, &quot;maxlenratio&quot;: 10.0,</span></span>
92
+ <span class="line"><span> # Only for Tacotron 2</span></span>
93
+ <span class="line"><span> &quot;use_attention_constraint&quot;: True, &quot;backward_window&quot;: 1,&quot;forward_window&quot;:3,</span></span>
94
+ <span class="line"><span> # Only for fastspeech (lower than 1.0 is faster speech, higher than 1.0 is slower speech)</span></span>
95
+ <span class="line"><span> &quot;fastspeech_alpha&quot;: 1.0,</span></span>
96
+ <span class="line"><span> })</span></span>
97
+ <span class="line"><span></span></span>
98
+ <span class="line"><span># define neural vocoder</span></span>
99
+ <span class="line"><span>from parallel_wavegan.utils import load_model</span></span>
100
+ <span class="line"><span>fs = 22050</span></span>
101
+ <span class="line"><span>vocoder = load_model(vocoder_path)</span></span>
102
+ <span class="line"><span>vocoder.remove_weight_norm()</span></span>
103
+ <span class="line"><span>vocoder = vocoder.eval().to(device)</span></span>
104
+ <span class="line"><span></span></span>
105
+ <span class="line"><span># define text frontend</span></span>
106
+ <span class="line"><span>from tacotron_cleaner.cleaners import custom_english_cleaners</span></span>
107
+ <span class="line"><span>from g2p_en import G2p</span></span>
108
+ <span class="line"><span>with open(dict_path) as f:</span></span>
109
+ <span class="line"><span> lines = f.readlines()</span></span>
110
+ <span class="line"><span>lines = [line.replace(&quot;\\n&quot;, &quot;&quot;).split(&quot; &quot;) for line in lines]</span></span>
111
+ <span class="line"><span>char_to_id = {c: int(i) for c, i in lines}</span></span>
112
+ <span class="line"><span>g2p = G2p()</span></span>
113
+ <span class="line"><span>def frontend(text):</span></span>
114
+ <span class="line"><span> &quot;&quot;&quot;Clean text and then convert to id sequence.&quot;&quot;&quot;</span></span>
115
+ <span class="line"><span> text = custom_english_cleaners(text)</span></span>
116
+ <span class="line"><span> </span></span>
117
+ <span class="line"><span> if trans_type == &quot;phn&quot;:</span></span>
118
+ <span class="line"><span> text = filter(lambda s: s != &quot; &quot;, g2p(text))</span></span>
119
+ <span class="line"><span> text = &quot; &quot;.join(text)</span></span>
120
+ <span class="line"><span> print(f&quot;Cleaned text: {text}&quot;)</span></span>
121
+ <span class="line"><span> charseq = text.split(&quot; &quot;)</span></span>
122
+ <span class="line"><span> else:</span></span>
123
+ <span class="line"><span> print(f&quot;Cleaned text: {text}&quot;)</span></span>
124
+ <span class="line"><span> charseq = list(text)</span></span>
125
+ <span class="line"><span> idseq = []</span></span>
126
+ <span class="line"><span> for c in charseq:</span></span>
127
+ <span class="line"><span> if c.isspace():</span></span>
128
+ <span class="line"><span> idseq += [char_to_id[&quot;&lt;space&gt;&quot;]]</span></span>
129
+ <span class="line"><span> elif c not in char_to_id.keys():</span></span>
130
+ <span class="line"><span> idseq += [char_to_id[&quot;&lt;unk&gt;&quot;]]</span></span>
131
+ <span class="line"><span> else:</span></span>
132
+ <span class="line"><span> idseq += [char_to_id[c]]</span></span>
133
+ <span class="line"><span> idseq += [idim - 1] # &lt;eos&gt;</span></span>
134
+ <span class="line"><span> return torch.LongTensor(idseq).view(-1).to(device)</span></span>
135
+ <span class="line"><span></span></span>
136
+ <span class="line"><span>import nltk</span></span>
137
+ <span class="line"><span>nltk.download(&#39;punkt&#39;)</span></span>
138
+ <span class="line"><span>print(&quot;Now ready to synthesize!&quot;)</span></span>
139
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="synthesis" tabindex="-1"><a class="header-anchor" href="#synthesis"><span>Synthesis</span></a></h3><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>import time</span></span>
140
+ <span class="line"><span>print(&quot;Input your favorite sentence in English!&quot;)</span></span>
141
+ <span class="line"><span>input_text = input()</span></span>
142
+ <span class="line"><span>with torch.no_grad():</span></span>
143
+ <span class="line"><span> start = time.time()</span></span>
144
+ <span class="line"><span> x = frontend(input_text)</span></span>
145
+ <span class="line"><span> c, _, _ = model.inference(x, inference_args)</span></span>
146
+ <span class="line"><span> y = vocoder.inference(c)</span></span>
147
+ <span class="line"><span>rtf = (time.time() - start) / (len(y) / fs)</span></span>
148
+ <span class="line"><span>print(f&quot;RTF = {rtf:5f}&quot;)</span></span>
149
+ <span class="line"><span></span></span>
150
+ <span class="line"><span>from IPython.display import display, Audio</span></span>
151
+ <span class="line"><span>display(Audio(y.view(-1).cpu().numpy(), rate=fs))</span></span>
152
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><hr><h2 id="japanese-demo" tabindex="-1"><a class="header-anchor" href="#japanese-demo"><span>Japanese demo</span></a></h2><h3 id="install-japanese-dependencies" tabindex="-1"><a class="header-anchor" href="#install-japanese-dependencies"><span>Install Japanese dependencies</span></a></h3><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!pip install pyopenjtalk</span></span>
153
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h3 id="download-pretrained-models" tabindex="-1"><a class="header-anchor" href="#download-pretrained-models"><span>Download pretrained models</span></a></h3><p>Here we select Tacotron2 or Transformer. The vocoder model is Parallel WaveGAN.</p><h4 id="a-tacotron-2" tabindex="-1"><a class="header-anchor" href="#a-tacotron-2"><span>(a) Tacotron 2</span></a></h4><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># download pretrained models</span></span>
154
+ <span class="line"><span>import os</span></span>
155
+ <span class="line"><span>if not os.path.exists(&quot;downloads/jp/tacotron2&quot;):</span></span>
156
+ <span class="line"><span> !./espnet/utils/download_from_google_drive.sh \\</span></span>
157
+ <span class="line"><span> https://drive.google.com/open?id=1OwrUQzAmvjj1x9cDhnZPp6dqtsEqGEJM downloads/jp/tacotron2 tar.gz</span></span>
158
+ <span class="line"><span> !./espnet/utils/download_from_google_drive.sh \\</span></span>
159
+ <span class="line"><span> https://drive.google.com/open?id=1kp5M4VvmagDmYckFJa78WGqh1drb_P9t downloads/jp/tacotron2 tar.gz</span></span>
160
+ <span class="line"><span></span></span>
161
+ <span class="line"><span># set path</span></span>
162
+ <span class="line"><span>dict_path = &quot;downloads/jp/tacotron2/data/lang_1phn/train_no_dev_units.txt&quot;</span></span>
163
+ <span class="line"><span>model_path = &quot;downloads/jp/tacotron2/exp/train_no_dev_pytorch_train_pytorch_tacotron2_phn/results/model.last1.avg.best&quot;</span></span>
164
+ <span class="line"><span>vocoder_path = &quot;downloads/jp/tacotron2/jsut.parallel_wavegan.v1/checkpoint-400000steps.pkl&quot;</span></span>
165
+ <span class="line"><span></span></span>
166
+ <span class="line"><span>print(&quot;sucessfully finished download.&quot;)</span></span>
167
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h4 id="b-transformer-1" tabindex="-1"><a class="header-anchor" href="#b-transformer-1"><span>(b) Transformer</span></a></h4><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># download pretrained models</span></span>
168
+ <span class="line"><span>import os</span></span>
169
+ <span class="line"><span>if not os.path.exists(&quot;downloads/jp/transformer&quot;):</span></span>
170
+ <span class="line"><span> !./espnet/utils/download_from_google_drive.sh \\</span></span>
171
+ <span class="line"><span> https://drive.google.com/open?id=1OwrUQzAmvjj1x9cDhnZPp6dqtsEqGEJM downloads/jp/transformer tar.gz</span></span>
172
+ <span class="line"><span> !./espnet/utils/download_from_google_drive.sh \\</span></span>
173
+ <span class="line"><span> https://drive.google.com/open?id=1mEnZfBKqA4eT6Bn0eRZuP6lNzL-IL3VD downloads/jp/transformer tar.gz</span></span>
174
+ <span class="line"><span></span></span>
175
+ <span class="line"><span># set path</span></span>
176
+ <span class="line"><span>dict_path = &quot;downloads/jp/transformer/data/lang_1phn/train_no_dev_units.txt&quot;</span></span>
177
+ <span class="line"><span>model_path = &quot;downloads/jp/transformer/exp/train_no_dev_pytorch_train_pytorch_transformer_phn/results/model.last1.avg.best&quot;</span></span>
178
+ <span class="line"><span>vocoder_path = &quot;downloads/jp/transformer/jsut.parallel_wavegan.v1/checkpoint-400000steps.pkl&quot;</span></span>
179
+ <span class="line"><span></span></span>
180
+ <span class="line"><span>print(&quot;sucessfully finished download.&quot;)</span></span>
181
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="setup-1" tabindex="-1"><a class="header-anchor" href="#setup-1"><span>Setup</span></a></h3><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># add path</span></span>
182
+ <span class="line"><span>import sys</span></span>
183
+ <span class="line"><span>sys.path.append(&quot;espnet&quot;)</span></span>
184
+ <span class="line"><span></span></span>
185
+ <span class="line"><span># define device</span></span>
186
+ <span class="line"><span>import torch</span></span>
187
+ <span class="line"><span>device = torch.device(&quot;cuda&quot;)</span></span>
188
+ <span class="line"><span></span></span>
189
+ <span class="line"><span># define E2E-TTS model</span></span>
190
+ <span class="line"><span>from argparse import Namespace</span></span>
191
+ <span class="line"><span>from espnet.asr.asr_utils import get_model_conf</span></span>
192
+ <span class="line"><span>from espnet.asr.asr_utils import torch_load</span></span>
193
+ <span class="line"><span>from espnet.utils.dynamic_import import dynamic_import</span></span>
194
+ <span class="line"><span>idim, odim, train_args = get_model_conf(model_path)</span></span>
195
+ <span class="line"><span>model_class = dynamic_import(train_args.model_module)</span></span>
196
+ <span class="line"><span>model = model_class(idim, odim, train_args)</span></span>
197
+ <span class="line"><span>torch_load(model_path, model)</span></span>
198
+ <span class="line"><span>model = model.eval().to(device)</span></span>
199
+ <span class="line"><span>inference_args = Namespace(**{&quot;threshold&quot;: 0.5, &quot;minlenratio&quot;: 0.0, &quot;maxlenratio&quot;: 10.0})</span></span>
200
+ <span class="line"><span></span></span>
201
+ <span class="line"><span># define neural vocoder</span></span>
202
+ <span class="line"><span>from parallel_wavegan.utils import load_model</span></span>
203
+ <span class="line"><span>fs = 24000</span></span>
204
+ <span class="line"><span>vocoder = load_model(vocoder_path)</span></span>
205
+ <span class="line"><span>vocoder.remove_weight_norm()</span></span>
206
+ <span class="line"><span>vocoder = vocoder.eval().to(device)</span></span>
207
+ <span class="line"><span></span></span>
208
+ <span class="line"><span># define text frontend</span></span>
209
+ <span class="line"><span>import pyopenjtalk</span></span>
210
+ <span class="line"><span>with open(dict_path) as f:</span></span>
211
+ <span class="line"><span> lines = f.readlines()</span></span>
212
+ <span class="line"><span>lines = [line.replace(&quot;\\n&quot;, &quot;&quot;).split(&quot; &quot;) for line in lines]</span></span>
213
+ <span class="line"><span>char_to_id = {c: int(i) for c, i in lines}</span></span>
214
+ <span class="line"><span>def frontend(text):</span></span>
215
+ <span class="line"><span> &quot;&quot;&quot;Clean text and then convert to id sequence.&quot;&quot;&quot;</span></span>
216
+ <span class="line"><span> text = pyopenjtalk.g2p(text, kana=False)</span></span>
217
+ <span class="line"><span> print(f&quot;Cleaned text: {text}&quot;)</span></span>
218
+ <span class="line"><span> charseq = text.split(&quot; &quot;)</span></span>
219
+ <span class="line"><span> idseq = []</span></span>
220
+ <span class="line"><span> for c in charseq:</span></span>
221
+ <span class="line"><span> if c.isspace():</span></span>
222
+ <span class="line"><span> idseq += [char_to_id[&quot;&lt;space&gt;&quot;]]</span></span>
223
+ <span class="line"><span> elif c not in char_to_id.keys():</span></span>
224
+ <span class="line"><span> idseq += [char_to_id[&quot;&lt;unk&gt;&quot;]]</span></span>
225
+ <span class="line"><span> else:</span></span>
226
+ <span class="line"><span> idseq += [char_to_id[c]]</span></span>
227
+ <span class="line"><span> idseq += [idim - 1] # &lt;eos&gt;</span></span>
228
+ <span class="line"><span> return torch.LongTensor(idseq).view(-1).to(device)</span></span>
229
+ <span class="line"><span></span></span>
230
+ <span class="line"><span>frontend(&quot;初回の辞書のインストールが必要です&quot;)</span></span>
231
+ <span class="line"><span>print(&quot;Now ready to synthesize!&quot;)</span></span>
232
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="synthesis-1" tabindex="-1"><a class="header-anchor" href="#synthesis-1"><span>Synthesis</span></a></h3><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>import time</span></span>
233
+ <span class="line"><span>print(&quot;日本語で好きな文章を入力してください&quot;)</span></span>
234
+ <span class="line"><span>input_text = input()</span></span>
235
+ <span class="line"><span></span></span>
236
+ <span class="line"><span>with torch.no_grad():</span></span>
237
+ <span class="line"><span> start = time.time()</span></span>
238
+ <span class="line"><span> x = frontend(input_text)</span></span>
239
+ <span class="line"><span> c, _, _ = model.inference(x, inference_args)</span></span>
240
+ <span class="line"><span> y = vocoder.inference(c)</span></span>
241
+ <span class="line"><span>rtf = (time.time() - start) / (len(y) / fs)</span></span>
242
+ <span class="line"><span>print(f&quot;RTF = {rtf:5f}&quot;)</span></span>
243
+ <span class="line"><span></span></span>
244
+ <span class="line"><span>from IPython.display import display, Audio</span></span>
245
+ <span class="line"><span>display(Audio(y.view(-1).cpu().numpy(), rate=fs))</span></span>
246
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><hr><h2 id="mandarin-demo" tabindex="-1"><a class="header-anchor" href="#mandarin-demo"><span>Mandarin demo</span></a></h2><p><strong>IMPORTANT NOTE</strong>: The author cannot understand Mandarin. The text front-end part might have some bugs.</p><h3 id="install-mandarin-dependencies" tabindex="-1"><a class="header-anchor" href="#install-mandarin-dependencies"><span>Install Mandarin dependencies</span></a></h3><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!pip install pypinyin</span></span>
247
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h3 id="download-pretrained-models-1" tabindex="-1"><a class="header-anchor" href="#download-pretrained-models-1"><span>Download pretrained models</span></a></h3><p>You can select Transformer or FastSpeech.</p><h4 id="a-transformer" tabindex="-1"><a class="header-anchor" href="#a-transformer"><span>(a) Transformer</span></a></h4><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># download pretrained models</span></span>
248
+ <span class="line"><span>import os</span></span>
249
+ <span class="line"><span>if not os.path.exists(&quot;downloads/zh/transformer&quot;):</span></span>
250
+ <span class="line"><span> !./espnet/utils/download_from_google_drive.sh \\</span></span>
251
+ <span class="line"><span> https://drive.google.com/open?id=10M6H88jEUGbRWBmU1Ff2VaTmOAeL8CEy downloads/zh/transformer tar.gz</span></span>
252
+ <span class="line"><span> !./espnet/utils/download_from_google_drive.sh \\</span></span>
253
+ <span class="line"><span> https://drive.google.com/open?id=1bTSygvonv5TS6-iuYsOIUWpN2atGnyhZ downloads/zh/transformer tar.gz</span></span>
254
+ <span class="line"><span></span></span>
255
+ <span class="line"><span># set path</span></span>
256
+ <span class="line"><span>dict_path = &quot;downloads/zh/transformer/data/lang_phn/train_no_dev_units.txt&quot;</span></span>
257
+ <span class="line"><span>model_path = &quot;downloads/zh/transformer/exp/train_no_dev_pytorch_train_pytorch_transformer.v1.single/results/model.last1.avg.best&quot;</span></span>
258
+ <span class="line"><span>vocoder_path = &quot;downloads/zh/transformer/csmsc.parallel_wavegan.v1/checkpoint-400000steps.pkl&quot;</span></span>
259
+ <span class="line"><span></span></span>
260
+ <span class="line"><span>print(&quot;sucessfully finished download.&quot;)</span></span>
261
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h4 id="b-fastspeech" tabindex="-1"><a class="header-anchor" href="#b-fastspeech"><span>(b) FastSpeech</span></a></h4><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># download pretrained models</span></span>
262
+ <span class="line"><span>import os</span></span>
263
+ <span class="line"><span>if not os.path.exists(&quot;downloads/zh/fastspeech&quot;):</span></span>
264
+ <span class="line"><span> !./espnet/utils/download_from_google_drive.sh \\</span></span>
265
+ <span class="line"><span> https://drive.google.com/open?id=10M6H88jEUGbRWBmU1Ff2VaTmOAeL8CEy downloads/zh/fastspeech tar.gz</span></span>
266
+ <span class="line"><span> !./espnet/utils/download_from_google_drive.sh \\</span></span>
267
+ <span class="line"><span> https://drive.google.com/open?id=1T8thxkAxjGFPXPWPTcKLvHnd6lG0-82R downloads/zh/fastspeech tar.gz </span></span>
268
+ <span class="line"><span></span></span>
269
+ <span class="line"><span># set path</span></span>
270
+ <span class="line"><span>dict_path = &quot;downloads/zh/fastspeech/data/lang_phn/train_no_dev_units.txt&quot;</span></span>
271
+ <span class="line"><span>model_path = &quot;downloads/zh/fastspeech/exp/train_no_dev_pytorch_train_fastspeech.v3.single/results/model.last1.avg.best&quot;</span></span>
272
+ <span class="line"><span>vocoder_path = &quot;downloads/zh/fastspeech/csmsc.parallel_wavegan.v1/checkpoint-400000steps.pkl&quot;</span></span>
273
+ <span class="line"><span></span></span>
274
+ <span class="line"><span>print(&quot;sucessfully finished download.&quot;)</span></span>
275
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="setup-2" tabindex="-1"><a class="header-anchor" href="#setup-2"><span>Setup</span></a></h3><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># add path</span></span>
276
+ <span class="line"><span>import sys</span></span>
277
+ <span class="line"><span>sys.path.append(&quot;espnet&quot;)</span></span>
278
+ <span class="line"><span></span></span>
279
+ <span class="line"><span># define device</span></span>
280
+ <span class="line"><span>import torch</span></span>
281
+ <span class="line"><span>device = torch.device(&quot;cuda&quot;)</span></span>
282
+ <span class="line"><span></span></span>
283
+ <span class="line"><span># define E2E-TTS model</span></span>
284
+ <span class="line"><span>from argparse import Namespace</span></span>
285
+ <span class="line"><span>from espnet.asr.asr_utils import get_model_conf</span></span>
286
+ <span class="line"><span>from espnet.asr.asr_utils import torch_load</span></span>
287
+ <span class="line"><span>from espnet.utils.dynamic_import import dynamic_import</span></span>
288
+ <span class="line"><span>idim, odim, train_args = get_model_conf(model_path)</span></span>
289
+ <span class="line"><span>model_class = dynamic_import(train_args.model_module)</span></span>
290
+ <span class="line"><span>model = model_class(idim, odim, train_args)</span></span>
291
+ <span class="line"><span>torch_load(model_path, model)</span></span>
292
+ <span class="line"><span>model = model.eval().to(device)</span></span>
293
+ <span class="line"><span>inference_args = Namespace(**{&quot;threshold&quot;: 0.5, &quot;minlenratio&quot;: 0.0, &quot;maxlenratio&quot;: 10.0})</span></span>
294
+ <span class="line"><span></span></span>
295
+ <span class="line"><span># define neural vocoder</span></span>
296
+ <span class="line"><span>from parallel_wavegan.utils import load_model</span></span>
297
+ <span class="line"><span>fs = 24000</span></span>
298
+ <span class="line"><span>vocoder = load_model(vocoder_path)</span></span>
299
+ <span class="line"><span>vocoder.remove_weight_norm()</span></span>
300
+ <span class="line"><span>vocoder = vocoder.eval().to(device)</span></span>
301
+ <span class="line"><span></span></span>
302
+ <span class="line"><span># define text frontend</span></span>
303
+ <span class="line"><span>from pypinyin import pinyin, Style</span></span>
304
+ <span class="line"><span>from pypinyin.style._utils import get_initials, get_finals</span></span>
305
+ <span class="line"><span>with open(dict_path) as f:</span></span>
306
+ <span class="line"><span> lines = f.readlines()</span></span>
307
+ <span class="line"><span>lines = [line.replace(&quot;\\n&quot;, &quot;&quot;).split(&quot; &quot;) for line in lines]</span></span>
308
+ <span class="line"><span>char_to_id = {c: int(i) for c, i in lines}</span></span>
309
+ <span class="line"><span>def frontend(text):</span></span>
310
+ <span class="line"><span> &quot;&quot;&quot;Clean text and then convert to id sequence.&quot;&quot;&quot;</span></span>
311
+ <span class="line"><span> text = pinyin(text, style=Style.TONE3)</span></span>
312
+ <span class="line"><span> text = [c[0] for c in text]</span></span>
313
+ <span class="line"><span> print(f&quot;Cleaned text: {text}&quot;)</span></span>
314
+ <span class="line"><span> idseq = []</span></span>
315
+ <span class="line"><span> for x in text:</span></span>
316
+ <span class="line"><span> c_init = get_initials(x, strict=True)</span></span>
317
+ <span class="line"><span> c_final = get_finals(x, strict=True)</span></span>
318
+ <span class="line"><span> for c in [c_init, c_final]:</span></span>
319
+ <span class="line"><span> if len(c) == 0:</span></span>
320
+ <span class="line"><span> continue</span></span>
321
+ <span class="line"><span> c = c.replace(&quot;ü&quot;, &quot;v&quot;)</span></span>
322
+ <span class="line"><span> c = c.replace(&quot;ui&quot;, &quot;uei&quot;)</span></span>
323
+ <span class="line"><span> c = c.replace(&quot;un&quot;, &quot;uen&quot;)</span></span>
324
+ <span class="line"><span> c = c.replace(&quot;iu&quot;, &quot;iou&quot;)</span></span>
325
+ <span class="line"><span> # Special rule: &quot;e5n&quot; -&gt; &quot;en5&quot;</span></span>
326
+ <span class="line"><span> if &quot;5&quot; in c:</span></span>
327
+ <span class="line"><span> c = c.replace(&quot;5&quot;, &quot;&quot;) + &quot;5&quot;</span></span>
328
+ <span class="line"><span> if c not in char_to_id.keys():</span></span>
329
+ <span class="line"><span> print(f&quot;WARN: {c} is not included in dict.&quot;)</span></span>
330
+ <span class="line"><span> idseq += [char_to_id[&quot;&lt;unk&gt;&quot;]]</span></span>
331
+ <span class="line"><span> else:</span></span>
332
+ <span class="line"><span> idseq += [char_to_id[c]]</span></span>
333
+ <span class="line"><span> idseq += [idim - 1] # &lt;eos&gt;</span></span>
334
+ <span class="line"><span> return torch.LongTensor(idseq).view(-1).to(device)</span></span>
335
+ <span class="line"><span></span></span>
336
+ <span class="line"><span>print(&quot;now ready to synthesize!&quot;)</span></span>
337
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="synthesis-2" tabindex="-1"><a class="header-anchor" href="#synthesis-2"><span>Synthesis</span></a></h3><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>import time</span></span>
338
+ <span class="line"><span>print(&quot;請用中文輸入您喜歡的句子!&quot;)</span></span>
339
+ <span class="line"><span>input_text = input()</span></span>
340
+ <span class="line"><span></span></span>
341
+ <span class="line"><span>with torch.no_grad():</span></span>
342
+ <span class="line"><span> start = time.time()</span></span>
343
+ <span class="line"><span> x = frontend(input_text)</span></span>
344
+ <span class="line"><span> c, _, _ = model.inference(x, inference_args)</span></span>
345
+ <span class="line"><span> y = vocoder.inference(c)</span></span>
346
+ <span class="line"><span>rtf = (time.time() - start) / (len(y) / fs)</span></span>
347
+ <span class="line"><span>print(f&quot;RTF = {rtf:5f}&quot;)</span></span>
348
+ <span class="line"><span></span></span>
349
+ <span class="line"><span>from IPython.display import display, Audio</span></span>
350
+ <span class="line"><span>display(Audio(y.view(-1).cpu().numpy(), rate=fs))</span></span>
351
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div>`,54);function _(g,f){const a=l("ExternalLinkIcon");return p(),d("div",null,[n("p",null,[n("a",r,[c,e(a)])]),v,u,m,n("p",null,[s("Author: Tomoki Hayashi ("),n("a",h,[s("@kan-bayashi"),e(a)]),s(")")]),b])}const x=i(o,[["render",_],["__file","tts_realtime_demo.html.vue"]]),w=JSON.parse('{"path":"/espnet2/tts/tts_realtime_demo.html","title":"ESPnet real time E2E-TTS demonstration","lang":"en-US","frontmatter":{},"headers":[{"level":2,"title":"Install","slug":"install","link":"#install","children":[]},{"level":2,"title":"English demo","slug":"english-demo","link":"#english-demo","children":[{"level":3,"title":"Download pretrained feature generation model","slug":"download-pretrained-feature-generation-model","link":"#download-pretrained-feature-generation-model","children":[]},{"level":3,"title":"Download pretrained vocoder model","slug":"download-pretrained-vocoder-model","link":"#download-pretrained-vocoder-model","children":[]},{"level":3,"title":"Setup","slug":"setup","link":"#setup","children":[]},{"level":3,"title":"Synthesis","slug":"synthesis","link":"#synthesis","children":[]}]},{"level":2,"title":"Japanese demo","slug":"japanese-demo","link":"#japanese-demo","children":[{"level":3,"title":"Install Japanese dependencies","slug":"install-japanese-dependencies","link":"#install-japanese-dependencies","children":[]},{"level":3,"title":"Download pretrained models","slug":"download-pretrained-models","link":"#download-pretrained-models","children":[]},{"level":3,"title":"Setup","slug":"setup-1","link":"#setup-1","children":[]},{"level":3,"title":"Synthesis","slug":"synthesis-1","link":"#synthesis-1","children":[]}]},{"level":2,"title":"Mandarin demo","slug":"mandarin-demo","link":"#mandarin-demo","children":[{"level":3,"title":"Install Mandarin dependencies","slug":"install-mandarin-dependencies","link":"#install-mandarin-dependencies","children":[]},{"level":3,"title":"Download pretrained models","slug":"download-pretrained-models-1","link":"#download-pretrained-models-1","children":[]},{"level":3,"title":"Setup","slug":"setup-2","link":"#setup-2","children":[]},{"level":3,"title":"Synthesis","slug":"synthesis-2","link":"#synthesis-2","children":[]}]}],"git":{},"filePathRelative":"espnet2/tts/tts_realtime_demo.md"}');export{x as comp,w as data};
browserconfig.xml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="utf-8"?>
2
+ <browserconfig>
3
+ <msapplication>
4
+ <tile>
5
+ <square150x150logo src="/images/icons/mstile-150x150.png"/>
6
+ <TileColor>#ffffff</TileColor>
7
+ </tile>
8
+ </msapplication>
9
+ </browserconfig>
espnet2/asr/asr_cli.html ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!doctype html>
2
+ <html lang="en-US">
3
+ <head>
4
+ <meta charset="utf-8" />
5
+ <meta name="viewport" content="width=device-width,initial-scale=1" />
6
+ <meta name="generator" content="VuePress 2.0.0-rc.9" />
7
+ <style>
8
+ :root {
9
+ --c-bg: #fff;
10
+ }
11
+ html.dark {
12
+ --c-bg: #22272e;
13
+ }
14
+ html,
15
+ body {
16
+ background-color: var(--c-bg);
17
+ }
18
+ </style>
19
+ <script>
20
+ const userMode = localStorage.getItem('vuepress-color-scheme')
21
+ const systemDarkMode =
22
+ window.matchMedia &&
23
+ window.matchMedia('(prefers-color-scheme: dark)').matches
24
+ if (userMode === 'dark' || (userMode !== 'light' && systemDarkMode)) {
25
+ document.documentElement.classList.toggle('dark', true)
26
+ }
27
+ </script>
28
+ <link rel="manifest" href="/manifest.webmanifest"><meta name="application-name" content="Example"><meta name="apple-mobile-web-app-title" content="Example"><meta name="apple-mobile-web-app-status-bar-style" content="black"><meta name="msapplication-TileColor" content="#3eaf7c"><meta name="theme-color" content="#3eaf7c"><title>Speech Recognition (Recipe) | </title><meta name="description" content=" ">
29
+ <link rel="preload" href="/assets/style-SNWc1iKP.css" as="style"><link rel="stylesheet" href="/assets/style-SNWc1iKP.css">
30
+ <link rel="modulepreload" href="/assets/app-DTS6SjJz.js"><link rel="modulepreload" href="/assets/asr_cli.html-BA-xBrC-.js">
31
+ <link rel="prefetch" href="/assets/index.html-DGcx4T0I.js" as="script"><link rel="prefetch" href="/assets/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html-CJ8-yKXK.js" as="script"><link rel="prefetch" href="/assets/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html-BDY4p1H1.js" as="script"><link rel="prefetch" href="/assets/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html-DDjiuGQB.js" as="script"><link rel="prefetch" href="/assets/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html-DnIftUJK.js" as="script"><link rel="prefetch" href="/assets/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html-D9GsoT-_.js" as="script"><link rel="prefetch" href="/assets/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html-DobzmqH0.js" as="script"><link rel="prefetch" href="/assets/espnet2_tutorial_2021_CMU_11751_18781.html-BY6Z52B4.js" as="script"><link rel="prefetch" href="/assets/asr_library.html-rEQwKTMV.js" as="script"><link rel="prefetch" href="/assets/espnet2_asr_realtime_demo.html-BnK1Wovv.js" as="script"><link rel="prefetch" href="/assets/espnet2_asr_transfer_learning_demo.html-DJeSoTyY.js" as="script"><link rel="prefetch" href="/assets/espnet2_streaming_asr_demo.html-Yx-OX8AZ.js" as="script"><link rel="prefetch" href="/assets/espnet_se_demonstration_for_waspaa_2021.html--JdDNbEo.js" as="script"><link rel="prefetch" href="/assets/se_demo.html-DY-mv2y8.js" as="script"><link rel="prefetch" href="/assets/onnx_conversion_demo.html-D56NEMop.js" as="script"><link rel="prefetch" href="/assets/pretrained.html-JpE__EKJ.js" as="script"><link rel="prefetch" href="/assets/espnet2_2pass_slu_demo.html-BmvJ92Ni.js" as="script"><link rel="prefetch" href="/assets/st_demo.html-WLzB4ZGO.js" as="script"><link rel="prefetch" href="/assets/espnet2_tts_realtime_demo.html-BdxLBr1c.js" as="script"><link rel="prefetch" href="/assets/tts_cli.html-BfB21gs4.js" as="script"><link rel="prefetch" href="/assets/tts_realtime_demo.html-BKOGq7as.js" as="script"><link rel="prefetch" href="/assets/finetune_owsm.html-ICOQYZj2.js" as="script"><link rel="prefetch" href="/assets/finetune_with_lora.html-3NfoQDOl.js" as="script"><link rel="prefetch" href="/assets/train.html-BQ-t2Cs4.js" as="script"><link rel="prefetch" href="/assets/tacotron2.html-Ds1AKES7.js" as="script"><link rel="prefetch" href="/assets/404.html-DN7291h8.js" as="script"><link rel="prefetch" href="/assets/NpmBadge-rh9tvaXX.js" as="script">
32
+ </head>
33
+ <body>
34
+ <div id="app"><!--[--><div class="theme-container"><!--[--><header class="navbar"><div class="toggle-sidebar-button" title="toggle sidebar" aria-expanded="false" role="button" tabindex="0"><div class="icon" aria-hidden="true"><span></span><span></span><span></span></div></div><span><a class="route-link" href="/"><img class="logo" src="/images/espnet_logo1.png" alt=" "><span class="site-name can-hide" aria-hidden="true"> </span></a></span><div class="navbar-items-wrapper" style=""><!--[--><!--]--><nav class="navbar-items can-hide" aria-label="site navigation"><!--[--><div class="navbar-item"><a class="external-link" href="https://github.com/espnet/espnet" rel="noopener noreferrer" target="_blank" aria-label="Github"><!--[--><!--[--><!--]--> Github <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://huggingface.co/espnet" rel="noopener noreferrer" target="_blank" aria-label="HuggingFace"><!--[--><!--[--><!--]--> HuggingFace <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://espnet.github.io/espnet" rel="noopener noreferrer" target="_blank" aria-label="Docs"><!--[--><!--[--><!--]--> Docs <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SE</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SLU</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link route-link-active" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>OTHERS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ST</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/tts/tacotron2.html" aria-label="TTS demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> TTS demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_with_lora.html" aria-label="Finetune Model with ESPnet-Easy"><!--[--><!--[--><!--]--> Finetune Model with ESPnet-Easy <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/train.html" aria-label="Sample demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> Sample demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_owsm.html" aria-label="OWSM finetuning with custom dataset"><!--[--><!--[--><!--]--> OWSM finetuning with custom dataset <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_tutorial_2021_CMU_11751_18781.html" aria-label="CMU 11751/18781 2021: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 2021: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html" aria-label="CMU 11492/11692 Spring 2023: Spoken Language Understanding"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Spoken Language Understanding <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html" aria-label="CMU 11492/11692 Spring 2023: Speech Enhancement"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Speech Enhancement <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html" aria-label="CMU 11492/11692 Spring 2023: Data preparation"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Data preparation <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task)"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html" aria-label="CMU 11492/11692 Spring 2023: Text to Speech"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Text to Speech <!--[--><!--]--><!--]--></a></li><!--]--></ul></div></div><!--]--></nav><!--[--><!--]--><button class="toggle-color-mode-button" title="toggle color mode"><svg style="" class="icon" focusable="false" viewBox="0 0 32 32"><path d="M16 12.005a4 4 0 1 1-4 4a4.005 4.005 0 0 1 4-4m0-2a6 6 0 1 0 6 6a6 6 0 0 0-6-6z" fill="currentColor"></path><path d="M5.394 6.813l1.414-1.415l3.506 3.506L8.9 10.318z" fill="currentColor"></path><path d="M2 15.005h5v2H2z" fill="currentColor"></path><path d="M5.394 25.197L8.9 21.691l1.414 1.415l-3.506 3.505z" fill="currentColor"></path><path d="M15 25.005h2v5h-2z" fill="currentColor"></path><path d="M21.687 23.106l1.414-1.415l3.506 3.506l-1.414 1.414z" fill="currentColor"></path><path d="M25 15.005h5v2h-5z" fill="currentColor"></path><path d="M21.687 8.904l3.506-3.506l1.414 1.415l-3.506 3.505z" fill="currentColor"></path><path d="M15 2.005h2v5h-2z" fill="currentColor"></path></svg><svg style="display:none;" class="icon" focusable="false" viewBox="0 0 32 32"><path d="M13.502 5.414a15.075 15.075 0 0 0 11.594 18.194a11.113 11.113 0 0 1-7.975 3.39c-.138 0-.278.005-.418 0a11.094 11.094 0 0 1-3.2-21.584M14.98 3a1.002 1.002 0 0 0-.175.016a13.096 13.096 0 0 0 1.825 25.981c.164.006.328 0 .49 0a13.072 13.072 0 0 0 10.703-5.555a1.01 1.01 0 0 0-.783-1.565A13.08 13.08 0 0 1 15.89 4.38A1.015 1.015 0 0 0 14.98 3z" fill="currentColor"></path></svg></button><form class="search-box" role="search"><input type="search" placeholder="Search" autocomplete="off" spellcheck="false" value><!----></form></div></header><!--]--><div class="sidebar-mask"></div><!--[--><aside class="sidebar"><nav class="navbar-items" aria-label="site navigation"><!--[--><div class="navbar-item"><a class="external-link" href="https://github.com/espnet/espnet" rel="noopener noreferrer" target="_blank" aria-label="Github"><!--[--><!--[--><!--]--> Github <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://huggingface.co/espnet" rel="noopener noreferrer" target="_blank" aria-label="HuggingFace"><!--[--><!--[--><!--]--> HuggingFace <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://espnet.github.io/espnet" rel="noopener noreferrer" target="_blank" aria-label="Docs"><!--[--><!--[--><!--]--> Docs <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SE</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SLU</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link route-link-active" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>OTHERS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ST</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/tts/tacotron2.html" aria-label="TTS demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> TTS demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_with_lora.html" aria-label="Finetune Model with ESPnet-Easy"><!--[--><!--[--><!--]--> Finetune Model with ESPnet-Easy <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/train.html" aria-label="Sample demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> Sample demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_owsm.html" aria-label="OWSM finetuning with custom dataset"><!--[--><!--[--><!--]--> OWSM finetuning with custom dataset <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_tutorial_2021_CMU_11751_18781.html" aria-label="CMU 11751/18781 2021: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 2021: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html" aria-label="CMU 11492/11692 Spring 2023: Spoken Language Understanding"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Spoken Language Understanding <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html" aria-label="CMU 11492/11692 Spring 2023: Speech Enhancement"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Speech Enhancement <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html" aria-label="CMU 11492/11692 Spring 2023: Data preparation"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Data preparation <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task)"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html" aria-label="CMU 11492/11692 Spring 2023: Text to Speech"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Text to Speech <!--[--><!--]--><!--]--></a></li><!--]--></ul></div></div><!--]--></nav><!--[--><!--]--><ul class="sidebar-items"><!--[--><li><p tabindex="0" class="sidebar-item sidebar-heading">TTS <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">SE <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">SLU <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading active">ASR <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link route-link-active sidebar-item active" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="#abstract" aria-label="Abstract"><!--[--><!--[--><!--]--> Abstract <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#installation" aria-label="Installation"><!--[--><!--[--><!--]--> Installation <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#espnet-command-line-usage-espnet-egs-xxx" aria-label="ESPnet command line usage (espnet/egs/xxx)"><!--[--><!--[--><!--]--> ESPnet command line usage (espnet/egs/xxx) <!--[--><!--]--><!--]--></a><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="#stage-0-2-data-preparation" aria-label="Stage 0 - 2 Data preparation"><!--[--><!--[--><!--]--> Stage 0 - 2 Data preparation <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><a class="route-link sidebar-item" href="#kaldi-style-directory-structure" aria-label="Kaldi-style directory structure"><!--[--><!--[--><!--]--> Kaldi-style directory structure <!--[--><!--]--><!--]--></a><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="#tips-essential-files-in-data-preparation" aria-label="TIPS: essential files in data preparation"><!--[--><!--[--><!--]--> TIPS: essential files in data preparation <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#raw-text-list" aria-label="raw text list"><!--[--><!--[--><!--]--> raw text list <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#tips-explore-datasets-with-data-json" aria-label="TIPS: explore datasets with data.json"><!--[--><!--[--><!--]--> TIPS: explore datasets with data.json <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#stage-3-4-nn-training" aria-label="Stage 3 - 4 NN Training"><!--[--><!--[--><!--]--> Stage 3 - 4 NN Training <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#tips-change-yaml-py" aria-label="TIPS: change_yaml.py"><!--[--><!--[--><!--]--> TIPS: change_yaml.py <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#tips-tensorboard" aria-label="TIPS: tensorboard"><!--[--><!--[--><!--]--> TIPS: tensorboard <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#decoding-and-evaluation" aria-label="Decoding and evaluation"><!--[--><!--[--><!--]--> Decoding and evaluation <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#recognize-speech-from-python" aria-label="Recognize speech from python"><!--[--><!--[--><!--]--> Recognize speech from python <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><!--]--></ul></li><li><a class="route-link sidebar-item" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">OTHERS <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">ST <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><!--]--></ul><!--[--><!--]--></aside><!--]--><!--[--><main class="page"><!--[--><!--]--><div class="theme-default-content"><!--[--><!--]--><div><h1 id="speech-recognition-recipe" tabindex="-1"><a class="header-anchor" href="#speech-recognition-recipe"><span>Speech Recognition (Recipe)</span></a></h1><p>Author: <a href="https://github.com/ShigekiKarita" target="_blank" rel="noopener noreferrer">Shigeki Karita<span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span></a></p><p>July 29 2019</p><p>ESPnet Hackathon 2019 @Tokyo</p><h2 id="abstract" tabindex="-1"><a class="header-anchor" href="#abstract"><span>Abstract</span></a></h2><p>This example shows you a practical ASR example using ESPnet as a command line interface, and also as a library.</p><p>See also</p><ul><li>documetation https://espnet.github.io/espnet/</li><li>github https://github.com/espnet</li></ul><h2 id="installation" tabindex="-1"><a class="header-anchor" href="#installation"><span>Installation</span></a></h2><p>ESPnet depends on Kaldi ASR toolkit and Warp-CTC. This will take a few minutes.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># OS setup</span></span>
35
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">sudo apt-get install bc tree</span></span>
36
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">cat /etc/os-release</span></span>
37
+ <span class="line"></span>
38
+ <span class="line"><span style="color:#6A9955;"># espnet setup</span></span>
39
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">git clone https://github.com/espnet/espnet</span></span>
40
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">cd espnet; pip install -e .</span></span>
41
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">mkdir -p espnet/tools/venv/</span><span style="color:#DCDCAA;">bin</span><span style="color:#D4D4D4;">; touch espnet/tools/venv/</span><span style="color:#DCDCAA;">bin</span><span style="color:#D4D4D4;">/activate</span></span>
42
+ <span class="line"></span>
43
+ <span class="line"><span style="color:#6A9955;"># warp ctc setup</span></span>
44
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">git clone https://github.com/espnet/warp-ctc -b pytorch-</span><span style="color:#B5CEA8;">1.1</span></span>
45
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">cd warp-ctc </span><span style="color:#F44747;">&amp;&amp;</span><span style="color:#D4D4D4;"> mkdir build </span><span style="color:#F44747;">&amp;&amp;</span><span style="color:#D4D4D4;"> cd build </span><span style="color:#F44747;">&amp;&amp;</span><span style="color:#D4D4D4;"> cmake .. </span><span style="color:#F44747;">&amp;&amp;</span><span style="color:#D4D4D4;"> make -j4</span></span>
46
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">cd warp-ctc/pytorch_binding </span><span style="color:#F44747;">&amp;&amp;</span><span style="color:#D4D4D4;"> python setup.py install </span></span>
47
+ <span class="line"></span>
48
+ <span class="line"><span style="color:#6A9955;"># kaldi setup</span></span>
49
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">cd ./espnet/tools; git clone https://github.com/kaldi-asr/kaldi</span></span>
50
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">echo </span><span style="color:#CE9178;">&quot;&quot;</span><span style="color:#D4D4D4;"> &gt; ./espnet/tools/kaldi/tools/extras/check_dependencies.sh </span><span style="color:#6A9955;"># ignore check</span></span>
51
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">chmod +x ./espnet/tools/kaldi/tools/extras/check_dependencies.sh</span></span>
52
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">cd ./espnet/tools/kaldi/tools; make sph2pipe sclite</span></span>
53
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">rm -rf espnet/tools/kaldi/tools/python</span></span>
54
+ <span class="line"><span style="color:#D4D4D4;">![ ! -e ubuntu16-featbin.tar.gz ] </span><span style="color:#F44747;">&amp;&amp;</span><span style="color:#D4D4D4;"> wget https://</span><span style="color:#B5CEA8;">18</span><span style="color:#D4D4D4;">-</span><span style="color:#B5CEA8;">198329952</span><span style="color:#D4D4D4;">-gh.circle-artifacts.com/</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">/home/circleci/repo/ubuntu16-featbin.tar.gz</span></span>
55
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">tar -xf ./ubuntu16-featbin.tar.gz</span></span>
56
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">cp featbin/* espnet/tools/kaldi/src/featbin/</span></span>
57
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="espnet-command-line-usage-espnet-egs-xxx" tabindex="-1"><a class="header-anchor" href="#espnet-command-line-usage-espnet-egs-xxx"><span>ESPnet command line usage (espnet/egs/xxx)</span></a></h2><p>You can use the end-to-end script <code>run.sh</code> for reproducing systems reported in <code>espnet/egs/*/asr1/RESULTS.md</code>. Typically, we organize <code>run.sh</code> with several stages:</p><ol start="0"><li>Data download (if available)</li><li>Kaldi-style data preparation</li><li>Save python-friendly data (e.g., JSON, HDF5, etc)</li><li>Language model training</li><li>ASR model training</li><li>Decoding and evaluation</li></ol><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">ls espnet/egs</span></span>
58
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h3 id="stage-0-2-data-preparation" tabindex="-1"><a class="header-anchor" href="#stage-0-2-data-preparation"><span>Stage 0 - 2 Data preparation</span></a></h3><p>For example, if you add <code>--stop-stage 2</code>, you can stop the script before neural network training.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">cd espnet/egs/an4/asr1; ./run.sh </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">ngpu </span><span style="color:#B5CEA8;">1</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">stop-stage </span><span style="color:#B5CEA8;">2</span></span>
59
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h2 id="kaldi-style-directory-structure" tabindex="-1"><a class="header-anchor" href="#kaldi-style-directory-structure"><span>Kaldi-style directory structure</span></a></h2><p>Always we organize each recipe placed in <code>egs/xxx/asr1</code> in Kaldi way:</p><ul><li><code>conf/</code>: kaldi configurations, e.g., speech feature</li><li><code>data/</code>: almost raw <a href="https://kaldi-asr.org/doc/data_prep.html" target="_blank" rel="noopener noreferrer">data prepared by Kaldi<span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span></a></li><li><code>exp/</code>: intermidiate files through experiments, e.g., log files, model parameters</li><li><code>fbank/</code>: speech feature binary files, e.g., <a href="https://kaldi-asr.org/doc/io.html" target="_blank" rel="noopener noreferrer">ark, scp<span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span></a></li><li><code>dump/</code>: ESPnet meta data for tranining, e.g., json, hdf5</li><li><code>local/</code>: corpus specific data preparation scripts</li><li><a href="https://github.com/kaldi-asr/kaldi/tree/master/egs/wsj/s5/steps" target="_blank" rel="noopener noreferrer">steps/<span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span></a>, <a href="https://github.com/kaldi-asr/kaldi/tree/master/egs/wsj/s5/utils" target="_blank" rel="noopener noreferrer">utils/<span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span></a>: Kaldi&#39;s helper scripts</li></ul><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">tree -L </span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;"> espnet/egs/an4/asr1</span></span>
60
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h3 id="tips-essential-files-in-data-preparation" tabindex="-1"><a class="header-anchor" href="#tips-essential-files-in-data-preparation"><span>TIPS: essential files in data preparation</span></a></h3><p>To create a new recipe, all you need is stage 1 that creates key-value pair files:</p><ul><li>speech<code>data/xxx/wav.scp</code></li><li>text<code>data/xxx/text</code></li></ul><h4 id="raw-speech-file-list" tabindex="-1"><a class="header-anchor" href="#raw-speech-file-list"><span>raw speech file list</span></a></h4><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">head espnet/egs/an4/asr1/data/train/wav.scp</span></span>
61
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h3 id="raw-text-list" tabindex="-1"><a class="header-anchor" href="#raw-text-list"><span>raw text list</span></a></h3><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">head espnet/egs/an4/asr1/data/train/text</span></span>
62
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h3 id="tips-explore-datasets-with-data-json" tabindex="-1"><a class="header-anchor" href="#tips-explore-datasets-with-data-json"><span>TIPS: explore datasets with data.json</span></a></h3><p>To explore datasets easily, ESPnet stores metadata <code>dump/xxx/data.json</code> in the stage 2.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> json</span></span>
63
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> matplotlib.pyplot </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> plt</span></span>
64
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> kaldiio</span></span>
65
+ <span class="line"></span>
66
+ <span class="line"><span style="color:#6A9955;"># load 10-th speech/text in data.json</span></span>
67
+ <span class="line"><span style="color:#D4D4D4;">root = </span><span style="color:#CE9178;">&quot;espnet/egs/an4/asr1&quot;</span></span>
68
+ <span class="line"><span style="color:#C586C0;">with</span><span style="color:#DCDCAA;"> open</span><span style="color:#D4D4D4;">(root + </span><span style="color:#CE9178;">&quot;/dump/test/deltafalse/data.json&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;r&quot;</span><span style="color:#D4D4D4;">) </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> f:</span></span>
69
+ <span class="line"><span style="color:#D4D4D4;"> test_json = json.load(f)[</span><span style="color:#CE9178;">&quot;utts&quot;</span><span style="color:#D4D4D4;">]</span></span>
70
+ <span class="line"><span style="color:#D4D4D4;"> </span></span>
71
+ <span class="line"><span style="color:#D4D4D4;">key, info = </span><span style="color:#4EC9B0;">list</span><span style="color:#D4D4D4;">(test_json.items())[</span><span style="color:#B5CEA8;">10</span><span style="color:#D4D4D4;">]</span></span>
72
+ <span class="line"></span>
73
+ <span class="line"><span style="color:#6A9955;"># plot the speech feature</span></span>
74
+ <span class="line"><span style="color:#D4D4D4;">fbank = kaldiio.load_mat(info[</span><span style="color:#CE9178;">&quot;input&quot;</span><span style="color:#D4D4D4;">][</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">][</span><span style="color:#CE9178;">&quot;feat&quot;</span><span style="color:#D4D4D4;">])</span></span>
75
+ <span class="line"><span style="color:#D4D4D4;">plt.matshow(fbank.T[::-</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">])</span></span>
76
+ <span class="line"><span style="color:#D4D4D4;">plt.title(key + </span><span style="color:#CE9178;">&quot;: &quot;</span><span style="color:#D4D4D4;"> + info[</span><span style="color:#CE9178;">&quot;output&quot;</span><span style="color:#D4D4D4;">][</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">][</span><span style="color:#CE9178;">&quot;text&quot;</span><span style="color:#D4D4D4;">])</span></span>
77
+ <span class="line"></span>
78
+ <span class="line"><span style="color:#6A9955;"># print the key-value pair</span></span>
79
+ <span class="line"><span style="color:#D4D4D4;">key, info</span></span>
80
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="stage-3-4-nn-training" tabindex="-1"><a class="header-anchor" href="#stage-3-4-nn-training"><span>Stage 3 - 4 NN Training</span></a></h3><p>Let&#39;s go to the most interesting part...</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">tail espnet/egs/an4/asr1/conf/train_mtlalpha1.0.yaml</span></span>
81
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">cd espnet/egs/an4/asr1; ./run.sh </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">ngpu </span><span style="color:#B5CEA8;">1</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">stage </span><span style="color:#B5CEA8;">3</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">stop-stage </span><span style="color:#B5CEA8;">4</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">train-config ./conf/train_mtlalpha1.0.yaml</span></span>
82
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h3 id="tips-change-yaml-py" tabindex="-1"><a class="header-anchor" href="#tips-change-yaml-py"><span>TIPS: change_yaml.py</span></a></h3><p>You can tweak YAML config by <strong>$(change_yaml.py xxx.yaml -a yyy=zzz)</strong></p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">cd espnet/egs/an4/asr1; source path.sh; \</span></span>
83
+ <span class="line"><span style="color:#D4D4D4;"> ./run.sh </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">ngpu </span><span style="color:#B5CEA8;">1</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">stage </span><span style="color:#B5CEA8;">4</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">stop-stage </span><span style="color:#B5CEA8;">4</span><span style="color:#D4D4D4;"> \</span></span>
84
+ <span class="line"><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">train-config </span><span style="color:#F44747;">$</span><span style="color:#D4D4D4;">(change_yaml.py ./conf/train_mtlalpha1.0.yaml -a eunits=</span><span style="color:#B5CEA8;">100</span><span style="color:#D4D4D4;">)</span></span>
85
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="tips-tensorboard" tabindex="-1"><a class="header-anchor" href="#tips-tensorboard"><span>TIPS: tensorboard</span></a></h3><p>You can easily monitor effects of the config by tensorboard</p><h3 id="decoding-and-evaluation" tabindex="-1"><a class="header-anchor" href="#decoding-and-evaluation"><span>Decoding and evaluation</span></a></h3><p>decode config (<code>change_yaml.py</code> also works)</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">cat espnet/egs/an4/asr1/conf/decode_ctcweight1.0.yaml</span></span>
86
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h4 id="command-line-usage" tabindex="-1"><a class="header-anchor" href="#command-line-usage"><span>Command line usage</span></a></h4><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">cd espnet/egs/an4/asr1; ./run.sh </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">stage </span><span style="color:#B5CEA8;">5</span></span>
87
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h4 id="asr-result-as-data-json" tabindex="-1"><a class="header-anchor" href="#asr-result-as-data-json"><span>ASR result as <code>data.json</code></span></a></h4><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">head -n20 espnet/egs/an4/asr1/exp/train_nodev_pytorch_train_mtlalpha1.0/decode_test_decode_ctcweight1.0_lm_word100/data.json</span></span>
88
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h3 id="recognize-speech-from-python" tabindex="-1"><a class="header-anchor" href="#recognize-speech-from-python"><span>Recognize speech from python</span></a></h3><p>Let&#39;s use ESPnet as a library and the trained model:</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">ls espnet/egs/an4/asr1/exp/train_nodev_pytorch_train_mtlalpha1.0/results</span></span>
89
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h4 id="recap-load-speech-from-data-json" tabindex="-1"><a class="header-anchor" href="#recap-load-speech-from-data-json"><span>recap: load speech from data.json</span></a></h4><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> json</span></span>
90
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> matplotlib.pyplot </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> plt</span></span>
91
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> kaldiio</span></span>
92
+ <span class="line"></span>
93
+ <span class="line"><span style="color:#6A9955;"># load 10-th speech/text in data.json</span></span>
94
+ <span class="line"><span style="color:#D4D4D4;">root = </span><span style="color:#CE9178;">&quot;espnet/egs/an4/asr1&quot;</span></span>
95
+ <span class="line"><span style="color:#C586C0;">with</span><span style="color:#DCDCAA;"> open</span><span style="color:#D4D4D4;">(root + </span><span style="color:#CE9178;">&quot;/dump/test/deltafalse/data.json&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;r&quot;</span><span style="color:#D4D4D4;">) </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> f:</span></span>
96
+ <span class="line"><span style="color:#D4D4D4;"> test_json = json.load(f)[</span><span style="color:#CE9178;">&quot;utts&quot;</span><span style="color:#D4D4D4;">]</span></span>
97
+ <span class="line"><span style="color:#D4D4D4;"> </span></span>
98
+ <span class="line"><span style="color:#D4D4D4;">key, info = </span><span style="color:#4EC9B0;">list</span><span style="color:#D4D4D4;">(test_json.items())[</span><span style="color:#B5CEA8;">10</span><span style="color:#D4D4D4;">]</span></span>
99
+ <span class="line"></span>
100
+ <span class="line"><span style="color:#6A9955;"># plot the speech feature</span></span>
101
+ <span class="line"><span style="color:#D4D4D4;">fbank = kaldiio.load_mat(info[</span><span style="color:#CE9178;">&quot;input&quot;</span><span style="color:#D4D4D4;">][</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">][</span><span style="color:#CE9178;">&quot;feat&quot;</span><span style="color:#D4D4D4;">])</span></span>
102
+ <span class="line"><span style="color:#D4D4D4;">plt.matshow(fbank.T[::-</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">])</span></span>
103
+ <span class="line"><span style="color:#D4D4D4;">plt.title(key + </span><span style="color:#CE9178;">&quot;: &quot;</span><span style="color:#D4D4D4;"> + info[</span><span style="color:#CE9178;">&quot;output&quot;</span><span style="color:#D4D4D4;">][</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">][</span><span style="color:#CE9178;">&quot;text&quot;</span><span style="color:#D4D4D4;">])</span></span>
104
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h4 id="load-model" tabindex="-1"><a class="header-anchor" href="#load-model"><span>load model</span></a></h4><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> json</span></span>
105
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> torch</span></span>
106
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> argparse</span></span>
107
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet.bin.asr_recog </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> get_parser</span></span>
108
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet.nets.pytorch_backend.e2e_asr </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> E2E</span></span>
109
+ <span class="line"></span>
110
+ <span class="line"><span style="color:#D4D4D4;">root = </span><span style="color:#CE9178;">&quot;espnet/egs/an4/asr1&quot;</span></span>
111
+ <span class="line"><span style="color:#D4D4D4;">model_dir = root + </span><span style="color:#CE9178;">&quot;/exp/train_nodev_pytorch_train_mtlalpha1.0/results&quot;</span></span>
112
+ <span class="line"></span>
113
+ <span class="line"><span style="color:#6A9955;"># load model</span></span>
114
+ <span class="line"><span style="color:#C586C0;">with</span><span style="color:#DCDCAA;"> open</span><span style="color:#D4D4D4;">(model_dir + </span><span style="color:#CE9178;">&quot;/model.json&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;r&quot;</span><span style="color:#D4D4D4;">) </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> f:</span></span>
115
+ <span class="line"><span style="color:#D4D4D4;"> idim, odim, conf = json.load(f)</span></span>
116
+ <span class="line"><span style="color:#D4D4D4;">model = E2E(idim, odim, argparse.Namespace(**conf))</span></span>
117
+ <span class="line"><span style="color:#D4D4D4;">model.load_state_dict(torch.load(model_dir + </span><span style="color:#CE9178;">&quot;/model.loss.best&quot;</span><span style="color:#D4D4D4;">))</span></span>
118
+ <span class="line"><span style="color:#D4D4D4;">model.cpu().eval()</span></span>
119
+ <span class="line"></span>
120
+ <span class="line"><span style="color:#6A9955;"># load token dict</span></span>
121
+ <span class="line"><span style="color:#C586C0;">with</span><span style="color:#DCDCAA;"> open</span><span style="color:#D4D4D4;">(root + </span><span style="color:#CE9178;">&quot;/data/lang_1char/train_nodev_units.txt&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;r&quot;</span><span style="color:#D4D4D4;">) </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> f:</span></span>
122
+ <span class="line"><span style="color:#D4D4D4;"> token_list = [entry.split()[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">] </span><span style="color:#C586C0;">for</span><span style="color:#D4D4D4;"> entry </span><span style="color:#C586C0;">in</span><span style="color:#D4D4D4;"> f]</span></span>
123
+ <span class="line"><span style="color:#D4D4D4;">token_list.insert(</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&#39;&lt;blank&gt;&#39;</span><span style="color:#D4D4D4;">)</span></span>
124
+ <span class="line"><span style="color:#D4D4D4;">token_list.append(</span><span style="color:#CE9178;">&#39;&lt;eos&gt;&#39;</span><span style="color:#D4D4D4;">)</span></span>
125
+ <span class="line"></span>
126
+ <span class="line"><span style="color:#6A9955;"># recognize speech</span></span>
127
+ <span class="line"><span style="color:#D4D4D4;">parser = get_parser()</span></span>
128
+ <span class="line"><span style="color:#D4D4D4;">args = parser.parse_args([</span><span style="color:#CE9178;">&quot;--beam-size&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;2&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;--ctc-weight&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;1.0&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;--result-label&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;out.json&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;--model&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;&quot;</span><span style="color:#D4D4D4;">])</span></span>
129
+ <span class="line"><span style="color:#D4D4D4;">result = model.recognize(fbank, args, token_list)</span></span>
130
+ <span class="line"><span style="color:#D4D4D4;">s = </span><span style="color:#CE9178;">&quot;&quot;</span><span style="color:#D4D4D4;">.join(conf[</span><span style="color:#CE9178;">&quot;char_list&quot;</span><span style="color:#D4D4D4;">][y] </span><span style="color:#C586C0;">for</span><span style="color:#D4D4D4;"> y </span><span style="color:#C586C0;">in</span><span style="color:#D4D4D4;"> result[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">][</span><span style="color:#CE9178;">&quot;yseq&quot;</span><span style="color:#D4D4D4;">]).replace(</span><span style="color:#CE9178;">&quot;&lt;eos&gt;&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;&quot;</span><span style="color:#D4D4D4;">).replace(</span><span style="color:#CE9178;">&quot;&lt;space&gt;&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot; &quot;</span><span style="color:#D4D4D4;">).replace(</span><span style="color:#CE9178;">&quot;&lt;blank&gt;&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;&quot;</span><span style="color:#D4D4D4;">)</span></span>
131
+ <span class="line"></span>
132
+ <span class="line"><span style="color:#DCDCAA;">print</span><span style="color:#D4D4D4;">(</span><span style="color:#CE9178;">&quot;groundtruth:&quot;</span><span style="color:#D4D4D4;">, info[</span><span style="color:#CE9178;">&quot;output&quot;</span><span style="color:#D4D4D4;">][</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">][</span><span style="color:#CE9178;">&quot;text&quot;</span><span style="color:#D4D4D4;">])</span></span>
133
+ <span class="line"><span style="color:#DCDCAA;">print</span><span style="color:#D4D4D4;">(</span><span style="color:#CE9178;">&quot;prediction: &quot;</span><span style="color:#D4D4D4;">, s)</span></span>
134
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> os</span></span>
135
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> kaldiio</span></span>
136
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> IPython.display </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> Audio</span></span>
137
+ <span class="line"></span>
138
+ <span class="line"></span>
139
+ <span class="line"><span style="color:#C586C0;">try</span><span style="color:#D4D4D4;">:</span></span>
140
+ <span class="line"><span style="color:#D4D4D4;"> d = os.getcwd()</span></span>
141
+ <span class="line"><span style="color:#D4D4D4;"> os.chdir(root)</span></span>
142
+ <span class="line"><span style="color:#D4D4D4;"> sr, wav = kaldiio.load_scp(</span><span style="color:#CE9178;">&quot;data/test/wav.scp&quot;</span><span style="color:#D4D4D4;">)[key]</span></span>
143
+ <span class="line"><span style="color:#C586C0;">finally</span><span style="color:#D4D4D4;">:</span></span>
144
+ <span class="line"><span style="color:#D4D4D4;"> os.chdir(d)</span></span>
145
+ <span class="line"><span style="color:#D4D4D4;">Audio(wav, </span><span style="color:#9CDCFE;">rate</span><span style="color:#D4D4D4;">=sr)</span></span>
146
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div></div><!--[--><!--]--></div><footer class="vp-page-meta"><!----><div class="vp-meta-item git-info"><!----><!----></div></footer><nav class="vp-page-nav" aria-label="page navigation"><!----><a class="route-link next" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><div class="hint">Next <span class="arrow right"></span></div><div class="link"><span>ESPnet2-ASR realtime demonstration</span></div><!--]--></a></nav><!--[--><!--]--></main><!--]--></div><!--[--><!----><!--]--><!--]--></div>
147
+ <script type="module" src="/assets/app-DTS6SjJz.js" defer></script>
148
+ </body>
149
+ </html>
espnet2/asr/asr_library.html ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!doctype html>
2
+ <html lang="en-US">
3
+ <head>
4
+ <meta charset="utf-8" />
5
+ <meta name="viewport" content="width=device-width,initial-scale=1" />
6
+ <meta name="generator" content="VuePress 2.0.0-rc.9" />
7
+ <style>
8
+ :root {
9
+ --c-bg: #fff;
10
+ }
11
+ html.dark {
12
+ --c-bg: #22272e;
13
+ }
14
+ html,
15
+ body {
16
+ background-color: var(--c-bg);
17
+ }
18
+ </style>
19
+ <script>
20
+ const userMode = localStorage.getItem('vuepress-color-scheme')
21
+ const systemDarkMode =
22
+ window.matchMedia &&
23
+ window.matchMedia('(prefers-color-scheme: dark)').matches
24
+ if (userMode === 'dark' || (userMode !== 'light' && systemDarkMode)) {
25
+ document.documentElement.classList.toggle('dark', true)
26
+ }
27
+ </script>
28
+ <link rel="manifest" href="/manifest.webmanifest"><meta name="application-name" content="Example"><meta name="apple-mobile-web-app-title" content="Example"><meta name="apple-mobile-web-app-status-bar-style" content="black"><meta name="msapplication-TileColor" content="#3eaf7c"><meta name="theme-color" content="#3eaf7c"><title>Speech Recognition (Library) | </title><meta name="description" content=" ">
29
+ <link rel="preload" href="/assets/style-SNWc1iKP.css" as="style"><link rel="stylesheet" href="/assets/style-SNWc1iKP.css">
30
+ <link rel="modulepreload" href="/assets/app-DTS6SjJz.js"><link rel="modulepreload" href="/assets/asr_library.html-rEQwKTMV.js">
31
+ <link rel="prefetch" href="/assets/index.html-DGcx4T0I.js" as="script"><link rel="prefetch" href="/assets/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html-CJ8-yKXK.js" as="script"><link rel="prefetch" href="/assets/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html-BDY4p1H1.js" as="script"><link rel="prefetch" href="/assets/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html-DDjiuGQB.js" as="script"><link rel="prefetch" href="/assets/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html-DnIftUJK.js" as="script"><link rel="prefetch" href="/assets/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html-D9GsoT-_.js" as="script"><link rel="prefetch" href="/assets/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html-DobzmqH0.js" as="script"><link rel="prefetch" href="/assets/espnet2_tutorial_2021_CMU_11751_18781.html-BY6Z52B4.js" as="script"><link rel="prefetch" href="/assets/asr_cli.html-BA-xBrC-.js" as="script"><link rel="prefetch" href="/assets/espnet2_asr_realtime_demo.html-BnK1Wovv.js" as="script"><link rel="prefetch" href="/assets/espnet2_asr_transfer_learning_demo.html-DJeSoTyY.js" as="script"><link rel="prefetch" href="/assets/espnet2_streaming_asr_demo.html-Yx-OX8AZ.js" as="script"><link rel="prefetch" href="/assets/espnet_se_demonstration_for_waspaa_2021.html--JdDNbEo.js" as="script"><link rel="prefetch" href="/assets/se_demo.html-DY-mv2y8.js" as="script"><link rel="prefetch" href="/assets/onnx_conversion_demo.html-D56NEMop.js" as="script"><link rel="prefetch" href="/assets/pretrained.html-JpE__EKJ.js" as="script"><link rel="prefetch" href="/assets/espnet2_2pass_slu_demo.html-BmvJ92Ni.js" as="script"><link rel="prefetch" href="/assets/st_demo.html-WLzB4ZGO.js" as="script"><link rel="prefetch" href="/assets/espnet2_tts_realtime_demo.html-BdxLBr1c.js" as="script"><link rel="prefetch" href="/assets/tts_cli.html-BfB21gs4.js" as="script"><link rel="prefetch" href="/assets/tts_realtime_demo.html-BKOGq7as.js" as="script"><link rel="prefetch" href="/assets/finetune_owsm.html-ICOQYZj2.js" as="script"><link rel="prefetch" href="/assets/finetune_with_lora.html-3NfoQDOl.js" as="script"><link rel="prefetch" href="/assets/train.html-BQ-t2Cs4.js" as="script"><link rel="prefetch" href="/assets/tacotron2.html-Ds1AKES7.js" as="script"><link rel="prefetch" href="/assets/404.html-DN7291h8.js" as="script"><link rel="prefetch" href="/assets/NpmBadge-rh9tvaXX.js" as="script">
32
+ </head>
33
+ <body>
34
+ <div id="app"><!--[--><div class="theme-container"><!--[--><header class="navbar"><div class="toggle-sidebar-button" title="toggle sidebar" aria-expanded="false" role="button" tabindex="0"><div class="icon" aria-hidden="true"><span></span><span></span><span></span></div></div><span><a class="route-link" href="/"><img class="logo" src="/images/espnet_logo1.png" alt=" "><span class="site-name can-hide" aria-hidden="true"> </span></a></span><div class="navbar-items-wrapper" style=""><!--[--><!--]--><nav class="navbar-items can-hide" aria-label="site navigation"><!--[--><div class="navbar-item"><a class="external-link" href="https://github.com/espnet/espnet" rel="noopener noreferrer" target="_blank" aria-label="Github"><!--[--><!--[--><!--]--> Github <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://huggingface.co/espnet" rel="noopener noreferrer" target="_blank" aria-label="HuggingFace"><!--[--><!--[--><!--]--> HuggingFace <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://espnet.github.io/espnet" rel="noopener noreferrer" target="_blank" aria-label="Docs"><!--[--><!--[--><!--]--> Docs <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SE</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SLU</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link route-link-active" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>OTHERS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ST</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/tts/tacotron2.html" aria-label="TTS demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> TTS demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_with_lora.html" aria-label="Finetune Model with ESPnet-Easy"><!--[--><!--[--><!--]--> Finetune Model with ESPnet-Easy <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/train.html" aria-label="Sample demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> Sample demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_owsm.html" aria-label="OWSM finetuning with custom dataset"><!--[--><!--[--><!--]--> OWSM finetuning with custom dataset <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_tutorial_2021_CMU_11751_18781.html" aria-label="CMU 11751/18781 2021: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 2021: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html" aria-label="CMU 11492/11692 Spring 2023: Spoken Language Understanding"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Spoken Language Understanding <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html" aria-label="CMU 11492/11692 Spring 2023: Speech Enhancement"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Speech Enhancement <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html" aria-label="CMU 11492/11692 Spring 2023: Data preparation"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Data preparation <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task)"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html" aria-label="CMU 11492/11692 Spring 2023: Text to Speech"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Text to Speech <!--[--><!--]--><!--]--></a></li><!--]--></ul></div></div><!--]--></nav><!--[--><!--]--><button class="toggle-color-mode-button" title="toggle color mode"><svg style="" class="icon" focusable="false" viewBox="0 0 32 32"><path d="M16 12.005a4 4 0 1 1-4 4a4.005 4.005 0 0 1 4-4m0-2a6 6 0 1 0 6 6a6 6 0 0 0-6-6z" fill="currentColor"></path><path d="M5.394 6.813l1.414-1.415l3.506 3.506L8.9 10.318z" fill="currentColor"></path><path d="M2 15.005h5v2H2z" fill="currentColor"></path><path d="M5.394 25.197L8.9 21.691l1.414 1.415l-3.506 3.505z" fill="currentColor"></path><path d="M15 25.005h2v5h-2z" fill="currentColor"></path><path d="M21.687 23.106l1.414-1.415l3.506 3.506l-1.414 1.414z" fill="currentColor"></path><path d="M25 15.005h5v2h-5z" fill="currentColor"></path><path d="M21.687 8.904l3.506-3.506l1.414 1.415l-3.506 3.505z" fill="currentColor"></path><path d="M15 2.005h2v5h-2z" fill="currentColor"></path></svg><svg style="display:none;" class="icon" focusable="false" viewBox="0 0 32 32"><path d="M13.502 5.414a15.075 15.075 0 0 0 11.594 18.194a11.113 11.113 0 0 1-7.975 3.39c-.138 0-.278.005-.418 0a11.094 11.094 0 0 1-3.2-21.584M14.98 3a1.002 1.002 0 0 0-.175.016a13.096 13.096 0 0 0 1.825 25.981c.164.006.328 0 .49 0a13.072 13.072 0 0 0 10.703-5.555a1.01 1.01 0 0 0-.783-1.565A13.08 13.08 0 0 1 15.89 4.38A1.015 1.015 0 0 0 14.98 3z" fill="currentColor"></path></svg></button><form class="search-box" role="search"><input type="search" placeholder="Search" autocomplete="off" spellcheck="false" value><!----></form></div></header><!--]--><div class="sidebar-mask"></div><!--[--><aside class="sidebar"><nav class="navbar-items" aria-label="site navigation"><!--[--><div class="navbar-item"><a class="external-link" href="https://github.com/espnet/espnet" rel="noopener noreferrer" target="_blank" aria-label="Github"><!--[--><!--[--><!--]--> Github <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://huggingface.co/espnet" rel="noopener noreferrer" target="_blank" aria-label="HuggingFace"><!--[--><!--[--><!--]--> HuggingFace <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://espnet.github.io/espnet" rel="noopener noreferrer" target="_blank" aria-label="Docs"><!--[--><!--[--><!--]--> Docs <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SE</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SLU</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link route-link-active" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>OTHERS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ST</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/tts/tacotron2.html" aria-label="TTS demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> TTS demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_with_lora.html" aria-label="Finetune Model with ESPnet-Easy"><!--[--><!--[--><!--]--> Finetune Model with ESPnet-Easy <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/train.html" aria-label="Sample demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> Sample demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_owsm.html" aria-label="OWSM finetuning with custom dataset"><!--[--><!--[--><!--]--> OWSM finetuning with custom dataset <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_tutorial_2021_CMU_11751_18781.html" aria-label="CMU 11751/18781 2021: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 2021: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html" aria-label="CMU 11492/11692 Spring 2023: Spoken Language Understanding"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Spoken Language Understanding <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html" aria-label="CMU 11492/11692 Spring 2023: Speech Enhancement"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Speech Enhancement <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html" aria-label="CMU 11492/11692 Spring 2023: Data preparation"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Data preparation <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task)"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html" aria-label="CMU 11492/11692 Spring 2023: Text to Speech"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Text to Speech <!--[--><!--]--><!--]--></a></li><!--]--></ul></div></div><!--]--></nav><!--[--><!--]--><ul class="sidebar-items"><!--[--><li><p tabindex="0" class="sidebar-item sidebar-heading">TTS <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">SE <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">SLU <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading active">ASR <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link route-link-active sidebar-item active" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="#installation" aria-label="Installation"><!--[--><!--[--><!--]--> Installation <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#espnet-data-preparation" aria-label="ESPnet data preparation"><!--[--><!--[--><!--]--> ESPnet data preparation <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#kaldi-style-directories" aria-label="Kaldi-style directories"><!--[--><!--[--><!--]--> Kaldi-style directories <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#espnet-as-a-library" aria-label="ESPnet as a library"><!--[--><!--[--><!--]--> ESPnet as a library <!--[--><!--]--><!--]--></a><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="#load-train-dev-dataset-1-4" aria-label="Load train/dev dataset (1/4)"><!--[--><!--[--><!--]--> Load train/dev dataset (1/4) <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#create-minibatches-2-4" aria-label="Create minibatches (2/4)"><!--[--><!--[--><!--]--> Create minibatches (2/4) <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#build-neural-networks-3-4" aria-label="Build neural networks (3/4)"><!--[--><!--[--><!--]--> Build neural networks (3/4) <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#update-neural-networks-by-iterating-datasets-4-4" aria-label="Update neural networks by iterating datasets (4/4)"><!--[--><!--[--><!--]--> Update neural networks by iterating datasets (4/4) <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#recognize-speech" aria-label="Recognize speech"><!--[--><!--[--><!--]--> Recognize speech <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><!--]--></ul></li><li><a class="route-link sidebar-item" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">OTHERS <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">ST <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><!--]--></ul><!--[--><!--]--></aside><!--]--><!--[--><main class="page"><!--[--><!--]--><div class="theme-default-content"><!--[--><!--]--><div><h1 id="speech-recognition-library" tabindex="-1"><a class="header-anchor" href="#speech-recognition-library"><span>Speech Recognition (Library)</span></a></h1><p>This example shows you a practical ASR example using ESPnet as a command line interface and library.</p><p>See also</p><ul><li>run in <a href="https://colab.research.google.com/github/espnet/notebook/blob/master/asr_library.ipynb" target="_blank" rel="noopener noreferrer">colab<span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span></a></li><li>documetation https://espnet.github.io/espnet/</li><li>github https://github.com/espnet</li></ul><p>Author: <a href="https://github.com/ShigekiKarita" target="_blank" rel="noopener noreferrer">Shigeki Karita<span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span></a></p><h2 id="installation" tabindex="-1"><a class="header-anchor" href="#installation"><span>Installation</span></a></h2><p>ESPnet depends on Kaldi ASR toolkit and Warp-CTC. This cell will take a few minutes.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># TODO(karita): put these lines in ./espnet/tools/setup_colab.sh</span></span>
35
+ <span class="line"><span># OS setup</span></span>
36
+ <span class="line"><span>!sudo apt-get install bc tree</span></span>
37
+ <span class="line"><span>!cat /etc/os-release</span></span>
38
+ <span class="line"><span></span></span>
39
+ <span class="line"><span># espnet setup</span></span>
40
+ <span class="line"><span>!git clone https://github.com/espnet/espnet</span></span>
41
+ <span class="line"><span>!cd espnet; pip install -e .</span></span>
42
+ <span class="line"><span>!mkdir espnet/tools/venv/bin; touch espnet/tools/venv/bin/activate</span></span>
43
+ <span class="line"><span></span></span>
44
+ <span class="line"><span># warp ctc setup</span></span>
45
+ <span class="line"><span>!git clone https://github.com/espnet/warp-ctc -b pytorch-1.1</span></span>
46
+ <span class="line"><span>!cd warp-ctc &amp;&amp; mkdir build &amp;&amp; cd build &amp;&amp; cmake .. &amp;&amp; make -j4</span></span>
47
+ <span class="line"><span>!cd warp-ctc/pytorch_binding &amp;&amp; python setup.py install </span></span>
48
+ <span class="line"><span></span></span>
49
+ <span class="line"><span># kaldi setup</span></span>
50
+ <span class="line"><span>!cd ./espnet/tools; git clone https://github.com/kaldi-asr/kaldi</span></span>
51
+ <span class="line"><span>!echo &quot;&quot; &gt; ./espnet/tools/kaldi/tools/extras/check_dependencies.sh # ignore check</span></span>
52
+ <span class="line"><span>!chmod +x ./espnet/tools/kaldi/tools/extras/check_dependencies.sh</span></span>
53
+ <span class="line"><span>!cd ./espnet/tools/kaldi/tools; make sph2pipe sclite</span></span>
54
+ <span class="line"><span>!rm -rf espnet/tools/kaldi/tools/python</span></span>
55
+ <span class="line"><span>![ ! -e ubuntu16-featbin.tar.gz ] &amp;&amp; wget https://18-198329952-gh.circle-artifacts.com/0/home/circleci/repo/ubuntu16-featbin.tar.gz</span></span>
56
+ <span class="line"><span>!tar -xf ./ubuntu16-featbin.tar.gz</span></span>
57
+ <span class="line"><span>!cp featbin/* espnet/tools/kaldi/src/featbin/</span></span>
58
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="espnet-data-preparation" tabindex="-1"><a class="header-anchor" href="#espnet-data-preparation"><span>ESPnet data preparation</span></a></h2><p>You can use the end-to-end script <code>run.sh</code> for reproducing systems reported in <code>espnet/egs/*/asr1/RESULTS.md</code>. Typically, we organize <code>run.sh</code> with several stages:</p><ol start="0"><li>Data download (if available)</li><li>Kaldi-style data preparation</li><li>Dump useful data for traning (e.g., JSON, HDF5, etc)</li><li>Lanuage model training</li><li>ASR model training</li><li>Decoding and evaluation</li></ol><p>For example, if you add <code>--stop-stage 2</code>, you can stop the script before neural network training.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!cd espnet/egs/an4/asr1; ./run.sh --ngpu 1 --stop-stage 2</span></span>
59
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h2 id="kaldi-style-directories" tabindex="-1"><a class="header-anchor" href="#kaldi-style-directories"><span>Kaldi-style directories</span></a></h2><p>Always we organize each recipe placed in <code>egs/xxx/asr1</code> in Kaldi way. For example, the important directories are:</p><ul><li><code>conf/</code>: kaldi configurations, e.g., speech feature</li><li><code>data/</code>: almost raw <a href="https://kaldi-asr.org/doc/data_prep.html" target="_blank" rel="noopener noreferrer">data prepared by Kaldi<span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span></a></li><li><code>exp/</code>: intermidiate files through experiments, e.g., log files, model parameters</li><li><code>fbank/</code>: speech feature binary files, e.g., <a href="https://kaldi-asr.org/doc/io.html" target="_blank" rel="noopener noreferrer">ark, scp<span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span></a></li><li><code>dump/</code>: ESPnet meta data for tranining, e.g., json, hdf5</li><li><code>local/</code>: corpus specific data preparation scripts</li><li><a href="https://github.com/kaldi-asr/kaldi/tree/master/egs/wsj/s5/steps" target="_blank" rel="noopener noreferrer">steps/<span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span></a>, <a href="https://github.com/kaldi-asr/kaldi/tree/master/egs/wsj/s5/utils" target="_blank" rel="noopener noreferrer">utils/<span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span></a>: Kaldi&#39;s helper scripts</li></ul><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!tree -L 1</span></span>
60
+ <span class="line"><span>!ls data/train</span></span>
61
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="espnet-as-a-library" tabindex="-1"><a class="header-anchor" href="#espnet-as-a-library"><span>ESPnet as a library</span></a></h2><p>Here we use ESPnet as a library to create a simple Python snippet for speech recognition. ESPnet &#39;s training script&#39;<code>asr_train.py</code> has three parts:</p><ol><li>Load train/dev dataset</li><li>Create minibatches</li><li>Build neural networks</li><li>Update neural networks by iterating datasets</li></ol><p>Let&#39;s implement these procedures from scratch!</p><h3 id="load-train-dev-dataset-1-4" tabindex="-1"><a class="header-anchor" href="#load-train-dev-dataset-1-4"><span>Load train/dev dataset (1/4)</span></a></h3><p>First, we will check how <code>run.sh</code> organized the JSON files and load the pair of the speech feature and its transcription.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>import json</span></span>
62
+ <span class="line"><span>import matplotlib.pyplot as plt</span></span>
63
+ <span class="line"><span>import kaldiio</span></span>
64
+ <span class="line"><span></span></span>
65
+ <span class="line"><span>root = &quot;espnet/egs/an4/asr1&quot;</span></span>
66
+ <span class="line"><span>with open(root + &quot;/dump/train_nodev/deltafalse/data.json&quot;, &quot;r&quot;) as f:</span></span>
67
+ <span class="line"><span> train_json = json.load(f)[&quot;utts&quot;]</span></span>
68
+ <span class="line"><span>with open(root + &quot;/dump/train_dev/deltafalse/data.json&quot;, &quot;r&quot;) as f:</span></span>
69
+ <span class="line"><span> dev_json = json.load(f)[&quot;utts&quot;]</span></span>
70
+ <span class="line"><span> </span></span>
71
+ <span class="line"><span># the first training data for speech recognition</span></span>
72
+ <span class="line"><span>key, info = next(iter(train_json.items()))</span></span>
73
+ <span class="line"><span></span></span>
74
+ <span class="line"><span># plot the 80-dim fbank + 3-dim pitch speech feature</span></span>
75
+ <span class="line"><span>fbank = kaldiio.load_mat(info[&quot;input&quot;][0][&quot;feat&quot;])</span></span>
76
+ <span class="line"><span>plt.matshow(fbank.T[::-1])</span></span>
77
+ <span class="line"><span>plt.title(key + &quot;: &quot; + info[&quot;output&quot;][0][&quot;text&quot;])</span></span>
78
+ <span class="line"><span></span></span>
79
+ <span class="line"><span># print the key-value pair</span></span>
80
+ <span class="line"><span>key, info</span></span>
81
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="create-minibatches-2-4" tabindex="-1"><a class="header-anchor" href="#create-minibatches-2-4"><span>Create minibatches (2/4)</span></a></h3><p>To parallelize neural network training, we create minibatches that containes several sequence pairs by splitting datasets.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>from espnet.utils.training.batchfy import make_batchset</span></span>
82
+ <span class="line"><span></span></span>
83
+ <span class="line"><span>batch_size = 32</span></span>
84
+ <span class="line"><span>trainset = make_batchset(train_json, batch_size)</span></span>
85
+ <span class="line"><span>devset = make_batchset(dev_json, batch_size)</span></span>
86
+ <span class="line"><span>assert len(devset[0]) == batch_size</span></span>
87
+ <span class="line"><span>devset[0][:3]</span></span>
88
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="build-neural-networks-3-4" tabindex="-1"><a class="header-anchor" href="#build-neural-networks-3-4"><span>Build neural networks (3/4)</span></a></h3><p>For simplicity, we use a predefined model: <a href="https://papers.nips.cc/paper/7181-attention-is-all-you-need.pdf" target="_blank" rel="noopener noreferrer">Transformer<span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span></a>.</p><p>NOTE: You can also use your custom model in command line tools as <code>asr_train.py --model-module your_module:YourModel</code></p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>import argparse</span></span>
89
+ <span class="line"><span>from espnet.bin.asr_train import get_parser</span></span>
90
+ <span class="line"><span>from espnet.nets.pytorch_backend.e2e_asr import E2E</span></span>
91
+ <span class="line"><span></span></span>
92
+ <span class="line"><span>parser = get_parser()</span></span>
93
+ <span class="line"><span>parser = E2E.add_arguments(parser)</span></span>
94
+ <span class="line"><span>config = parser.parse_args([</span></span>
95
+ <span class="line"><span> &quot;--mtlalpha&quot;, &quot;0.0&quot;, # weight for cross entropy and CTC loss</span></span>
96
+ <span class="line"><span> &quot;--outdir&quot;, &quot;out&quot;, &quot;--dict&quot;, &quot;&quot;]) # TODO: allow no arg</span></span>
97
+ <span class="line"><span></span></span>
98
+ <span class="line"><span>idim = info[&quot;input&quot;][0][&quot;shape&quot;][1]</span></span>
99
+ <span class="line"><span>odim = info[&quot;output&quot;][0][&quot;shape&quot;][1]</span></span>
100
+ <span class="line"><span>setattr(config, &quot;char_list&quot;, [])</span></span>
101
+ <span class="line"><span>model = E2E(idim, odim, config)</span></span>
102
+ <span class="line"><span>model</span></span>
103
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="update-neural-networks-by-iterating-datasets-4-4" tabindex="-1"><a class="header-anchor" href="#update-neural-networks-by-iterating-datasets-4-4"><span>Update neural networks by iterating datasets (4/4)</span></a></h3><p>Finaly, we got the training part.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>import numpy</span></span>
104
+ <span class="line"><span>import torch</span></span>
105
+ <span class="line"><span>from torch.nn.utils.rnn import pad_sequence</span></span>
106
+ <span class="line"><span>from torch.nn.utils.clip_grad import clip_grad_norm_</span></span>
107
+ <span class="line"><span>from torch.utils.data import DataLoader</span></span>
108
+ <span class="line"><span></span></span>
109
+ <span class="line"><span>def collate(minibatch):</span></span>
110
+ <span class="line"><span> fbanks = []</span></span>
111
+ <span class="line"><span> tokens = []</span></span>
112
+ <span class="line"><span> for key, info in minibatch[0]:</span></span>
113
+ <span class="line"><span> fbanks.append(torch.tensor(kaldiio.load_mat(info[&quot;input&quot;][0][&quot;feat&quot;])))</span></span>
114
+ <span class="line"><span> tokens.append(torch.tensor([int(s) for s in info[&quot;output&quot;][0][&quot;tokenid&quot;].split()]))</span></span>
115
+ <span class="line"><span> ilens = torch.tensor([x.shape[0] for x in fbanks])</span></span>
116
+ <span class="line"><span> return pad_sequence(fbanks, batch_first=True), ilens, pad_sequence(tokens, batch_first=True)</span></span>
117
+ <span class="line"><span></span></span>
118
+ <span class="line"><span>train_loader = DataLoader(trainset, collate_fn=collate, shuffle=True, pin_memory=True)</span></span>
119
+ <span class="line"><span>dev_loader = DataLoader(devset, collate_fn=collate, pin_memory=True)</span></span>
120
+ <span class="line"><span>model.cuda()</span></span>
121
+ <span class="line"><span>optim = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.98))</span></span>
122
+ <span class="line"><span></span></span>
123
+ <span class="line"><span>n_iter = len(trainset)</span></span>
124
+ <span class="line"><span>n_epoch = 10</span></span>
125
+ <span class="line"><span>total_iter = n_iter * n_epoch</span></span>
126
+ <span class="line"><span>train_acc = []</span></span>
127
+ <span class="line"><span>valid_acc = []</span></span>
128
+ <span class="line"><span>for epoch in range(n_epoch):</span></span>
129
+ <span class="line"><span> # training</span></span>
130
+ <span class="line"><span> acc = []</span></span>
131
+ <span class="line"><span> model.train()</span></span>
132
+ <span class="line"><span> for data in train_loader:</span></span>
133
+ <span class="line"><span> loss = model(*[d.cuda() for d in data])</span></span>
134
+ <span class="line"><span> optim.zero_grad()</span></span>
135
+ <span class="line"><span> loss.backward()</span></span>
136
+ <span class="line"><span> acc.append(model.acc)</span></span>
137
+ <span class="line"><span> norm = clip_grad_norm_(model.parameters(), 10.0)</span></span>
138
+ <span class="line"><span> optim.step()</span></span>
139
+ <span class="line"><span> train_acc.append(numpy.mean(acc))</span></span>
140
+ <span class="line"><span></span></span>
141
+ <span class="line"><span> # validation</span></span>
142
+ <span class="line"><span> acc = []</span></span>
143
+ <span class="line"><span> model.eval()</span></span>
144
+ <span class="line"><span> for data in dev_loader:</span></span>
145
+ <span class="line"><span> model(*[d.cuda() for d in data])</span></span>
146
+ <span class="line"><span> acc.append(model.acc)</span></span>
147
+ <span class="line"><span> valid_acc.append(numpy.mean(acc))</span></span>
148
+ <span class="line"><span> print(f&quot;epoch: {epoch}, train acc: {train_acc[-1]:.3f}, dev acc: {valid_acc[-1]:.3f}&quot;)</span></span>
149
+ <span class="line"><span></span></span>
150
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>import matplotlib.pyplot as plt</span></span>
151
+ <span class="line"><span></span></span>
152
+ <span class="line"><span>plt.plot(range(len(train_acc)), train_acc, label=&quot;train acc&quot;)</span></span>
153
+ <span class="line"><span>plt.plot(range(len(valid_acc)), valid_acc, label=&quot;dev acc&quot;)</span></span>
154
+ <span class="line"><span>plt.grid()</span></span>
155
+ <span class="line"><span>plt.legend()</span></span>
156
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>torch.save(model.state_dict(), &quot;best.pt&quot;)</span></span>
157
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h3 id="recognize-speech" tabindex="-1"><a class="header-anchor" href="#recognize-speech"><span>Recognize speech</span></a></h3><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>import json</span></span>
158
+ <span class="line"><span>import matplotlib.pyplot as plt</span></span>
159
+ <span class="line"><span>import kaldiio</span></span>
160
+ <span class="line"><span>from espnet.bin.asr_recog import get_parser</span></span>
161
+ <span class="line"><span></span></span>
162
+ <span class="line"><span># load data</span></span>
163
+ <span class="line"><span>root = &quot;espnet/egs/an4/asr1&quot;</span></span>
164
+ <span class="line"><span>with open(root + &quot;/dump/test/deltafalse/data.json&quot;, &quot;r&quot;) as f:</span></span>
165
+ <span class="line"><span> test_json = json.load(f)[&quot;utts&quot;]</span></span>
166
+ <span class="line"><span> </span></span>
167
+ <span class="line"><span>key, info = list(test_json.items())[10]</span></span>
168
+ <span class="line"><span></span></span>
169
+ <span class="line"><span># plot the 80-dim fbank + 3-dim pitch speech feature</span></span>
170
+ <span class="line"><span>fbank = kaldiio.load_mat(info[&quot;input&quot;][0][&quot;feat&quot;])</span></span>
171
+ <span class="line"><span>plt.matshow(fbank.T[::-1])</span></span>
172
+ <span class="line"><span>plt.title(key + &quot;: &quot; + info[&quot;output&quot;][0][&quot;text&quot;])</span></span>
173
+ <span class="line"><span></span></span>
174
+ <span class="line"><span># load token dict</span></span>
175
+ <span class="line"><span>with open(root + &quot;/data/lang_1char/train_nodev_units.txt&quot;, &quot;r&quot;) as f:</span></span>
176
+ <span class="line"><span> token_list = [entry.split()[0] for entry in f]</span></span>
177
+ <span class="line"><span>token_list.insert(0, &#39;&lt;blank&gt;&#39;)</span></span>
178
+ <span class="line"><span>token_list.append(&#39;&lt;eos&gt;&#39;)</span></span>
179
+ <span class="line"><span></span></span>
180
+ <span class="line"><span># recognize speech</span></span>
181
+ <span class="line"><span>parser = get_parser()</span></span>
182
+ <span class="line"><span>args = parser.parse_args([</span></span>
183
+ <span class="line"><span> &quot;--beam-size&quot;, &quot;1&quot;,</span></span>
184
+ <span class="line"><span> &quot;--ctc-weight&quot;, &quot;0&quot;,</span></span>
185
+ <span class="line"><span> &quot;--result-label&quot;, &quot;out.json&quot;,</span></span>
186
+ <span class="line"><span> &quot;--model&quot;, &quot;&quot;</span></span>
187
+ <span class="line"><span>])</span></span>
188
+ <span class="line"><span>model.cpu()</span></span>
189
+ <span class="line"><span>model.eval()</span></span>
190
+ <span class="line"><span></span></span>
191
+ <span class="line"><span>def to_str(result):</span></span>
192
+ <span class="line"><span> return &quot;&quot;.join(token_list[y] for y in result[0][&quot;yseq&quot;]) \</span></span>
193
+ <span class="line"><span> .replace(&quot;&lt;eos&gt;&quot;, &quot;&quot;).replace(&quot;&lt;space&gt;&quot;, &quot; &quot;).replace(&quot;&lt;blank&gt;&quot;, &quot;&quot;)</span></span>
194
+ <span class="line"><span></span></span>
195
+ <span class="line"><span>print(&quot;groundtruth:&quot;, info[&quot;output&quot;][0][&quot;text&quot;])</span></span>
196
+ <span class="line"><span>print(&quot;prediction: &quot;, to_str(model.recognize(fbank, args, token_list)))</span></span>
197
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span></span></span>
198
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div></div><!--[--><!--]--></div><footer class="vp-page-meta"><!----><div class="vp-meta-item git-info"><!----><!----></div></footer><nav class="vp-page-nav" aria-label="page navigation"><a class="route-link prev" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><div class="hint"><span class="arrow left"></span> Prev</div><div class="link"><span>ESPnet2-ASR realtime demonstration</span></div><!--]--></a><a class="route-link next" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><div class="hint">Next <span class="arrow right"></span></div><div class="link"><span>Use transfer learning for ASR in ESPnet2</span></div><!--]--></a></nav><!--[--><!--]--></main><!--]--></div><!--[--><!----><!--]--><!--]--></div>
199
+ <script type="module" src="/assets/app-DTS6SjJz.js" defer></script>
200
+ </body>
201
+ </html>
espnet2/asr/espnet2_asr_realtime_demo.html ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!doctype html>
2
+ <html lang="en-US">
3
+ <head>
4
+ <meta charset="utf-8" />
5
+ <meta name="viewport" content="width=device-width,initial-scale=1" />
6
+ <meta name="generator" content="VuePress 2.0.0-rc.9" />
7
+ <style>
8
+ :root {
9
+ --c-bg: #fff;
10
+ }
11
+ html.dark {
12
+ --c-bg: #22272e;
13
+ }
14
+ html,
15
+ body {
16
+ background-color: var(--c-bg);
17
+ }
18
+ </style>
19
+ <script>
20
+ const userMode = localStorage.getItem('vuepress-color-scheme')
21
+ const systemDarkMode =
22
+ window.matchMedia &&
23
+ window.matchMedia('(prefers-color-scheme: dark)').matches
24
+ if (userMode === 'dark' || (userMode !== 'light' && systemDarkMode)) {
25
+ document.documentElement.classList.toggle('dark', true)
26
+ }
27
+ </script>
28
+ <link rel="manifest" href="/manifest.webmanifest"><meta name="application-name" content="Example"><meta name="apple-mobile-web-app-title" content="Example"><meta name="apple-mobile-web-app-status-bar-style" content="black"><meta name="msapplication-TileColor" content="#3eaf7c"><meta name="theme-color" content="#3eaf7c"><title>ESPnet2-ASR realtime demonstration | </title><meta name="description" content=" ">
29
+ <link rel="preload" href="/assets/style-SNWc1iKP.css" as="style"><link rel="stylesheet" href="/assets/style-SNWc1iKP.css">
30
+ <link rel="modulepreload" href="/assets/app-DTS6SjJz.js"><link rel="modulepreload" href="/assets/espnet2_asr_realtime_demo.html-BnK1Wovv.js">
31
+ <link rel="prefetch" href="/assets/index.html-DGcx4T0I.js" as="script"><link rel="prefetch" href="/assets/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html-CJ8-yKXK.js" as="script"><link rel="prefetch" href="/assets/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html-BDY4p1H1.js" as="script"><link rel="prefetch" href="/assets/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html-DDjiuGQB.js" as="script"><link rel="prefetch" href="/assets/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html-DnIftUJK.js" as="script"><link rel="prefetch" href="/assets/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html-D9GsoT-_.js" as="script"><link rel="prefetch" href="/assets/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html-DobzmqH0.js" as="script"><link rel="prefetch" href="/assets/espnet2_tutorial_2021_CMU_11751_18781.html-BY6Z52B4.js" as="script"><link rel="prefetch" href="/assets/asr_cli.html-BA-xBrC-.js" as="script"><link rel="prefetch" href="/assets/asr_library.html-rEQwKTMV.js" as="script"><link rel="prefetch" href="/assets/espnet2_asr_transfer_learning_demo.html-DJeSoTyY.js" as="script"><link rel="prefetch" href="/assets/espnet2_streaming_asr_demo.html-Yx-OX8AZ.js" as="script"><link rel="prefetch" href="/assets/espnet_se_demonstration_for_waspaa_2021.html--JdDNbEo.js" as="script"><link rel="prefetch" href="/assets/se_demo.html-DY-mv2y8.js" as="script"><link rel="prefetch" href="/assets/onnx_conversion_demo.html-D56NEMop.js" as="script"><link rel="prefetch" href="/assets/pretrained.html-JpE__EKJ.js" as="script"><link rel="prefetch" href="/assets/espnet2_2pass_slu_demo.html-BmvJ92Ni.js" as="script"><link rel="prefetch" href="/assets/st_demo.html-WLzB4ZGO.js" as="script"><link rel="prefetch" href="/assets/espnet2_tts_realtime_demo.html-BdxLBr1c.js" as="script"><link rel="prefetch" href="/assets/tts_cli.html-BfB21gs4.js" as="script"><link rel="prefetch" href="/assets/tts_realtime_demo.html-BKOGq7as.js" as="script"><link rel="prefetch" href="/assets/finetune_owsm.html-ICOQYZj2.js" as="script"><link rel="prefetch" href="/assets/finetune_with_lora.html-3NfoQDOl.js" as="script"><link rel="prefetch" href="/assets/train.html-BQ-t2Cs4.js" as="script"><link rel="prefetch" href="/assets/tacotron2.html-Ds1AKES7.js" as="script"><link rel="prefetch" href="/assets/404.html-DN7291h8.js" as="script"><link rel="prefetch" href="/assets/NpmBadge-rh9tvaXX.js" as="script">
32
+ </head>
33
+ <body>
34
+ <div id="app"><!--[--><div class="theme-container"><!--[--><header class="navbar"><div class="toggle-sidebar-button" title="toggle sidebar" aria-expanded="false" role="button" tabindex="0"><div class="icon" aria-hidden="true"><span></span><span></span><span></span></div></div><span><a class="route-link" href="/"><img class="logo" src="/images/espnet_logo1.png" alt=" "><span class="site-name can-hide" aria-hidden="true"> </span></a></span><div class="navbar-items-wrapper" style=""><!--[--><!--]--><nav class="navbar-items can-hide" aria-label="site navigation"><!--[--><div class="navbar-item"><a class="external-link" href="https://github.com/espnet/espnet" rel="noopener noreferrer" target="_blank" aria-label="Github"><!--[--><!--[--><!--]--> Github <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://huggingface.co/espnet" rel="noopener noreferrer" target="_blank" aria-label="HuggingFace"><!--[--><!--[--><!--]--> HuggingFace <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://espnet.github.io/espnet" rel="noopener noreferrer" target="_blank" aria-label="Docs"><!--[--><!--[--><!--]--> Docs <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SE</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SLU</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link route-link-active" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>OTHERS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ST</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/tts/tacotron2.html" aria-label="TTS demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> TTS demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_with_lora.html" aria-label="Finetune Model with ESPnet-Easy"><!--[--><!--[--><!--]--> Finetune Model with ESPnet-Easy <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/train.html" aria-label="Sample demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> Sample demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_owsm.html" aria-label="OWSM finetuning with custom dataset"><!--[--><!--[--><!--]--> OWSM finetuning with custom dataset <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_tutorial_2021_CMU_11751_18781.html" aria-label="CMU 11751/18781 2021: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 2021: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html" aria-label="CMU 11492/11692 Spring 2023: Spoken Language Understanding"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Spoken Language Understanding <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html" aria-label="CMU 11492/11692 Spring 2023: Speech Enhancement"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Speech Enhancement <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html" aria-label="CMU 11492/11692 Spring 2023: Data preparation"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Data preparation <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task)"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html" aria-label="CMU 11492/11692 Spring 2023: Text to Speech"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Text to Speech <!--[--><!--]--><!--]--></a></li><!--]--></ul></div></div><!--]--></nav><!--[--><!--]--><button class="toggle-color-mode-button" title="toggle color mode"><svg style="" class="icon" focusable="false" viewBox="0 0 32 32"><path d="M16 12.005a4 4 0 1 1-4 4a4.005 4.005 0 0 1 4-4m0-2a6 6 0 1 0 6 6a6 6 0 0 0-6-6z" fill="currentColor"></path><path d="M5.394 6.813l1.414-1.415l3.506 3.506L8.9 10.318z" fill="currentColor"></path><path d="M2 15.005h5v2H2z" fill="currentColor"></path><path d="M5.394 25.197L8.9 21.691l1.414 1.415l-3.506 3.505z" fill="currentColor"></path><path d="M15 25.005h2v5h-2z" fill="currentColor"></path><path d="M21.687 23.106l1.414-1.415l3.506 3.506l-1.414 1.414z" fill="currentColor"></path><path d="M25 15.005h5v2h-5z" fill="currentColor"></path><path d="M21.687 8.904l3.506-3.506l1.414 1.415l-3.506 3.505z" fill="currentColor"></path><path d="M15 2.005h2v5h-2z" fill="currentColor"></path></svg><svg style="display:none;" class="icon" focusable="false" viewBox="0 0 32 32"><path d="M13.502 5.414a15.075 15.075 0 0 0 11.594 18.194a11.113 11.113 0 0 1-7.975 3.39c-.138 0-.278.005-.418 0a11.094 11.094 0 0 1-3.2-21.584M14.98 3a1.002 1.002 0 0 0-.175.016a13.096 13.096 0 0 0 1.825 25.981c.164.006.328 0 .49 0a13.072 13.072 0 0 0 10.703-5.555a1.01 1.01 0 0 0-.783-1.565A13.08 13.08 0 0 1 15.89 4.38A1.015 1.015 0 0 0 14.98 3z" fill="currentColor"></path></svg></button><form class="search-box" role="search"><input type="search" placeholder="Search" autocomplete="off" spellcheck="false" value><!----></form></div></header><!--]--><div class="sidebar-mask"></div><!--[--><aside class="sidebar"><nav class="navbar-items" aria-label="site navigation"><!--[--><div class="navbar-item"><a class="external-link" href="https://github.com/espnet/espnet" rel="noopener noreferrer" target="_blank" aria-label="Github"><!--[--><!--[--><!--]--> Github <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://huggingface.co/espnet" rel="noopener noreferrer" target="_blank" aria-label="HuggingFace"><!--[--><!--[--><!--]--> HuggingFace <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://espnet.github.io/espnet" rel="noopener noreferrer" target="_blank" aria-label="Docs"><!--[--><!--[--><!--]--> Docs <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SE</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SLU</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link route-link-active" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>OTHERS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ST</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/tts/tacotron2.html" aria-label="TTS demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> TTS demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_with_lora.html" aria-label="Finetune Model with ESPnet-Easy"><!--[--><!--[--><!--]--> Finetune Model with ESPnet-Easy <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/train.html" aria-label="Sample demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> Sample demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_owsm.html" aria-label="OWSM finetuning with custom dataset"><!--[--><!--[--><!--]--> OWSM finetuning with custom dataset <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_tutorial_2021_CMU_11751_18781.html" aria-label="CMU 11751/18781 2021: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 2021: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html" aria-label="CMU 11492/11692 Spring 2023: Spoken Language Understanding"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Spoken Language Understanding <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html" aria-label="CMU 11492/11692 Spring 2023: Speech Enhancement"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Speech Enhancement <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html" aria-label="CMU 11492/11692 Spring 2023: Data preparation"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Data preparation <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task)"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html" aria-label="CMU 11492/11692 Spring 2023: Text to Speech"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Text to Speech <!--[--><!--]--><!--]--></a></li><!--]--></ul></div></div><!--]--></nav><!--[--><!--]--><ul class="sidebar-items"><!--[--><li><p tabindex="0" class="sidebar-item sidebar-heading">TTS <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">SE <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">SLU <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading active">ASR <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link route-link-active sidebar-item active" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="#asr-model-demo" aria-label="ASR model demo"><!--[--><!--[--><!--]--> ASR model demo <!--[--><!--]--><!--]--></a><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="#model-selection" aria-label="Model Selection"><!--[--><!--[--><!--]--> Model Selection <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#model-setup" aria-label="Model Setup"><!--[--><!--[--><!--]--> Model Setup <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#recognize-our-example-recordings" aria-label="Recognize our example recordings"><!--[--><!--[--><!--]--> Recognize our example recordings <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#recognize-your-own-pre-recordings" aria-label="Recognize your own pre-recordings"><!--[--><!--[--><!--]--> Recognize your own pre-recordings <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#recognize-your-own-live-recordings" aria-label="Recognize your own live-recordings"><!--[--><!--[--><!--]--> Recognize your own live-recordings <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><!--]--></ul></li><li><a class="route-link sidebar-item" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">OTHERS <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">ST <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><!--]--></ul><!--[--><!--]--></aside><!--]--><!--[--><main class="page"><!--[--><!--]--><div class="theme-default-content"><!--[--><!--]--><div><h1 id="espnet2-asr-realtime-demonstration" tabindex="-1"><a class="header-anchor" href="#espnet2-asr-realtime-demonstration"><span>ESPnet2-ASR realtime demonstration</span></a></h1><p>This notebook provides a demonstration of the realtime E2E-ASR using ESPnet2-ASR.</p><ul><li>ESPnet2-ASR: https://github.com/espnet/espnet/tree/master/egs2/TEMPLATE/asr1</li></ul><p>Author: Jiatong Shi (<a href="https://github.com/ftshijt" target="_blank" rel="noopener noreferrer">@ftshijt<span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span></a>)</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># </span><span style="color:#569CD6;">NOTE</span><span style="color:#6A9955;">: pip shows imcompatible errors due to preinstalled libraries but you do not need to care</span></span>
35
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">pip install -q espnet==</span><span style="color:#B5CEA8;">0.10</span><span style="color:#D4D4D4;">.0</span></span>
36
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">pip install -q espnet_model_zoo</span></span>
37
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="asr-model-demo" tabindex="-1"><a class="header-anchor" href="#asr-model-demo"><span>ASR model demo</span></a></h2><h3 id="model-selection" tabindex="-1"><a class="header-anchor" href="#model-selection"><span>Model Selection</span></a></h3><p>Please select model shown in <a href="https://github.com/espnet/espnet_model_zoo/blob/master/espnet_model_zoo/table.csv" target="_blank" rel="noopener noreferrer">espnet_model_zoo<span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span></a></p><p>In this demonstration, we will show English, Japanese, Spanish, Mandrain, Multilingual ASR model, respectively</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;">#@title Choose English ASR model { run: &quot;auto&quot; }</span></span>
38
+ <span class="line"></span>
39
+ <span class="line"><span style="color:#D4D4D4;">lang = </span><span style="color:#CE9178;">&#39;en&#39;</span></span>
40
+ <span class="line"><span style="color:#D4D4D4;">fs = </span><span style="color:#B5CEA8;">16000</span><span style="color:#6A9955;"> #@param {type:&quot;integer&quot;}</span></span>
41
+ <span class="line"><span style="color:#D4D4D4;">tag = </span><span style="color:#CE9178;">&#39;Shinji Watanabe/spgispeech_asr_train_asr_conformer6_n_fft512_hop_length256_raw_en_unnorm_bpe5000_valid.acc.ave&#39;</span><span style="color:#6A9955;"> #@param [&quot;Shinji Watanabe/spgispeech_asr_train_asr_conformer6_n_fft512_hop_length256_raw_en_unnorm_bpe5000_valid.acc.ave&quot;, &quot;kamo-naoyuki/librispeech_asr_train_asr_conformer6_n_fft512_hop_length256_raw_en_bpe5000_scheduler_confwarmup_steps40000_optim_conflr0.0025_sp_valid.acc.ave&quot;] {type:&quot;string&quot;}</span></span>
42
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;">#@title Choose Japanese ASR model { run: &quot;auto&quot; }</span></span>
43
+ <span class="line"></span>
44
+ <span class="line"><span style="color:#D4D4D4;">lang = </span><span style="color:#CE9178;">&#39;ja&#39;</span></span>
45
+ <span class="line"><span style="color:#D4D4D4;">fs = </span><span style="color:#B5CEA8;">16000</span><span style="color:#6A9955;"> #@param {type:&quot;integer&quot;}</span></span>
46
+ <span class="line"><span style="color:#D4D4D4;">tag = </span><span style="color:#CE9178;">&#39;Shinji Watanabe/laborotv_asr_train_asr_conformer2_latest33_raw_char_sp_valid.acc.ave&#39;</span><span style="color:#6A9955;"> #@param [&quot;Shinji Watanabe/laborotv_asr_train_asr_conformer2_latest33_raw_char_sp_valid.acc.ave&quot;] {type:&quot;string&quot;}</span></span>
47
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;">#@title Choose Spanish ASR model { run: &quot;auto&quot; }</span></span>
48
+ <span class="line"></span>
49
+ <span class="line"><span style="color:#D4D4D4;">lang = </span><span style="color:#CE9178;">&#39;es&#39;</span></span>
50
+ <span class="line"><span style="color:#D4D4D4;">fs = </span><span style="color:#B5CEA8;">16000</span><span style="color:#6A9955;"> #@param {type:&quot;integer&quot;}</span></span>
51
+ <span class="line"><span style="color:#D4D4D4;">tag = </span><span style="color:#CE9178;">&#39;ftshijt/mls_asr_transformer_valid.acc.best&#39;</span><span style="color:#6A9955;"> #@param [&quot;ftshijt/mls_asr_transformer_valid.acc.best&quot;] {type:&quot;string&quot;}</span></span>
52
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;">#@title Choose Mandrain ASR model { run: &quot;auto&quot; }</span></span>
53
+ <span class="line"></span>
54
+ <span class="line"><span style="color:#D4D4D4;">lang = </span><span style="color:#CE9178;">&#39;zh&#39;</span></span>
55
+ <span class="line"><span style="color:#D4D4D4;">fs = </span><span style="color:#B5CEA8;">16000</span><span style="color:#6A9955;"> #@param {type:&quot;integer&quot;}</span></span>
56
+ <span class="line"><span style="color:#D4D4D4;">tag = </span><span style="color:#CE9178;">&#39;Emiru Tsunoo/aishell_asr_train_asr_streaming_transformer_raw_zh_char_sp_valid.acc.ave&#39;</span><span style="color:#6A9955;"> #@param [&quot; Emiru Tsunoo/aishell_asr_train_asr_streaming_transformer_raw_zh_char_sp_valid.acc.ave&quot;] {type:&quot;string&quot;}</span></span>
57
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;">#@title Choose Multilingual ASR model { run: &quot;auto&quot; }</span></span>
58
+ <span class="line"></span>
59
+ <span class="line"><span style="color:#D4D4D4;">lang = </span><span style="color:#CE9178;">&#39;multilingual&#39;</span></span>
60
+ <span class="line"><span style="color:#D4D4D4;">fs = </span><span style="color:#B5CEA8;">16000</span><span style="color:#6A9955;"> #@param {type:&quot;integer&quot;}</span></span>
61
+ <span class="line"><span style="color:#D4D4D4;">tag = </span><span style="color:#CE9178;">&#39;ftshijt/open_li52_asr_train_asr_raw_bpe7000_valid.acc.ave_10best&#39;</span><span style="color:#6A9955;"> #@param [&quot; ftshijt/open_li52_asr_train_asr_raw_bpe7000_valid.acc.ave_10best&quot;] {type:&quot;string&quot;}</span></span>
62
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="model-setup" tabindex="-1"><a class="header-anchor" href="#model-setup"><span>Model Setup</span></a></h3><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> time</span></span>
63
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> torch</span></span>
64
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> string</span></span>
65
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet_model_zoo.downloader </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> ModelDownloader</span></span>
66
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet2.bin.asr_inference </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> Speech2Text</span></span>
67
+ <span class="line"></span>
68
+ <span class="line"></span>
69
+ <span class="line"><span style="color:#D4D4D4;">d = ModelDownloader()</span></span>
70
+ <span class="line"><span style="color:#6A9955;"># It may takes a while to download and build models</span></span>
71
+ <span class="line"><span style="color:#D4D4D4;">speech2text = Speech2Text(</span></span>
72
+ <span class="line"><span style="color:#D4D4D4;"> **d.download_and_unpack(tag),</span></span>
73
+ <span class="line"><span style="color:#9CDCFE;"> device</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&quot;cuda&quot;</span><span style="color:#D4D4D4;">,</span></span>
74
+ <span class="line"><span style="color:#9CDCFE;"> minlenratio</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">0.0</span><span style="color:#D4D4D4;">,</span></span>
75
+ <span class="line"><span style="color:#9CDCFE;"> maxlenratio</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">0.0</span><span style="color:#D4D4D4;">,</span></span>
76
+ <span class="line"><span style="color:#9CDCFE;"> ctc_weight</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">0.3</span><span style="color:#D4D4D4;">,</span></span>
77
+ <span class="line"><span style="color:#9CDCFE;"> beam_size</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">10</span><span style="color:#D4D4D4;">,</span></span>
78
+ <span class="line"><span style="color:#9CDCFE;"> batch_size</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">,</span></span>
79
+ <span class="line"><span style="color:#9CDCFE;"> nbest</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">1</span></span>
80
+ <span class="line"><span style="color:#D4D4D4;">)</span></span>
81
+ <span class="line"></span>
82
+ <span class="line"><span style="color:#569CD6;">def</span><span style="color:#DCDCAA;"> text_normalizer</span><span style="color:#D4D4D4;">(</span><span style="color:#9CDCFE;">text</span><span style="color:#D4D4D4;">):</span></span>
83
+ <span class="line"><span style="color:#D4D4D4;"> text = text.upper()</span></span>
84
+ <span class="line"><span style="color:#C586C0;"> return</span><span style="color:#D4D4D4;"> text.translate(</span><span style="color:#4EC9B0;">str</span><span style="color:#D4D4D4;">.maketrans(</span><span style="color:#CE9178;">&#39;&#39;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&#39;&#39;</span><span style="color:#D4D4D4;">, string.punctuation))</span></span>
85
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="recognize-our-example-recordings" tabindex="-1"><a class="header-anchor" href="#recognize-our-example-recordings"><span>Recognize our example recordings</span></a></h3><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">git clone https://github.com/ftshijt/ESPNet_asr_egs.git</span></span>
86
+ <span class="line"></span>
87
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> pandas </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> pd</span></span>
88
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> soundfile</span></span>
89
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> librosa.display</span></span>
90
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> IPython.display </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> display, Audio</span></span>
91
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> matplotlib.pyplot </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> plt</span></span>
92
+ <span class="line"></span>
93
+ <span class="line"></span>
94
+ <span class="line"><span style="color:#D4D4D4;">egs = pd.read_csv(</span><span style="color:#CE9178;">&quot;ESPNet_asr_egs/egs.csv&quot;</span><span style="color:#D4D4D4;">)</span></span>
95
+ <span class="line"><span style="color:#C586C0;">for</span><span style="color:#D4D4D4;"> index, row </span><span style="color:#C586C0;">in</span><span style="color:#D4D4D4;"> egs.iterrows():</span></span>
96
+ <span class="line"><span style="color:#C586C0;"> if</span><span style="color:#D4D4D4;"> row[</span><span style="color:#CE9178;">&quot;lang&quot;</span><span style="color:#D4D4D4;">] == lang </span><span style="color:#569CD6;">or</span><span style="color:#D4D4D4;"> lang == </span><span style="color:#CE9178;">&quot;multilingual&quot;</span><span style="color:#D4D4D4;">:</span></span>
97
+ <span class="line"><span style="color:#D4D4D4;"> speech, rate = soundfile.read(</span><span style="color:#CE9178;">&quot;ESPNet_asr_egs/&quot;</span><span style="color:#D4D4D4;"> + row[</span><span style="color:#CE9178;">&quot;path&quot;</span><span style="color:#D4D4D4;">])</span></span>
98
+ <span class="line"><span style="color:#C586C0;"> assert</span><span style="color:#D4D4D4;"> fs == </span><span style="color:#4EC9B0;">int</span><span style="color:#D4D4D4;">(row[</span><span style="color:#CE9178;">&quot;sr&quot;</span><span style="color:#D4D4D4;">])</span></span>
99
+ <span class="line"><span style="color:#D4D4D4;"> nbests = speech2text(speech)</span></span>
100
+ <span class="line"></span>
101
+ <span class="line"><span style="color:#D4D4D4;"> text, *_ = nbests[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">]</span></span>
102
+ <span class="line"><span style="color:#DCDCAA;"> print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;Input Speech: ESPNet_asr_egs/</span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">row[</span><span style="color:#CE9178;">&#39;path&#39;</span><span style="color:#D4D4D4;">]</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&quot;</span><span style="color:#D4D4D4;">)</span></span>
103
+ <span class="line"><span style="color:#6A9955;"> # let us listen to samples</span></span>
104
+ <span class="line"><span style="color:#D4D4D4;"> display(Audio(speech, </span><span style="color:#9CDCFE;">rate</span><span style="color:#D4D4D4;">=rate))</span></span>
105
+ <span class="line"><span style="color:#D4D4D4;"> librosa.display.waveplot(speech, </span><span style="color:#9CDCFE;">sr</span><span style="color:#D4D4D4;">=rate)</span></span>
106
+ <span class="line"><span style="color:#D4D4D4;"> plt.show()</span></span>
107
+ <span class="line"><span style="color:#DCDCAA;"> print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;Reference text: </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">text_normalizer(row[</span><span style="color:#CE9178;">&#39;text&#39;</span><span style="color:#D4D4D4;">])</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&quot;</span><span style="color:#D4D4D4;">)</span></span>
108
+ <span class="line"><span style="color:#DCDCAA;"> print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;ASR hypothesis: </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">text_normalizer(text)</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&quot;</span><span style="color:#D4D4D4;">)</span></span>
109
+ <span class="line"><span style="color:#DCDCAA;"> print</span><span style="color:#D4D4D4;">(</span><span style="color:#CE9178;">&quot;*&quot;</span><span style="color:#D4D4D4;"> * </span><span style="color:#B5CEA8;">50</span><span style="color:#D4D4D4;">)</span></span>
110
+ <span class="line"></span>
111
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="recognize-your-own-pre-recordings" tabindex="-1"><a class="header-anchor" href="#recognize-your-own-pre-recordings"><span>Recognize your own pre-recordings</span></a></h3><ol><li>Upload your own pre-recorded recordings</li><li>Recognize your voice with the ASR system</li></ol><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> google.colab </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> files</span></span>
112
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> IPython.display </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> display, Audio</span></span>
113
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> soundfile</span></span>
114
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> librosa.display</span></span>
115
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> matplotlib.pyplot </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> plt</span></span>
116
+ <span class="line"></span>
117
+ <span class="line"><span style="color:#D4D4D4;">uploaded = files.upload()</span></span>
118
+ <span class="line"></span>
119
+ <span class="line"><span style="color:#C586C0;">for</span><span style="color:#D4D4D4;"> file_name </span><span style="color:#C586C0;">in</span><span style="color:#D4D4D4;"> uploaded.keys():</span></span>
120
+ <span class="line"><span style="color:#D4D4D4;"> speech, rate = soundfile.read(file_name)</span></span>
121
+ <span class="line"><span style="color:#C586C0;"> assert</span><span style="color:#D4D4D4;"> rate == fs, </span><span style="color:#CE9178;">&quot;mismatch in sampling rate&quot;</span></span>
122
+ <span class="line"><span style="color:#D4D4D4;"> nbests = speech2text(speech)</span></span>
123
+ <span class="line"><span style="color:#D4D4D4;"> text, *_ = nbests[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">]</span></span>
124
+ <span class="line"></span>
125
+ <span class="line"><span style="color:#DCDCAA;"> print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;Input Speech: </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">file_name</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&quot;</span><span style="color:#D4D4D4;">)</span></span>
126
+ <span class="line"><span style="color:#D4D4D4;"> display(Audio(speech, </span><span style="color:#9CDCFE;">rate</span><span style="color:#D4D4D4;">=rate))</span></span>
127
+ <span class="line"><span style="color:#D4D4D4;"> librosa.display.waveplot(speech, </span><span style="color:#9CDCFE;">sr</span><span style="color:#D4D4D4;">=rate)</span></span>
128
+ <span class="line"><span style="color:#D4D4D4;"> plt.show()</span></span>
129
+ <span class="line"><span style="color:#DCDCAA;"> print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;ASR hypothesis: </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">text_normalizer(text)</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&quot;</span><span style="color:#D4D4D4;">)</span></span>
130
+ <span class="line"><span style="color:#DCDCAA;"> print</span><span style="color:#D4D4D4;">(</span><span style="color:#CE9178;">&quot;*&quot;</span><span style="color:#D4D4D4;"> * </span><span style="color:#B5CEA8;">50</span><span style="color:#D4D4D4;">)</span></span>
131
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="recognize-your-own-live-recordings" tabindex="-1"><a class="header-anchor" href="#recognize-your-own-live-recordings"><span>Recognize your own live-recordings</span></a></h3><ol><li>Record your own voice</li><li>Recognize your voice with the ASR system</li></ol><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># from https://gist.github.com/korakot/c21c3476c024ad6d56d5f48b0bca92be</span></span>
132
+ <span class="line"></span>
133
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> IPython.display </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> Javascript</span></span>
134
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> google.colab </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> output</span></span>
135
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> base64 </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> b64decode</span></span>
136
+ <span class="line"></span>
137
+ <span class="line"><span style="color:#D4D4D4;">RECORD = </span><span style="color:#CE9178;">&quot;&quot;&quot;</span></span>
138
+ <span class="line"><span style="color:#CE9178;">const sleep = time =&gt; new Promise(resolve =&gt; setTimeout(resolve, time))</span></span>
139
+ <span class="line"><span style="color:#CE9178;">const b2text = blob =&gt; new Promise(resolve =&gt; {</span></span>
140
+ <span class="line"><span style="color:#CE9178;"> const reader = new FileReader()</span></span>
141
+ <span class="line"><span style="color:#CE9178;"> reader.onloadend = e =&gt; resolve(e.srcElement.result)</span></span>
142
+ <span class="line"><span style="color:#CE9178;"> reader.readAsDataURL(blob)</span></span>
143
+ <span class="line"><span style="color:#CE9178;">})</span></span>
144
+ <span class="line"><span style="color:#CE9178;">var record = time =&gt; new Promise(async resolve =&gt; {</span></span>
145
+ <span class="line"><span style="color:#CE9178;"> stream = await navigator.mediaDevices.getUserMedia({ audio: true })</span></span>
146
+ <span class="line"><span style="color:#CE9178;"> recorder = new MediaRecorder(stream)</span></span>
147
+ <span class="line"><span style="color:#CE9178;"> chunks = []</span></span>
148
+ <span class="line"><span style="color:#CE9178;"> recorder.ondataavailable = e =&gt; chunks.push(e.data)</span></span>
149
+ <span class="line"><span style="color:#CE9178;"> recorder.start()</span></span>
150
+ <span class="line"><span style="color:#CE9178;"> await sleep(time)</span></span>
151
+ <span class="line"><span style="color:#CE9178;"> recorder.onstop = async ()=&gt;{</span></span>
152
+ <span class="line"><span style="color:#CE9178;"> blob = new Blob(chunks)</span></span>
153
+ <span class="line"><span style="color:#CE9178;"> text = await b2text(blob)</span></span>
154
+ <span class="line"><span style="color:#CE9178;"> resolve(text)</span></span>
155
+ <span class="line"><span style="color:#CE9178;"> }</span></span>
156
+ <span class="line"><span style="color:#CE9178;"> recorder.stop()</span></span>
157
+ <span class="line"><span style="color:#CE9178;">})</span></span>
158
+ <span class="line"><span style="color:#CE9178;">&quot;&quot;&quot;</span></span>
159
+ <span class="line"></span>
160
+ <span class="line"><span style="color:#569CD6;">def</span><span style="color:#DCDCAA;"> record</span><span style="color:#D4D4D4;">(</span><span style="color:#9CDCFE;">sec</span><span style="color:#D4D4D4;">, </span><span style="color:#9CDCFE;">filename</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&#39;audio.wav&#39;</span><span style="color:#D4D4D4;">):</span></span>
161
+ <span class="line"><span style="color:#D4D4D4;"> display(Javascript(RECORD))</span></span>
162
+ <span class="line"><span style="color:#D4D4D4;"> s = output.eval_js(</span><span style="color:#CE9178;">&#39;record(</span><span style="color:#569CD6;">%d</span><span style="color:#CE9178;">)&#39;</span><span style="color:#D4D4D4;"> % (sec * </span><span style="color:#B5CEA8;">1000</span><span style="color:#D4D4D4;">))</span></span>
163
+ <span class="line"><span style="color:#D4D4D4;"> b = b64decode(s.split(</span><span style="color:#CE9178;">&#39;,&#39;</span><span style="color:#D4D4D4;">)[</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">])</span></span>
164
+ <span class="line"><span style="color:#C586C0;"> with</span><span style="color:#DCDCAA;"> open</span><span style="color:#D4D4D4;">(filename, </span><span style="color:#CE9178;">&#39;wb+&#39;</span><span style="color:#D4D4D4;">) </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> f:</span></span>
165
+ <span class="line"><span style="color:#D4D4D4;"> f.write(b)</span></span>
166
+ <span class="line"></span>
167
+ <span class="line"><span style="color:#D4D4D4;">audio = </span><span style="color:#CE9178;">&#39;audio.wav&#39;</span></span>
168
+ <span class="line"><span style="color:#D4D4D4;">second = </span><span style="color:#B5CEA8;">5</span></span>
169
+ <span class="line"><span style="color:#DCDCAA;">print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;Speak to your microphone </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">second</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;"> sec...&quot;</span><span style="color:#D4D4D4;">)</span></span>
170
+ <span class="line"><span style="color:#D4D4D4;">record(second, audio)</span></span>
171
+ <span class="line"><span style="color:#DCDCAA;">print</span><span style="color:#D4D4D4;">(</span><span style="color:#CE9178;">&quot;Done!&quot;</span><span style="color:#D4D4D4;">)</span></span>
172
+ <span class="line"></span>
173
+ <span class="line"></span>
174
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> librosa</span></span>
175
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> librosa.display</span></span>
176
+ <span class="line"><span style="color:#D4D4D4;">speech, rate = librosa.load(audio, </span><span style="color:#9CDCFE;">sr</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">16000</span><span style="color:#D4D4D4;">)</span></span>
177
+ <span class="line"><span style="color:#D4D4D4;">librosa.display.waveplot(speech, </span><span style="color:#9CDCFE;">sr</span><span style="color:#D4D4D4;">=rate)</span></span>
178
+ <span class="line"></span>
179
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> matplotlib.pyplot </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> plt</span></span>
180
+ <span class="line"><span style="color:#D4D4D4;">plt.show()</span></span>
181
+ <span class="line"></span>
182
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> pysndfile</span></span>
183
+ <span class="line"><span style="color:#D4D4D4;">pysndfile.sndio.write(</span><span style="color:#CE9178;">&#39;audio_ds.wav&#39;</span><span style="color:#D4D4D4;">, speech, </span><span style="color:#9CDCFE;">rate</span><span style="color:#D4D4D4;">=rate, </span><span style="color:#9CDCFE;">format</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&#39;wav&#39;</span><span style="color:#D4D4D4;">, </span><span style="color:#9CDCFE;">enc</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&#39;pcm16&#39;</span><span style="color:#D4D4D4;">)</span></span>
184
+ <span class="line"></span>
185
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> IPython.display </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> display, Audio</span></span>
186
+ <span class="line"><span style="color:#D4D4D4;">display(Audio(speech, </span><span style="color:#9CDCFE;">rate</span><span style="color:#D4D4D4;">=rate))</span></span>
187
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">nbests = speech2text(speech)</span></span>
188
+ <span class="line"><span style="color:#D4D4D4;">text, *_ = nbests[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">]</span></span>
189
+ <span class="line"></span>
190
+ <span class="line"><span style="color:#DCDCAA;">print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;ASR hypothesis: </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">text_normalizer(text)</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&quot;</span><span style="color:#D4D4D4;">)</span></span>
191
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div></div><!--[--><!--]--></div><footer class="vp-page-meta"><!----><div class="vp-meta-item git-info"><!----><!----></div></footer><nav class="vp-page-nav" aria-label="page navigation"><a class="route-link prev" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><div class="hint"><span class="arrow left"></span> Prev</div><div class="link"><span>Speech Recognition (Recipe)</span></div><!--]--></a><a class="route-link next" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><div class="hint">Next <span class="arrow right"></span></div><div class="link"><span>Speech Recognition (Library)</span></div><!--]--></a></nav><!--[--><!--]--></main><!--]--></div><!--[--><!----><!--]--><!--]--></div>
192
+ <script type="module" src="/assets/app-DTS6SjJz.js" defer></script>
193
+ </body>
194
+ </html>
espnet2/asr/espnet2_asr_transfer_learning_demo.html ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!doctype html>
2
+ <html lang="en-US">
3
+ <head>
4
+ <meta charset="utf-8" />
5
+ <meta name="viewport" content="width=device-width,initial-scale=1" />
6
+ <meta name="generator" content="VuePress 2.0.0-rc.9" />
7
+ <style>
8
+ :root {
9
+ --c-bg: #fff;
10
+ }
11
+ html.dark {
12
+ --c-bg: #22272e;
13
+ }
14
+ html,
15
+ body {
16
+ background-color: var(--c-bg);
17
+ }
18
+ </style>
19
+ <script>
20
+ const userMode = localStorage.getItem('vuepress-color-scheme')
21
+ const systemDarkMode =
22
+ window.matchMedia &&
23
+ window.matchMedia('(prefers-color-scheme: dark)').matches
24
+ if (userMode === 'dark' || (userMode !== 'light' && systemDarkMode)) {
25
+ document.documentElement.classList.toggle('dark', true)
26
+ }
27
+ </script>
28
+ <link rel="manifest" href="/manifest.webmanifest"><meta name="application-name" content="Example"><meta name="apple-mobile-web-app-title" content="Example"><meta name="apple-mobile-web-app-status-bar-style" content="black"><meta name="msapplication-TileColor" content="#3eaf7c"><meta name="theme-color" content="#3eaf7c"><title>Use transfer learning for ASR in ESPnet2 | </title><meta name="description" content=" ">
29
+ <link rel="preload" href="/assets/style-SNWc1iKP.css" as="style"><link rel="stylesheet" href="/assets/style-SNWc1iKP.css">
30
+ <link rel="modulepreload" href="/assets/app-DTS6SjJz.js"><link rel="modulepreload" href="/assets/espnet2_asr_transfer_learning_demo.html-DJeSoTyY.js">
31
+ <link rel="prefetch" href="/assets/index.html-DGcx4T0I.js" as="script"><link rel="prefetch" href="/assets/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html-CJ8-yKXK.js" as="script"><link rel="prefetch" href="/assets/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html-BDY4p1H1.js" as="script"><link rel="prefetch" href="/assets/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html-DDjiuGQB.js" as="script"><link rel="prefetch" href="/assets/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html-DnIftUJK.js" as="script"><link rel="prefetch" href="/assets/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html-D9GsoT-_.js" as="script"><link rel="prefetch" href="/assets/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html-DobzmqH0.js" as="script"><link rel="prefetch" href="/assets/espnet2_tutorial_2021_CMU_11751_18781.html-BY6Z52B4.js" as="script"><link rel="prefetch" href="/assets/asr_cli.html-BA-xBrC-.js" as="script"><link rel="prefetch" href="/assets/asr_library.html-rEQwKTMV.js" as="script"><link rel="prefetch" href="/assets/espnet2_asr_realtime_demo.html-BnK1Wovv.js" as="script"><link rel="prefetch" href="/assets/espnet2_streaming_asr_demo.html-Yx-OX8AZ.js" as="script"><link rel="prefetch" href="/assets/espnet_se_demonstration_for_waspaa_2021.html--JdDNbEo.js" as="script"><link rel="prefetch" href="/assets/se_demo.html-DY-mv2y8.js" as="script"><link rel="prefetch" href="/assets/onnx_conversion_demo.html-D56NEMop.js" as="script"><link rel="prefetch" href="/assets/pretrained.html-JpE__EKJ.js" as="script"><link rel="prefetch" href="/assets/espnet2_2pass_slu_demo.html-BmvJ92Ni.js" as="script"><link rel="prefetch" href="/assets/st_demo.html-WLzB4ZGO.js" as="script"><link rel="prefetch" href="/assets/espnet2_tts_realtime_demo.html-BdxLBr1c.js" as="script"><link rel="prefetch" href="/assets/tts_cli.html-BfB21gs4.js" as="script"><link rel="prefetch" href="/assets/tts_realtime_demo.html-BKOGq7as.js" as="script"><link rel="prefetch" href="/assets/finetune_owsm.html-ICOQYZj2.js" as="script"><link rel="prefetch" href="/assets/finetune_with_lora.html-3NfoQDOl.js" as="script"><link rel="prefetch" href="/assets/train.html-BQ-t2Cs4.js" as="script"><link rel="prefetch" href="/assets/tacotron2.html-Ds1AKES7.js" as="script"><link rel="prefetch" href="/assets/404.html-DN7291h8.js" as="script"><link rel="prefetch" href="/assets/NpmBadge-rh9tvaXX.js" as="script">
32
+ </head>
33
+ <body>
34
+ <div id="app"><!--[--><div class="theme-container"><!--[--><header class="navbar"><div class="toggle-sidebar-button" title="toggle sidebar" aria-expanded="false" role="button" tabindex="0"><div class="icon" aria-hidden="true"><span></span><span></span><span></span></div></div><span><a class="route-link" href="/"><img class="logo" src="/images/espnet_logo1.png" alt=" "><span class="site-name can-hide" aria-hidden="true"> </span></a></span><div class="navbar-items-wrapper" style=""><!--[--><!--]--><nav class="navbar-items can-hide" aria-label="site navigation"><!--[--><div class="navbar-item"><a class="external-link" href="https://github.com/espnet/espnet" rel="noopener noreferrer" target="_blank" aria-label="Github"><!--[--><!--[--><!--]--> Github <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://huggingface.co/espnet" rel="noopener noreferrer" target="_blank" aria-label="HuggingFace"><!--[--><!--[--><!--]--> HuggingFace <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://espnet.github.io/espnet" rel="noopener noreferrer" target="_blank" aria-label="Docs"><!--[--><!--[--><!--]--> Docs <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SE</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SLU</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link route-link-active" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>OTHERS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ST</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/tts/tacotron2.html" aria-label="TTS demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> TTS demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_with_lora.html" aria-label="Finetune Model with ESPnet-Easy"><!--[--><!--[--><!--]--> Finetune Model with ESPnet-Easy <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/train.html" aria-label="Sample demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> Sample demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_owsm.html" aria-label="OWSM finetuning with custom dataset"><!--[--><!--[--><!--]--> OWSM finetuning with custom dataset <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_tutorial_2021_CMU_11751_18781.html" aria-label="CMU 11751/18781 2021: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 2021: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html" aria-label="CMU 11492/11692 Spring 2023: Spoken Language Understanding"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Spoken Language Understanding <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html" aria-label="CMU 11492/11692 Spring 2023: Speech Enhancement"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Speech Enhancement <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html" aria-label="CMU 11492/11692 Spring 2023: Data preparation"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Data preparation <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task)"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html" aria-label="CMU 11492/11692 Spring 2023: Text to Speech"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Text to Speech <!--[--><!--]--><!--]--></a></li><!--]--></ul></div></div><!--]--></nav><!--[--><!--]--><button class="toggle-color-mode-button" title="toggle color mode"><svg style="" class="icon" focusable="false" viewBox="0 0 32 32"><path d="M16 12.005a4 4 0 1 1-4 4a4.005 4.005 0 0 1 4-4m0-2a6 6 0 1 0 6 6a6 6 0 0 0-6-6z" fill="currentColor"></path><path d="M5.394 6.813l1.414-1.415l3.506 3.506L8.9 10.318z" fill="currentColor"></path><path d="M2 15.005h5v2H2z" fill="currentColor"></path><path d="M5.394 25.197L8.9 21.691l1.414 1.415l-3.506 3.505z" fill="currentColor"></path><path d="M15 25.005h2v5h-2z" fill="currentColor"></path><path d="M21.687 23.106l1.414-1.415l3.506 3.506l-1.414 1.414z" fill="currentColor"></path><path d="M25 15.005h5v2h-5z" fill="currentColor"></path><path d="M21.687 8.904l3.506-3.506l1.414 1.415l-3.506 3.505z" fill="currentColor"></path><path d="M15 2.005h2v5h-2z" fill="currentColor"></path></svg><svg style="display:none;" class="icon" focusable="false" viewBox="0 0 32 32"><path d="M13.502 5.414a15.075 15.075 0 0 0 11.594 18.194a11.113 11.113 0 0 1-7.975 3.39c-.138 0-.278.005-.418 0a11.094 11.094 0 0 1-3.2-21.584M14.98 3a1.002 1.002 0 0 0-.175.016a13.096 13.096 0 0 0 1.825 25.981c.164.006.328 0 .49 0a13.072 13.072 0 0 0 10.703-5.555a1.01 1.01 0 0 0-.783-1.565A13.08 13.08 0 0 1 15.89 4.38A1.015 1.015 0 0 0 14.98 3z" fill="currentColor"></path></svg></button><form class="search-box" role="search"><input type="search" placeholder="Search" autocomplete="off" spellcheck="false" value><!----></form></div></header><!--]--><div class="sidebar-mask"></div><!--[--><aside class="sidebar"><nav class="navbar-items" aria-label="site navigation"><!--[--><div class="navbar-item"><a class="external-link" href="https://github.com/espnet/espnet" rel="noopener noreferrer" target="_blank" aria-label="Github"><!--[--><!--[--><!--]--> Github <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://huggingface.co/espnet" rel="noopener noreferrer" target="_blank" aria-label="HuggingFace"><!--[--><!--[--><!--]--> HuggingFace <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://espnet.github.io/espnet" rel="noopener noreferrer" target="_blank" aria-label="Docs"><!--[--><!--[--><!--]--> Docs <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SE</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SLU</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link route-link-active" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>OTHERS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ST</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/tts/tacotron2.html" aria-label="TTS demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> TTS demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_with_lora.html" aria-label="Finetune Model with ESPnet-Easy"><!--[--><!--[--><!--]--> Finetune Model with ESPnet-Easy <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/train.html" aria-label="Sample demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> Sample demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_owsm.html" aria-label="OWSM finetuning with custom dataset"><!--[--><!--[--><!--]--> OWSM finetuning with custom dataset <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_tutorial_2021_CMU_11751_18781.html" aria-label="CMU 11751/18781 2021: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 2021: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html" aria-label="CMU 11492/11692 Spring 2023: Spoken Language Understanding"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Spoken Language Understanding <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html" aria-label="CMU 11492/11692 Spring 2023: Speech Enhancement"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Speech Enhancement <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html" aria-label="CMU 11492/11692 Spring 2023: Data preparation"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Data preparation <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task)"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html" aria-label="CMU 11492/11692 Spring 2023: Text to Speech"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Text to Speech <!--[--><!--]--><!--]--></a></li><!--]--></ul></div></div><!--]--></nav><!--[--><!--]--><ul class="sidebar-items"><!--[--><li><p tabindex="0" class="sidebar-item sidebar-heading">TTS <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">SE <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">SLU <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading active">ASR <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link route-link-active sidebar-item active" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="#why-using-such-pre-trained-models" aria-label="Why using such (pre-)trained models ?"><!--[--><!--[--><!--]--> Why using such (pre-)trained models ? <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#use-a-trained-model-from-espnet-repository-on-huggingface" aria-label="Use a trained model from ESPnet repository on HuggingFace."><!--[--><!--[--><!--]--> Use a trained model from ESPnet repository on HuggingFace. <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><a class="route-link sidebar-item" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">OTHERS <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">ST <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><!--]--></ul><!--[--><!--]--></aside><!--]--><!--[--><main class="page"><!--[--><!--]--><div class="theme-default-content"><!--[--><!--]--><div><h1 id="use-transfer-learning-for-asr-in-espnet2" tabindex="-1"><a class="header-anchor" href="#use-transfer-learning-for-asr-in-espnet2"><span><strong>Use transfer learning for ASR in ESPnet2</strong></span></a></h1><p>Author : Dan Berrebbi (dberrebb@andrew.cmu.edu)</p><p>Date : April 11th, 2022</p><h1 id="abstract" tabindex="-1"><a class="header-anchor" href="#abstract"><span>Abstract</span></a></h1><p>In that tutorial, we will introduce several options to use pre-trained models/parameters for Automatic Speech Recognition (ASR) in ESPnet2. Available options are :</p><ul><li>use a local model you (or a collegue) have already trained,</li><li>use a trained model from <a href="https://huggingface.co/espnet" target="_blank" rel="noopener noreferrer">ESPnet repository on HuggingFace<span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span></a>.</li></ul><p>We note that this is done for ASR training, so at <strong>stage 11</strong> of ESPnet2 models&#39; recipe.</p><h3 id="why-using-such-pre-trained-models" tabindex="-1"><a class="header-anchor" href="#why-using-such-pre-trained-models"><span>Why using such (pre-)trained models ?</span></a></h3><p>Several projects may involve making use of previously trained models, this is the reason why we developed ESPnet repository on HuggingFace for instance. Example of use cases are listed below (non-exhaustive):</p><ul><li>target a low resource language, a model trained from scratch may perform badly if trained with only few hours of data,</li><li>study robustness to shifts (domain, language ... shifts) of a model,</li><li>make use of massively trained multilingual models.</li><li>...</li></ul><h1 id="espnet-installation-about-10-minutes-in-total" tabindex="-1"><a class="header-anchor" href="#espnet-installation-about-10-minutes-in-total"><span>ESPnet installation (about 10 minutes in total)</span></a></h1><p>Please use the gpu environnement provided by google colab for runing this notebook.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">git clone </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">depth </span><span style="color:#B5CEA8;">5</span><span style="color:#D4D4D4;"> https://github.com/espnet/espnet</span></span>
35
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># It takes 30 seconds</span></span>
36
+ <span class="line"><span style="color:#D4D4D4;">%cd /content/espnet/tools</span></span>
37
+ <span class="line"><span style="color:#D4D4D4;">!./setup_anaconda.sh anaconda espnet </span><span style="color:#B5CEA8;">3.9</span></span>
38
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># It may take ~8 minutes</span></span>
39
+ <span class="line"><span style="color:#D4D4D4;">%cd /content/espnet/tools</span></span>
40
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">make CUDA_VERSION=</span><span style="color:#B5CEA8;">10.2</span></span>
41
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h1 id="mini-an4-recipe-as-a-transfer-learning-example" tabindex="-1"><a class="header-anchor" href="#mini-an4-recipe-as-a-transfer-learning-example"><span>mini_an4 recipe as a transfer learning example</span></a></h1><p>In this example, we use the <strong>mini_an4</strong> data, which has only 4 utterances for training. This is of course too small to train an ASR model, but it enables to run all the decribed transfer learning models on a colab environnement. After having run and understood those models/instructions, you can apply it to any other recipe of ESPnet2 or a new recipe that you build. First, move to the recipe directory</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">%cd /content/espnet/egs2/mini_an4/asr1</span></span>
42
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p><strong>Add a configuration file</strong></p><p>As the mini_an4 does not contain any configuration file for ASR model, we add one here.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">config = {</span><span style="color:#CE9178;">&#39;accum_grad&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">,</span></span>
43
+ <span class="line"><span style="color:#CE9178;"> &#39;batch_size&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">,</span></span>
44
+ <span class="line"><span style="color:#CE9178;"> &#39;batch_type&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#CE9178;">&#39;folded&#39;</span><span style="color:#D4D4D4;">,</span></span>
45
+ <span class="line"><span style="color:#CE9178;"> &#39;best_model_criterion&#39;</span><span style="color:#D4D4D4;">: [[</span><span style="color:#CE9178;">&#39;valid&#39;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&#39;acc&#39;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&#39;max&#39;</span><span style="color:#D4D4D4;">]],</span></span>
46
+ <span class="line"><span style="color:#CE9178;"> &#39;decoder&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#CE9178;">&#39;transformer&#39;</span><span style="color:#D4D4D4;">,</span></span>
47
+ <span class="line"><span style="color:#CE9178;"> &#39;decoder_conf&#39;</span><span style="color:#D4D4D4;">: {</span><span style="color:#CE9178;">&#39;dropout_rate&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#B5CEA8;">0.1</span><span style="color:#D4D4D4;">,</span></span>
48
+ <span class="line"><span style="color:#CE9178;"> &#39;input_layer&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#CE9178;">&#39;embed&#39;</span><span style="color:#D4D4D4;">,</span></span>
49
+ <span class="line"><span style="color:#CE9178;"> &#39;linear_units&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#B5CEA8;">2048</span><span style="color:#D4D4D4;">,</span></span>
50
+ <span class="line"><span style="color:#CE9178;"> &#39;num_blocks&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#B5CEA8;">6</span><span style="color:#D4D4D4;">},</span></span>
51
+ <span class="line"><span style="color:#CE9178;"> &#39;encoder&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#CE9178;">&#39;transformer&#39;</span><span style="color:#D4D4D4;">,</span></span>
52
+ <span class="line"><span style="color:#CE9178;"> &#39;encoder_conf&#39;</span><span style="color:#D4D4D4;">: {</span><span style="color:#CE9178;">&#39;attention_dropout_rate&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#B5CEA8;">0.0</span><span style="color:#D4D4D4;">,</span></span>
53
+ <span class="line"><span style="color:#CE9178;"> &#39;attention_heads&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#B5CEA8;">4</span><span style="color:#D4D4D4;">,</span></span>
54
+ <span class="line"><span style="color:#CE9178;"> &#39;dropout_rate&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#B5CEA8;">0.1</span><span style="color:#D4D4D4;">,</span></span>
55
+ <span class="line"><span style="color:#CE9178;"> &#39;input_layer&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#CE9178;">&#39;conv2d&#39;</span><span style="color:#D4D4D4;">,</span></span>
56
+ <span class="line"><span style="color:#CE9178;"> &#39;linear_units&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#B5CEA8;">2048</span><span style="color:#D4D4D4;">,</span></span>
57
+ <span class="line"><span style="color:#CE9178;"> &#39;num_blocks&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#B5CEA8;">12</span><span style="color:#D4D4D4;">,</span></span>
58
+ <span class="line"><span style="color:#CE9178;"> &#39;output_size&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#B5CEA8;">256</span><span style="color:#D4D4D4;">},</span></span>
59
+ <span class="line"><span style="color:#CE9178;"> &#39;grad_clip&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#B5CEA8;">5</span><span style="color:#D4D4D4;">,</span></span>
60
+ <span class="line"><span style="color:#CE9178;"> &#39;init&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#CE9178;">&#39;xavier_uniform&#39;</span><span style="color:#D4D4D4;">,</span></span>
61
+ <span class="line"><span style="color:#CE9178;"> &#39;keep_nbest_models&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">,</span></span>
62
+ <span class="line"><span style="color:#CE9178;"> &#39;max_epoch&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#B5CEA8;">5</span><span style="color:#D4D4D4;">,</span></span>
63
+ <span class="line"><span style="color:#CE9178;"> &#39;model_conf&#39;</span><span style="color:#D4D4D4;">: {</span><span style="color:#CE9178;">&#39;ctc_weight&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#B5CEA8;">0.3</span><span style="color:#D4D4D4;">,</span></span>
64
+ <span class="line"><span style="color:#CE9178;"> &#39;length_normalized_loss&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#569CD6;">False</span><span style="color:#D4D4D4;">,</span></span>
65
+ <span class="line"><span style="color:#CE9178;"> &#39;lsm_weight&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#B5CEA8;">0.1</span><span style="color:#D4D4D4;">},</span></span>
66
+ <span class="line"><span style="color:#CE9178;"> &#39;optim&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#CE9178;">&#39;adam&#39;</span><span style="color:#D4D4D4;">,</span></span>
67
+ <span class="line"><span style="color:#CE9178;"> &#39;optim_conf&#39;</span><span style="color:#D4D4D4;">: {</span><span style="color:#CE9178;">&#39;lr&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#B5CEA8;">1.0</span><span style="color:#D4D4D4;">},</span></span>
68
+ <span class="line"><span style="color:#CE9178;"> &#39;patience&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">,</span></span>
69
+ <span class="line"><span style="color:#CE9178;"> &#39;scheduler&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#CE9178;">&#39;noamlr&#39;</span><span style="color:#D4D4D4;">,</span></span>
70
+ <span class="line"><span style="color:#CE9178;"> &#39;scheduler_conf&#39;</span><span style="color:#D4D4D4;">: {</span><span style="color:#CE9178;">&#39;warmup_steps&#39;</span><span style="color:#D4D4D4;">: </span><span style="color:#B5CEA8;">1000</span><span style="color:#D4D4D4;">}}</span></span>
71
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> yaml</span></span>
72
+ <span class="line"><span style="color:#C586C0;">with</span><span style="color:#DCDCAA;"> open</span><span style="color:#D4D4D4;">(</span><span style="color:#CE9178;">&quot;conf/train_asr.yaml&quot;</span><span style="color:#D4D4D4;">,</span><span style="color:#CE9178;">&quot;w&quot;</span><span style="color:#D4D4D4;">) </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> f:</span></span>
73
+ <span class="line"><span style="color:#D4D4D4;"> yaml.dump(config, f)</span></span>
74
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p><strong>Data preparation (stage 1 - stage 5)</strong></p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">!./asr.sh </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">stage </span><span style="color:#B5CEA8;">1</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">stop_stage </span><span style="color:#B5CEA8;">5</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">train-</span><span style="color:#4EC9B0;">set</span><span style="color:#CE9178;"> &quot;train_nodev&quot;</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">valid-</span><span style="color:#4EC9B0;">set</span><span style="color:#CE9178;"> &quot;train_dev&quot;</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">test_sets </span><span style="color:#CE9178;">&quot;test&quot;</span></span>
75
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p><strong>Stage 10: ASR collect stats</strong>:</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># takes about 10 seconds</span></span>
76
+ <span class="line"><span style="color:#D4D4D4;">!./asr.sh </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">stage </span><span style="color:#B5CEA8;">10</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">stop_stage </span><span style="color:#B5CEA8;">10</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">train-</span><span style="color:#4EC9B0;">set</span><span style="color:#CE9178;"> &quot;train_nodev&quot;</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">valid-</span><span style="color:#4EC9B0;">set</span><span style="color:#CE9178;"> &quot;train_dev&quot;</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">test_sets </span><span style="color:#CE9178;">&quot;test&quot;</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">asr_config </span><span style="color:#CE9178;">&quot;conf/train_asr.yaml&quot;</span></span>
77
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><p><strong>Stage 11: ASR training (from scratch)</strong></p><p>We train our model for only 5 epochs, just to have a pre-trained model.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># takes about 1-2 minutes</span></span>
78
+ <span class="line"><span style="color:#D4D4D4;">!./asr.sh </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">stage </span><span style="color:#B5CEA8;">11</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">stop_stage </span><span style="color:#B5CEA8;">11</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">train-</span><span style="color:#4EC9B0;">set</span><span style="color:#CE9178;"> &quot;train_nodev&quot;</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">valid-</span><span style="color:#4EC9B0;">set</span><span style="color:#CE9178;"> &quot;train_dev&quot;</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">test_sets </span><span style="color:#CE9178;">&quot;test&quot;</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">asr_config </span><span style="color:#CE9178;">&quot;conf/train_asr.yaml&quot;</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">asr_tag </span><span style="color:#CE9178;">&quot;pre_trained_model&quot;</span></span>
79
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><p><strong>Stage 11.2 : ASR training over a pre-trained model</strong></p><p>We train our new model over the previously trained model. (here as we use the same training data, this is not very useful, but again this is a toy example that is reproducible with any model.)</p><p><strong>Step 1</strong> : make sure your ASR model file has the proper ESPnet format (should be ok if trained with ESPnet). It just needs to be a &quot;.pth&quot; (or &quot;.pt&quot; or other extension) type pytorch model.</p><p><strong>Step 2</strong> : add the parameter <code>--pretrained_model path/to/your/pretrained/model/file.pth</code> to run.sh.</p><p><strong>Step 3</strong> : step 2 will initialize your new model with the parameters of the pre-trained model. Thus your new model will be trained with a strong initialization. However, if your new model have different parameter sizes for some parts of the model (e.g. last projection layer could be modified ...). This will lead to an error because of mismatches in size. To prevent this to happen, you can add the parameter <code>--ignore_init_mismatch true</code> in run.sh.</p><p><strong>Step 4 (Optional)</strong> : if you only want to use some specific parts of the pre-trained model, or exclude specific parts, you can specify it in the <code>--pretrained_model</code> argument by passing the component names with the following syntax : <code>--pretrained_model &lt;file_path&gt;:&lt;src_key&gt;:&lt;dst_key&gt;:&lt;exclude_Keys&gt;</code>. <code>src_key</code> are the parameters you want to keep from the pre-trained model. <code>dst_key</code> are the parameters you want to initialize in the new model with the <code>src_key</code>parameters. And <code>exclude_Keys</code> are the parameters from the pre-trained model that you do not want to use. You can leave <code>src_key</code> and <code>dst_key</code> fields empty and just fill <code>exclude_Keys</code> with the parameters that you ant to drop. For instance, if you want to re-use encoder parameters but not decoder ones, syntax will be <code>--pretrained_model &lt;file_path&gt;:::decoder</code>. You can see the argument expected format in more details <a href="https://github.com/espnet/espnet/blob/e76c78c0c661ab37cc081d46d9b059dcb31292fe/espnet2/torch_utils/load_pretrained_model.py#L43-L53" target="_blank" rel="noopener noreferrer">here<span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span></a>.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># takes about 1-2 minutes</span></span>
80
+ <span class="line"><span style="color:#D4D4D4;">!./asr.sh </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">stage </span><span style="color:#B5CEA8;">11</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">stop_stage </span><span style="color:#B5CEA8;">11</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">train-</span><span style="color:#4EC9B0;">set</span><span style="color:#CE9178;"> &quot;train_nodev&quot;</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">valid-</span><span style="color:#4EC9B0;">set</span><span style="color:#CE9178;"> &quot;train_dev&quot;</span><span style="color:#D4D4D4;"> \</span></span>
81
+ <span class="line"><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">test_sets </span><span style="color:#CE9178;">&quot;test&quot;</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">asr_config </span><span style="color:#CE9178;">&quot;conf/train_asr.yaml&quot;</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">asr_tag </span><span style="color:#CE9178;">&quot;transfer_learning_with_pre_trained_model&quot;</span><span style="color:#D4D4D4;">\</span></span>
82
+ <span class="line"><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">pretrained_model </span><span style="color:#CE9178;">&quot;/content/espnet/egs2/mini_an4/asr1/exp/asr_train_asr_raw_bpe30/valid.acc.ave.pth&quot;</span></span>
83
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p><strong>Stage 11.3 : ASR training over a HuggingFace pre-trained model</strong></p><p>We train our new model over the previously trained model from HuggingFace. Any model can be used, here we take a model trained on Bengali as an example. It can be found at https://huggingface.co/espnet/bn_openslr53.</p><h3 id="use-a-trained-model-from-espnet-repository-on-huggingface" tabindex="-1"><a class="header-anchor" href="#use-a-trained-model-from-espnet-repository-on-huggingface"><span>Use a trained model from ESPnet repository on HuggingFace.</span></a></h3><p><a href="https://huggingface.co/espnet" target="_blank" rel="noopener noreferrer">ESPnet repository on HuggingFace<span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span></a> contains more than 200 pre-trained models, for a wide variety of languages and dataset, and we are actively expanding this repositories with new models every week! This enable any user to perform transfer learning with a wide variety of models without having to re-train them. In order to use our pre-trained models, the first step is to download the &quot;.pth&quot; model file from the <a href="https://huggingface.co/espnet" target="_blank" rel="noopener noreferrer">HugginFace page<span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span></a>. There are several easy way to do it, either by manually downloading them (e.g. <code>wget https://huggingface.co/espnet/bn_openslr53/blob/main/exp/asr_train_asr_raw_bpe1000/41epoch.pth</code>), cloning it (<code>git clone https://huggingface.co/espnet/bn_openslr53</code>) or downloading it through an ESPnet recipe (described in the models&#39; pages on HuggingFace):</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>cd espnet</span></span>
84
+ <span class="line"><span>git checkout fa1b865352475b744c37f70440de1cc6b257ba70</span></span>
85
+ <span class="line"><span>pip install -e .</span></span>
86
+ <span class="line"><span>cd egs2/bn_openslr53/asr1</span></span>
87
+ <span class="line"><span>./run.sh --skip_data_prep false --skip_train true --download_model espnet/bn_openslr53</span></span>
88
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>Then, as you have the &quot;.pth&quot; model file, you can follow the steps 1 to 4 from the previous section in order to use this pre-train model.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">wget https://huggingface.co/espnet/bn_openslr53/resolve/main/exp/asr_train_asr_raw_bpe1000/</span><span style="color:#F44747;">41epoch</span><span style="color:#D4D4D4;">.pth</span></span>
89
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>The next command line will raise an error because of the size mismatch of some parameters, as mentionned before (step3).</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># will fail in about 5 seconds</span></span>
90
+ <span class="line"><span style="color:#D4D4D4;">!./asr.sh </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">stage </span><span style="color:#B5CEA8;">11</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">stop_stage </span><span style="color:#B5CEA8;">11</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">train-</span><span style="color:#4EC9B0;">set</span><span style="color:#CE9178;"> &quot;train_nodev&quot;</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">valid-</span><span style="color:#4EC9B0;">set</span><span style="color:#CE9178;"> &quot;train_dev&quot;</span><span style="color:#D4D4D4;"> \</span></span>
91
+ <span class="line"><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">test_sets </span><span style="color:#CE9178;">&quot;test&quot;</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">asr_config </span><span style="color:#CE9178;">&quot;conf/train_asr.yaml&quot;</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">asr_tag </span><span style="color:#CE9178;">&quot;transfer_learning_with_pre_trained_model&quot;</span><span style="color:#D4D4D4;">\</span></span>
92
+ <span class="line"><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">pretrained_model </span><span style="color:#CE9178;">&quot;/content/espnet/egs2/mini_an4/asr1/41epoch.pth&quot;</span></span>
93
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>To solve this issue, as mentionned, we can use the <code>--ignore_init_mismatch &quot;true&quot;</code> parameter.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># takes about 1-2 minutes</span></span>
94
+ <span class="line"><span style="color:#D4D4D4;">!./asr.sh </span><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">stage </span><span style="color:#B5CEA8;">11</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">stop_stage </span><span style="color:#B5CEA8;">11</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">train-</span><span style="color:#4EC9B0;">set</span><span style="color:#CE9178;"> &quot;train_nodev&quot;</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">valid-</span><span style="color:#4EC9B0;">set</span><span style="color:#CE9178;"> &quot;train_dev&quot;</span><span style="color:#D4D4D4;"> \</span></span>
95
+ <span class="line"><span style="color:#F44747;">--</span><span style="color:#D4D4D4;">test_sets </span><span style="color:#CE9178;">&quot;test&quot;</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">asr_config </span><span style="color:#CE9178;">&quot;conf/train_asr.yaml&quot;</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">asr_tag </span><span style="color:#CE9178;">&quot;transfer_learning_with_pre_trained_model_from_HF&quot;</span><span style="color:#D4D4D4;">\</span></span>
96
+ <span class="line"><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">pretrained_model </span><span style="color:#CE9178;">&quot;/content/espnet/egs2/mini_an4/asr1/41epoch.pth&quot;</span><span style="color:#F44747;"> --</span><span style="color:#D4D4D4;">ignore_init_mismatch </span><span style="color:#CE9178;">&quot;true&quot;</span><span style="color:#D4D4D4;"> </span></span>
97
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p><strong>Additional note about the <code>--ignore_init_mismatch true</code> option :</strong> This option is very convenient because in lots of transfer learning use cases, you will aim to use a model trained on a language X (e.g. X=English) for another language Y. Language Y may have a vocabulary (set of tokens) different from language X, for instance if you target Y=Totonac, a Mexican low resource language, your model may be stronger if you use a different set of bpes/tokens thatn the one used to train the English model. In that situation, the last layer (projection to vocabulary space) of your ASR model needs to be initialized from scratch and may be different in shape than the one of the English model. For that reason, you should use the <code>--ignore_init_mismatch true</code> option. It also enables to handle the case where the scripts are differents from languages X to Y.</p></div><!--[--><!--]--></div><footer class="vp-page-meta"><!----><div class="vp-meta-item git-info"><!----><!----></div></footer><nav class="vp-page-nav" aria-label="page navigation"><a class="route-link prev" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><div class="hint"><span class="arrow left"></span> Prev</div><div class="link"><span>Speech Recognition (Library)</span></div><!--]--></a><a class="route-link next" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><div class="hint">Next <span class="arrow right"></span></div><div class="link"><span>ESPnet2 real streaming Transformer demonstration</span></div><!--]--></a></nav><!--[--><!--]--></main><!--]--></div><!--[--><!----><!--]--><!--]--></div>
98
+ <script type="module" src="/assets/app-DTS6SjJz.js" defer></script>
99
+ </body>
100
+ </html>
espnet2/asr/espnet2_streaming_asr_demo.html ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!doctype html>
2
+ <html lang="en-US">
3
+ <head>
4
+ <meta charset="utf-8" />
5
+ <meta name="viewport" content="width=device-width,initial-scale=1" />
6
+ <meta name="generator" content="VuePress 2.0.0-rc.9" />
7
+ <style>
8
+ :root {
9
+ --c-bg: #fff;
10
+ }
11
+ html.dark {
12
+ --c-bg: #22272e;
13
+ }
14
+ html,
15
+ body {
16
+ background-color: var(--c-bg);
17
+ }
18
+ </style>
19
+ <script>
20
+ const userMode = localStorage.getItem('vuepress-color-scheme')
21
+ const systemDarkMode =
22
+ window.matchMedia &&
23
+ window.matchMedia('(prefers-color-scheme: dark)').matches
24
+ if (userMode === 'dark' || (userMode !== 'light' && systemDarkMode)) {
25
+ document.documentElement.classList.toggle('dark', true)
26
+ }
27
+ </script>
28
+ <link rel="manifest" href="/manifest.webmanifest"><meta name="application-name" content="Example"><meta name="apple-mobile-web-app-title" content="Example"><meta name="apple-mobile-web-app-status-bar-style" content="black"><meta name="msapplication-TileColor" content="#3eaf7c"><meta name="theme-color" content="#3eaf7c"><title>ESPnet2 real streaming Transformer demonstration | </title><meta name="description" content=" ">
29
+ <link rel="preload" href="/assets/style-SNWc1iKP.css" as="style"><link rel="stylesheet" href="/assets/style-SNWc1iKP.css">
30
+ <link rel="modulepreload" href="/assets/app-DTS6SjJz.js"><link rel="modulepreload" href="/assets/espnet2_streaming_asr_demo.html-Yx-OX8AZ.js">
31
+ <link rel="prefetch" href="/assets/index.html-DGcx4T0I.js" as="script"><link rel="prefetch" href="/assets/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html-CJ8-yKXK.js" as="script"><link rel="prefetch" href="/assets/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html-BDY4p1H1.js" as="script"><link rel="prefetch" href="/assets/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html-DDjiuGQB.js" as="script"><link rel="prefetch" href="/assets/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html-DnIftUJK.js" as="script"><link rel="prefetch" href="/assets/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html-D9GsoT-_.js" as="script"><link rel="prefetch" href="/assets/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html-DobzmqH0.js" as="script"><link rel="prefetch" href="/assets/espnet2_tutorial_2021_CMU_11751_18781.html-BY6Z52B4.js" as="script"><link rel="prefetch" href="/assets/asr_cli.html-BA-xBrC-.js" as="script"><link rel="prefetch" href="/assets/asr_library.html-rEQwKTMV.js" as="script"><link rel="prefetch" href="/assets/espnet2_asr_realtime_demo.html-BnK1Wovv.js" as="script"><link rel="prefetch" href="/assets/espnet2_asr_transfer_learning_demo.html-DJeSoTyY.js" as="script"><link rel="prefetch" href="/assets/espnet_se_demonstration_for_waspaa_2021.html--JdDNbEo.js" as="script"><link rel="prefetch" href="/assets/se_demo.html-DY-mv2y8.js" as="script"><link rel="prefetch" href="/assets/onnx_conversion_demo.html-D56NEMop.js" as="script"><link rel="prefetch" href="/assets/pretrained.html-JpE__EKJ.js" as="script"><link rel="prefetch" href="/assets/espnet2_2pass_slu_demo.html-BmvJ92Ni.js" as="script"><link rel="prefetch" href="/assets/st_demo.html-WLzB4ZGO.js" as="script"><link rel="prefetch" href="/assets/espnet2_tts_realtime_demo.html-BdxLBr1c.js" as="script"><link rel="prefetch" href="/assets/tts_cli.html-BfB21gs4.js" as="script"><link rel="prefetch" href="/assets/tts_realtime_demo.html-BKOGq7as.js" as="script"><link rel="prefetch" href="/assets/finetune_owsm.html-ICOQYZj2.js" as="script"><link rel="prefetch" href="/assets/finetune_with_lora.html-3NfoQDOl.js" as="script"><link rel="prefetch" href="/assets/train.html-BQ-t2Cs4.js" as="script"><link rel="prefetch" href="/assets/tacotron2.html-Ds1AKES7.js" as="script"><link rel="prefetch" href="/assets/404.html-DN7291h8.js" as="script"><link rel="prefetch" href="/assets/NpmBadge-rh9tvaXX.js" as="script">
32
+ </head>
33
+ <body>
34
+ <div id="app"><!--[--><div class="theme-container"><!--[--><header class="navbar"><div class="toggle-sidebar-button" title="toggle sidebar" aria-expanded="false" role="button" tabindex="0"><div class="icon" aria-hidden="true"><span></span><span></span><span></span></div></div><span><a class="route-link" href="/"><img class="logo" src="/images/espnet_logo1.png" alt=" "><span class="site-name can-hide" aria-hidden="true"> </span></a></span><div class="navbar-items-wrapper" style=""><!--[--><!--]--><nav class="navbar-items can-hide" aria-label="site navigation"><!--[--><div class="navbar-item"><a class="external-link" href="https://github.com/espnet/espnet" rel="noopener noreferrer" target="_blank" aria-label="Github"><!--[--><!--[--><!--]--> Github <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://huggingface.co/espnet" rel="noopener noreferrer" target="_blank" aria-label="HuggingFace"><!--[--><!--[--><!--]--> HuggingFace <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://espnet.github.io/espnet" rel="noopener noreferrer" target="_blank" aria-label="Docs"><!--[--><!--[--><!--]--> Docs <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SE</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SLU</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link route-link-active" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>OTHERS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ST</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/tts/tacotron2.html" aria-label="TTS demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> TTS demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_with_lora.html" aria-label="Finetune Model with ESPnet-Easy"><!--[--><!--[--><!--]--> Finetune Model with ESPnet-Easy <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/train.html" aria-label="Sample demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> Sample demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_owsm.html" aria-label="OWSM finetuning with custom dataset"><!--[--><!--[--><!--]--> OWSM finetuning with custom dataset <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_tutorial_2021_CMU_11751_18781.html" aria-label="CMU 11751/18781 2021: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 2021: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html" aria-label="CMU 11492/11692 Spring 2023: Spoken Language Understanding"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Spoken Language Understanding <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html" aria-label="CMU 11492/11692 Spring 2023: Speech Enhancement"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Speech Enhancement <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html" aria-label="CMU 11492/11692 Spring 2023: Data preparation"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Data preparation <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task)"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html" aria-label="CMU 11492/11692 Spring 2023: Text to Speech"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Text to Speech <!--[--><!--]--><!--]--></a></li><!--]--></ul></div></div><!--]--></nav><!--[--><!--]--><button class="toggle-color-mode-button" title="toggle color mode"><svg style="" class="icon" focusable="false" viewBox="0 0 32 32"><path d="M16 12.005a4 4 0 1 1-4 4a4.005 4.005 0 0 1 4-4m0-2a6 6 0 1 0 6 6a6 6 0 0 0-6-6z" fill="currentColor"></path><path d="M5.394 6.813l1.414-1.415l3.506 3.506L8.9 10.318z" fill="currentColor"></path><path d="M2 15.005h5v2H2z" fill="currentColor"></path><path d="M5.394 25.197L8.9 21.691l1.414 1.415l-3.506 3.505z" fill="currentColor"></path><path d="M15 25.005h2v5h-2z" fill="currentColor"></path><path d="M21.687 23.106l1.414-1.415l3.506 3.506l-1.414 1.414z" fill="currentColor"></path><path d="M25 15.005h5v2h-5z" fill="currentColor"></path><path d="M21.687 8.904l3.506-3.506l1.414 1.415l-3.506 3.505z" fill="currentColor"></path><path d="M15 2.005h2v5h-2z" fill="currentColor"></path></svg><svg style="display:none;" class="icon" focusable="false" viewBox="0 0 32 32"><path d="M13.502 5.414a15.075 15.075 0 0 0 11.594 18.194a11.113 11.113 0 0 1-7.975 3.39c-.138 0-.278.005-.418 0a11.094 11.094 0 0 1-3.2-21.584M14.98 3a1.002 1.002 0 0 0-.175.016a13.096 13.096 0 0 0 1.825 25.981c.164.006.328 0 .49 0a13.072 13.072 0 0 0 10.703-5.555a1.01 1.01 0 0 0-.783-1.565A13.08 13.08 0 0 1 15.89 4.38A1.015 1.015 0 0 0 14.98 3z" fill="currentColor"></path></svg></button><form class="search-box" role="search"><input type="search" placeholder="Search" autocomplete="off" spellcheck="false" value><!----></form></div></header><!--]--><div class="sidebar-mask"></div><!--[--><aside class="sidebar"><nav class="navbar-items" aria-label="site navigation"><!--[--><div class="navbar-item"><a class="external-link" href="https://github.com/espnet/espnet" rel="noopener noreferrer" target="_blank" aria-label="Github"><!--[--><!--[--><!--]--> Github <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://huggingface.co/espnet" rel="noopener noreferrer" target="_blank" aria-label="HuggingFace"><!--[--><!--[--><!--]--> HuggingFace <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://espnet.github.io/espnet" rel="noopener noreferrer" target="_blank" aria-label="Docs"><!--[--><!--[--><!--]--> Docs <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SE</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SLU</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link route-link-active" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>OTHERS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ST</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/tts/tacotron2.html" aria-label="TTS demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> TTS demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_with_lora.html" aria-label="Finetune Model with ESPnet-Easy"><!--[--><!--[--><!--]--> Finetune Model with ESPnet-Easy <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/train.html" aria-label="Sample demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> Sample demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_owsm.html" aria-label="OWSM finetuning with custom dataset"><!--[--><!--[--><!--]--> OWSM finetuning with custom dataset <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_tutorial_2021_CMU_11751_18781.html" aria-label="CMU 11751/18781 2021: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 2021: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html" aria-label="CMU 11492/11692 Spring 2023: Spoken Language Understanding"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Spoken Language Understanding <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html" aria-label="CMU 11492/11692 Spring 2023: Speech Enhancement"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Speech Enhancement <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html" aria-label="CMU 11492/11692 Spring 2023: Data preparation"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Data preparation <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task)"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html" aria-label="CMU 11492/11692 Spring 2023: Text to Speech"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Text to Speech <!--[--><!--]--><!--]--></a></li><!--]--></ul></div></div><!--]--></nav><!--[--><!--]--><ul class="sidebar-items"><!--[--><li><p tabindex="0" class="sidebar-item sidebar-heading">TTS <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">SE <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">SLU <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading active">ASR <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link route-link-active sidebar-item active" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="#train-a-streaming-transformer-model" aria-label="Train a streaming Transformer model"><!--[--><!--[--><!--]--> Train a streaming Transformer model <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#download-pre-trained-model-and-audio-file-for-demo" aria-label="Download pre-trained model and audio file for demo"><!--[--><!--[--><!--]--> Download pre-trained model and audio file for demo <!--[--><!--]--><!--]--></a><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="#for-mandarin-task-pretrained-using-aishell-1" aria-label="For Mandarin Task (Pretrained using AISHELL-1)"><!--[--><!--[--><!--]--> For Mandarin Task (Pretrained using AISHELL-1) <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#for-english-task-pretrained-using-tedlium2" aria-label="For English Task (Pretrained using Tedlium2)"><!--[--><!--[--><!--]--> For English Task (Pretrained using Tedlium2) <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><a class="route-link sidebar-item" href="#import-packages" aria-label="Import packages"><!--[--><!--[--><!--]--> Import packages <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#prepare-for-inference" aria-label="Prepare for inference"><!--[--><!--[--><!--]--> Prepare for inference <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#recognize-the-audio-file" aria-label="Recognize the audio file"><!--[--><!--[--><!--]--> Recognize the audio file <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#recognize-the-speech-from-speaker" aria-label="Recognize the speech from speaker"><!--[--><!--[--><!--]--> Recognize the speech from speaker <!--[--><!--]--><!--]--></a><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="#install-pyaudio" aria-label="Install pyaudio"><!--[--><!--[--><!--]--> Install pyaudio <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#streamingly-recognize-with-pyaudio" aria-label="Streamingly recognize with pyaudio"><!--[--><!--[--><!--]--> Streamingly recognize with pyaudio <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><!--]--></ul></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">OTHERS <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">ST <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><!--]--></ul><!--[--><!--]--></aside><!--]--><!--[--><main class="page"><!--[--><!--]--><div class="theme-default-content"><!--[--><!--]--><div><h1 id="espnet2-real-streaming-transformer-demonstration" tabindex="-1"><a class="header-anchor" href="#espnet2-real-streaming-transformer-demonstration"><span>ESPnet2 real streaming Transformer demonstration</span></a></h1><p>Details in &quot;Streaming Transformer ASR with Blockwise Synchronous Beam Search&quot; (https://arxiv.org/abs/2006.14941)</p><p>This local notebook provides a demonstration of streaming ASR based on Transformer using ESPnet2.</p><p>You can recognize a recorded audio file or a speech online.</p><p>Author: Keqi Deng (UCAS)</p><h2 id="train-a-streaming-transformer-model" tabindex="-1"><a class="header-anchor" href="#train-a-streaming-transformer-model"><span>Train a streaming Transformer model</span></a></h2><p>You can train a streaming Transformer model on your own corpus following the example of https://github.com/espnet/espnet/blob/master/egs2/aishell/asr1/run_streaming.sh</p><h2 id="download-pre-trained-model-and-audio-file-for-demo" tabindex="-1"><a class="header-anchor" href="#download-pre-trained-model-and-audio-file-for-demo"><span>Download pre-trained model and audio file for demo</span></a></h2><p>You can download the pre-trained model from the ESPnet_model_zoo or directly from Huggingface.</p><h3 id="for-mandarin-task-pretrained-using-aishell-1" tabindex="-1"><a class="header-anchor" href="#for-mandarin-task-pretrained-using-aishell-1"><span>For Mandarin Task (Pretrained using AISHELL-1)</span></a></h3><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">tag=</span><span style="color:#CE9178;">&#39;Emiru Tsunoo/aishell_asr_train_asr_streaming_transformer_raw_zh_char_sp_valid.acc.ave&#39;</span></span>
35
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h3 id="for-english-task-pretrained-using-tedlium2" tabindex="-1"><a class="header-anchor" href="#for-english-task-pretrained-using-tedlium2"><span>For English Task (Pretrained using Tedlium2)</span></a></h3><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">tag=</span><span style="color:#CE9178;">&#39;D-Keqi/espnet_asr_train_asr_streaming_transformer_raw_en_bpe500_sp_valid.acc.ave&#39;</span></span>
36
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h2 id="import-packages" tabindex="-1"><a class="header-anchor" href="#import-packages"><span>Import packages</span></a></h2><p>Make sure that you have installed the latest ESPnet</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> sys</span></span>
37
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> espnet</span></span>
38
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet2.bin.asr_inference_streaming </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> Speech2TextStreaming</span></span>
39
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet_model_zoo.downloader </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> ModelDownloader</span></span>
40
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> argparse</span></span>
41
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> numpy </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> np</span></span>
42
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> wave</span></span>
43
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="prepare-for-inference" tabindex="-1"><a class="header-anchor" href="#prepare-for-inference"><span>Prepare for inference</span></a></h2><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">d=ModelDownloader()</span></span>
44
+ <span class="line"><span style="color:#D4D4D4;">speech2text = Speech2TextStreaming(</span></span>
45
+ <span class="line"><span style="color:#D4D4D4;"> **d.download_and_unpack(tag),</span></span>
46
+ <span class="line"><span style="color:#9CDCFE;"> token_type</span><span style="color:#D4D4D4;">=</span><span style="color:#569CD6;">None</span><span style="color:#D4D4D4;">,</span></span>
47
+ <span class="line"><span style="color:#9CDCFE;"> bpemodel</span><span style="color:#D4D4D4;">=</span><span style="color:#569CD6;">None</span><span style="color:#D4D4D4;">,</span></span>
48
+ <span class="line"><span style="color:#9CDCFE;"> maxlenratio</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">0.0</span><span style="color:#D4D4D4;">,</span></span>
49
+ <span class="line"><span style="color:#9CDCFE;"> minlenratio</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">0.0</span><span style="color:#D4D4D4;">,</span></span>
50
+ <span class="line"><span style="color:#9CDCFE;"> beam_size</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">20</span><span style="color:#D4D4D4;">,</span></span>
51
+ <span class="line"><span style="color:#9CDCFE;"> ctc_weight</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">0.5</span><span style="color:#D4D4D4;">,</span></span>
52
+ <span class="line"><span style="color:#9CDCFE;"> lm_weight</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">0.0</span><span style="color:#D4D4D4;">,</span></span>
53
+ <span class="line"><span style="color:#9CDCFE;"> penalty</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">0.0</span><span style="color:#D4D4D4;">,</span></span>
54
+ <span class="line"><span style="color:#9CDCFE;"> nbest</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">,</span></span>
55
+ <span class="line"><span style="color:#9CDCFE;"> device</span><span style="color:#D4D4D4;"> = </span><span style="color:#CE9178;">&quot;cpu&quot;</span><span style="color:#D4D4D4;">,</span></span>
56
+ <span class="line"><span style="color:#9CDCFE;"> disable_repetition_detection</span><span style="color:#D4D4D4;">=</span><span style="color:#569CD6;">True</span><span style="color:#D4D4D4;">,</span></span>
57
+ <span class="line"><span style="color:#9CDCFE;"> decoder_text_length_limit</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">,</span></span>
58
+ <span class="line"><span style="color:#9CDCFE;"> encoded_feat_length_limit</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">0</span></span>
59
+ <span class="line"><span style="color:#D4D4D4;">)</span></span>
60
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">prev_lines = </span><span style="color:#B5CEA8;">0</span></span>
61
+ <span class="line"><span style="color:#569CD6;">def</span><span style="color:#DCDCAA;"> progress_output</span><span style="color:#D4D4D4;">(</span><span style="color:#9CDCFE;">text</span><span style="color:#D4D4D4;">):</span></span>
62
+ <span class="line"><span style="color:#569CD6;"> global</span><span style="color:#D4D4D4;"> prev_lines</span></span>
63
+ <span class="line"><span style="color:#D4D4D4;"> lines=[</span><span style="color:#CE9178;">&#39;&#39;</span><span style="color:#D4D4D4;">]</span></span>
64
+ <span class="line"><span style="color:#C586C0;"> for</span><span style="color:#D4D4D4;"> i </span><span style="color:#C586C0;">in</span><span style="color:#D4D4D4;"> text:</span></span>
65
+ <span class="line"><span style="color:#C586C0;"> if</span><span style="color:#DCDCAA;"> len</span><span style="color:#D4D4D4;">(lines[-</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">]) &gt; </span><span style="color:#B5CEA8;">100</span><span style="color:#D4D4D4;">:</span></span>
66
+ <span class="line"><span style="color:#D4D4D4;"> lines.append(</span><span style="color:#CE9178;">&#39;&#39;</span><span style="color:#D4D4D4;">)</span></span>
67
+ <span class="line"><span style="color:#D4D4D4;"> lines[-</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">] += i</span></span>
68
+ <span class="line"><span style="color:#C586C0;"> for</span><span style="color:#D4D4D4;"> i,line </span><span style="color:#C586C0;">in</span><span style="color:#DCDCAA;"> enumerate</span><span style="color:#D4D4D4;">(lines):</span></span>
69
+ <span class="line"><span style="color:#C586C0;"> if</span><span style="color:#D4D4D4;"> i == prev_lines:</span></span>
70
+ <span class="line"><span style="color:#D4D4D4;"> sys.stderr.write(</span><span style="color:#CE9178;">&#39;</span><span style="color:#D7BA7D;">\n\r</span><span style="color:#CE9178;">&#39;</span><span style="color:#D4D4D4;">)</span></span>
71
+ <span class="line"><span style="color:#C586C0;"> else</span><span style="color:#D4D4D4;">:</span></span>
72
+ <span class="line"><span style="color:#D4D4D4;"> sys.stderr.write(</span><span style="color:#CE9178;">&#39;</span><span style="color:#D7BA7D;">\r\033</span><span style="color:#CE9178;">[B</span><span style="color:#D7BA7D;">\033</span><span style="color:#CE9178;">[K&#39;</span><span style="color:#D4D4D4;">)</span></span>
73
+ <span class="line"><span style="color:#D4D4D4;"> sys.stderr.write(line)</span></span>
74
+ <span class="line"></span>
75
+ <span class="line"><span style="color:#D4D4D4;"> prev_lines = </span><span style="color:#DCDCAA;">len</span><span style="color:#D4D4D4;">(lines)</span></span>
76
+ <span class="line"><span style="color:#D4D4D4;"> sys.stderr.flush()</span></span>
77
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#569CD6;">def</span><span style="color:#DCDCAA;"> recognize</span><span style="color:#D4D4D4;">(</span><span style="color:#9CDCFE;">wavfile</span><span style="color:#D4D4D4;">):</span></span>
78
+ <span class="line"><span style="color:#C586C0;"> with</span><span style="color:#D4D4D4;"> wave.open(wavfile, </span><span style="color:#CE9178;">&#39;rb&#39;</span><span style="color:#D4D4D4;">) </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> wavfile:</span></span>
79
+ <span class="line"><span style="color:#D4D4D4;"> ch=wavfile.getnchannels()</span></span>
80
+ <span class="line"><span style="color:#D4D4D4;"> bits=wavfile.getsampwidth()</span></span>
81
+ <span class="line"><span style="color:#D4D4D4;"> rate=wavfile.getframerate()</span></span>
82
+ <span class="line"><span style="color:#D4D4D4;"> nframes=wavfile.getnframes()</span></span>
83
+ <span class="line"><span style="color:#D4D4D4;"> buf = wavfile.readframes(-</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">)</span></span>
84
+ <span class="line"><span style="color:#D4D4D4;"> data=np.frombuffer(buf, </span><span style="color:#9CDCFE;">dtype</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&#39;int16&#39;</span><span style="color:#D4D4D4;">)</span></span>
85
+ <span class="line"><span style="color:#D4D4D4;"> speech = data.astype(np.float16)/</span><span style="color:#B5CEA8;">32767.0</span><span style="color:#6A9955;"> #32767 is the upper limit of 16-bit binary numbers and is used for the normalization of int to float.</span></span>
86
+ <span class="line"><span style="color:#D4D4D4;"> sim_chunk_length = </span><span style="color:#B5CEA8;">640</span></span>
87
+ <span class="line"><span style="color:#C586C0;"> if</span><span style="color:#D4D4D4;"> sim_chunk_length &gt; </span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">:</span></span>
88
+ <span class="line"><span style="color:#C586C0;"> for</span><span style="color:#D4D4D4;"> i </span><span style="color:#C586C0;">in</span><span style="color:#DCDCAA;"> range</span><span style="color:#D4D4D4;">(</span><span style="color:#DCDCAA;">len</span><span style="color:#D4D4D4;">(speech)//sim_chunk_length):</span></span>
89
+ <span class="line"><span style="color:#D4D4D4;"> results = speech2text(</span><span style="color:#9CDCFE;">speech</span><span style="color:#D4D4D4;">=speech[i*sim_chunk_length:(i+</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">)*sim_chunk_length], </span><span style="color:#9CDCFE;">is_final</span><span style="color:#D4D4D4;">=</span><span style="color:#569CD6;">False</span><span style="color:#D4D4D4;">)</span></span>
90
+ <span class="line"><span style="color:#C586C0;"> if</span><span style="color:#D4D4D4;"> results </span><span style="color:#569CD6;">is</span><span style="color:#569CD6;"> not</span><span style="color:#569CD6;"> None</span><span style="color:#569CD6;"> and</span><span style="color:#DCDCAA;"> len</span><span style="color:#D4D4D4;">(results) &gt; </span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">:</span></span>
91
+ <span class="line"><span style="color:#D4D4D4;"> nbests = [text </span><span style="color:#C586C0;">for</span><span style="color:#D4D4D4;"> text, token, token_int, hyp </span><span style="color:#C586C0;">in</span><span style="color:#D4D4D4;"> results]</span></span>
92
+ <span class="line"><span style="color:#D4D4D4;"> text = nbests[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">] </span><span style="color:#C586C0;">if</span><span style="color:#D4D4D4;"> nbests </span><span style="color:#569CD6;">is</span><span style="color:#569CD6;"> not</span><span style="color:#569CD6;"> None</span><span style="color:#569CD6;"> and</span><span style="color:#DCDCAA;"> len</span><span style="color:#D4D4D4;">(nbests) &gt; </span><span style="color:#B5CEA8;">0</span><span style="color:#C586C0;"> else</span><span style="color:#CE9178;"> &quot;&quot;</span></span>
93
+ <span class="line"><span style="color:#D4D4D4;"> progress_output(nbests[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">])</span></span>
94
+ <span class="line"><span style="color:#C586C0;"> else</span><span style="color:#D4D4D4;">:</span></span>
95
+ <span class="line"><span style="color:#D4D4D4;"> progress_output(</span><span style="color:#CE9178;">&quot;&quot;</span><span style="color:#D4D4D4;">)</span></span>
96
+ <span class="line"><span style="color:#D4D4D4;"> </span></span>
97
+ <span class="line"><span style="color:#D4D4D4;"> results = speech2text(speech[(i+</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">)*sim_chunk_length:</span><span style="color:#DCDCAA;">len</span><span style="color:#D4D4D4;">(speech)], </span><span style="color:#9CDCFE;">is_final</span><span style="color:#D4D4D4;">=</span><span style="color:#569CD6;">True</span><span style="color:#D4D4D4;">)</span></span>
98
+ <span class="line"><span style="color:#C586C0;"> else</span><span style="color:#D4D4D4;">:</span></span>
99
+ <span class="line"><span style="color:#D4D4D4;"> results = speech2text(speech, </span><span style="color:#9CDCFE;">is_final</span><span style="color:#D4D4D4;">=</span><span style="color:#569CD6;">True</span><span style="color:#D4D4D4;">)</span></span>
100
+ <span class="line"><span style="color:#D4D4D4;"> nbests = [text </span><span style="color:#C586C0;">for</span><span style="color:#D4D4D4;"> text, token, token_int, hyp </span><span style="color:#C586C0;">in</span><span style="color:#D4D4D4;"> results]</span></span>
101
+ <span class="line"><span style="color:#D4D4D4;"> progress_output(nbests[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">])</span></span>
102
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="recognize-the-audio-file" tabindex="-1"><a class="header-anchor" href="#recognize-the-audio-file"><span>Recognize the audio file</span></a></h2><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;">#You can upload your own audio file for recognition, and also we provide some demo audio files that you can download from Google drive. </span></span>
103
+ <span class="line"><span style="color:#6A9955;">#For Mandarin task, the demo file comes from the AISSHELL-1: https://drive.google.com/file/d/1l8w93r8Bs5FtC3A-1ydEqFQdP4k6FiUL/view?usp=sharing</span></span>
104
+ <span class="line"><span style="color:#6A9955;">#wavfile=&#39;./BAC009S0724W0121.wav&#39;</span></span>
105
+ <span class="line"><span style="color:#6A9955;">#For English task, the demo file comes from the Librispeech: https://drive.google.com/file/d/1l71ZUNQ6qQk95T54H0tH_OEwZvWnEL4u/view?usp=sharing</span></span>
106
+ <span class="line"><span style="color:#6A9955;">#wavfile=&#39;./61-70968-0000.wav&#39;</span></span>
107
+ <span class="line"><span style="color:#D4D4D4;">recognize(wavfile)</span></span>
108
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="recognize-the-speech-from-speaker" tabindex="-1"><a class="header-anchor" href="#recognize-the-speech-from-speaker"><span>Recognize the speech from speaker</span></a></h2><h3 id="install-pyaudio" tabindex="-1"><a class="header-anchor" href="#install-pyaudio"><span>Install pyaudio</span></a></h3><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> pyaudio</span></span>
109
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h3 id="streamingly-recognize-with-pyaudio" tabindex="-1"><a class="header-anchor" href="#streamingly-recognize-with-pyaudio"><span>Streamingly recognize with pyaudio</span></a></h3><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">CHUNK=</span><span style="color:#B5CEA8;">2048</span></span>
110
+ <span class="line"><span style="color:#D4D4D4;">FORMAT=pyaudio.paInt16</span></span>
111
+ <span class="line"><span style="color:#D4D4D4;">CHANNELS=</span><span style="color:#B5CEA8;">1</span></span>
112
+ <span class="line"><span style="color:#D4D4D4;">RATE=</span><span style="color:#B5CEA8;">16000</span></span>
113
+ <span class="line"><span style="color:#D4D4D4;">RECORD_SECONDS=</span><span style="color:#B5CEA8;">5</span></span>
114
+ <span class="line"><span style="color:#D4D4D4;">p=pyaudio.PyAudio()</span></span>
115
+ <span class="line"><span style="color:#D4D4D4;">stream = p.open(</span><span style="color:#9CDCFE;">format</span><span style="color:#D4D4D4;">=FORMAT,</span><span style="color:#9CDCFE;">channels</span><span style="color:#D4D4D4;">=CHANNELS,</span><span style="color:#9CDCFE;">rate</span><span style="color:#D4D4D4;">=RATE,</span><span style="color:#9CDCFE;">input</span><span style="color:#D4D4D4;">=</span><span style="color:#569CD6;">True</span><span style="color:#D4D4D4;">,</span><span style="color:#9CDCFE;">frames_per_buffer</span><span style="color:#D4D4D4;">=CHUNK)</span></span>
116
+ <span class="line"><span style="color:#C586C0;">for</span><span style="color:#D4D4D4;"> i </span><span style="color:#C586C0;">in</span><span style="color:#DCDCAA;"> range</span><span style="color:#D4D4D4;">(</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">,</span><span style="color:#4EC9B0;">int</span><span style="color:#D4D4D4;">(RATE/CHUNK*RECORD_SECONDS)+</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">):</span></span>
117
+ <span class="line"><span style="color:#D4D4D4;"> data=stream.read(CHUNK)</span></span>
118
+ <span class="line"><span style="color:#D4D4D4;"> data=np.frombuffer(data, </span><span style="color:#9CDCFE;">dtype</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&#39;int16&#39;</span><span style="color:#D4D4D4;">)</span></span>
119
+ <span class="line"><span style="color:#D4D4D4;"> data=data.astype(np.float16)/</span><span style="color:#B5CEA8;">32767.0</span><span style="color:#6A9955;"> #32767 is the upper limit of 16-bit binary numbers and is used for the normalization of int to float.</span></span>
120
+ <span class="line"><span style="color:#C586C0;"> if</span><span style="color:#D4D4D4;"> i==</span><span style="color:#4EC9B0;">int</span><span style="color:#D4D4D4;">(RATE/CHUNK*RECORD_SECONDS):</span></span>
121
+ <span class="line"><span style="color:#D4D4D4;"> results = speech2text(</span><span style="color:#9CDCFE;">speech</span><span style="color:#D4D4D4;">=data, </span><span style="color:#9CDCFE;">is_final</span><span style="color:#D4D4D4;">=</span><span style="color:#569CD6;">True</span><span style="color:#D4D4D4;">)</span></span>
122
+ <span class="line"><span style="color:#C586C0;"> break</span></span>
123
+ <span class="line"><span style="color:#D4D4D4;"> results = speech2text(</span><span style="color:#9CDCFE;">speech</span><span style="color:#D4D4D4;">=data, </span><span style="color:#9CDCFE;">is_final</span><span style="color:#D4D4D4;">=</span><span style="color:#569CD6;">False</span><span style="color:#D4D4D4;">)</span></span>
124
+ <span class="line"><span style="color:#C586C0;"> if</span><span style="color:#D4D4D4;"> results </span><span style="color:#569CD6;">is</span><span style="color:#569CD6;"> not</span><span style="color:#569CD6;"> None</span><span style="color:#569CD6;"> and</span><span style="color:#DCDCAA;"> len</span><span style="color:#D4D4D4;">(results) &gt; </span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">:</span></span>
125
+ <span class="line"><span style="color:#D4D4D4;"> nbests = [text </span><span style="color:#C586C0;">for</span><span style="color:#D4D4D4;"> text, token, token_int, hyp </span><span style="color:#C586C0;">in</span><span style="color:#D4D4D4;"> results]</span></span>
126
+ <span class="line"><span style="color:#D4D4D4;"> text = nbests[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">] </span><span style="color:#C586C0;">if</span><span style="color:#D4D4D4;"> nbests </span><span style="color:#569CD6;">is</span><span style="color:#569CD6;"> not</span><span style="color:#569CD6;"> None</span><span style="color:#569CD6;"> and</span><span style="color:#DCDCAA;"> len</span><span style="color:#D4D4D4;">(nbests) &gt; </span><span style="color:#B5CEA8;">0</span><span style="color:#C586C0;"> else</span><span style="color:#CE9178;"> &quot;&quot;</span></span>
127
+ <span class="line"><span style="color:#D4D4D4;"> progress_output(nbests[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">])</span></span>
128
+ <span class="line"><span style="color:#C586C0;"> else</span><span style="color:#D4D4D4;">:</span></span>
129
+ <span class="line"><span style="color:#D4D4D4;"> progress_output(</span><span style="color:#CE9178;">&quot;&quot;</span><span style="color:#D4D4D4;">)</span></span>
130
+ <span class="line"><span style="color:#D4D4D4;">nbests = [text </span><span style="color:#C586C0;">for</span><span style="color:#D4D4D4;"> text, token, token_int, hyp </span><span style="color:#C586C0;">in</span><span style="color:#D4D4D4;"> results]</span></span>
131
+ <span class="line"><span style="color:#D4D4D4;">progress_output(nbests[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">])</span></span>
132
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div></div><!--[--><!--]--></div><footer class="vp-page-meta"><!----><div class="vp-meta-item git-info"><!----><!----></div></footer><nav class="vp-page-nav" aria-label="page navigation"><a class="route-link prev" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><div class="hint"><span class="arrow left"></span> Prev</div><div class="link"><span>Use transfer learning for ASR in ESPnet2</span></div><!--]--></a><!----></nav><!--[--><!--]--></main><!--]--></div><!--[--><!----><!--]--><!--]--></div>
133
+ <script type="module" src="/assets/app-DTS6SjJz.js" defer></script>
134
+ </body>
135
+ </html>
espnet2/others/onnx_conversion_demo.html ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!doctype html>
2
+ <html lang="en-US">
3
+ <head>
4
+ <meta charset="utf-8" />
5
+ <meta name="viewport" content="width=device-width,initial-scale=1" />
6
+ <meta name="generator" content="VuePress 2.0.0-rc.9" />
7
+ <style>
8
+ :root {
9
+ --c-bg: #fff;
10
+ }
11
+ html.dark {
12
+ --c-bg: #22272e;
13
+ }
14
+ html,
15
+ body {
16
+ background-color: var(--c-bg);
17
+ }
18
+ </style>
19
+ <script>
20
+ const userMode = localStorage.getItem('vuepress-color-scheme')
21
+ const systemDarkMode =
22
+ window.matchMedia &&
23
+ window.matchMedia('(prefers-color-scheme: dark)').matches
24
+ if (userMode === 'dark' || (userMode !== 'light' && systemDarkMode)) {
25
+ document.documentElement.classList.toggle('dark', true)
26
+ }
27
+ </script>
28
+ <link rel="manifest" href="/manifest.webmanifest"><meta name="application-name" content="Example"><meta name="apple-mobile-web-app-title" content="Example"><meta name="apple-mobile-web-app-status-bar-style" content="black"><meta name="msapplication-TileColor" content="#3eaf7c"><meta name="theme-color" content="#3eaf7c"><title>espnet_onnx demonstration | </title><meta name="description" content=" ">
29
+ <link rel="preload" href="/assets/style-SNWc1iKP.css" as="style"><link rel="stylesheet" href="/assets/style-SNWc1iKP.css">
30
+ <link rel="modulepreload" href="/assets/app-DTS6SjJz.js"><link rel="modulepreload" href="/assets/onnx_conversion_demo.html-D56NEMop.js">
31
+ <link rel="prefetch" href="/assets/index.html-DGcx4T0I.js" as="script"><link rel="prefetch" href="/assets/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html-CJ8-yKXK.js" as="script"><link rel="prefetch" href="/assets/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html-BDY4p1H1.js" as="script"><link rel="prefetch" href="/assets/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html-DDjiuGQB.js" as="script"><link rel="prefetch" href="/assets/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html-DnIftUJK.js" as="script"><link rel="prefetch" href="/assets/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html-D9GsoT-_.js" as="script"><link rel="prefetch" href="/assets/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html-DobzmqH0.js" as="script"><link rel="prefetch" href="/assets/espnet2_tutorial_2021_CMU_11751_18781.html-BY6Z52B4.js" as="script"><link rel="prefetch" href="/assets/asr_cli.html-BA-xBrC-.js" as="script"><link rel="prefetch" href="/assets/asr_library.html-rEQwKTMV.js" as="script"><link rel="prefetch" href="/assets/espnet2_asr_realtime_demo.html-BnK1Wovv.js" as="script"><link rel="prefetch" href="/assets/espnet2_asr_transfer_learning_demo.html-DJeSoTyY.js" as="script"><link rel="prefetch" href="/assets/espnet2_streaming_asr_demo.html-Yx-OX8AZ.js" as="script"><link rel="prefetch" href="/assets/espnet_se_demonstration_for_waspaa_2021.html--JdDNbEo.js" as="script"><link rel="prefetch" href="/assets/se_demo.html-DY-mv2y8.js" as="script"><link rel="prefetch" href="/assets/pretrained.html-JpE__EKJ.js" as="script"><link rel="prefetch" href="/assets/espnet2_2pass_slu_demo.html-BmvJ92Ni.js" as="script"><link rel="prefetch" href="/assets/st_demo.html-WLzB4ZGO.js" as="script"><link rel="prefetch" href="/assets/espnet2_tts_realtime_demo.html-BdxLBr1c.js" as="script"><link rel="prefetch" href="/assets/tts_cli.html-BfB21gs4.js" as="script"><link rel="prefetch" href="/assets/tts_realtime_demo.html-BKOGq7as.js" as="script"><link rel="prefetch" href="/assets/finetune_owsm.html-ICOQYZj2.js" as="script"><link rel="prefetch" href="/assets/finetune_with_lora.html-3NfoQDOl.js" as="script"><link rel="prefetch" href="/assets/train.html-BQ-t2Cs4.js" as="script"><link rel="prefetch" href="/assets/tacotron2.html-Ds1AKES7.js" as="script"><link rel="prefetch" href="/assets/404.html-DN7291h8.js" as="script"><link rel="prefetch" href="/assets/NpmBadge-rh9tvaXX.js" as="script">
32
+ </head>
33
+ <body>
34
+ <div id="app"><!--[--><div class="theme-container"><!--[--><header class="navbar"><div class="toggle-sidebar-button" title="toggle sidebar" aria-expanded="false" role="button" tabindex="0"><div class="icon" aria-hidden="true"><span></span><span></span><span></span></div></div><span><a class="route-link" href="/"><img class="logo" src="/images/espnet_logo1.png" alt=" "><span class="site-name can-hide" aria-hidden="true"> </span></a></span><div class="navbar-items-wrapper" style=""><!--[--><!--]--><nav class="navbar-items can-hide" aria-label="site navigation"><!--[--><div class="navbar-item"><a class="external-link" href="https://github.com/espnet/espnet" rel="noopener noreferrer" target="_blank" aria-label="Github"><!--[--><!--[--><!--]--> Github <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://huggingface.co/espnet" rel="noopener noreferrer" target="_blank" aria-label="HuggingFace"><!--[--><!--[--><!--]--> HuggingFace <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://espnet.github.io/espnet" rel="noopener noreferrer" target="_blank" aria-label="Docs"><!--[--><!--[--><!--]--> Docs <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SE</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SLU</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>OTHERS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link route-link-active" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ST</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/tts/tacotron2.html" aria-label="TTS demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> TTS demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_with_lora.html" aria-label="Finetune Model with ESPnet-Easy"><!--[--><!--[--><!--]--> Finetune Model with ESPnet-Easy <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/train.html" aria-label="Sample demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> Sample demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_owsm.html" aria-label="OWSM finetuning with custom dataset"><!--[--><!--[--><!--]--> OWSM finetuning with custom dataset <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_tutorial_2021_CMU_11751_18781.html" aria-label="CMU 11751/18781 2021: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 2021: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html" aria-label="CMU 11492/11692 Spring 2023: Spoken Language Understanding"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Spoken Language Understanding <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html" aria-label="CMU 11492/11692 Spring 2023: Speech Enhancement"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Speech Enhancement <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html" aria-label="CMU 11492/11692 Spring 2023: Data preparation"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Data preparation <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task)"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html" aria-label="CMU 11492/11692 Spring 2023: Text to Speech"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Text to Speech <!--[--><!--]--><!--]--></a></li><!--]--></ul></div></div><!--]--></nav><!--[--><!--]--><button class="toggle-color-mode-button" title="toggle color mode"><svg style="" class="icon" focusable="false" viewBox="0 0 32 32"><path d="M16 12.005a4 4 0 1 1-4 4a4.005 4.005 0 0 1 4-4m0-2a6 6 0 1 0 6 6a6 6 0 0 0-6-6z" fill="currentColor"></path><path d="M5.394 6.813l1.414-1.415l3.506 3.506L8.9 10.318z" fill="currentColor"></path><path d="M2 15.005h5v2H2z" fill="currentColor"></path><path d="M5.394 25.197L8.9 21.691l1.414 1.415l-3.506 3.505z" fill="currentColor"></path><path d="M15 25.005h2v5h-2z" fill="currentColor"></path><path d="M21.687 23.106l1.414-1.415l3.506 3.506l-1.414 1.414z" fill="currentColor"></path><path d="M25 15.005h5v2h-5z" fill="currentColor"></path><path d="M21.687 8.904l3.506-3.506l1.414 1.415l-3.506 3.505z" fill="currentColor"></path><path d="M15 2.005h2v5h-2z" fill="currentColor"></path></svg><svg style="display:none;" class="icon" focusable="false" viewBox="0 0 32 32"><path d="M13.502 5.414a15.075 15.075 0 0 0 11.594 18.194a11.113 11.113 0 0 1-7.975 3.39c-.138 0-.278.005-.418 0a11.094 11.094 0 0 1-3.2-21.584M14.98 3a1.002 1.002 0 0 0-.175.016a13.096 13.096 0 0 0 1.825 25.981c.164.006.328 0 .49 0a13.072 13.072 0 0 0 10.703-5.555a1.01 1.01 0 0 0-.783-1.565A13.08 13.08 0 0 1 15.89 4.38A1.015 1.015 0 0 0 14.98 3z" fill="currentColor"></path></svg></button><form class="search-box" role="search"><input type="search" placeholder="Search" autocomplete="off" spellcheck="false" value><!----></form></div></header><!--]--><div class="sidebar-mask"></div><!--[--><aside class="sidebar"><nav class="navbar-items" aria-label="site navigation"><!--[--><div class="navbar-item"><a class="external-link" href="https://github.com/espnet/espnet" rel="noopener noreferrer" target="_blank" aria-label="Github"><!--[--><!--[--><!--]--> Github <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://huggingface.co/espnet" rel="noopener noreferrer" target="_blank" aria-label="HuggingFace"><!--[--><!--[--><!--]--> HuggingFace <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://espnet.github.io/espnet" rel="noopener noreferrer" target="_blank" aria-label="Docs"><!--[--><!--[--><!--]--> Docs <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SE</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SLU</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>OTHERS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link route-link-active" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ST</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/tts/tacotron2.html" aria-label="TTS demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> TTS demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_with_lora.html" aria-label="Finetune Model with ESPnet-Easy"><!--[--><!--[--><!--]--> Finetune Model with ESPnet-Easy <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/train.html" aria-label="Sample demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> Sample demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_owsm.html" aria-label="OWSM finetuning with custom dataset"><!--[--><!--[--><!--]--> OWSM finetuning with custom dataset <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_tutorial_2021_CMU_11751_18781.html" aria-label="CMU 11751/18781 2021: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 2021: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html" aria-label="CMU 11492/11692 Spring 2023: Spoken Language Understanding"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Spoken Language Understanding <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html" aria-label="CMU 11492/11692 Spring 2023: Speech Enhancement"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Speech Enhancement <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html" aria-label="CMU 11492/11692 Spring 2023: Data preparation"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Data preparation <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task)"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html" aria-label="CMU 11492/11692 Spring 2023: Text to Speech"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Text to Speech <!--[--><!--]--><!--]--></a></li><!--]--></ul></div></div><!--]--></nav><!--[--><!--]--><ul class="sidebar-items"><!--[--><li><p tabindex="0" class="sidebar-item sidebar-heading">TTS <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">SE <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">SLU <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">ASR <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading active">OTHERS <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link route-link-active sidebar-item active" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="#table-of-contents" aria-label="Table of Contents"><!--[--><!--[--><!--]--> Table of Contents <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#export-model-from-espnet-model-zoo" aria-label="Export model from espnet_model_zoo"><!--[--><!--[--><!--]--> Export model from espnet_model_zoo <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#export-from-custom-model" aria-label="Export from custom model"><!--[--><!--[--><!--]--> Export from custom model <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><a class="route-link sidebar-item" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">ST <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><!--]--></ul><!--[--><!--]--></aside><!--]--><!--[--><main class="page"><!--[--><!--]--><div class="theme-default-content"><!--[--><!--]--><div><h1 id="espnet-onnx-demonstration" tabindex="-1"><a class="header-anchor" href="#espnet-onnx-demonstration"><span>espnet_onnx demonstration</span></a></h1><p>This notebook provides a demonstration of how to export your trained model into onnx format. Currently only ASR is supported.</p><p>see also:</p><ul><li>ESPnet: https://github.com/espnet/espnet</li><li>espnet_onnx: https://github.com/Masao-Someki/espnet_onnx</li></ul><p>Author: <a href="https://github.com/Masao-Someki" target="_blank" rel="noopener noreferrer">Masao Someki<span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span></a></p><h2 id="table-of-contents" tabindex="-1"><a class="header-anchor" href="#table-of-contents"><span>Table of Contents</span></a></h2><ul><li>Install Dependency</li><li>Export your model</li><li>Inference with onnx</li><li>Using streaming model</li></ul><h1 id="install-dependency" tabindex="-1"><a class="header-anchor" href="#install-dependency"><span>Install Dependency</span></a></h1><p>To run this demo, you need to install the following packages.</p><ul><li>espnet_onnx</li><li>torch &gt;= 1.11.0 (already installed in Colab)</li><li>espnet</li><li>espnet_model_zoo</li><li>onnx</li></ul><p><code>torch</code>, <code>espnet</code>, <code>espnet_model_zoo</code>, <code>onnx</code> is required to run the exportation demo.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">pip install -U espnet_onnx espnet espnet_model_zoo onnx</span></span>
35
+ <span class="line"></span>
36
+ <span class="line"><span style="color:#6A9955;"># in this demo, we need to update scipy to avoid an error</span></span>
37
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">pip install -U scipy</span></span>
38
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h1 id="export-your-model" tabindex="-1"><a class="header-anchor" href="#export-your-model"><span>Export your model</span></a></h1><h2 id="export-model-from-espnet-model-zoo" tabindex="-1"><a class="header-anchor" href="#export-model-from-espnet-model-zoo"><span>Export model from espnet_model_zoo</span></a></h2><p>The easiest way to export a model is to use <code>espnet_model_zoo</code>. You can download, unpack, and export the pretrained models with <code>export_from_pretrained</code> method. <code>espnet_onnx</code> will save the onnx models into cache directory, which is <code>${HOME}/.cache/espnet_onnx</code> in default.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># export the model.</span></span>
39
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet_onnx.export </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> ModelExport</span></span>
40
+ <span class="line"></span>
41
+ <span class="line"><span style="color:#D4D4D4;">tag_name = </span><span style="color:#CE9178;">&#39;kamo-naoyuki/timit_asr_train_asr_raw_word_valid.acc.ave&#39;</span></span>
42
+ <span class="line"></span>
43
+ <span class="line"><span style="color:#D4D4D4;">m = ModelExport()</span></span>
44
+ <span class="line"><span style="color:#D4D4D4;">m.export_from_pretrained(tag_name)</span></span>
45
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="export-from-custom-model" tabindex="-1"><a class="header-anchor" href="#export-from-custom-model"><span>Export from custom model</span></a></h2><p><code>espnet_onnx</code> can also export your own trained model with <code>export</code> method.</p><p>The following script shows how to export from <code>espnet2.bin.asr_inference.Speech2Text</code> instance. You can also export from a zipped file, by using the <code>export_from_zip</code> function.<br> For this demonstration, I&#39;m using the <code>from_pretrained</code> method to load parameters, but you can load your own model.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># prepare the espnet2.bin.asr_inference.Speech2Text instance.</span></span>
46
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet2.bin.asr_inference </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> Speech2Text</span></span>
47
+ <span class="line"></span>
48
+ <span class="line"><span style="color:#D4D4D4;">tag_name = </span><span style="color:#CE9178;">&#39;kamo-naoyuki/timit_asr_train_asr_raw_word_valid.acc.ave&#39;</span></span>
49
+ <span class="line"><span style="color:#D4D4D4;">speech2text = Speech2Text.from_pretrained(tag_name)</span></span>
50
+ <span class="line"></span>
51
+ <span class="line"></span>
52
+ <span class="line"><span style="color:#6A9955;"># export model</span></span>
53
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet_onnx.export </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> ModelExport</span></span>
54
+ <span class="line"></span>
55
+ <span class="line"><span style="color:#D4D4D4;">sample_model_tag = </span><span style="color:#CE9178;">&#39;demo/sample_model_1&#39;</span></span>
56
+ <span class="line"><span style="color:#D4D4D4;">m = ModelExport()</span></span>
57
+ <span class="line"><span style="color:#D4D4D4;">m.export(</span></span>
58
+ <span class="line"><span style="color:#D4D4D4;"> speech2text,</span></span>
59
+ <span class="line"><span style="color:#D4D4D4;"> sample_model_tag,</span></span>
60
+ <span class="line"><span style="color:#9CDCFE;"> quantize</span><span style="color:#D4D4D4;">=</span><span style="color:#569CD6;">False</span></span>
61
+ <span class="line"><span style="color:#D4D4D4;">)</span></span>
62
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h1 id="inference-with-onnx" tabindex="-1"><a class="header-anchor" href="#inference-with-onnx"><span>Inference with onnx</span></a></h1><p>Now, let&#39;s use the exported models for inference.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># please provide the tag_name to specify exported model.</span></span>
63
+ <span class="line"><span style="color:#D4D4D4;">tag_name = </span><span style="color:#CE9178;">&#39;kamo-naoyuki/timit_asr_train_asr_raw_word_valid.acc.ave&#39;</span></span>
64
+ <span class="line"></span>
65
+ <span class="line"></span>
66
+ <span class="line"><span style="color:#6A9955;"># upload wav file and let&#39;s inference!</span></span>
67
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> librosa</span></span>
68
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> google.colab </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> files</span></span>
69
+ <span class="line"></span>
70
+ <span class="line"><span style="color:#D4D4D4;">wav_file = files.upload()</span></span>
71
+ <span class="line"><span style="color:#D4D4D4;">y, sr = librosa.load(</span><span style="color:#4EC9B0;">list</span><span style="color:#D4D4D4;">(wav_file.keys())[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">], </span><span style="color:#9CDCFE;">sr</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">16000</span><span style="color:#D4D4D4;">)</span></span>
72
+ <span class="line"></span>
73
+ <span class="line"></span>
74
+ <span class="line"><span style="color:#6A9955;"># Use the exported onnx file to inference.</span></span>
75
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet_onnx </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> Speech2Text</span></span>
76
+ <span class="line"></span>
77
+ <span class="line"><span style="color:#D4D4D4;">speech2text = Speech2Text(tag_name)</span></span>
78
+ <span class="line"><span style="color:#D4D4D4;">nbest = speech2text(y)</span></span>
79
+ <span class="line"><span style="color:#DCDCAA;">print</span><span style="color:#D4D4D4;">(nbest[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">][</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">])</span></span>
80
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h1 id="using-streaming-model" tabindex="-1"><a class="header-anchor" href="#using-streaming-model"><span>Using streaming model</span></a></h1><p>Model exportation is exactly the same as non-streaming model. You can follow the <code>#Export your model</code> chapter.</p><p>As for streaming, you can specify the following configuration additionaly. Usually, these values should be the same as the training configuration.</p><ul><li>block_size</li><li>hop_size</li><li>look_ahead</li></ul><p>The length of the speech should be the same as <code>streaming_model.hop_size</code>. This value is calculated as follows</p><p>$$ \begin{align} h &amp;= \text{hop_size} * \text{encoder.subsample} * \text{stft.hop_length}\ \text{padding} &amp;= (\text{stft.n_fft} // \text{stft.hop_length}) * \text{stft.hop_length} \ \text{len(wav)} &amp;= h + \text{padding} \end{align} $$</p><p>For example, the length of the speech is 8704 with the following configuration.</p><ul><li>block_size = 40</li><li>hop_size = 16</li><li>look_ahead = 16</li><li>encoder.subsample = 4</li><li>stft.n_fft = 512</li><li>stft.hop_length = 128</li></ul><p>Now, let&#39;s demonstrate the streaming inference.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># Export the streaming model.</span></span>
81
+ <span class="line"><span style="color:#6A9955;"># Note that the following model is very large</span></span>
82
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet_onnx.export </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> ModelExport</span></span>
83
+ <span class="line"></span>
84
+ <span class="line"><span style="color:#D4D4D4;">tag_name = </span><span style="color:#CE9178;">&#39;D-Keqi/espnet_asr_train_asr_streaming_transformer_raw_en_bpe500_sp_valid.acc.ave&#39;</span></span>
85
+ <span class="line"></span>
86
+ <span class="line"><span style="color:#D4D4D4;">m = ModelExport()</span></span>
87
+ <span class="line"><span style="color:#D4D4D4;">m.export_from_pretrained(tag_name)</span></span>
88
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># In this tutorial, we will use the recorded wav file to simulate streaming.</span></span>
89
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> librosa</span></span>
90
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet_onnx </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> StreamingSpeech2Text</span></span>
91
+ <span class="line"></span>
92
+ <span class="line"><span style="color:#D4D4D4;">tag_name = </span><span style="color:#CE9178;">&#39;D-Keqi/espnet_asr_train_asr_streaming_transformer_raw_en_bpe500_sp_valid.acc.ave&#39;</span></span>
93
+ <span class="line"><span style="color:#D4D4D4;">streaming_model = StreamingSpeech2Text(tag_name)</span></span>
94
+ <span class="line"></span>
95
+ <span class="line"><span style="color:#6A9955;"># upload wav file</span></span>
96
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> google.colab </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> files</span></span>
97
+ <span class="line"><span style="color:#D4D4D4;">wav_file = files.upload()</span></span>
98
+ <span class="line"><span style="color:#D4D4D4;">y, sr = librosa.load(</span><span style="color:#4EC9B0;">list</span><span style="color:#D4D4D4;">(wav_file.keys())[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">], </span><span style="color:#9CDCFE;">sr</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">16000</span><span style="color:#D4D4D4;">)</span></span>
99
+ <span class="line"></span>
100
+ <span class="line"><span style="color:#D4D4D4;">num_process = </span><span style="color:#DCDCAA;">len</span><span style="color:#D4D4D4;">(y) // streaming_model.hop_size + </span><span style="color:#B5CEA8;">1</span></span>
101
+ <span class="line"><span style="color:#DCDCAA;">print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;I will split your audio file into </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">num_process</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;"> blocks.&quot;</span><span style="color:#D4D4D4;">)</span></span>
102
+ <span class="line"></span>
103
+ <span class="line"><span style="color:#6A9955;"># simulate streaming.</span></span>
104
+ <span class="line"><span style="color:#D4D4D4;">streaming_model.start()</span></span>
105
+ <span class="line"><span style="color:#C586C0;">for</span><span style="color:#D4D4D4;"> i </span><span style="color:#C586C0;">in</span><span style="color:#DCDCAA;"> range</span><span style="color:#D4D4D4;">(num_process):</span></span>
106
+ <span class="line"><span style="color:#6A9955;"> # prepare wav file</span></span>
107
+ <span class="line"><span style="color:#D4D4D4;"> start = i * streaming_model.hop_size</span></span>
108
+ <span class="line"><span style="color:#D4D4D4;"> end = (i + </span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">) * streaming_model.hop_size</span></span>
109
+ <span class="line"><span style="color:#D4D4D4;"> wav_streaming = y[start : end]</span></span>
110
+ <span class="line"></span>
111
+ <span class="line"><span style="color:#6A9955;"> # apply padding if len(wav_streaming) &lt; streaming_model.hop_size</span></span>
112
+ <span class="line"><span style="color:#D4D4D4;"> wav_streaming = streaming_model.pad(wav_streaming)</span></span>
113
+ <span class="line"><span style="color:#D4D4D4;"> </span></span>
114
+ <span class="line"><span style="color:#6A9955;"> # compute asr</span></span>
115
+ <span class="line"><span style="color:#D4D4D4;"> nbest = streaming_model(wav_streaming)</span></span>
116
+ <span class="line"><span style="color:#DCDCAA;"> print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&#39;Result at position </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">i</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;"> : </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">nbest[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">][</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">]</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&#39;</span><span style="color:#D4D4D4;">)</span></span>
117
+ <span class="line"></span>
118
+ <span class="line"><span style="color:#D4D4D4;">final_nbest = streaming_model.end()</span></span>
119
+ <span class="line"><span style="color:#DCDCAA;">print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&#39;Final result : </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">final_nbest[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">][</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">]</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&#39;</span><span style="color:#D4D4D4;">)</span></span>
120
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div></div><!--[--><!--]--></div><footer class="vp-page-meta"><!----><div class="vp-meta-item git-info"><!----><!----></div></footer><nav class="vp-page-nav" aria-label="page navigation"><!----><a class="route-link next" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><div class="hint">Next <span class="arrow right"></span></div><div class="link"><span>Pretrained Model</span></div><!--]--></a></nav><!--[--><!--]--></main><!--]--></div><!--[--><!----><!--]--><!--]--></div>
121
+ <script type="module" src="/assets/app-DTS6SjJz.js" defer></script>
122
+ </body>
123
+ </html>
espnet2/others/pretrained.html ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!doctype html>
2
+ <html lang="en-US">
3
+ <head>
4
+ <meta charset="utf-8" />
5
+ <meta name="viewport" content="width=device-width,initial-scale=1" />
6
+ <meta name="generator" content="VuePress 2.0.0-rc.9" />
7
+ <style>
8
+ :root {
9
+ --c-bg: #fff;
10
+ }
11
+ html.dark {
12
+ --c-bg: #22272e;
13
+ }
14
+ html,
15
+ body {
16
+ background-color: var(--c-bg);
17
+ }
18
+ </style>
19
+ <script>
20
+ const userMode = localStorage.getItem('vuepress-color-scheme')
21
+ const systemDarkMode =
22
+ window.matchMedia &&
23
+ window.matchMedia('(prefers-color-scheme: dark)').matches
24
+ if (userMode === 'dark' || (userMode !== 'light' && systemDarkMode)) {
25
+ document.documentElement.classList.toggle('dark', true)
26
+ }
27
+ </script>
28
+ <link rel="manifest" href="/manifest.webmanifest"><meta name="application-name" content="Example"><meta name="apple-mobile-web-app-title" content="Example"><meta name="apple-mobile-web-app-status-bar-style" content="black"><meta name="msapplication-TileColor" content="#3eaf7c"><meta name="theme-color" content="#3eaf7c"><title>Pretrained Model | </title><meta name="description" content=" ">
29
+ <link rel="preload" href="/assets/style-SNWc1iKP.css" as="style"><link rel="stylesheet" href="/assets/style-SNWc1iKP.css">
30
+ <link rel="modulepreload" href="/assets/app-DTS6SjJz.js"><link rel="modulepreload" href="/assets/pretrained.html-JpE__EKJ.js">
31
+ <link rel="prefetch" href="/assets/index.html-DGcx4T0I.js" as="script"><link rel="prefetch" href="/assets/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html-CJ8-yKXK.js" as="script"><link rel="prefetch" href="/assets/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html-BDY4p1H1.js" as="script"><link rel="prefetch" href="/assets/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html-DDjiuGQB.js" as="script"><link rel="prefetch" href="/assets/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html-DnIftUJK.js" as="script"><link rel="prefetch" href="/assets/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html-D9GsoT-_.js" as="script"><link rel="prefetch" href="/assets/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html-DobzmqH0.js" as="script"><link rel="prefetch" href="/assets/espnet2_tutorial_2021_CMU_11751_18781.html-BY6Z52B4.js" as="script"><link rel="prefetch" href="/assets/asr_cli.html-BA-xBrC-.js" as="script"><link rel="prefetch" href="/assets/asr_library.html-rEQwKTMV.js" as="script"><link rel="prefetch" href="/assets/espnet2_asr_realtime_demo.html-BnK1Wovv.js" as="script"><link rel="prefetch" href="/assets/espnet2_asr_transfer_learning_demo.html-DJeSoTyY.js" as="script"><link rel="prefetch" href="/assets/espnet2_streaming_asr_demo.html-Yx-OX8AZ.js" as="script"><link rel="prefetch" href="/assets/espnet_se_demonstration_for_waspaa_2021.html--JdDNbEo.js" as="script"><link rel="prefetch" href="/assets/se_demo.html-DY-mv2y8.js" as="script"><link rel="prefetch" href="/assets/onnx_conversion_demo.html-D56NEMop.js" as="script"><link rel="prefetch" href="/assets/espnet2_2pass_slu_demo.html-BmvJ92Ni.js" as="script"><link rel="prefetch" href="/assets/st_demo.html-WLzB4ZGO.js" as="script"><link rel="prefetch" href="/assets/espnet2_tts_realtime_demo.html-BdxLBr1c.js" as="script"><link rel="prefetch" href="/assets/tts_cli.html-BfB21gs4.js" as="script"><link rel="prefetch" href="/assets/tts_realtime_demo.html-BKOGq7as.js" as="script"><link rel="prefetch" href="/assets/finetune_owsm.html-ICOQYZj2.js" as="script"><link rel="prefetch" href="/assets/finetune_with_lora.html-3NfoQDOl.js" as="script"><link rel="prefetch" href="/assets/train.html-BQ-t2Cs4.js" as="script"><link rel="prefetch" href="/assets/tacotron2.html-Ds1AKES7.js" as="script"><link rel="prefetch" href="/assets/404.html-DN7291h8.js" as="script"><link rel="prefetch" href="/assets/NpmBadge-rh9tvaXX.js" as="script">
32
+ </head>
33
+ <body>
34
+ <div id="app"><!--[--><div class="theme-container"><!--[--><header class="navbar"><div class="toggle-sidebar-button" title="toggle sidebar" aria-expanded="false" role="button" tabindex="0"><div class="icon" aria-hidden="true"><span></span><span></span><span></span></div></div><span><a class="route-link" href="/"><img class="logo" src="/images/espnet_logo1.png" alt=" "><span class="site-name can-hide" aria-hidden="true"> </span></a></span><div class="navbar-items-wrapper" style=""><!--[--><!--]--><nav class="navbar-items can-hide" aria-label="site navigation"><!--[--><div class="navbar-item"><a class="external-link" href="https://github.com/espnet/espnet" rel="noopener noreferrer" target="_blank" aria-label="Github"><!--[--><!--[--><!--]--> Github <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://huggingface.co/espnet" rel="noopener noreferrer" target="_blank" aria-label="HuggingFace"><!--[--><!--[--><!--]--> HuggingFace <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://espnet.github.io/espnet" rel="noopener noreferrer" target="_blank" aria-label="Docs"><!--[--><!--[--><!--]--> Docs <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SE</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SLU</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>OTHERS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link route-link-active" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ST</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/tts/tacotron2.html" aria-label="TTS demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> TTS demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_with_lora.html" aria-label="Finetune Model with ESPnet-Easy"><!--[--><!--[--><!--]--> Finetune Model with ESPnet-Easy <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/train.html" aria-label="Sample demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> Sample demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_owsm.html" aria-label="OWSM finetuning with custom dataset"><!--[--><!--[--><!--]--> OWSM finetuning with custom dataset <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_tutorial_2021_CMU_11751_18781.html" aria-label="CMU 11751/18781 2021: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 2021: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html" aria-label="CMU 11492/11692 Spring 2023: Spoken Language Understanding"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Spoken Language Understanding <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html" aria-label="CMU 11492/11692 Spring 2023: Speech Enhancement"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Speech Enhancement <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html" aria-label="CMU 11492/11692 Spring 2023: Data preparation"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Data preparation <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task)"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html" aria-label="CMU 11492/11692 Spring 2023: Text to Speech"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Text to Speech <!--[--><!--]--><!--]--></a></li><!--]--></ul></div></div><!--]--></nav><!--[--><!--]--><button class="toggle-color-mode-button" title="toggle color mode"><svg style="" class="icon" focusable="false" viewBox="0 0 32 32"><path d="M16 12.005a4 4 0 1 1-4 4a4.005 4.005 0 0 1 4-4m0-2a6 6 0 1 0 6 6a6 6 0 0 0-6-6z" fill="currentColor"></path><path d="M5.394 6.813l1.414-1.415l3.506 3.506L8.9 10.318z" fill="currentColor"></path><path d="M2 15.005h5v2H2z" fill="currentColor"></path><path d="M5.394 25.197L8.9 21.691l1.414 1.415l-3.506 3.505z" fill="currentColor"></path><path d="M15 25.005h2v5h-2z" fill="currentColor"></path><path d="M21.687 23.106l1.414-1.415l3.506 3.506l-1.414 1.414z" fill="currentColor"></path><path d="M25 15.005h5v2h-5z" fill="currentColor"></path><path d="M21.687 8.904l3.506-3.506l1.414 1.415l-3.506 3.505z" fill="currentColor"></path><path d="M15 2.005h2v5h-2z" fill="currentColor"></path></svg><svg style="display:none;" class="icon" focusable="false" viewBox="0 0 32 32"><path d="M13.502 5.414a15.075 15.075 0 0 0 11.594 18.194a11.113 11.113 0 0 1-7.975 3.39c-.138 0-.278.005-.418 0a11.094 11.094 0 0 1-3.2-21.584M14.98 3a1.002 1.002 0 0 0-.175.016a13.096 13.096 0 0 0 1.825 25.981c.164.006.328 0 .49 0a13.072 13.072 0 0 0 10.703-5.555a1.01 1.01 0 0 0-.783-1.565A13.08 13.08 0 0 1 15.89 4.38A1.015 1.015 0 0 0 14.98 3z" fill="currentColor"></path></svg></button><form class="search-box" role="search"><input type="search" placeholder="Search" autocomplete="off" spellcheck="false" value><!----></form></div></header><!--]--><div class="sidebar-mask"></div><!--[--><aside class="sidebar"><nav class="navbar-items" aria-label="site navigation"><!--[--><div class="navbar-item"><a class="external-link" href="https://github.com/espnet/espnet" rel="noopener noreferrer" target="_blank" aria-label="Github"><!--[--><!--[--><!--]--> Github <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://huggingface.co/espnet" rel="noopener noreferrer" target="_blank" aria-label="HuggingFace"><!--[--><!--[--><!--]--> HuggingFace <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://espnet.github.io/espnet" rel="noopener noreferrer" target="_blank" aria-label="Docs"><!--[--><!--[--><!--]--> Docs <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SE</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SLU</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>OTHERS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link route-link-active" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ST</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/tts/tacotron2.html" aria-label="TTS demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> TTS demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_with_lora.html" aria-label="Finetune Model with ESPnet-Easy"><!--[--><!--[--><!--]--> Finetune Model with ESPnet-Easy <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/train.html" aria-label="Sample demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> Sample demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_owsm.html" aria-label="OWSM finetuning with custom dataset"><!--[--><!--[--><!--]--> OWSM finetuning with custom dataset <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_tutorial_2021_CMU_11751_18781.html" aria-label="CMU 11751/18781 2021: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 2021: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html" aria-label="CMU 11492/11692 Spring 2023: Spoken Language Understanding"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Spoken Language Understanding <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html" aria-label="CMU 11492/11692 Spring 2023: Speech Enhancement"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Speech Enhancement <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html" aria-label="CMU 11492/11692 Spring 2023: Data preparation"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Data preparation <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task)"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html" aria-label="CMU 11492/11692 Spring 2023: Text to Speech"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Text to Speech <!--[--><!--]--><!--]--></a></li><!--]--></ul></div></div><!--]--></nav><!--[--><!--]--><ul class="sidebar-items"><!--[--><li><p tabindex="0" class="sidebar-item sidebar-heading">TTS <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">SE <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">SLU <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">ASR <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading active">OTHERS <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link route-link-active sidebar-item active" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="#setup-envrionment" aria-label="Setup envrionment"><!--[--><!--[--><!--]--> Setup envrionment <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#recognize-speech-using-pretrained-models" aria-label="Recognize speech using pretrained models"><!--[--><!--[--><!--]--> Recognize speech using pretrained models <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#synthesize-speech-using-pretrained-models" aria-label="Synthesize speech using pretrained models"><!--[--><!--[--><!--]--> Synthesize speech using pretrained models <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">ST <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><!--]--></ul><!--[--><!--]--></aside><!--]--><!--[--><main class="page"><!--[--><!--]--><div class="theme-default-content"><!--[--><!--]--><div><h1 id="pretrained-model" tabindex="-1"><a class="header-anchor" href="#pretrained-model"><span>Pretrained Model</span></a></h1><p>This is the example notebook of how-to-recognize and -synthesize speech using the ESPnet models.</p><p>See also:</p><ul><li>Tutorial: https://github.com/espnet/espnet/blob/master/doc/tutorial.md</li><li>Github: https://github.com/espnet</li></ul><p>Author: <a href="https://github.com/takenori-y" target="_blank" rel="noopener noreferrer">Takenori Yoshimura<span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span></a></p><p>Last update: 2019/07/28</p><h2 id="setup-envrionment" tabindex="-1"><a class="header-anchor" href="#setup-envrionment"><span>Setup envrionment</span></a></h2><p>Let&#39;s setup the environmet for the demonstration. It takes around 10 minues. Please keep waiting for a while.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># OS setup</span></span>
35
+ <span class="line"><span>!sudo apt-get install bc tree sox</span></span>
36
+ <span class="line"><span>!cat /etc/os-release</span></span>
37
+ <span class="line"><span></span></span>
38
+ <span class="line"><span># espnet setup</span></span>
39
+ <span class="line"><span>!git clone https://github.com/espnet/espnet</span></span>
40
+ <span class="line"><span>!cd espnet; pip install -e .</span></span>
41
+ <span class="line"><span></span></span>
42
+ <span class="line"><span># warp ctc setup</span></span>
43
+ <span class="line"><span>!git clone https://github.com/espnet/warp-ctc -b pytorch-1.1</span></span>
44
+ <span class="line"><span>!cd warp-ctc &amp;&amp; mkdir build &amp;&amp; cd build &amp;&amp; cmake .. &amp;&amp; make -j</span></span>
45
+ <span class="line"><span>!cd warp-ctc/pytorch_binding &amp;&amp; python setup.py install </span></span>
46
+ <span class="line"><span></span></span>
47
+ <span class="line"><span># kaldi setup</span></span>
48
+ <span class="line"><span>!cd /content/espnet/tools; git clone https://github.com/kaldi-asr/kaldi</span></span>
49
+ <span class="line"><span>!echo &quot;&quot; &gt; ./espnet/tools/kaldi/tools/extras/check_dependencies.sh</span></span>
50
+ <span class="line"><span>!chmod +x ./espnet/tools/kaldi/tools/extras/check_dependencies.sh</span></span>
51
+ <span class="line"><span>!cd ./espnet/tools/kaldi/tools; make sph2pipe sclite</span></span>
52
+ <span class="line"><span>!rm -rf espnet/tools/kaldi/tools/python</span></span>
53
+ <span class="line"><span>!wget https://18-198329952-gh.circle-artifacts.com/0/home/circleci/repo/ubuntu16-featbin.tar.gz</span></span>
54
+ <span class="line"><span>!tar -xf ./ubuntu16-featbin.tar.gz</span></span>
55
+ <span class="line"><span>!cp featbin/* espnet/tools/kaldi/src/featbin/</span></span>
56
+ <span class="line"><span></span></span>
57
+ <span class="line"><span># sentencepiece setup</span></span>
58
+ <span class="line"><span>!cd espnet/tools; make sentencepiece.done</span></span>
59
+ <span class="line"><span></span></span>
60
+ <span class="line"><span># make dummy activate</span></span>
61
+ <span class="line"><span>!mkdir -p espnet/tools/venv/bin</span></span>
62
+ <span class="line"><span>!touch espnet/tools/venv/bin/activate</span></span>
63
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="recognize-speech-using-pretrained-models" tabindex="-1"><a class="header-anchor" href="#recognize-speech-using-pretrained-models"><span>Recognize speech using pretrained models</span></a></h2><p>Let&#39;s recognize 7-minutes long audio speech as an example. Go to a recipe directory and run <code>recog_wav.sh</code> at the directory.</p><p>Available models are summarized <a href="https://github.com/espnet/espnet#asr-demo" target="_blank" rel="noopener noreferrer">here<span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span></a>.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!cd espnet/egs/tedlium2/asr1; bash ../../../utils/recog_wav.sh --models tedlium2.tacotron2.v1</span></span>
64
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>You can see the progress of the recognition.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!cat espnet/egs/tedlium2/asr1/decode/TomWujec_2010U/log/decode.log</span></span>
65
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>You can change E2E model, language model, decoding parameters, etc. For the detail, see <code>recog_wav.sh</code>.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!cat espnet/utils/recog_wav.sh</span></span>
66
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h2 id="synthesize-speech-using-pretrained-models" tabindex="-1"><a class="header-anchor" href="#synthesize-speech-using-pretrained-models"><span>Synthesize speech using pretrained models</span></a></h2><p>Let&#39;s synthesize speech using an E2E model. Go to a recipe directory and run <code>synth_wav.sh</code> at the directory.</p><p>Available models are summarized <a href="https://github.com/espnet/espnet#tts-demo" target="_blank" rel="noopener noreferrer">here<span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span></a>.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!cd espnet/egs/ljspeech/tts1; \</span></span>
67
+ <span class="line"><span>echo &quot;THIS IS A DEMONSTRATION OF TEXT TO SPEECH.&quot; &gt; example.txt; \</span></span>
68
+ <span class="line"><span>bash ../../../utils/synth_wav.sh --models ljspeech.tacotron2.v1 example.txt</span></span>
69
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>Let&#39;s listen the synthesized speech!</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>from google.colab import files</span></span>
70
+ <span class="line"><span></span></span>
71
+ <span class="line"><span>files.download(&#39;espnet/egs/ljspeech/tts1/decode/example/wav/example.wav&#39;)</span></span>
72
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>You can change E2E model, decoding parameters, etc. For the detail, see <code>synth_wav.sh</code>.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!cat espnet/utils/synth_wav.sh</span></span>
73
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>We have a web storage to put your good trained models. If you want, please contact Shinji Watanabe <a href="mailto:shinjiw@ieee.org">shinjiw@ieee.org</a>.</p></div><!--[--><!--]--></div><footer class="vp-page-meta"><!----><div class="vp-meta-item git-info"><!----><!----></div></footer><nav class="vp-page-nav" aria-label="page navigation"><a class="route-link prev" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><div class="hint"><span class="arrow left"></span> Prev</div><div class="link"><span>espnet_onnx demonstration</span></div><!--]--></a><!----></nav><!--[--><!--]--></main><!--]--></div><!--[--><!----><!--]--><!--]--></div>
74
+ <script type="module" src="/assets/app-DTS6SjJz.js" defer></script>
75
+ </body>
76
+ </html>
espnet2/se/espnet_se_demonstration_for_waspaa_2021.html ADDED
The diff for this file is too large to render. See raw diff
 
espnet2/se/se_demo.html ADDED
@@ -0,0 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!doctype html>
2
+ <html lang="en-US">
3
+ <head>
4
+ <meta charset="utf-8" />
5
+ <meta name="viewport" content="width=device-width,initial-scale=1" />
6
+ <meta name="generator" content="VuePress 2.0.0-rc.9" />
7
+ <style>
8
+ :root {
9
+ --c-bg: #fff;
10
+ }
11
+ html.dark {
12
+ --c-bg: #22272e;
13
+ }
14
+ html,
15
+ body {
16
+ background-color: var(--c-bg);
17
+ }
18
+ </style>
19
+ <script>
20
+ const userMode = localStorage.getItem('vuepress-color-scheme')
21
+ const systemDarkMode =
22
+ window.matchMedia &&
23
+ window.matchMedia('(prefers-color-scheme: dark)').matches
24
+ if (userMode === 'dark' || (userMode !== 'light' && systemDarkMode)) {
25
+ document.documentElement.classList.toggle('dark', true)
26
+ }
27
+ </script>
28
+ <link rel="manifest" href="/manifest.webmanifest"><meta name="application-name" content="Example"><meta name="apple-mobile-web-app-title" content="Example"><meta name="apple-mobile-web-app-status-bar-style" content="black"><meta name="msapplication-TileColor" content="#3eaf7c"><meta name="theme-color" content="#3eaf7c"><title>ESPnet Speech Enhancement Demonstration | </title><meta name="description" content=" ">
29
+ <link rel="preload" href="/assets/style-SNWc1iKP.css" as="style"><link rel="stylesheet" href="/assets/style-SNWc1iKP.css">
30
+ <link rel="modulepreload" href="/assets/app-DTS6SjJz.js"><link rel="modulepreload" href="/assets/se_demo.html-DY-mv2y8.js">
31
+ <link rel="prefetch" href="/assets/index.html-DGcx4T0I.js" as="script"><link rel="prefetch" href="/assets/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html-CJ8-yKXK.js" as="script"><link rel="prefetch" href="/assets/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html-BDY4p1H1.js" as="script"><link rel="prefetch" href="/assets/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html-DDjiuGQB.js" as="script"><link rel="prefetch" href="/assets/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html-DnIftUJK.js" as="script"><link rel="prefetch" href="/assets/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html-D9GsoT-_.js" as="script"><link rel="prefetch" href="/assets/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html-DobzmqH0.js" as="script"><link rel="prefetch" href="/assets/espnet2_tutorial_2021_CMU_11751_18781.html-BY6Z52B4.js" as="script"><link rel="prefetch" href="/assets/asr_cli.html-BA-xBrC-.js" as="script"><link rel="prefetch" href="/assets/asr_library.html-rEQwKTMV.js" as="script"><link rel="prefetch" href="/assets/espnet2_asr_realtime_demo.html-BnK1Wovv.js" as="script"><link rel="prefetch" href="/assets/espnet2_asr_transfer_learning_demo.html-DJeSoTyY.js" as="script"><link rel="prefetch" href="/assets/espnet2_streaming_asr_demo.html-Yx-OX8AZ.js" as="script"><link rel="prefetch" href="/assets/espnet_se_demonstration_for_waspaa_2021.html--JdDNbEo.js" as="script"><link rel="prefetch" href="/assets/onnx_conversion_demo.html-D56NEMop.js" as="script"><link rel="prefetch" href="/assets/pretrained.html-JpE__EKJ.js" as="script"><link rel="prefetch" href="/assets/espnet2_2pass_slu_demo.html-BmvJ92Ni.js" as="script"><link rel="prefetch" href="/assets/st_demo.html-WLzB4ZGO.js" as="script"><link rel="prefetch" href="/assets/espnet2_tts_realtime_demo.html-BdxLBr1c.js" as="script"><link rel="prefetch" href="/assets/tts_cli.html-BfB21gs4.js" as="script"><link rel="prefetch" href="/assets/tts_realtime_demo.html-BKOGq7as.js" as="script"><link rel="prefetch" href="/assets/finetune_owsm.html-ICOQYZj2.js" as="script"><link rel="prefetch" href="/assets/finetune_with_lora.html-3NfoQDOl.js" as="script"><link rel="prefetch" href="/assets/train.html-BQ-t2Cs4.js" as="script"><link rel="prefetch" href="/assets/tacotron2.html-Ds1AKES7.js" as="script"><link rel="prefetch" href="/assets/404.html-DN7291h8.js" as="script"><link rel="prefetch" href="/assets/NpmBadge-rh9tvaXX.js" as="script">
32
+ </head>
33
+ <body>
34
+ <div id="app"><!--[--><div class="theme-container"><!--[--><header class="navbar"><div class="toggle-sidebar-button" title="toggle sidebar" aria-expanded="false" role="button" tabindex="0"><div class="icon" aria-hidden="true"><span></span><span></span><span></span></div></div><span><a class="route-link" href="/"><img class="logo" src="/images/espnet_logo1.png" alt=" "><span class="site-name can-hide" aria-hidden="true"> </span></a></span><div class="navbar-items-wrapper" style=""><!--[--><!--]--><nav class="navbar-items can-hide" aria-label="site navigation"><!--[--><div class="navbar-item"><a class="external-link" href="https://github.com/espnet/espnet" rel="noopener noreferrer" target="_blank" aria-label="Github"><!--[--><!--[--><!--]--> Github <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://huggingface.co/espnet" rel="noopener noreferrer" target="_blank" aria-label="HuggingFace"><!--[--><!--[--><!--]--> HuggingFace <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://espnet.github.io/espnet" rel="noopener noreferrer" target="_blank" aria-label="Docs"><!--[--><!--[--><!--]--> Docs <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SE</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link route-link-active" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SLU</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>OTHERS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ST</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/tts/tacotron2.html" aria-label="TTS demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> TTS demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_with_lora.html" aria-label="Finetune Model with ESPnet-Easy"><!--[--><!--[--><!--]--> Finetune Model with ESPnet-Easy <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/train.html" aria-label="Sample demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> Sample demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_owsm.html" aria-label="OWSM finetuning with custom dataset"><!--[--><!--[--><!--]--> OWSM finetuning with custom dataset <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_tutorial_2021_CMU_11751_18781.html" aria-label="CMU 11751/18781 2021: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 2021: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html" aria-label="CMU 11492/11692 Spring 2023: Spoken Language Understanding"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Spoken Language Understanding <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html" aria-label="CMU 11492/11692 Spring 2023: Speech Enhancement"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Speech Enhancement <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html" aria-label="CMU 11492/11692 Spring 2023: Data preparation"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Data preparation <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task)"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html" aria-label="CMU 11492/11692 Spring 2023: Text to Speech"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Text to Speech <!--[--><!--]--><!--]--></a></li><!--]--></ul></div></div><!--]--></nav><!--[--><!--]--><button class="toggle-color-mode-button" title="toggle color mode"><svg style="" class="icon" focusable="false" viewBox="0 0 32 32"><path d="M16 12.005a4 4 0 1 1-4 4a4.005 4.005 0 0 1 4-4m0-2a6 6 0 1 0 6 6a6 6 0 0 0-6-6z" fill="currentColor"></path><path d="M5.394 6.813l1.414-1.415l3.506 3.506L8.9 10.318z" fill="currentColor"></path><path d="M2 15.005h5v2H2z" fill="currentColor"></path><path d="M5.394 25.197L8.9 21.691l1.414 1.415l-3.506 3.505z" fill="currentColor"></path><path d="M15 25.005h2v5h-2z" fill="currentColor"></path><path d="M21.687 23.106l1.414-1.415l3.506 3.506l-1.414 1.414z" fill="currentColor"></path><path d="M25 15.005h5v2h-5z" fill="currentColor"></path><path d="M21.687 8.904l3.506-3.506l1.414 1.415l-3.506 3.505z" fill="currentColor"></path><path d="M15 2.005h2v5h-2z" fill="currentColor"></path></svg><svg style="display:none;" class="icon" focusable="false" viewBox="0 0 32 32"><path d="M13.502 5.414a15.075 15.075 0 0 0 11.594 18.194a11.113 11.113 0 0 1-7.975 3.39c-.138 0-.278.005-.418 0a11.094 11.094 0 0 1-3.2-21.584M14.98 3a1.002 1.002 0 0 0-.175.016a13.096 13.096 0 0 0 1.825 25.981c.164.006.328 0 .49 0a13.072 13.072 0 0 0 10.703-5.555a1.01 1.01 0 0 0-.783-1.565A13.08 13.08 0 0 1 15.89 4.38A1.015 1.015 0 0 0 14.98 3z" fill="currentColor"></path></svg></button><form class="search-box" role="search"><input type="search" placeholder="Search" autocomplete="off" spellcheck="false" value><!----></form></div></header><!--]--><div class="sidebar-mask"></div><!--[--><aside class="sidebar"><nav class="navbar-items" aria-label="site navigation"><!--[--><div class="navbar-item"><a class="external-link" href="https://github.com/espnet/espnet" rel="noopener noreferrer" target="_blank" aria-label="Github"><!--[--><!--[--><!--]--> Github <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://huggingface.co/espnet" rel="noopener noreferrer" target="_blank" aria-label="HuggingFace"><!--[--><!--[--><!--]--> HuggingFace <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://espnet.github.io/espnet" rel="noopener noreferrer" target="_blank" aria-label="Docs"><!--[--><!--[--><!--]--> Docs <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SE</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link route-link-active" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SLU</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>OTHERS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ST</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/tts/tacotron2.html" aria-label="TTS demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> TTS demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_with_lora.html" aria-label="Finetune Model with ESPnet-Easy"><!--[--><!--[--><!--]--> Finetune Model with ESPnet-Easy <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/train.html" aria-label="Sample demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> Sample demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_owsm.html" aria-label="OWSM finetuning with custom dataset"><!--[--><!--[--><!--]--> OWSM finetuning with custom dataset <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_tutorial_2021_CMU_11751_18781.html" aria-label="CMU 11751/18781 2021: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 2021: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html" aria-label="CMU 11492/11692 Spring 2023: Spoken Language Understanding"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Spoken Language Understanding <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html" aria-label="CMU 11492/11692 Spring 2023: Speech Enhancement"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Speech Enhancement <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html" aria-label="CMU 11492/11692 Spring 2023: Data preparation"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Data preparation <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task)"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html" aria-label="CMU 11492/11692 Spring 2023: Text to Speech"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Text to Speech <!--[--><!--]--><!--]--></a></li><!--]--></ul></div></div><!--]--></nav><!--[--><!--]--><ul class="sidebar-items"><!--[--><li><p tabindex="0" class="sidebar-item sidebar-heading">TTS <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading active">SE <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link route-link-active sidebar-item active" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="#install" aria-label="Install"><!--[--><!--[--><!--]--> Install <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#speech-enhancement" aria-label="Speech Enhancement"><!--[--><!--[--><!--]--> Speech Enhancement <!--[--><!--]--><!--]--></a><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="#single-channel-enhancement-the-chime-example" aria-label="Single-Channel Enhancement, the CHiME example"><!--[--><!--[--><!--]--> Single-Channel Enhancement, the CHiME example <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#enhance-your-own-pre-recordings" aria-label="Enhance your own pre-recordings"><!--[--><!--[--><!--]--> Enhance your own pre-recordings <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#multi-channel-enhancement" aria-label="Multi-Channel Enhancement"><!--[--><!--[--><!--]--> Multi-Channel Enhancement <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><a class="route-link sidebar-item" href="#speech-separation" aria-label="Speech Separation"><!--[--><!--[--><!--]--> Speech Separation <!--[--><!--]--><!--]--></a><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="#model-selection" aria-label="Model Selection"><!--[--><!--[--><!--]--> Model Selection <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#separate-speech-mixture" aria-label="Separate Speech Mixture"><!--[--><!--[--><!--]--> Separate Speech Mixture <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><a class="route-link sidebar-item" href="#evluate-separated-speech-with-pretrained-asr-model" aria-label="Evluate separated speech with pretrained ASR model"><!--[--><!--[--><!--]--> Evluate separated speech with pretrained ASR model <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><a class="route-link sidebar-item" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">SLU <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">ASR <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">OTHERS <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">ST <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><!--]--></ul><!--[--><!--]--></aside><!--]--><!--[--><main class="page"><!--[--><!--]--><div class="theme-default-content"><!--[--><!--]--><div><h1 id="espnet-speech-enhancement-demonstration" tabindex="-1"><a class="header-anchor" href="#espnet-speech-enhancement-demonstration"><span>ESPnet Speech Enhancement Demonstration</span></a></h1><p><a href="https://colab.research.google.com/drive/1fjRJCh96SoYLZPRxsjF9VDv4Q2VoIckI?usp=sharing" target="_blank" rel="noopener noreferrer"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"><span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span></a></p><p>This notebook provides a demonstration of the speech enhancement and separation using ESPnet2-SE.</p><ul><li>ESPnet2-SE: https://github.com/espnet/espnet/tree/master/egs2/TEMPLATE/enh1</li></ul><p>Author: Chenda Li (<a href="https://github.com/LiChenda" target="_blank" rel="noopener noreferrer">@LiChenda<span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span></a>), Wangyou Zhang (<a href="https://github.com/Emrys365" target="_blank" rel="noopener noreferrer">@Emrys365<span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span></a>)</p><h2 id="install" tabindex="-1"><a class="header-anchor" href="#install"><span>Install</span></a></h2><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>%pip install -q espnet==0.10.1</span></span>
35
+ <span class="line"><span>%pip install -q espnet_model_zoo</span></span>
36
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="speech-enhancement" tabindex="-1"><a class="header-anchor" href="#speech-enhancement"><span>Speech Enhancement</span></a></h2><h3 id="single-channel-enhancement-the-chime-example" tabindex="-1"><a class="header-anchor" href="#single-channel-enhancement-the-chime-example"><span>Single-Channel Enhancement, the CHiME example</span></a></h3><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># Download one utterance from real noisy speech of CHiME4</span></span>
37
+ <span class="line"><span>!gdown --id 1SmrN5NFSg6JuQSs2sfy3ehD8OIcqK6wS -O /content/M05_440C0213_PED_REAL.wav</span></span>
38
+ <span class="line"><span>import os</span></span>
39
+ <span class="line"><span></span></span>
40
+ <span class="line"><span>import soundfile</span></span>
41
+ <span class="line"><span>from IPython.display import display, Audio</span></span>
42
+ <span class="line"><span>mixwav_mc, sr = soundfile.read(&quot;/content/M05_440C0213_PED_REAL.wav&quot;)</span></span>
43
+ <span class="line"><span># mixwav.shape: num_samples, num_channels</span></span>
44
+ <span class="line"><span>mixwav_sc = mixwav_mc[:,4]</span></span>
45
+ <span class="line"><span>display(Audio(mixwav_mc.T, rate=sr))</span></span>
46
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h4 id="download-and-load-the-pretrained-conv-tasnet" tabindex="-1"><a class="header-anchor" href="#download-and-load-the-pretrained-conv-tasnet"><span>Download and load the pretrained Conv-Tasnet</span></a></h4><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!gdown --id 17DMWdw84wF3fz3t7ia1zssdzhkpVQGZm -O /content/chime_tasnet_singlechannel.zip</span></span>
47
+ <span class="line"><span>!unzip /content/chime_tasnet_singlechannel.zip -d /content/enh_model_sc</span></span>
48
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># Load the model</span></span>
49
+ <span class="line"><span># If you encounter error &quot;No module named &#39;espnet2&#39;&quot;, please re-run the 1st Cell. This might be a colab bug.</span></span>
50
+ <span class="line"><span>import sys</span></span>
51
+ <span class="line"><span>import soundfile</span></span>
52
+ <span class="line"><span>from espnet2.bin.enh_inference import SeparateSpeech</span></span>
53
+ <span class="line"><span></span></span>
54
+ <span class="line"><span></span></span>
55
+ <span class="line"><span>separate_speech = {}</span></span>
56
+ <span class="line"><span># For models downloaded from GoogleDrive, you can use the following script:</span></span>
57
+ <span class="line"><span>enh_model_sc = SeparateSpeech(</span></span>
58
+ <span class="line"><span> enh_train_config=&quot;/content/enh_model_sc/exp/enh_train_enh_conv_tasnet_raw/config.yaml&quot;,</span></span>
59
+ <span class="line"><span> enh_model_file=&quot;/content/enh_model_sc/exp/enh_train_enh_conv_tasnet_raw/5epoch.pth&quot;,</span></span>
60
+ <span class="line"><span> # for segment-wise process on long speech</span></span>
61
+ <span class="line"><span> normalize_segment_scale=False,</span></span>
62
+ <span class="line"><span> show_progressbar=True,</span></span>
63
+ <span class="line"><span> ref_channel=4,</span></span>
64
+ <span class="line"><span> normalize_output_wav=True,</span></span>
65
+ <span class="line"><span> device=&quot;cuda:0&quot;,</span></span>
66
+ <span class="line"><span>)</span></span>
67
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h4 id="enhance-the-single-channel-real-noisy-speech-in-chime4" tabindex="-1"><a class="header-anchor" href="#enhance-the-single-channel-real-noisy-speech-in-chime4"><span>Enhance the single-channel real noisy speech in CHiME4</span></a></h4><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># play the enhanced single-channel speech</span></span>
68
+ <span class="line"><span>wave = enh_model_sc(mixwav_sc[None, ...], sr)</span></span>
69
+ <span class="line"><span>print(&quot;Input real noisy speech&quot;, flush=True)</span></span>
70
+ <span class="line"><span>display(Audio(mixwav_sc, rate=sr))</span></span>
71
+ <span class="line"><span>print(&quot;Enhanced speech&quot;, flush=True)</span></span>
72
+ <span class="line"><span>display(Audio(wave[0].squeeze(), rate=sr))</span></span>
73
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="enhance-your-own-pre-recordings" tabindex="-1"><a class="header-anchor" href="#enhance-your-own-pre-recordings"><span>Enhance your own pre-recordings</span></a></h3><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>from google.colab import files</span></span>
74
+ <span class="line"><span>from IPython.display import display, Audio</span></span>
75
+ <span class="line"><span>import soundfile</span></span>
76
+ <span class="line"><span></span></span>
77
+ <span class="line"><span>uploaded = files.upload()</span></span>
78
+ <span class="line"><span></span></span>
79
+ <span class="line"><span>for file_name in uploaded.keys():</span></span>
80
+ <span class="line"><span> speech, rate = soundfile.read(file_name)</span></span>
81
+ <span class="line"><span> assert rate == sr, &quot;mismatch in sampling rate&quot;</span></span>
82
+ <span class="line"><span> wave = enh_model_sc(speech[None, ...], sr)</span></span>
83
+ <span class="line"><span> print(f&quot;Your input speech {file_name}&quot;, flush=True)</span></span>
84
+ <span class="line"><span> display(Audio(speech, rate=sr))</span></span>
85
+ <span class="line"><span> print(f&quot;Enhanced speech for {file_name}&quot;, flush=True)</span></span>
86
+ <span class="line"><span> display(Audio(wave[0].squeeze(), rate=sr))</span></span>
87
+ <span class="line"><span></span></span>
88
+ <span class="line"><span></span></span>
89
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="multi-channel-enhancement" tabindex="-1"><a class="header-anchor" href="#multi-channel-enhancement"><span>Multi-Channel Enhancement</span></a></h3><h4 id="download-and-load-the-pretrained-mvdr-neural-beamformer" tabindex="-1"><a class="header-anchor" href="#download-and-load-the-pretrained-mvdr-neural-beamformer"><span>Download and load the pretrained mvdr neural beamformer.</span></a></h4><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># Download the pretained enhancement model</span></span>
90
+ <span class="line"><span></span></span>
91
+ <span class="line"><span>!gdown --id 1FohDfBlOa7ipc9v2luY-QIFQ_GJ1iW_i -O /content/mvdr_beamformer_16k_se_raw_valid.zip</span></span>
92
+ <span class="line"><span>!unzip /content/mvdr_beamformer_16k_se_raw_valid.zip -d /content/enh_model_mc </span></span>
93
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># Load the model</span></span>
94
+ <span class="line"><span># If you encounter error &quot;No module named &#39;espnet2&#39;&quot;, please re-run the 1st Cell. This might be a colab bug.</span></span>
95
+ <span class="line"><span>import sys</span></span>
96
+ <span class="line"><span>import soundfile</span></span>
97
+ <span class="line"><span>from espnet2.bin.enh_inference import SeparateSpeech</span></span>
98
+ <span class="line"><span></span></span>
99
+ <span class="line"><span></span></span>
100
+ <span class="line"><span>separate_speech = {}</span></span>
101
+ <span class="line"><span># For models downloaded from GoogleDrive, you can use the following script:</span></span>
102
+ <span class="line"><span>enh_model_mc = SeparateSpeech(</span></span>
103
+ <span class="line"><span> enh_train_config=&quot;/content/enh_model_mc/exp/enh_train_enh_beamformer_mvdr_raw/config.yaml&quot;,</span></span>
104
+ <span class="line"><span> enh_model_file=&quot;/content/enh_model_mc/exp/enh_train_enh_beamformer_mvdr_raw/11epoch.pth&quot;,</span></span>
105
+ <span class="line"><span> # for segment-wise process on long speech</span></span>
106
+ <span class="line"><span> normalize_segment_scale=False,</span></span>
107
+ <span class="line"><span> show_progressbar=True,</span></span>
108
+ <span class="line"><span> ref_channel=4,</span></span>
109
+ <span class="line"><span> normalize_output_wav=True,</span></span>
110
+ <span class="line"><span> device=&quot;cuda:0&quot;,</span></span>
111
+ <span class="line"><span>)</span></span>
112
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h4 id="enhance-the-multi-channel-real-noisy-speech-in-chime4" tabindex="-1"><a class="header-anchor" href="#enhance-the-multi-channel-real-noisy-speech-in-chime4"><span>Enhance the multi-channel real noisy speech in CHiME4</span></a></h4><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>wave = enh_model_mc(mixwav_mc[None, ...], sr)</span></span>
113
+ <span class="line"><span>print(&quot;Input real noisy speech&quot;, flush=True)</span></span>
114
+ <span class="line"><span>display(Audio(mixwav_mc.T, rate=sr))</span></span>
115
+ <span class="line"><span>print(&quot;Enhanced speech&quot;, flush=True)</span></span>
116
+ <span class="line"><span>display(Audio(wave[0].squeeze(), rate=sr))</span></span>
117
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="speech-separation" tabindex="-1"><a class="header-anchor" href="#speech-separation"><span>Speech Separation</span></a></h2><h3 id="model-selection" tabindex="-1"><a class="header-anchor" href="#model-selection"><span>Model Selection</span></a></h3><p>Please select model shown in <a href="https://github.com/espnet/espnet_model_zoo/blob/master/espnet_model_zoo/table.csv" target="_blank" rel="noopener noreferrer">espnet_model_zoo<span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span></a></p><p>In this demonstration, we will show different speech separation models on wsj0_2mix.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>#@title Choose Speech Separation model { run: &quot;auto&quot; }</span></span>
118
+ <span class="line"><span></span></span>
119
+ <span class="line"><span>fs = 8000 #@param {type:&quot;integer&quot;}</span></span>
120
+ <span class="line"><span>tag = &quot;Chenda Li/wsj0_2mix_enh_train_enh_conv_tasnet_raw_valid.si_snr.ave&quot; #@param [&quot;Chenda Li/wsj0_2mix_enh_train_enh_conv_tasnet_raw_valid.si_snr.ave&quot;, &quot;Chenda Li/wsj0_2mix_enh_train_enh_rnn_tf_raw_valid.si_snr.ave&quot;, &quot;https://zenodo.org/record/4688000/files/enh_train_enh_dprnn_tasnet_raw_valid.si_snr.ave.zip&quot;]</span></span>
121
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># For models uploaded to Zenodo, you can use the following python script instead:</span></span>
122
+ <span class="line"><span>import sys</span></span>
123
+ <span class="line"><span>import soundfile</span></span>
124
+ <span class="line"><span>from espnet_model_zoo.downloader import ModelDownloader</span></span>
125
+ <span class="line"><span>from espnet2.bin.enh_inference import SeparateSpeech</span></span>
126
+ <span class="line"><span></span></span>
127
+ <span class="line"><span>d = ModelDownloader()</span></span>
128
+ <span class="line"><span></span></span>
129
+ <span class="line"><span>cfg = d.download_and_unpack(tag)</span></span>
130
+ <span class="line"><span>separate_speech = SeparateSpeech(</span></span>
131
+ <span class="line"><span> enh_train_config=cfg[&quot;train_config&quot;],</span></span>
132
+ <span class="line"><span> enh_model_file=cfg[&quot;model_file&quot;],</span></span>
133
+ <span class="line"><span> # for segment-wise process on long speech</span></span>
134
+ <span class="line"><span> segment_size=2.4,</span></span>
135
+ <span class="line"><span> hop_size=0.8,</span></span>
136
+ <span class="line"><span> normalize_segment_scale=False,</span></span>
137
+ <span class="line"><span> show_progressbar=True,</span></span>
138
+ <span class="line"><span> ref_channel=None,</span></span>
139
+ <span class="line"><span> normalize_output_wav=True,</span></span>
140
+ <span class="line"><span> device=&quot;cuda:0&quot;,</span></span>
141
+ <span class="line"><span>)</span></span>
142
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="separate-speech-mixture" tabindex="-1"><a class="header-anchor" href="#separate-speech-mixture"><span>Separate Speech Mixture</span></a></h3><h4 id="separate-the-example-in-wsj0-2mix-testing-set" tabindex="-1"><a class="header-anchor" href="#separate-the-example-in-wsj0-2mix-testing-set"><span>Separate the example in wsj0_2mix testing set</span></a></h4><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!gdown --id 1ZCUkd_Lb7pO2rpPr4FqYdtJBZ7JMiInx -O /content/447c020t_1.2106_422a0112_-1.2106.wav</span></span>
143
+ <span class="line"><span></span></span>
144
+ <span class="line"><span>import os</span></span>
145
+ <span class="line"><span>import soundfile</span></span>
146
+ <span class="line"><span>from IPython.display import display, Audio</span></span>
147
+ <span class="line"><span></span></span>
148
+ <span class="line"><span>mixwav, sr = soundfile.read(&quot;447c020t_1.2106_422a0112_-1.2106.wav&quot;)</span></span>
149
+ <span class="line"><span>waves_wsj = separate_speech(mixwav[None, ...], fs=sr)</span></span>
150
+ <span class="line"><span></span></span>
151
+ <span class="line"><span>print(&quot;Input mixture&quot;, flush=True)</span></span>
152
+ <span class="line"><span>display(Audio(mixwav, rate=sr))</span></span>
153
+ <span class="line"><span>print(f&quot;========= Separated speech with model {tag} =========&quot;, flush=True)</span></span>
154
+ <span class="line"><span>print(&quot;Separated spk1&quot;, flush=True)</span></span>
155
+ <span class="line"><span>display(Audio(waves_wsj[0].squeeze(), rate=sr))</span></span>
156
+ <span class="line"><span>print(&quot;Separated spk2&quot;, flush=True)</span></span>
157
+ <span class="line"><span>display(Audio(waves_wsj[1].squeeze(), rate=sr))</span></span>
158
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h4 id="separate-your-own-recordings" tabindex="-1"><a class="header-anchor" href="#separate-your-own-recordings"><span>Separate your own recordings</span></a></h4><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>from google.colab import files</span></span>
159
+ <span class="line"><span>from IPython.display import display, Audio</span></span>
160
+ <span class="line"><span>import soundfile</span></span>
161
+ <span class="line"><span></span></span>
162
+ <span class="line"><span>uploaded = files.upload()</span></span>
163
+ <span class="line"><span></span></span>
164
+ <span class="line"><span>for file_name in uploaded.keys():</span></span>
165
+ <span class="line"><span> mixwav_yours, rate = soundfile.read(file_name)</span></span>
166
+ <span class="line"><span> assert rate == sr, &quot;mismatch in sampling rate&quot;</span></span>
167
+ <span class="line"><span> waves_yours = separate_speech(mixwav_yours[None, ...], fs=sr)</span></span>
168
+ <span class="line"><span> print(&quot;Input mixture&quot;, flush=True)</span></span>
169
+ <span class="line"><span> display(Audio(mixwav_yours, rate=sr))</span></span>
170
+ <span class="line"><span> print(f&quot;========= Separated speech with model {tag} =========&quot;, flush=True)</span></span>
171
+ <span class="line"><span> print(&quot;Separated spk1&quot;, flush=True)</span></span>
172
+ <span class="line"><span> display(Audio(waves_yours[0].squeeze(), rate=sr))</span></span>
173
+ <span class="line"><span> print(&quot;Separated spk2&quot;, flush=True)</span></span>
174
+ <span class="line"><span> display(Audio(waves_yours[1].squeeze(), rate=sr))</span></span>
175
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h4 id="show-spectrums-of-separated-speech" tabindex="-1"><a class="header-anchor" href="#show-spectrums-of-separated-speech"><span>Show spectrums of separated speech</span></a></h4><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>import matplotlib.pyplot as plt</span></span>
176
+ <span class="line"><span>import torch</span></span>
177
+ <span class="line"><span>from torch_complex.tensor import ComplexTensor</span></span>
178
+ <span class="line"><span></span></span>
179
+ <span class="line"><span>from espnet.asr.asr_utils import plot_spectrogram</span></span>
180
+ <span class="line"><span>from espnet2.layers.stft import Stft</span></span>
181
+ <span class="line"><span></span></span>
182
+ <span class="line"><span></span></span>
183
+ <span class="line"><span>stft = Stft(</span></span>
184
+ <span class="line"><span> n_fft=512,</span></span>
185
+ <span class="line"><span> win_length=None,</span></span>
186
+ <span class="line"><span> hop_length=128,</span></span>
187
+ <span class="line"><span> window=&quot;hann&quot;,</span></span>
188
+ <span class="line"><span>)</span></span>
189
+ <span class="line"><span>ilens = torch.LongTensor([len(mixwav)])</span></span>
190
+ <span class="line"><span># specs: (T, F)</span></span>
191
+ <span class="line"><span>spec_mix = ComplexTensor(</span></span>
192
+ <span class="line"><span> *torch.unbind(</span></span>
193
+ <span class="line"><span> stft(torch.as_tensor(mixwav).unsqueeze(0), ilens)[0].squeeze(),</span></span>
194
+ <span class="line"><span> dim=-1</span></span>
195
+ <span class="line"><span> )</span></span>
196
+ <span class="line"><span>)</span></span>
197
+ <span class="line"><span>spec_sep1 = ComplexTensor(</span></span>
198
+ <span class="line"><span> *torch.unbind(</span></span>
199
+ <span class="line"><span> stft(torch.as_tensor(waves_wsj[0]), ilens)[0].squeeze(),</span></span>
200
+ <span class="line"><span> dim=-1</span></span>
201
+ <span class="line"><span> )</span></span>
202
+ <span class="line"><span>)</span></span>
203
+ <span class="line"><span>spec_sep2 = ComplexTensor(</span></span>
204
+ <span class="line"><span> *torch.unbind(</span></span>
205
+ <span class="line"><span> stft(torch.as_tensor(waves_wsj[1]), ilens)[0].squeeze(),</span></span>
206
+ <span class="line"><span> dim=-1</span></span>
207
+ <span class="line"><span> )</span></span>
208
+ <span class="line"><span>)</span></span>
209
+ <span class="line"><span></span></span>
210
+ <span class="line"><span># freqs = torch.linspace(0, sr / 2, spec_mix.shape[1])</span></span>
211
+ <span class="line"><span># frames = torch.linspace(0, len(mixwav) / sr, spec_mix.shape[0])</span></span>
212
+ <span class="line"><span>samples = torch.linspace(0, len(mixwav) / sr, len(mixwav))</span></span>
213
+ <span class="line"><span>plt.figure(figsize=(24, 12))</span></span>
214
+ <span class="line"><span>plt.subplot(3, 2, 1)</span></span>
215
+ <span class="line"><span>plt.title(&#39;Mixture Spectrogram&#39;)</span></span>
216
+ <span class="line"><span>plot_spectrogram(</span></span>
217
+ <span class="line"><span> plt, abs(spec_mix).transpose(-1, -2).numpy(), fs=sr,</span></span>
218
+ <span class="line"><span> mode=&#39;db&#39;, frame_shift=None,</span></span>
219
+ <span class="line"><span> bottom=False, labelbottom=False</span></span>
220
+ <span class="line"><span>)</span></span>
221
+ <span class="line"><span>plt.subplot(3, 2, 2)</span></span>
222
+ <span class="line"><span>plt.title(&#39;Mixture Wavform&#39;)</span></span>
223
+ <span class="line"><span>plt.plot(samples, mixwav)</span></span>
224
+ <span class="line"><span>plt.xlim(0, len(mixwav) / sr)</span></span>
225
+ <span class="line"><span></span></span>
226
+ <span class="line"><span>plt.subplot(3, 2, 3)</span></span>
227
+ <span class="line"><span>plt.title(&#39;Separated Spectrogram (spk1)&#39;)</span></span>
228
+ <span class="line"><span>plot_spectrogram(</span></span>
229
+ <span class="line"><span> plt, abs(spec_sep1).transpose(-1, -2).numpy(), fs=sr,</span></span>
230
+ <span class="line"><span> mode=&#39;db&#39;, frame_shift=None,</span></span>
231
+ <span class="line"><span> bottom=False, labelbottom=False</span></span>
232
+ <span class="line"><span>)</span></span>
233
+ <span class="line"><span>plt.subplot(3, 2, 4)</span></span>
234
+ <span class="line"><span>plt.title(&#39;Separated Wavform (spk1)&#39;)</span></span>
235
+ <span class="line"><span>plt.plot(samples, waves_wsj[0].squeeze())</span></span>
236
+ <span class="line"><span>plt.xlim(0, len(mixwav) / sr)</span></span>
237
+ <span class="line"><span></span></span>
238
+ <span class="line"><span>plt.subplot(3, 2, 5)</span></span>
239
+ <span class="line"><span>plt.title(&#39;Separated Spectrogram (spk2)&#39;)</span></span>
240
+ <span class="line"><span>plot_spectrogram(</span></span>
241
+ <span class="line"><span> plt, abs(spec_sep2).transpose(-1, -2).numpy(), fs=sr,</span></span>
242
+ <span class="line"><span> mode=&#39;db&#39;, frame_shift=None,</span></span>
243
+ <span class="line"><span> bottom=False, labelbottom=False</span></span>
244
+ <span class="line"><span>)</span></span>
245
+ <span class="line"><span>plt.subplot(3, 2, 6)</span></span>
246
+ <span class="line"><span>plt.title(&#39;Separated Wavform (spk2)&#39;)</span></span>
247
+ <span class="line"><span>plt.plot(samples, waves_wsj[1].squeeze())</span></span>
248
+ <span class="line"><span>plt.xlim(0, len(mixwav) / sr)</span></span>
249
+ <span class="line"><span>plt.xlabel(&quot;Time (s)&quot;)</span></span>
250
+ <span class="line"><span>plt.show()</span></span>
251
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="evluate-separated-speech-with-pretrained-asr-model" tabindex="-1"><a class="header-anchor" href="#evluate-separated-speech-with-pretrained-asr-model"><span>Evluate separated speech with pretrained ASR model</span></a></h2><p>The ground truths are:</p><p><code>text_1: SOME CRITICS INCLUDING HIGH REAGAN ADMINISTRATION OFFICIALS ARE RAISING THE ALARM THAT THE FED&#39;S POLICY IS TOO TIGHT AND COULD CAUSE A RECESSION NEXT YEAR</code></p><p><code>text_2: THE UNITED STATES UNDERTOOK TO DEFEND WESTERN EUROPE AGAINST SOVIET ATTACK</code></p><p>(This may take a while for the speech recognition.)</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>import espnet_model_zoo</span></span>
252
+ <span class="line"><span>from espnet_model_zoo.downloader import ModelDownloader</span></span>
253
+ <span class="line"><span>from espnet2.bin.asr_inference import Speech2Text</span></span>
254
+ <span class="line"><span></span></span>
255
+ <span class="line"><span>wsj_8k_model_url=&quot;https://zenodo.org/record/4012264/files/asr_train_asr_transformer_raw_char_1gpu_valid.acc.ave.zip?download=1&quot;</span></span>
256
+ <span class="line"><span></span></span>
257
+ <span class="line"><span>d = ModelDownloader()</span></span>
258
+ <span class="line"><span>speech2text = Speech2Text(</span></span>
259
+ <span class="line"><span> **d.download_and_unpack(wsj_8k_model_url),</span></span>
260
+ <span class="line"><span> device=&quot;cuda:0&quot;,</span></span>
261
+ <span class="line"><span>)</span></span>
262
+ <span class="line"><span></span></span>
263
+ <span class="line"><span>text_est = [None, None]</span></span>
264
+ <span class="line"><span>text_est[0], *_ = speech2text(waves_wsj[0].squeeze())[0]</span></span>
265
+ <span class="line"><span>text_est[1], *_ = speech2text(waves_wsj[1].squeeze())[0]</span></span>
266
+ <span class="line"><span>text_m, *_ = speech2text(mixwav)[0]</span></span>
267
+ <span class="line"><span>print(&quot;Mix Speech to Text: &quot;, text_m)</span></span>
268
+ <span class="line"><span>print(&quot;Separated Speech 1 to Text: &quot;, text_est[0])</span></span>
269
+ <span class="line"><span>print(&quot;Separated Speech 2 to Text: &quot;, text_est[1])</span></span>
270
+ <span class="line"><span></span></span>
271
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>import difflib</span></span>
272
+ <span class="line"><span>from itertools import permutations</span></span>
273
+ <span class="line"><span></span></span>
274
+ <span class="line"><span>import editdistance</span></span>
275
+ <span class="line"><span>import numpy as np</span></span>
276
+ <span class="line"><span></span></span>
277
+ <span class="line"><span>colors = dict(</span></span>
278
+ <span class="line"><span> red=lambda text: f&quot;\033[38;2;255;0;0m{text}\033[0m&quot; if text else &quot;&quot;,</span></span>
279
+ <span class="line"><span> green=lambda text: f&quot;\033[38;2;0;255;0m{text}\033[0m&quot; if text else &quot;&quot;,</span></span>
280
+ <span class="line"><span> yellow=lambda text: f&quot;\033[38;2;225;225;0m{text}\033[0m&quot; if text else &quot;&quot;,</span></span>
281
+ <span class="line"><span> white=lambda text: f&quot;\033[38;2;255;255;255m{text}\033[0m&quot; if text else &quot;&quot;,</span></span>
282
+ <span class="line"><span> black=lambda text: f&quot;\033[38;2;0;0;0m{text}\033[0m&quot; if text else &quot;&quot;,</span></span>
283
+ <span class="line"><span>)</span></span>
284
+ <span class="line"><span></span></span>
285
+ <span class="line"><span>def diff_strings(ref, est):</span></span>
286
+ <span class="line"><span> &quot;&quot;&quot;Reference: https://stackoverflow.com/a/64404008/7384873&quot;&quot;&quot;</span></span>
287
+ <span class="line"><span> ref_str, est_str, err_str = [], [], []</span></span>
288
+ <span class="line"><span> matcher = difflib.SequenceMatcher(None, ref, est)</span></span>
289
+ <span class="line"><span> for opcode, a0, a1, b0, b1 in matcher.get_opcodes():</span></span>
290
+ <span class="line"><span> if opcode == &quot;equal&quot;:</span></span>
291
+ <span class="line"><span> txt = ref[a0:a1]</span></span>
292
+ <span class="line"><span> ref_str.append(txt)</span></span>
293
+ <span class="line"><span> est_str.append(txt)</span></span>
294
+ <span class="line"><span> err_str.append(&quot; &quot; * (a1 - a0))</span></span>
295
+ <span class="line"><span> elif opcode == &quot;insert&quot;:</span></span>
296
+ <span class="line"><span> ref_str.append(&quot;*&quot; * (b1 - b0))</span></span>
297
+ <span class="line"><span> est_str.append(colors[&quot;green&quot;](est[b0:b1]))</span></span>
298
+ <span class="line"><span> err_str.append(colors[&quot;black&quot;](&quot;I&quot; * (b1 - b0)))</span></span>
299
+ <span class="line"><span> elif opcode == &quot;delete&quot;:</span></span>
300
+ <span class="line"><span> ref_str.append(ref[a0:a1])</span></span>
301
+ <span class="line"><span> est_str.append(colors[&quot;red&quot;](&quot;*&quot; * (a1 - a0)))</span></span>
302
+ <span class="line"><span> err_str.append(colors[&quot;black&quot;](&quot;D&quot; * (a1 - a0)))</span></span>
303
+ <span class="line"><span> elif opcode == &quot;replace&quot;:</span></span>
304
+ <span class="line"><span> diff = a1 - a0 - b1 + b0</span></span>
305
+ <span class="line"><span> if diff &gt;= 0:</span></span>
306
+ <span class="line"><span> txt_ref = ref[a0:a1]</span></span>
307
+ <span class="line"><span> txt_est = colors[&quot;yellow&quot;](est[b0:b1]) + colors[&quot;red&quot;](&quot;*&quot; * diff)</span></span>
308
+ <span class="line"><span> txt_err = &quot;S&quot; * (b1 - b0) + &quot;D&quot; * diff</span></span>
309
+ <span class="line"><span> elif diff &lt; 0:</span></span>
310
+ <span class="line"><span> txt_ref = ref[a0:a1] + &quot;*&quot; * -diff</span></span>
311
+ <span class="line"><span> txt_est = colors[&quot;yellow&quot;](est[b0:b1]) + colors[&quot;green&quot;](&quot;*&quot; * -diff)</span></span>
312
+ <span class="line"><span> txt_err = &quot;S&quot; * (b1 - b0) + &quot;I&quot; * -diff</span></span>
313
+ <span class="line"><span></span></span>
314
+ <span class="line"><span> ref_str.append(txt_ref)</span></span>
315
+ <span class="line"><span> est_str.append(txt_est)</span></span>
316
+ <span class="line"><span> err_str.append(colors[&quot;black&quot;](txt_err))</span></span>
317
+ <span class="line"><span> return &quot;&quot;.join(ref_str), &quot;&quot;.join(est_str), &quot;&quot;.join(err_str)</span></span>
318
+ <span class="line"><span></span></span>
319
+ <span class="line"><span></span></span>
320
+ <span class="line"><span>text_ref = [</span></span>
321
+ <span class="line"><span> &quot;SOME CRITICS INCLUDING HIGH REAGAN ADMINISTRATION OFFICIALS ARE RAISING THE ALARM THAT THE FED&#39;S POLICY IS TOO TIGHT AND COULD CAUSE A RECESSION NEXT YEAR&quot;,</span></span>
322
+ <span class="line"><span> &quot;THE UNITED STATES UNDERTOOK TO DEFEND WESTERN EUROPE AGAINST SOVIET ATTACK&quot;,</span></span>
323
+ <span class="line"><span>]</span></span>
324
+ <span class="line"><span></span></span>
325
+ <span class="line"><span>print(&quot;=====================&quot; , flush=True)</span></span>
326
+ <span class="line"><span>perms = list(permutations(range(2)))</span></span>
327
+ <span class="line"><span>string_edit = [</span></span>
328
+ <span class="line"><span> [</span></span>
329
+ <span class="line"><span> editdistance.eval(text_ref[m], text_est[n])</span></span>
330
+ <span class="line"><span> for m, n in enumerate(p)</span></span>
331
+ <span class="line"><span> ]</span></span>
332
+ <span class="line"><span> for p in perms</span></span>
333
+ <span class="line"><span>]</span></span>
334
+ <span class="line"><span></span></span>
335
+ <span class="line"><span>dist = [sum(edist) for edist in string_edit]</span></span>
336
+ <span class="line"><span>perm_idx = np.argmin(dist)</span></span>
337
+ <span class="line"><span>perm = perms[perm_idx]</span></span>
338
+ <span class="line"><span></span></span>
339
+ <span class="line"><span>for i, p in enumerate(perm):</span></span>
340
+ <span class="line"><span> print(&quot;\n--------------- Text %d ---------------&quot; % (i + 1), flush=True)</span></span>
341
+ <span class="line"><span> ref, est, err = diff_strings(text_ref[i], text_est[p])</span></span>
342
+ <span class="line"><span> print(&quot;REF: &quot; + ref + &quot;\n&quot; + &quot;HYP: &quot; + est + &quot;\n&quot; + &quot;ERR: &quot; + err, flush=True)</span></span>
343
+ <span class="line"><span> print(&quot;Edit Distance = {}\n&quot;.format(string_edit[perm_idx][i]), flush=True)</span></span>
344
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div></div><!--[--><!--]--></div><footer class="vp-page-meta"><!----><div class="vp-meta-item git-info"><!----><!----></div></footer><nav class="vp-page-nav" aria-label="page navigation"><!----><a class="route-link next" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><div class="hint">Next <span class="arrow right"></span></div><div class="link"><span>ESPnet Speech Enhancement Demonstration</span></div><!--]--></a></nav><!--[--><!--]--></main><!--]--></div><!--[--><!----><!--]--><!--]--></div>
345
+ <script type="module" src="/assets/app-DTS6SjJz.js" defer></script>
346
+ </body>
347
+ </html>
espnet2/slu/espnet2_2pass_slu_demo.html ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!doctype html>
2
+ <html lang="en-US">
3
+ <head>
4
+ <meta charset="utf-8" />
5
+ <meta name="viewport" content="width=device-width,initial-scale=1" />
6
+ <meta name="generator" content="VuePress 2.0.0-rc.9" />
7
+ <style>
8
+ :root {
9
+ --c-bg: #fff;
10
+ }
11
+ html.dark {
12
+ --c-bg: #22272e;
13
+ }
14
+ html,
15
+ body {
16
+ background-color: var(--c-bg);
17
+ }
18
+ </style>
19
+ <script>
20
+ const userMode = localStorage.getItem('vuepress-color-scheme')
21
+ const systemDarkMode =
22
+ window.matchMedia &&
23
+ window.matchMedia('(prefers-color-scheme: dark)').matches
24
+ if (userMode === 'dark' || (userMode !== 'light' && systemDarkMode)) {
25
+ document.documentElement.classList.toggle('dark', true)
26
+ }
27
+ </script>
28
+ <link rel="manifest" href="/manifest.webmanifest"><meta name="application-name" content="Example"><meta name="apple-mobile-web-app-title" content="Example"><meta name="apple-mobile-web-app-status-bar-style" content="black"><meta name="msapplication-TileColor" content="#3eaf7c"><meta name="theme-color" content="#3eaf7c"><title>ESPNET 2 pass SLU Demonstration | </title><meta name="description" content=" ">
29
+ <link rel="preload" href="/assets/style-SNWc1iKP.css" as="style"><link rel="stylesheet" href="/assets/style-SNWc1iKP.css">
30
+ <link rel="modulepreload" href="/assets/app-DTS6SjJz.js"><link rel="modulepreload" href="/assets/espnet2_2pass_slu_demo.html-BmvJ92Ni.js">
31
+ <link rel="prefetch" href="/assets/index.html-DGcx4T0I.js" as="script"><link rel="prefetch" href="/assets/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html-CJ8-yKXK.js" as="script"><link rel="prefetch" href="/assets/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html-BDY4p1H1.js" as="script"><link rel="prefetch" href="/assets/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html-DDjiuGQB.js" as="script"><link rel="prefetch" href="/assets/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html-DnIftUJK.js" as="script"><link rel="prefetch" href="/assets/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html-D9GsoT-_.js" as="script"><link rel="prefetch" href="/assets/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html-DobzmqH0.js" as="script"><link rel="prefetch" href="/assets/espnet2_tutorial_2021_CMU_11751_18781.html-BY6Z52B4.js" as="script"><link rel="prefetch" href="/assets/asr_cli.html-BA-xBrC-.js" as="script"><link rel="prefetch" href="/assets/asr_library.html-rEQwKTMV.js" as="script"><link rel="prefetch" href="/assets/espnet2_asr_realtime_demo.html-BnK1Wovv.js" as="script"><link rel="prefetch" href="/assets/espnet2_asr_transfer_learning_demo.html-DJeSoTyY.js" as="script"><link rel="prefetch" href="/assets/espnet2_streaming_asr_demo.html-Yx-OX8AZ.js" as="script"><link rel="prefetch" href="/assets/espnet_se_demonstration_for_waspaa_2021.html--JdDNbEo.js" as="script"><link rel="prefetch" href="/assets/se_demo.html-DY-mv2y8.js" as="script"><link rel="prefetch" href="/assets/onnx_conversion_demo.html-D56NEMop.js" as="script"><link rel="prefetch" href="/assets/pretrained.html-JpE__EKJ.js" as="script"><link rel="prefetch" href="/assets/st_demo.html-WLzB4ZGO.js" as="script"><link rel="prefetch" href="/assets/espnet2_tts_realtime_demo.html-BdxLBr1c.js" as="script"><link rel="prefetch" href="/assets/tts_cli.html-BfB21gs4.js" as="script"><link rel="prefetch" href="/assets/tts_realtime_demo.html-BKOGq7as.js" as="script"><link rel="prefetch" href="/assets/finetune_owsm.html-ICOQYZj2.js" as="script"><link rel="prefetch" href="/assets/finetune_with_lora.html-3NfoQDOl.js" as="script"><link rel="prefetch" href="/assets/train.html-BQ-t2Cs4.js" as="script"><link rel="prefetch" href="/assets/tacotron2.html-Ds1AKES7.js" as="script"><link rel="prefetch" href="/assets/404.html-DN7291h8.js" as="script"><link rel="prefetch" href="/assets/NpmBadge-rh9tvaXX.js" as="script">
32
+ </head>
33
+ <body>
34
+ <div id="app"><!--[--><div class="theme-container"><!--[--><header class="navbar"><div class="toggle-sidebar-button" title="toggle sidebar" aria-expanded="false" role="button" tabindex="0"><div class="icon" aria-hidden="true"><span></span><span></span><span></span></div></div><span><a class="route-link" href="/"><img class="logo" src="/images/espnet_logo1.png" alt=" "><span class="site-name can-hide" aria-hidden="true"> </span></a></span><div class="navbar-items-wrapper" style=""><!--[--><!--]--><nav class="navbar-items can-hide" aria-label="site navigation"><!--[--><div class="navbar-item"><a class="external-link" href="https://github.com/espnet/espnet" rel="noopener noreferrer" target="_blank" aria-label="Github"><!--[--><!--[--><!--]--> Github <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://huggingface.co/espnet" rel="noopener noreferrer" target="_blank" aria-label="HuggingFace"><!--[--><!--[--><!--]--> HuggingFace <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://espnet.github.io/espnet" rel="noopener noreferrer" target="_blank" aria-label="Docs"><!--[--><!--[--><!--]--> Docs <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SE</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SLU</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link route-link-active" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>OTHERS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ST</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/tts/tacotron2.html" aria-label="TTS demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> TTS demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_with_lora.html" aria-label="Finetune Model with ESPnet-Easy"><!--[--><!--[--><!--]--> Finetune Model with ESPnet-Easy <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/train.html" aria-label="Sample demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> Sample demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_owsm.html" aria-label="OWSM finetuning with custom dataset"><!--[--><!--[--><!--]--> OWSM finetuning with custom dataset <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_tutorial_2021_CMU_11751_18781.html" aria-label="CMU 11751/18781 2021: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 2021: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html" aria-label="CMU 11492/11692 Spring 2023: Spoken Language Understanding"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Spoken Language Understanding <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html" aria-label="CMU 11492/11692 Spring 2023: Speech Enhancement"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Speech Enhancement <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html" aria-label="CMU 11492/11692 Spring 2023: Data preparation"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Data preparation <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task)"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html" aria-label="CMU 11492/11692 Spring 2023: Text to Speech"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Text to Speech <!--[--><!--]--><!--]--></a></li><!--]--></ul></div></div><!--]--></nav><!--[--><!--]--><button class="toggle-color-mode-button" title="toggle color mode"><svg style="" class="icon" focusable="false" viewBox="0 0 32 32"><path d="M16 12.005a4 4 0 1 1-4 4a4.005 4.005 0 0 1 4-4m0-2a6 6 0 1 0 6 6a6 6 0 0 0-6-6z" fill="currentColor"></path><path d="M5.394 6.813l1.414-1.415l3.506 3.506L8.9 10.318z" fill="currentColor"></path><path d="M2 15.005h5v2H2z" fill="currentColor"></path><path d="M5.394 25.197L8.9 21.691l1.414 1.415l-3.506 3.505z" fill="currentColor"></path><path d="M15 25.005h2v5h-2z" fill="currentColor"></path><path d="M21.687 23.106l1.414-1.415l3.506 3.506l-1.414 1.414z" fill="currentColor"></path><path d="M25 15.005h5v2h-5z" fill="currentColor"></path><path d="M21.687 8.904l3.506-3.506l1.414 1.415l-3.506 3.505z" fill="currentColor"></path><path d="M15 2.005h2v5h-2z" fill="currentColor"></path></svg><svg style="display:none;" class="icon" focusable="false" viewBox="0 0 32 32"><path d="M13.502 5.414a15.075 15.075 0 0 0 11.594 18.194a11.113 11.113 0 0 1-7.975 3.39c-.138 0-.278.005-.418 0a11.094 11.094 0 0 1-3.2-21.584M14.98 3a1.002 1.002 0 0 0-.175.016a13.096 13.096 0 0 0 1.825 25.981c.164.006.328 0 .49 0a13.072 13.072 0 0 0 10.703-5.555a1.01 1.01 0 0 0-.783-1.565A13.08 13.08 0 0 1 15.89 4.38A1.015 1.015 0 0 0 14.98 3z" fill="currentColor"></path></svg></button><form class="search-box" role="search"><input type="search" placeholder="Search" autocomplete="off" spellcheck="false" value><!----></form></div></header><!--]--><div class="sidebar-mask"></div><!--[--><aside class="sidebar"><nav class="navbar-items" aria-label="site navigation"><!--[--><div class="navbar-item"><a class="external-link" href="https://github.com/espnet/espnet" rel="noopener noreferrer" target="_blank" aria-label="Github"><!--[--><!--[--><!--]--> Github <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://huggingface.co/espnet" rel="noopener noreferrer" target="_blank" aria-label="HuggingFace"><!--[--><!--[--><!--]--> HuggingFace <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://espnet.github.io/espnet" rel="noopener noreferrer" target="_blank" aria-label="Docs"><!--[--><!--[--><!--]--> Docs <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SE</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SLU</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link route-link-active" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>OTHERS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ST</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/tts/tacotron2.html" aria-label="TTS demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> TTS demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_with_lora.html" aria-label="Finetune Model with ESPnet-Easy"><!--[--><!--[--><!--]--> Finetune Model with ESPnet-Easy <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/train.html" aria-label="Sample demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> Sample demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_owsm.html" aria-label="OWSM finetuning with custom dataset"><!--[--><!--[--><!--]--> OWSM finetuning with custom dataset <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_tutorial_2021_CMU_11751_18781.html" aria-label="CMU 11751/18781 2021: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 2021: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html" aria-label="CMU 11492/11692 Spring 2023: Spoken Language Understanding"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Spoken Language Understanding <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html" aria-label="CMU 11492/11692 Spring 2023: Speech Enhancement"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Speech Enhancement <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html" aria-label="CMU 11492/11692 Spring 2023: Data preparation"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Data preparation <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task)"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html" aria-label="CMU 11492/11692 Spring 2023: Text to Speech"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Text to Speech <!--[--><!--]--><!--]--></a></li><!--]--></ul></div></div><!--]--></nav><!--[--><!--]--><ul class="sidebar-items"><!--[--><li><p tabindex="0" class="sidebar-item sidebar-heading">TTS <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">SE <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading active">SLU <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link route-link-active sidebar-item active" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="#download-audio-file" aria-label="Download Audio File"><!--[--><!--[--><!--]--> Download Audio File <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#download-and-load-pretrained-first-pass-model" aria-label="Download and Load pretrained First Pass Model"><!--[--><!--[--><!--]--> Download and Load pretrained First Pass Model <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#download-and-load-pretrained-second-pass-model" aria-label="Download and Load pretrained Second Pass Model"><!--[--><!--[--><!--]--> Download and Load pretrained Second Pass Model <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">ASR <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">OTHERS <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">ST <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><!--]--></ul><!--[--><!--]--></aside><!--]--><!--[--><main class="page"><!--[--><!--]--><div class="theme-default-content"><!--[--><!--]--><div><h1 id="espnet-2-pass-slu-demonstration" tabindex="-1"><a class="header-anchor" href="#espnet-2-pass-slu-demonstration"><span>ESPNET 2 pass SLU Demonstration</span></a></h1><p>This notebook provides a demonstration of the Two Pass End-to-End Spoken Language Understanding model</p><p>Paper Link: https://arxiv.org/abs/2207.06670</p><p>ESPnet2-SLU: https://github.com/espnet/espnet/tree/master/egs2/TEMPLATE/slu1</p><p>Author: Siddhant Arora</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">! python -m pip install transformers</span></span>
35
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">git clone https://github.com/espnet/espnet /espnet</span></span>
36
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">pip install /espnet</span></span>
37
+ <span class="line"><span style="color:#D4D4D4;">%pip install -q espnet_model_zoo</span></span>
38
+ <span class="line"><span style="color:#D4D4D4;">%pip install fairseq@git+https://github.com//pytorch/fairseq.git@f2146bdc7abf293186de9449bfa2272775e39e1d</span><span style="color:#6A9955;">#egg=fairseq</span></span>
39
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="download-audio-file" tabindex="-1"><a class="header-anchor" href="#download-audio-file"><span>Download Audio File</span></a></h2><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># !gdown --id 1LxoxCoFgx3u8CvKb1loybGFtArKKPcAH -O /content/audio_file.wav</span></span>
40
+ <span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">gdown </span><span style="color:#F44747;">--</span><span style="color:#DCDCAA;">id</span><span style="color:#F44747;"> 18ANT62ittt7Ai2E8bQRlvT0ZVXXsf1eE</span><span style="color:#D4D4D4;"> -O /content/audio_file.wav</span></span>
41
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> os</span></span>
42
+ <span class="line"></span>
43
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> soundfile</span></span>
44
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> IPython.display </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> display, Audio</span></span>
45
+ <span class="line"><span style="color:#D4D4D4;">mixwav_mc, sr = soundfile.read(</span><span style="color:#CE9178;">&quot;/content/audio_file.wav&quot;</span><span style="color:#D4D4D4;">)</span></span>
46
+ <span class="line"><span style="color:#D4D4D4;">display(Audio(mixwav_mc.T, </span><span style="color:#9CDCFE;">rate</span><span style="color:#D4D4D4;">=sr))</span></span>
47
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="download-and-load-pretrained-first-pass-model" tabindex="-1"><a class="header-anchor" href="#download-and-load-pretrained-first-pass-model"><span>Download and Load pretrained First Pass Model</span></a></h2><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">git lfs clone https://huggingface.co/espnet/siddhana_slurp_new_asr_train_asr_conformer_raw_en_word_valid.acc.ave_10best /content/slurp_first_pass_model</span></span>
48
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet2.bin.asr_inference </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> Speech2Text</span></span>
49
+ <span class="line"><span style="color:#D4D4D4;">speech2text_slurp = Speech2Text.from_pretrained(</span></span>
50
+ <span class="line"><span style="color:#9CDCFE;"> asr_train_config</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&quot;/content/slurp_first_pass_model/exp/asr_train_asr_conformer_raw_en_word/config.yaml&quot;</span><span style="color:#D4D4D4;">,</span></span>
51
+ <span class="line"><span style="color:#9CDCFE;"> asr_model_file</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&quot;/content/slurp_first_pass_model/exp/asr_train_asr_conformer_raw_en_word/valid.acc.ave_10best.pth&quot;</span><span style="color:#D4D4D4;">,</span></span>
52
+ <span class="line"><span style="color:#9CDCFE;"> nbest</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">,</span></span>
53
+ <span class="line"><span style="color:#D4D4D4;">)</span></span>
54
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">nbests_orig = speech2text_slurp(mixwav_mc)</span></span>
55
+ <span class="line"><span style="color:#D4D4D4;">text, *_ = nbests_orig[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">]</span></span>
56
+ <span class="line"><span style="color:#569CD6;">def</span><span style="color:#DCDCAA;"> text_normalizer</span><span style="color:#D4D4D4;">(</span><span style="color:#9CDCFE;">sub_word_transcript</span><span style="color:#D4D4D4;">):</span></span>
57
+ <span class="line"><span style="color:#D4D4D4;"> transcript = sub_word_transcript[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">].replace(</span><span style="color:#CE9178;">&quot;▁&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;&quot;</span><span style="color:#D4D4D4;">)</span></span>
58
+ <span class="line"><span style="color:#C586C0;"> for</span><span style="color:#D4D4D4;"> sub_word </span><span style="color:#C586C0;">in</span><span style="color:#D4D4D4;"> sub_word_transcript[</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">:]:</span></span>
59
+ <span class="line"><span style="color:#C586C0;"> if</span><span style="color:#CE9178;"> &quot;▁&quot;</span><span style="color:#569CD6;"> in</span><span style="color:#D4D4D4;"> sub_word:</span></span>
60
+ <span class="line"><span style="color:#D4D4D4;"> transcript = transcript + </span><span style="color:#CE9178;">&quot; &quot;</span><span style="color:#D4D4D4;"> + sub_word.replace(</span><span style="color:#CE9178;">&quot;▁&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;&quot;</span><span style="color:#D4D4D4;">)</span></span>
61
+ <span class="line"><span style="color:#C586C0;"> else</span><span style="color:#D4D4D4;">:</span></span>
62
+ <span class="line"><span style="color:#D4D4D4;"> transcript = transcript + sub_word</span></span>
63
+ <span class="line"><span style="color:#C586C0;"> return</span><span style="color:#D4D4D4;"> transcript</span></span>
64
+ <span class="line"><span style="color:#D4D4D4;">intent_text=</span><span style="color:#CE9178;">&quot;{scenario: &quot;</span><span style="color:#D4D4D4;">+text.split()[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">].split(</span><span style="color:#CE9178;">&quot;_&quot;</span><span style="color:#D4D4D4;">)[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">]+</span><span style="color:#CE9178;">&quot;, action: &quot;</span><span style="color:#D4D4D4;">+</span><span style="color:#CE9178;">&quot;_&quot;</span><span style="color:#D4D4D4;">.join(text.split()[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">].split(</span><span style="color:#CE9178;">&quot;_&quot;</span><span style="color:#D4D4D4;">)[</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">:])+</span><span style="color:#CE9178;">&quot;}&quot;</span></span>
65
+ <span class="line"><span style="color:#DCDCAA;">print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;INTENT: </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">intent_text</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&quot;</span><span style="color:#D4D4D4;">)</span></span>
66
+ <span class="line"><span style="color:#D4D4D4;">transcript=text_normalizer(text.split()[</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">:])</span></span>
67
+ <span class="line"><span style="color:#DCDCAA;">print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;ASR hypothesis: </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">transcript</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&quot;</span><span style="color:#D4D4D4;">)</span></span>
68
+ <span class="line"><span style="color:#DCDCAA;">print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;First pass SLU model fails to predict the correct action.&quot;</span><span style="color:#D4D4D4;">)</span></span>
69
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="download-and-load-pretrained-second-pass-model" tabindex="-1"><a class="header-anchor" href="#download-and-load-pretrained-second-pass-model"><span>Download and Load pretrained Second Pass Model</span></a></h2><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#F44747;">!</span><span style="color:#D4D4D4;">git lfs clone https://huggingface.co/espnet/slurp_slu_2pass /content/slurp_second_pass_model</span></span>
70
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet2.bin.slu_inference </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> Speech2Understand</span></span>
71
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> transformers </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> AutoModel, AutoTokenizer</span></span>
72
+ <span class="line"><span style="color:#D4D4D4;">speech2text_second_pass_slurp = Speech2Understand.from_pretrained(</span></span>
73
+ <span class="line"><span style="color:#9CDCFE;"> slu_train_config</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&quot;/content/slurp_second_pass_model/exp/slu_train_asr_bert_conformer_deliberation_raw_en_word/config.yaml&quot;</span><span style="color:#D4D4D4;">,</span></span>
74
+ <span class="line"><span style="color:#9CDCFE;"> slu_model_file</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&quot;/content/slurp_second_pass_model/exp/slu_train_asr_bert_conformer_deliberation_raw_en_word/valid.acc.ave_10best.pth&quot;</span><span style="color:#D4D4D4;">,</span></span>
75
+ <span class="line"><span style="color:#9CDCFE;"> nbest</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">,</span></span>
76
+ <span class="line"><span style="color:#D4D4D4;">)</span></span>
77
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet2.tasks.slu </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> SLUTask</span></span>
78
+ <span class="line"><span style="color:#D4D4D4;">preprocess_fn=SLUTask.build_preprocess_fn(</span></span>
79
+ <span class="line"><span style="color:#D4D4D4;"> speech2text_second_pass_slurp.asr_train_args, </span><span style="color:#569CD6;">False</span></span>
80
+ <span class="line"><span style="color:#D4D4D4;"> )</span></span>
81
+ <span class="line"></span>
82
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> numpy </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> np</span></span>
83
+ <span class="line"><span style="color:#D4D4D4;">transcript = preprocess_fn.text_cleaner(transcript)</span></span>
84
+ <span class="line"><span style="color:#D4D4D4;">tokens = preprocess_fn.transcript_tokenizer.text2tokens(transcript)</span></span>
85
+ <span class="line"><span style="color:#D4D4D4;">text_ints = np.array(preprocess_fn.transcript_token_id_converter.tokens2ids(tokens), </span><span style="color:#9CDCFE;">dtype</span><span style="color:#D4D4D4;">=np.int64)</span></span>
86
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> torch</span></span>
87
+ <span class="line"><span style="color:#D4D4D4;">nbests = speech2text_second_pass_slurp(mixwav_mc,torch.tensor(text_ints))</span></span>
88
+ <span class="line"><span style="color:#D4D4D4;">text1, *_ = nbests[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">]</span></span>
89
+ <span class="line"><span style="color:#D4D4D4;">intent_text=</span><span style="color:#CE9178;">&quot;{scenario: &quot;</span><span style="color:#D4D4D4;">+text1.split()[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">].split(</span><span style="color:#CE9178;">&quot;_&quot;</span><span style="color:#D4D4D4;">)[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">]+</span><span style="color:#CE9178;">&quot;, action: &quot;</span><span style="color:#D4D4D4;">+</span><span style="color:#CE9178;">&quot;_&quot;</span><span style="color:#D4D4D4;">.join(text1.split()[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">].split(</span><span style="color:#CE9178;">&quot;_&quot;</span><span style="color:#D4D4D4;">)[</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">:])+</span><span style="color:#CE9178;">&quot;}&quot;</span></span>
90
+ <span class="line"><span style="color:#DCDCAA;">print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;INTENT: </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">intent_text</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&quot;</span><span style="color:#D4D4D4;">)</span></span>
91
+ <span class="line"><span style="color:#D4D4D4;">transcript=text_normalizer(text1.split()[</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">:])</span></span>
92
+ <span class="line"><span style="color:#DCDCAA;">print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;ASR hypothesis: </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">transcript</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&quot;</span><span style="color:#D4D4D4;">)</span></span>
93
+ <span class="line"><span style="color:#DCDCAA;">print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;Second pass SLU model successfully recognizes the correct action.&quot;</span><span style="color:#D4D4D4;">)</span></span>
94
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div></div><!--[--><!--]--></div><footer class="vp-page-meta"><!----><div class="vp-meta-item git-info"><!----><!----></div></footer><!----><!--[--><!--]--></main><!--]--></div><!--[--><!----><!--]--><!--]--></div>
95
+ <script type="module" src="/assets/app-DTS6SjJz.js" defer></script>
96
+ </body>
97
+ </html>
espnet2/st/st_demo.html ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!doctype html>
2
+ <html lang="en-US">
3
+ <head>
4
+ <meta charset="utf-8" />
5
+ <meta name="viewport" content="width=device-width,initial-scale=1" />
6
+ <meta name="generator" content="VuePress 2.0.0-rc.9" />
7
+ <style>
8
+ :root {
9
+ --c-bg: #fff;
10
+ }
11
+ html.dark {
12
+ --c-bg: #22272e;
13
+ }
14
+ html,
15
+ body {
16
+ background-color: var(--c-bg);
17
+ }
18
+ </style>
19
+ <script>
20
+ const userMode = localStorage.getItem('vuepress-color-scheme')
21
+ const systemDarkMode =
22
+ window.matchMedia &&
23
+ window.matchMedia('(prefers-color-scheme: dark)').matches
24
+ if (userMode === 'dark' || (userMode !== 'light' && systemDarkMode)) {
25
+ document.documentElement.classList.toggle('dark', true)
26
+ }
27
+ </script>
28
+ <link rel="manifest" href="/manifest.webmanifest"><meta name="application-name" content="Example"><meta name="apple-mobile-web-app-title" content="Example"><meta name="apple-mobile-web-app-status-bar-style" content="black"><meta name="msapplication-TileColor" content="#3eaf7c"><meta name="theme-color" content="#3eaf7c"><title>ESPnet Speech Translation Demonstration | </title><meta name="description" content=" ">
29
+ <link rel="preload" href="/assets/style-SNWc1iKP.css" as="style"><link rel="stylesheet" href="/assets/style-SNWc1iKP.css">
30
+ <link rel="modulepreload" href="/assets/app-DTS6SjJz.js"><link rel="modulepreload" href="/assets/st_demo.html-WLzB4ZGO.js">
31
+ <link rel="prefetch" href="/assets/index.html-DGcx4T0I.js" as="script"><link rel="prefetch" href="/assets/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html-CJ8-yKXK.js" as="script"><link rel="prefetch" href="/assets/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html-BDY4p1H1.js" as="script"><link rel="prefetch" href="/assets/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html-DDjiuGQB.js" as="script"><link rel="prefetch" href="/assets/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html-DnIftUJK.js" as="script"><link rel="prefetch" href="/assets/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html-D9GsoT-_.js" as="script"><link rel="prefetch" href="/assets/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html-DobzmqH0.js" as="script"><link rel="prefetch" href="/assets/espnet2_tutorial_2021_CMU_11751_18781.html-BY6Z52B4.js" as="script"><link rel="prefetch" href="/assets/asr_cli.html-BA-xBrC-.js" as="script"><link rel="prefetch" href="/assets/asr_library.html-rEQwKTMV.js" as="script"><link rel="prefetch" href="/assets/espnet2_asr_realtime_demo.html-BnK1Wovv.js" as="script"><link rel="prefetch" href="/assets/espnet2_asr_transfer_learning_demo.html-DJeSoTyY.js" as="script"><link rel="prefetch" href="/assets/espnet2_streaming_asr_demo.html-Yx-OX8AZ.js" as="script"><link rel="prefetch" href="/assets/espnet_se_demonstration_for_waspaa_2021.html--JdDNbEo.js" as="script"><link rel="prefetch" href="/assets/se_demo.html-DY-mv2y8.js" as="script"><link rel="prefetch" href="/assets/onnx_conversion_demo.html-D56NEMop.js" as="script"><link rel="prefetch" href="/assets/pretrained.html-JpE__EKJ.js" as="script"><link rel="prefetch" href="/assets/espnet2_2pass_slu_demo.html-BmvJ92Ni.js" as="script"><link rel="prefetch" href="/assets/espnet2_tts_realtime_demo.html-BdxLBr1c.js" as="script"><link rel="prefetch" href="/assets/tts_cli.html-BfB21gs4.js" as="script"><link rel="prefetch" href="/assets/tts_realtime_demo.html-BKOGq7as.js" as="script"><link rel="prefetch" href="/assets/finetune_owsm.html-ICOQYZj2.js" as="script"><link rel="prefetch" href="/assets/finetune_with_lora.html-3NfoQDOl.js" as="script"><link rel="prefetch" href="/assets/train.html-BQ-t2Cs4.js" as="script"><link rel="prefetch" href="/assets/tacotron2.html-Ds1AKES7.js" as="script"><link rel="prefetch" href="/assets/404.html-DN7291h8.js" as="script"><link rel="prefetch" href="/assets/NpmBadge-rh9tvaXX.js" as="script">
32
+ </head>
33
+ <body>
34
+ <div id="app"><!--[--><div class="theme-container"><!--[--><header class="navbar"><div class="toggle-sidebar-button" title="toggle sidebar" aria-expanded="false" role="button" tabindex="0"><div class="icon" aria-hidden="true"><span></span><span></span><span></span></div></div><span><a class="route-link" href="/"><img class="logo" src="/images/espnet_logo1.png" alt=" "><span class="site-name can-hide" aria-hidden="true"> </span></a></span><div class="navbar-items-wrapper" style=""><!--[--><!--]--><nav class="navbar-items can-hide" aria-label="site navigation"><!--[--><div class="navbar-item"><a class="external-link" href="https://github.com/espnet/espnet" rel="noopener noreferrer" target="_blank" aria-label="Github"><!--[--><!--[--><!--]--> Github <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://huggingface.co/espnet" rel="noopener noreferrer" target="_blank" aria-label="HuggingFace"><!--[--><!--[--><!--]--> HuggingFace <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://espnet.github.io/espnet" rel="noopener noreferrer" target="_blank" aria-label="Docs"><!--[--><!--[--><!--]--> Docs <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SE</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SLU</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>OTHERS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ST</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link route-link-active" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/tts/tacotron2.html" aria-label="TTS demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> TTS demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_with_lora.html" aria-label="Finetune Model with ESPnet-Easy"><!--[--><!--[--><!--]--> Finetune Model with ESPnet-Easy <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/train.html" aria-label="Sample demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> Sample demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_owsm.html" aria-label="OWSM finetuning with custom dataset"><!--[--><!--[--><!--]--> OWSM finetuning with custom dataset <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_tutorial_2021_CMU_11751_18781.html" aria-label="CMU 11751/18781 2021: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 2021: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html" aria-label="CMU 11492/11692 Spring 2023: Spoken Language Understanding"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Spoken Language Understanding <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html" aria-label="CMU 11492/11692 Spring 2023: Speech Enhancement"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Speech Enhancement <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html" aria-label="CMU 11492/11692 Spring 2023: Data preparation"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Data preparation <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task)"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html" aria-label="CMU 11492/11692 Spring 2023: Text to Speech"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Text to Speech <!--[--><!--]--><!--]--></a></li><!--]--></ul></div></div><!--]--></nav><!--[--><!--]--><button class="toggle-color-mode-button" title="toggle color mode"><svg style="" class="icon" focusable="false" viewBox="0 0 32 32"><path d="M16 12.005a4 4 0 1 1-4 4a4.005 4.005 0 0 1 4-4m0-2a6 6 0 1 0 6 6a6 6 0 0 0-6-6z" fill="currentColor"></path><path d="M5.394 6.813l1.414-1.415l3.506 3.506L8.9 10.318z" fill="currentColor"></path><path d="M2 15.005h5v2H2z" fill="currentColor"></path><path d="M5.394 25.197L8.9 21.691l1.414 1.415l-3.506 3.505z" fill="currentColor"></path><path d="M15 25.005h2v5h-2z" fill="currentColor"></path><path d="M21.687 23.106l1.414-1.415l3.506 3.506l-1.414 1.414z" fill="currentColor"></path><path d="M25 15.005h5v2h-5z" fill="currentColor"></path><path d="M21.687 8.904l3.506-3.506l1.414 1.415l-3.506 3.505z" fill="currentColor"></path><path d="M15 2.005h2v5h-2z" fill="currentColor"></path></svg><svg style="display:none;" class="icon" focusable="false" viewBox="0 0 32 32"><path d="M13.502 5.414a15.075 15.075 0 0 0 11.594 18.194a11.113 11.113 0 0 1-7.975 3.39c-.138 0-.278.005-.418 0a11.094 11.094 0 0 1-3.2-21.584M14.98 3a1.002 1.002 0 0 0-.175.016a13.096 13.096 0 0 0 1.825 25.981c.164.006.328 0 .49 0a13.072 13.072 0 0 0 10.703-5.555a1.01 1.01 0 0 0-.783-1.565A13.08 13.08 0 0 1 15.89 4.38A1.015 1.015 0 0 0 14.98 3z" fill="currentColor"></path></svg></button><form class="search-box" role="search"><input type="search" placeholder="Search" autocomplete="off" spellcheck="false" value><!----></form></div></header><!--]--><div class="sidebar-mask"></div><!--[--><aside class="sidebar"><nav class="navbar-items" aria-label="site navigation"><!--[--><div class="navbar-item"><a class="external-link" href="https://github.com/espnet/espnet" rel="noopener noreferrer" target="_blank" aria-label="Github"><!--[--><!--[--><!--]--> Github <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://huggingface.co/espnet" rel="noopener noreferrer" target="_blank" aria-label="HuggingFace"><!--[--><!--[--><!--]--> HuggingFace <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://espnet.github.io/espnet" rel="noopener noreferrer" target="_blank" aria-label="Docs"><!--[--><!--[--><!--]--> Docs <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SE</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SLU</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>OTHERS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ST</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link route-link-active" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/tts/tacotron2.html" aria-label="TTS demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> TTS demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_with_lora.html" aria-label="Finetune Model with ESPnet-Easy"><!--[--><!--[--><!--]--> Finetune Model with ESPnet-Easy <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/train.html" aria-label="Sample demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> Sample demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_owsm.html" aria-label="OWSM finetuning with custom dataset"><!--[--><!--[--><!--]--> OWSM finetuning with custom dataset <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_tutorial_2021_CMU_11751_18781.html" aria-label="CMU 11751/18781 2021: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 2021: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html" aria-label="CMU 11492/11692 Spring 2023: Spoken Language Understanding"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Spoken Language Understanding <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html" aria-label="CMU 11492/11692 Spring 2023: Speech Enhancement"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Speech Enhancement <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html" aria-label="CMU 11492/11692 Spring 2023: Data preparation"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Data preparation <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task)"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html" aria-label="CMU 11492/11692 Spring 2023: Text to Speech"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Text to Speech <!--[--><!--]--><!--]--></a></li><!--]--></ul></div></div><!--]--></nav><!--[--><!--]--><ul class="sidebar-items"><!--[--><li><p tabindex="0" class="sidebar-item sidebar-heading">TTS <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">SE <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">SLU <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">ASR <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">OTHERS <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading active">ST <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link route-link-active sidebar-item active" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="#install" aria-label="Install"><!--[--><!--[--><!--]--> Install <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#spanish-speech-english-text-translation" aria-label="Spanish speech -&gt; English text translation"><!--[--><!--[--><!--]--> Spanish speech -&gt; English text translation <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#english-translated-text-to-speech-synthesis" aria-label="English translated text-to-speech synthesis"><!--[--><!--[--><!--]--> English translated text-to-speech synthesis <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#check-decoding-log" aria-label="Check decoding log"><!--[--><!--[--><!--]--> Check decoding log <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#training-st-models-from-scratch" aria-label="Training ST models from scratch"><!--[--><!--[--><!--]--> Training ST models from scratch <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#details-of-espnet-tools" aria-label="Details of ESPnet tools"><!--[--><!--[--><!--]--> Details of ESPnet tools <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><!--]--></ul></li><!--]--></ul><!--[--><!--]--></aside><!--]--><!--[--><main class="page"><!--[--><!--]--><div class="theme-default-content"><!--[--><!--]--><div><h1 id="espnet-speech-translation-demonstration" tabindex="-1"><a class="header-anchor" href="#espnet-speech-translation-demonstration"><span>ESPnet Speech Translation Demonstration</span></a></h1><p><a href="https://colab.research.google.com/github/espnet/notebook/blob/master/st_demo.ipynb" target="_blank" rel="noopener noreferrer"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"><span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span></a></p><p>See also</p><ul><li>ESPnet: https://github.com/espnet/espnet</li><li>ESPnet documentation: https://espnet.github.io/espnet/</li><li>TTS demo: https://colab.research.google.com/github/espnet/notebook/blob/master/tts_realtime_demo.ipynb</li></ul><p>Author: <a href="https://github.com/ShigekiKarita" target="_blank" rel="noopener noreferrer">Shigeki Karita<span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span></a></p><h2 id="install" tabindex="-1"><a class="header-anchor" href="#install"><span>Install</span></a></h2><p>It takes around 3 minutes. Please keep waiting for a while.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># OS setup</span></span>
35
+ <span class="line"><span>!cat /etc/os-release</span></span>
36
+ <span class="line"><span>!apt-get install -qq bc tree sox</span></span>
37
+ <span class="line"><span></span></span>
38
+ <span class="line"><span># espnet and moses setup</span></span>
39
+ <span class="line"><span>!git clone -q https://github.com/ShigekiKarita/espnet.git</span></span>
40
+ <span class="line"><span>!pip install -q torch==1.1</span></span>
41
+ <span class="line"><span>!cd espnet; git checkout c0466d9a356c1a33f671a546426d7bc33b5b17e8; pip install -q -e .</span></span>
42
+ <span class="line"><span>!cd espnet/tools/; make moses.done</span></span>
43
+ <span class="line"><span></span></span>
44
+ <span class="line"><span># download pre-compiled warp-ctc and kaldi tools</span></span>
45
+ <span class="line"><span>!espnet/utils/download_from_google_drive.sh \</span></span>
46
+ <span class="line"><span> &quot;https://drive.google.com/open?id=13Y4tSygc8WtqzvAVGK_vRV9GlV7TRC0w&quot; espnet/tools tar.gz &gt; /dev/null</span></span>
47
+ <span class="line"><span></span></span>
48
+ <span class="line"><span># make dummy activate</span></span>
49
+ <span class="line"><span>!mkdir -p espnet/tools/venv/bin &amp;&amp; touch espnet/tools/venv/bin/activate</span></span>
50
+ <span class="line"><span>!echo &quot;setup done.&quot;</span></span>
51
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><hr><h2 id="spanish-speech-english-text-translation" tabindex="-1"><a class="header-anchor" href="#spanish-speech-english-text-translation"><span>Spanish speech -&gt; English text translation</span></a></h2><p>This audio says &quot;yo soy José.&quot;</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>from IPython.display import display, Audio</span></span>
52
+ <span class="line"><span>display(Audio(&quot;/content/espnet/test_utils/st_test.wav&quot;, rate=16000))</span></span>
53
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><p>Let&#39;s translate this into English text by our pretrained Transformer ST model trained on the Fisher-CALLHOME Spanish dataset.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># move on the recipe directory</span></span>
54
+ <span class="line"><span>import os</span></span>
55
+ <span class="line"><span>os.chdir(&quot;/content/espnet/egs/fisher_callhome_spanish/st1&quot;)</span></span>
56
+ <span class="line"><span></span></span>
57
+ <span class="line"><span>!../../../utils/translate_wav.sh --models fisher_callhome_spanish.transformer.v1.es-en ../../../test_utils/st_test.wav | tee /content/translated.txt</span></span>
58
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>As seen above, we successfully obtained the result: <strong>&quot;Translated text: yes i&#39;m jose&quot;</strong>!</p><h2 id="english-translated-text-to-speech-synthesis" tabindex="-1"><a class="header-anchor" href="#english-translated-text-to-speech-synthesis"><span>English translated text-to-speech synthesis</span></a></h2><p>Now let&#39;s generate an <strong>English speech</strong> from the translated text using a pretrained ESPnet-TTS model.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!sed -n &#39;s/Translated text://p&#39; /content/translated.txt | tr &#39;[:lower:]&#39; &#39;[:upper:]&#39; | tee /content/translated_sed.txt</span></span>
59
+ <span class="line"><span>!../../../utils/synth_wav.sh /content/translated_sed.txt</span></span>
60
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>import matplotlib.pyplot as plt</span></span>
61
+ <span class="line"><span>import kaldiio</span></span>
62
+ <span class="line"><span>fbank = next(iter(kaldiio.load_scp(&quot;decode/translated_sed/outputs/feats.scp&quot;).values()))</span></span>
63
+ <span class="line"><span>plt.matshow(fbank.T)</span></span>
64
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>from IPython.display import display, Audio</span></span>
65
+ <span class="line"><span>display(Audio(&quot;decode/translated_sed/wav_wnv/translated_sed_gen.wav&quot;))</span></span>
66
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><p>Successfully, it says &quot;Yes I&#39;m Jose&quot;! For more TTS demo, visit https://colab.research.google.com/github/espnet/notebook/blob/master/tts_realtime_demo.ipynb</p><h2 id="check-decoding-log" tabindex="-1"><a class="header-anchor" href="#check-decoding-log"><span>Check decoding log</span></a></h2><p>After the translation, you will find <code>&lt;decode_dir&gt;/&lt;wav name&gt;/result.json</code> for details;</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!cat decode/st_test/result.json</span></span>
67
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>and <code>&lt;decode_dir&gt;/&lt;wav name&gt;/log/decode.log</code> for runtime log;</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!cat decode/st_test/log/decode.log</span></span>
68
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>Let&#39;s calculate real-time factor (RTF) of the ST decoding from the <code>decode.log</code></p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>from dateutil import parser</span></span>
69
+ <span class="line"><span>from subprocess import PIPE, run</span></span>
70
+ <span class="line"><span></span></span>
71
+ <span class="line"><span># calc input duration (seconds)</span></span>
72
+ <span class="line"><span>input_sec = float(run([&quot;soxi&quot;, &quot;-D&quot;, &quot;/content/espnet/test_utils/st_test.wav&quot;], stdout=PIPE).stdout)</span></span>
73
+ <span class="line"><span></span></span>
74
+ <span class="line"><span># calc NN decoding time</span></span>
75
+ <span class="line"><span>with open(&quot;decode/st_test/log/decode.log&quot;, &quot;r&quot;) as f:</span></span>
76
+ <span class="line"><span> times = [parser.parse(x.split(&quot;(&quot;)[0]) for x in f if &quot;e2e_st_transformer&quot; in x]</span></span>
77
+ <span class="line"><span>decode_sec = (times[-1] - times[0]).total_seconds()</span></span>
78
+ <span class="line"><span></span></span>
79
+ <span class="line"><span># get real-time factor (RTF)</span></span>
80
+ <span class="line"><span>print(&quot;Input duration:\t&quot;, input_sec, &quot;sec&quot;)</span></span>
81
+ <span class="line"><span>print(&quot;NN decoding:\t&quot;, decode_sec, &quot;sec&quot;)</span></span>
82
+ <span class="line"><span>print(&quot;Real-time factor:\t&quot;, decode_sec / input_sec)</span></span>
83
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>As you can see above, ESPnet-ST can <strong>translate speech faster than the input</strong> (it should be RTF &lt; 1.0).</p><h2 id="training-st-models-from-scratch" tabindex="-1"><a class="header-anchor" href="#training-st-models-from-scratch"><span>Training ST models from scratch</span></a></h2><p>We provide <a href="https://kaldi-asr.org/doc/kaldi_for_dummies.html" target="_blank" rel="noopener noreferrer">Kaldi-style recipes<span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span></a> for ST as well as <a href="https://colab.research.google.com/github/espnet/notebook/blob/master/asr_cli.ipynb" target="_blank" rel="noopener noreferrer">ASR<span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span></a> and <a href="https://colab.research.google.com/github/espnet/notebook/blob/master/tts_cli.ipynb" target="_blank" rel="noopener noreferrer">TTS<span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span></a> as all-in-one bash script <code>run.sh</code>:</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!cd /content/espnet/egs/must_c/st1/ &amp;&amp; ./run.sh --must-c /content</span></span>
84
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>However, it takes too much time to finish downloading the dataset. So we cancel the cell above.</p><h2 id="details-of-espnet-tools" tabindex="-1"><a class="header-anchor" href="#details-of-espnet-tools"><span>Details of ESPnet tools</span></a></h2><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!../../../utils/translate_wav.sh --help</span></span>
85
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!../../../utils/synth_wav.sh --help</span></span>
86
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span></span></span>
87
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div></div><!--[--><!--]--></div><footer class="vp-page-meta"><!----><div class="vp-meta-item git-info"><!----><!----></div></footer><!----><!--[--><!--]--></main><!--]--></div><!--[--><!----><!--]--><!--]--></div>
88
+ <script type="module" src="/assets/app-DTS6SjJz.js" defer></script>
89
+ </body>
90
+ </html>
espnet2/tts/espnet2_tts_realtime_demo.html ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!doctype html>
2
+ <html lang="en-US">
3
+ <head>
4
+ <meta charset="utf-8" />
5
+ <meta name="viewport" content="width=device-width,initial-scale=1" />
6
+ <meta name="generator" content="VuePress 2.0.0-rc.9" />
7
+ <style>
8
+ :root {
9
+ --c-bg: #fff;
10
+ }
11
+ html.dark {
12
+ --c-bg: #22272e;
13
+ }
14
+ html,
15
+ body {
16
+ background-color: var(--c-bg);
17
+ }
18
+ </style>
19
+ <script>
20
+ const userMode = localStorage.getItem('vuepress-color-scheme')
21
+ const systemDarkMode =
22
+ window.matchMedia &&
23
+ window.matchMedia('(prefers-color-scheme: dark)').matches
24
+ if (userMode === 'dark' || (userMode !== 'light' && systemDarkMode)) {
25
+ document.documentElement.classList.toggle('dark', true)
26
+ }
27
+ </script>
28
+ <link rel="manifest" href="/manifest.webmanifest"><meta name="application-name" content="Example"><meta name="apple-mobile-web-app-title" content="Example"><meta name="apple-mobile-web-app-status-bar-style" content="black"><meta name="msapplication-TileColor" content="#3eaf7c"><meta name="theme-color" content="#3eaf7c"><title>ESPnet2-TTS realtime demonstration | </title><meta name="description" content=" ">
29
+ <link rel="preload" href="/assets/style-SNWc1iKP.css" as="style"><link rel="stylesheet" href="/assets/style-SNWc1iKP.css">
30
+ <link rel="modulepreload" href="/assets/app-DTS6SjJz.js"><link rel="modulepreload" href="/assets/espnet2_tts_realtime_demo.html-BdxLBr1c.js">
31
+ <link rel="prefetch" href="/assets/index.html-DGcx4T0I.js" as="script"><link rel="prefetch" href="/assets/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html-CJ8-yKXK.js" as="script"><link rel="prefetch" href="/assets/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html-BDY4p1H1.js" as="script"><link rel="prefetch" href="/assets/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html-DDjiuGQB.js" as="script"><link rel="prefetch" href="/assets/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html-DnIftUJK.js" as="script"><link rel="prefetch" href="/assets/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html-D9GsoT-_.js" as="script"><link rel="prefetch" href="/assets/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html-DobzmqH0.js" as="script"><link rel="prefetch" href="/assets/espnet2_tutorial_2021_CMU_11751_18781.html-BY6Z52B4.js" as="script"><link rel="prefetch" href="/assets/asr_cli.html-BA-xBrC-.js" as="script"><link rel="prefetch" href="/assets/asr_library.html-rEQwKTMV.js" as="script"><link rel="prefetch" href="/assets/espnet2_asr_realtime_demo.html-BnK1Wovv.js" as="script"><link rel="prefetch" href="/assets/espnet2_asr_transfer_learning_demo.html-DJeSoTyY.js" as="script"><link rel="prefetch" href="/assets/espnet2_streaming_asr_demo.html-Yx-OX8AZ.js" as="script"><link rel="prefetch" href="/assets/espnet_se_demonstration_for_waspaa_2021.html--JdDNbEo.js" as="script"><link rel="prefetch" href="/assets/se_demo.html-DY-mv2y8.js" as="script"><link rel="prefetch" href="/assets/onnx_conversion_demo.html-D56NEMop.js" as="script"><link rel="prefetch" href="/assets/pretrained.html-JpE__EKJ.js" as="script"><link rel="prefetch" href="/assets/espnet2_2pass_slu_demo.html-BmvJ92Ni.js" as="script"><link rel="prefetch" href="/assets/st_demo.html-WLzB4ZGO.js" as="script"><link rel="prefetch" href="/assets/tts_cli.html-BfB21gs4.js" as="script"><link rel="prefetch" href="/assets/tts_realtime_demo.html-BKOGq7as.js" as="script"><link rel="prefetch" href="/assets/finetune_owsm.html-ICOQYZj2.js" as="script"><link rel="prefetch" href="/assets/finetune_with_lora.html-3NfoQDOl.js" as="script"><link rel="prefetch" href="/assets/train.html-BQ-t2Cs4.js" as="script"><link rel="prefetch" href="/assets/tacotron2.html-Ds1AKES7.js" as="script"><link rel="prefetch" href="/assets/404.html-DN7291h8.js" as="script"><link rel="prefetch" href="/assets/NpmBadge-rh9tvaXX.js" as="script">
32
+ </head>
33
+ <body>
34
+ <div id="app"><!--[--><div class="theme-container"><!--[--><header class="navbar"><div class="toggle-sidebar-button" title="toggle sidebar" aria-expanded="false" role="button" tabindex="0"><div class="icon" aria-hidden="true"><span></span><span></span><span></span></div></div><span><a class="route-link" href="/"><img class="logo" src="/images/espnet_logo1.png" alt=" "><span class="site-name can-hide" aria-hidden="true"> </span></a></span><div class="navbar-items-wrapper" style=""><!--[--><!--]--><nav class="navbar-items can-hide" aria-label="site navigation"><!--[--><div class="navbar-item"><a class="external-link" href="https://github.com/espnet/espnet" rel="noopener noreferrer" target="_blank" aria-label="Github"><!--[--><!--[--><!--]--> Github <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://huggingface.co/espnet" rel="noopener noreferrer" target="_blank" aria-label="HuggingFace"><!--[--><!--[--><!--]--> HuggingFace <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://espnet.github.io/espnet" rel="noopener noreferrer" target="_blank" aria-label="Docs"><!--[--><!--[--><!--]--> Docs <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link route-link-active" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SE</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SLU</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>OTHERS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ST</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/tts/tacotron2.html" aria-label="TTS demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> TTS demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_with_lora.html" aria-label="Finetune Model with ESPnet-Easy"><!--[--><!--[--><!--]--> Finetune Model with ESPnet-Easy <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/train.html" aria-label="Sample demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> Sample demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_owsm.html" aria-label="OWSM finetuning with custom dataset"><!--[--><!--[--><!--]--> OWSM finetuning with custom dataset <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_tutorial_2021_CMU_11751_18781.html" aria-label="CMU 11751/18781 2021: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 2021: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html" aria-label="CMU 11492/11692 Spring 2023: Spoken Language Understanding"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Spoken Language Understanding <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html" aria-label="CMU 11492/11692 Spring 2023: Speech Enhancement"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Speech Enhancement <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html" aria-label="CMU 11492/11692 Spring 2023: Data preparation"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Data preparation <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task)"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html" aria-label="CMU 11492/11692 Spring 2023: Text to Speech"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Text to Speech <!--[--><!--]--><!--]--></a></li><!--]--></ul></div></div><!--]--></nav><!--[--><!--]--><button class="toggle-color-mode-button" title="toggle color mode"><svg style="" class="icon" focusable="false" viewBox="0 0 32 32"><path d="M16 12.005a4 4 0 1 1-4 4a4.005 4.005 0 0 1 4-4m0-2a6 6 0 1 0 6 6a6 6 0 0 0-6-6z" fill="currentColor"></path><path d="M5.394 6.813l1.414-1.415l3.506 3.506L8.9 10.318z" fill="currentColor"></path><path d="M2 15.005h5v2H2z" fill="currentColor"></path><path d="M5.394 25.197L8.9 21.691l1.414 1.415l-3.506 3.505z" fill="currentColor"></path><path d="M15 25.005h2v5h-2z" fill="currentColor"></path><path d="M21.687 23.106l1.414-1.415l3.506 3.506l-1.414 1.414z" fill="currentColor"></path><path d="M25 15.005h5v2h-5z" fill="currentColor"></path><path d="M21.687 8.904l3.506-3.506l1.414 1.415l-3.506 3.505z" fill="currentColor"></path><path d="M15 2.005h2v5h-2z" fill="currentColor"></path></svg><svg style="display:none;" class="icon" focusable="false" viewBox="0 0 32 32"><path d="M13.502 5.414a15.075 15.075 0 0 0 11.594 18.194a11.113 11.113 0 0 1-7.975 3.39c-.138 0-.278.005-.418 0a11.094 11.094 0 0 1-3.2-21.584M14.98 3a1.002 1.002 0 0 0-.175.016a13.096 13.096 0 0 0 1.825 25.981c.164.006.328 0 .49 0a13.072 13.072 0 0 0 10.703-5.555a1.01 1.01 0 0 0-.783-1.565A13.08 13.08 0 0 1 15.89 4.38A1.015 1.015 0 0 0 14.98 3z" fill="currentColor"></path></svg></button><form class="search-box" role="search"><input type="search" placeholder="Search" autocomplete="off" spellcheck="false" value><!----></form></div></header><!--]--><div class="sidebar-mask"></div><!--[--><aside class="sidebar"><nav class="navbar-items" aria-label="site navigation"><!--[--><div class="navbar-item"><a class="external-link" href="https://github.com/espnet/espnet" rel="noopener noreferrer" target="_blank" aria-label="Github"><!--[--><!--[--><!--]--> Github <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://huggingface.co/espnet" rel="noopener noreferrer" target="_blank" aria-label="HuggingFace"><!--[--><!--[--><!--]--> HuggingFace <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://espnet.github.io/espnet" rel="noopener noreferrer" target="_blank" aria-label="Docs"><!--[--><!--[--><!--]--> Docs <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link route-link-active" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SE</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SLU</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>OTHERS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ST</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/tts/tacotron2.html" aria-label="TTS demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> TTS demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_with_lora.html" aria-label="Finetune Model with ESPnet-Easy"><!--[--><!--[--><!--]--> Finetune Model with ESPnet-Easy <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/train.html" aria-label="Sample demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> Sample demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_owsm.html" aria-label="OWSM finetuning with custom dataset"><!--[--><!--[--><!--]--> OWSM finetuning with custom dataset <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_tutorial_2021_CMU_11751_18781.html" aria-label="CMU 11751/18781 2021: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 2021: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html" aria-label="CMU 11492/11692 Spring 2023: Spoken Language Understanding"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Spoken Language Understanding <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html" aria-label="CMU 11492/11692 Spring 2023: Speech Enhancement"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Speech Enhancement <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html" aria-label="CMU 11492/11692 Spring 2023: Data preparation"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Data preparation <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task)"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html" aria-label="CMU 11492/11692 Spring 2023: Text to Speech"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Text to Speech <!--[--><!--]--><!--]--></a></li><!--]--></ul></div></div><!--]--></nav><!--[--><!--]--><ul class="sidebar-items"><!--[--><li><p tabindex="0" class="sidebar-item sidebar-heading active">TTS <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link route-link-active sidebar-item active" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="#installation" aria-label="Installation"><!--[--><!--[--><!--]--> Installation <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#single-speaker-model-demo" aria-label="Single speaker model demo"><!--[--><!--[--><!--]--> Single speaker model demo <!--[--><!--]--><!--]--></a><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="#model-selection" aria-label="Model Selection"><!--[--><!--[--><!--]--> Model Selection <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#model-setup" aria-label="Model Setup"><!--[--><!--[--><!--]--> Model Setup <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#synthesis" aria-label="Synthesis"><!--[--><!--[--><!--]--> Synthesis <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><a class="route-link sidebar-item" href="#multi-speaker-model-demo" aria-label="Multi-speaker Model Demo"><!--[--><!--[--><!--]--> Multi-speaker Model Demo <!--[--><!--]--><!--]--></a><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="#model-selection-1" aria-label="Model Selection"><!--[--><!--[--><!--]--> Model Selection <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#model-setup-1" aria-label="Model Setup"><!--[--><!--[--><!--]--> Model Setup <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#speaker-selection" aria-label="Speaker selection"><!--[--><!--[--><!--]--> Speaker selection <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#synthesis-1" aria-label="Synthesis"><!--[--><!--[--><!--]--> Synthesis <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><!--]--></ul></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">SE <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">SLU <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">ASR <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">OTHERS <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">ST <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><!--]--></ul><!--[--><!--]--></aside><!--]--><!--[--><main class="page"><!--[--><!--]--><div class="theme-default-content"><!--[--><!--]--><div><p><a href="https://colab.research.google.com/github/espnet/notebook/blob/master/espnet2_tts_realtime_demo.ipynb" target="_blank" rel="noopener noreferrer"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"><span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span></a></p><h1 id="espnet2-tts-realtime-demonstration" tabindex="-1"><a class="header-anchor" href="#espnet2-tts-realtime-demonstration"><span>ESPnet2-TTS realtime demonstration</span></a></h1><p>This notebook provides a demonstration of the realtime E2E-TTS using ESPnet2-TTS and ParallelWaveGAN repo.</p><ul><li>ESPnet2-TTS: https://github.com/espnet/espnet/tree/master/egs2/TEMPLATE/tts1</li><li>ParallelWaveGAN: https://github.com/kan-bayashi/ParallelWaveGAN</li></ul><p>Author: Tomoki Hayashi (<a href="https://github.com/kan-bayashi" target="_blank" rel="noopener noreferrer">@kan-bayashi<span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span></a>)</p><h2 id="installation" tabindex="-1"><a class="header-anchor" href="#installation"><span>Installation</span></a></h2><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># NOTE: pip shows imcompatible errors due to preinstalled libraries but you do not need to care</span></span>
35
+ <span class="line"><span>!pip install -q espnet==202308 pypinyin==0.44.0 parallel_wavegan==0.5.4 gdown==4.4.0 espnet_model_zoo</span></span>
36
+ <span class="line"><span></span></span>
37
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="single-speaker-model-demo" tabindex="-1"><a class="header-anchor" href="#single-speaker-model-demo"><span>Single speaker model demo</span></a></h2><h3 id="model-selection" tabindex="-1"><a class="header-anchor" href="#model-selection"><span>Model Selection</span></a></h3><p>Please select model: English, Japanese, and Mandarin are supported.</p><p>You can try end-to-end text2wav model &amp; combination of text2mel and vocoder.<br> If you use text2wav model, you do not need to use vocoder (automatically disabled).</p><p><strong>Text2wav models</strong>:</p><ul><li>VITS</li></ul><p><strong>Text2mel models</strong>:</p><ul><li>Tacotron2</li><li>Transformer-TTS</li><li>(Conformer) FastSpeech</li><li>(Conformer) FastSpeech2</li></ul><p><strong>Vocoders</strong>:</p><ul><li>Parallel WaveGAN</li><li>Multi-band MelGAN</li><li>HiFiGAN</li><li>Style MelGAN.</li></ul><blockquote><p>The terms of use follow that of each corpus. We use the following corpora:</p></blockquote><ul><li><code>ljspeech_*</code>: LJSpeech dataset <ul><li>https://keithito.com/LJ-Speech-Dataset/</li></ul></li><li><code>jsut_*</code>: JSUT corpus <ul><li>https://sites.google.com/site/shinnosuketakamichi/publication/jsut</li></ul></li><li><code>jvs_*</code>: JVS corpus + JSUT corpus <ul><li>https://sites.google.com/site/shinnosuketakamichi/research-topics/jvs_corpus</li><li>https://sites.google.com/site/shinnosuketakamichi/publication/jsut</li></ul></li><li><code>tsukuyomi_*</code>: つくよみちゃんコーパス + JSUT corpus <ul><li>https://tyc.rei-yumesaki.net/material/corpus/</li><li>https://sites.google.com/site/shinnosuketakamichi/publication/jsut</li></ul></li><li><code>csmsc_*</code>: Chinese Standard Mandarin Speech Corpus <ul><li>https://www.data-baker.com/open_source.html</li></ul></li></ul><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>#@title Choose English model { run: &quot;auto&quot; }</span></span>
38
+ <span class="line"><span>lang = &#39;English&#39;</span></span>
39
+ <span class="line"><span>tag = &#39;kan-bayashi/ljspeech_vits&#39; #@param [&quot;kan-bayashi/ljspeech_tacotron2&quot;, &quot;kan-bayashi/ljspeech_fastspeech&quot;, &quot;kan-bayashi/ljspeech_fastspeech2&quot;, &quot;kan-bayashi/ljspeech_conformer_fastspeech2&quot;, &quot;kan-bayashi/ljspeech_joint_finetune_conformer_fastspeech2_hifigan&quot;, &quot;kan-bayashi/ljspeech_joint_train_conformer_fastspeech2_hifigan&quot;, &quot;kan-bayashi/ljspeech_vits&quot;] {type:&quot;string&quot;}</span></span>
40
+ <span class="line"><span>vocoder_tag = &quot;none&quot; #@param [&quot;none&quot;, &quot;parallel_wavegan/ljspeech_parallel_wavegan.v1&quot;, &quot;parallel_wavegan/ljspeech_full_band_melgan.v2&quot;, &quot;parallel_wavegan/ljspeech_multi_band_melgan.v2&quot;, &quot;parallel_wavegan/ljspeech_hifigan.v1&quot;, &quot;parallel_wavegan/ljspeech_style_melgan.v1&quot;] {type:&quot;string&quot;}</span></span>
41
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>#@title Choose Japanese model { run: &quot;auto&quot; }</span></span>
42
+ <span class="line"><span>lang = &#39;Japanese&#39;</span></span>
43
+ <span class="line"><span>tag = &#39;kan-bayashi/jsut_full_band_vits_prosody&#39; #@param [&quot;kan-bayashi/jsut_tacotron2&quot;, &quot;kan-bayashi/jsut_transformer&quot;, &quot;kan-bayashi/jsut_fastspeech&quot;, &quot;kan-bayashi/jsut_fastspeech2&quot;, &quot;kan-bayashi/jsut_conformer_fastspeech2&quot;, &quot;kan-bayashi/jsut_conformer_fastspeech2_accent&quot;, &quot;kan-bayashi/jsut_conformer_fastspeech2_accent_with_pause&quot;, &quot;kan-bayashi/jsut_vits_accent_with_pause&quot;, &quot;kan-bayashi/jsut_full_band_vits_accent_with_pause&quot;, &quot;kan-bayashi/jsut_tacotron2_prosody&quot;, &quot;kan-bayashi/jsut_transformer_prosody&quot;, &quot;kan-bayashi/jsut_conformer_fastspeech2_tacotron2_prosody&quot;, &quot;kan-bayashi/jsut_vits_prosody&quot;, &quot;kan-bayashi/jsut_full_band_vits_prosody&quot;, &quot;kan-bayashi/jvs_jvs010_vits_prosody&quot;, &quot;kan-bayashi/tsukuyomi_full_band_vits_prosody&quot;] {type:&quot;string&quot;}</span></span>
44
+ <span class="line"><span>vocoder_tag = &#39;none&#39; #@param [&quot;none&quot;, &quot;parallel_wavegan/jsut_parallel_wavegan.v1&quot;, &quot;parallel_wavegan/jsut_multi_band_melgan.v2&quot;, &quot;parallel_wavegan/jsut_style_melgan.v1&quot;, &quot;parallel_wavegan/jsut_hifigan.v1&quot;] {type:&quot;string&quot;}</span></span>
45
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>#@title Choose Mandarin model { run: &quot;auto&quot; }</span></span>
46
+ <span class="line"><span>lang = &#39;Mandarin&#39;</span></span>
47
+ <span class="line"><span>tag = &#39;kan-bayashi/csmsc_full_band_vits&#39; #@param [&quot;kan-bayashi/csmsc_tacotron2&quot;, &quot;kan-bayashi/csmsc_transformer&quot;, &quot;kan-bayashi/csmsc_fastspeech&quot;, &quot;kan-bayashi/csmsc_fastspeech2&quot;, &quot;kan-bayashi/csmsc_conformer_fastspeech2&quot;, &quot;kan-bayashi/csmsc_vits&quot;, &quot;kan-bayashi/csmsc_full_band_vits&quot;] {type: &quot;string&quot;}</span></span>
48
+ <span class="line"><span>vocoder_tag = &quot;none&quot; #@param [&quot;none&quot;, &quot;parallel_wavegan/csmsc_parallel_wavegan.v1&quot;, &quot;parallel_wavegan/csmsc_multi_band_melgan.v2&quot;, &quot;parallel_wavegan/csmsc_hifigan.v1&quot;, &quot;parallel_wavegan/csmsc_style_melgan.v1&quot;] {type:&quot;string&quot;}</span></span>
49
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="model-setup" tabindex="-1"><a class="header-anchor" href="#model-setup"><span>Model Setup</span></a></h3><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>from espnet2.bin.tts_inference import Text2Speech</span></span>
50
+ <span class="line"><span>from espnet2.utils.types import str_or_none</span></span>
51
+ <span class="line"><span></span></span>
52
+ <span class="line"><span>text2speech = Text2Speech.from_pretrained(</span></span>
53
+ <span class="line"><span> model_tag=str_or_none(tag),</span></span>
54
+ <span class="line"><span> vocoder_tag=str_or_none(vocoder_tag),</span></span>
55
+ <span class="line"><span> device=&quot;cuda&quot;,</span></span>
56
+ <span class="line"><span> # Only for Tacotron 2 &amp; Transformer</span></span>
57
+ <span class="line"><span> threshold=0.5,</span></span>
58
+ <span class="line"><span> # Only for Tacotron 2</span></span>
59
+ <span class="line"><span> minlenratio=0.0,</span></span>
60
+ <span class="line"><span> maxlenratio=10.0,</span></span>
61
+ <span class="line"><span> use_att_constraint=False,</span></span>
62
+ <span class="line"><span> backward_window=1,</span></span>
63
+ <span class="line"><span> forward_window=3,</span></span>
64
+ <span class="line"><span> # Only for FastSpeech &amp; FastSpeech2 &amp; VITS</span></span>
65
+ <span class="line"><span> speed_control_alpha=1.0,</span></span>
66
+ <span class="line"><span> # Only for VITS</span></span>
67
+ <span class="line"><span> noise_scale=0.333,</span></span>
68
+ <span class="line"><span> noise_scale_dur=0.333,</span></span>
69
+ <span class="line"><span>)</span></span>
70
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="synthesis" tabindex="-1"><a class="header-anchor" href="#synthesis"><span>Synthesis</span></a></h3><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>import time</span></span>
71
+ <span class="line"><span>import torch</span></span>
72
+ <span class="line"><span></span></span>
73
+ <span class="line"><span># decide the input sentence by yourself</span></span>
74
+ <span class="line"><span>print(f&quot;Input your favorite sentence in {lang}.&quot;)</span></span>
75
+ <span class="line"><span>x = input()</span></span>
76
+ <span class="line"><span></span></span>
77
+ <span class="line"><span># synthesis</span></span>
78
+ <span class="line"><span>with torch.no_grad():</span></span>
79
+ <span class="line"><span> start = time.time()</span></span>
80
+ <span class="line"><span> wav = text2speech(x)[&quot;wav&quot;]</span></span>
81
+ <span class="line"><span>rtf = (time.time() - start) / (len(wav) / text2speech.fs)</span></span>
82
+ <span class="line"><span>print(f&quot;RTF = {rtf:5f}&quot;)</span></span>
83
+ <span class="line"><span></span></span>
84
+ <span class="line"><span># let us listen to generated samples</span></span>
85
+ <span class="line"><span>from IPython.display import display, Audio</span></span>
86
+ <span class="line"><span>display(Audio(wav.view(-1).cpu().numpy(), rate=text2speech.fs))</span></span>
87
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="multi-speaker-model-demo" tabindex="-1"><a class="header-anchor" href="#multi-speaker-model-demo"><span>Multi-speaker Model Demo</span></a></h2><h3 id="model-selection-1" tabindex="-1"><a class="header-anchor" href="#model-selection-1"><span>Model Selection</span></a></h3><p>Now we provide only English multi-speaker pretrained model.</p><blockquote><p>The terms of use follow that of each corpus. We use the following corpora:</p></blockquote><ul><li><code>libritts_*</code>: LibriTTS corpus <ul><li>http://www.openslr.org/60</li></ul></li><li><code>vctk_*</code>: English Multi-speaker Corpus for CSTR Voice Cloning Toolkit <ul><li>http://www.udialogue.org/download/cstr-vctk-corpus.html</li></ul></li></ul><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>#@title English multi-speaker pretrained model { run: &quot;auto&quot; }</span></span>
88
+ <span class="line"><span>lang = &#39;English&#39;</span></span>
89
+ <span class="line"><span>tag = &#39;kan-bayashi/vctk_full_band_multi_spk_vits&#39; #@param [&quot;kan-bayashi/vctk_gst_tacotron2&quot;, &quot;kan-bayashi/vctk_gst_transformer&quot;, &quot;kan-bayashi/vctk_xvector_tacotron2&quot;, &quot;kan-bayashi/vctk_xvector_transformer&quot;, &quot;kan-bayashi/vctk_xvector_conformer_fastspeech2&quot;, &quot;kan-bayashi/vctk_gst+xvector_tacotron2&quot;, &quot;kan-bayashi/vctk_gst+xvector_transformer&quot;, &quot;kan-bayashi/vctk_gst+xvector_conformer_fastspeech2&quot;, &quot;kan-bayashi/vctk_multi_spk_vits&quot;, &quot;kan-bayashi/vctk_full_band_multi_spk_vits&quot;, &quot;kan-bayashi/libritts_xvector_transformer&quot;, &quot;kan-bayashi/libritts_xvector_conformer_fastspeech2&quot;, &quot;kan-bayashi/libritts_gst+xvector_transformer&quot;, &quot;kan-bayashi/libritts_gst+xvector_conformer_fastspeech2&quot;, &quot;kan-bayashi/libritts_xvector_vits&quot;] {type:&quot;string&quot;}</span></span>
90
+ <span class="line"><span>vocoder_tag = &quot;none&quot; #@param [&quot;none&quot;, &quot;parallel_wavegan/vctk_parallel_wavegan.v1.long&quot;, &quot;parallel_wavegan/vctk_multi_band_melgan.v2&quot;, &quot;parallel_wavegan/vctk_style_melgan.v1&quot;, &quot;parallel_wavegan/vctk_hifigan.v1&quot;, &quot;parallel_wavegan/libritts_parallel_wavegan.v1.long&quot;, &quot;parallel_wavegan/libritts_multi_band_melgan.v2&quot;, &quot;parallel_wavegan/libritts_hifigan.v1&quot;, &quot;parallel_wavegan/libritts_style_melgan.v1&quot;] {type:&quot;string&quot;}</span></span>
91
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="model-setup-1" tabindex="-1"><a class="header-anchor" href="#model-setup-1"><span>Model Setup</span></a></h3><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>from espnet2.bin.tts_inference import Text2Speech</span></span>
92
+ <span class="line"><span>from espnet2.utils.types import str_or_none</span></span>
93
+ <span class="line"><span></span></span>
94
+ <span class="line"><span>text2speech = Text2Speech.from_pretrained(</span></span>
95
+ <span class="line"><span> model_tag=str_or_none(tag),</span></span>
96
+ <span class="line"><span> vocoder_tag=str_or_none(vocoder_tag),</span></span>
97
+ <span class="line"><span> device=&quot;cuda&quot;,</span></span>
98
+ <span class="line"><span> # Only for Tacotron 2 &amp; Transformer</span></span>
99
+ <span class="line"><span> threshold=0.5,</span></span>
100
+ <span class="line"><span> # Only for Tacotron 2</span></span>
101
+ <span class="line"><span> minlenratio=0.0,</span></span>
102
+ <span class="line"><span> maxlenratio=10.0,</span></span>
103
+ <span class="line"><span> use_att_constraint=False,</span></span>
104
+ <span class="line"><span> backward_window=1,</span></span>
105
+ <span class="line"><span> forward_window=3,</span></span>
106
+ <span class="line"><span> # Only for FastSpeech &amp; FastSpeech2 &amp; VITS</span></span>
107
+ <span class="line"><span> speed_control_alpha=1.0,</span></span>
108
+ <span class="line"><span> # Only for VITS</span></span>
109
+ <span class="line"><span> noise_scale=0.333,</span></span>
110
+ <span class="line"><span> noise_scale_dur=0.333,</span></span>
111
+ <span class="line"><span>)</span></span>
112
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="speaker-selection" tabindex="-1"><a class="header-anchor" href="#speaker-selection"><span>Speaker selection</span></a></h3><p>For multi-speaker model, we need to provide X-vector and/or the reference speech to decide the speaker characteristics.<br> For X-vector, you can select the speaker from the dumped x-vectors.<br> For the reference speech, you can use any speech but please make sure the sampling rate is matched.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>import glob</span></span>
113
+ <span class="line"><span>import os</span></span>
114
+ <span class="line"><span>import numpy as np</span></span>
115
+ <span class="line"><span>import kaldiio</span></span>
116
+ <span class="line"><span></span></span>
117
+ <span class="line"><span># Get model directory path</span></span>
118
+ <span class="line"><span>from espnet_model_zoo.downloader import ModelDownloader</span></span>
119
+ <span class="line"><span>d = ModelDownloader()</span></span>
120
+ <span class="line"><span>model_dir = os.path.dirname(d.download_and_unpack(tag)[&quot;train_config&quot;])</span></span>
121
+ <span class="line"><span></span></span>
122
+ <span class="line"><span># X-vector selection</span></span>
123
+ <span class="line"><span>spembs = None</span></span>
124
+ <span class="line"><span>if text2speech.use_spembs:</span></span>
125
+ <span class="line"><span> xvector_ark = [p for p in glob.glob(f&quot;{model_dir}/../../dump/**/spk_xvector.ark&quot;, recursive=True) if &quot;tr&quot; in p][0]</span></span>
126
+ <span class="line"><span> xvectors = {k: v for k, v in kaldiio.load_ark(xvector_ark)}</span></span>
127
+ <span class="line"><span> spks = list(xvectors.keys())</span></span>
128
+ <span class="line"><span></span></span>
129
+ <span class="line"><span> # randomly select speaker</span></span>
130
+ <span class="line"><span> random_spk_idx = np.random.randint(0, len(spks))</span></span>
131
+ <span class="line"><span> spk = spks[random_spk_idx]</span></span>
132
+ <span class="line"><span> spembs = xvectors[spk]</span></span>
133
+ <span class="line"><span> print(f&quot;selected spk: {spk}&quot;)</span></span>
134
+ <span class="line"><span></span></span>
135
+ <span class="line"><span># Speaker ID selection</span></span>
136
+ <span class="line"><span>sids = None</span></span>
137
+ <span class="line"><span>if text2speech.use_sids:</span></span>
138
+ <span class="line"><span> spk2sid = glob.glob(f&quot;{model_dir}/../../dump/**/spk2sid&quot;, recursive=True)[0]</span></span>
139
+ <span class="line"><span> with open(spk2sid) as f:</span></span>
140
+ <span class="line"><span> lines = [line.strip() for line in f.readlines()]</span></span>
141
+ <span class="line"><span> sid2spk = {int(line.split()[1]): line.split()[0] for line in lines}</span></span>
142
+ <span class="line"><span> </span></span>
143
+ <span class="line"><span> # randomly select speaker</span></span>
144
+ <span class="line"><span> sids = np.array(np.random.randint(1, len(sid2spk)))</span></span>
145
+ <span class="line"><span> spk = sid2spk[int(sids)]</span></span>
146
+ <span class="line"><span> print(f&quot;selected spk: {spk}&quot;)</span></span>
147
+ <span class="line"><span></span></span>
148
+ <span class="line"><span># Reference speech selection for GST</span></span>
149
+ <span class="line"><span>speech = None</span></span>
150
+ <span class="line"><span>if text2speech.use_speech:</span></span>
151
+ <span class="line"><span> # you can change here to load your own reference speech</span></span>
152
+ <span class="line"><span> # e.g.</span></span>
153
+ <span class="line"><span> # import soundfile as sf</span></span>
154
+ <span class="line"><span> # speech, fs = sf.read(&quot;/path/to/reference.wav&quot;)</span></span>
155
+ <span class="line"><span> # speech = torch.from_numpy(speech).float()</span></span>
156
+ <span class="line"><span> speech = torch.randn(50000,) * 0.01</span></span>
157
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="synthesis-1" tabindex="-1"><a class="header-anchor" href="#synthesis-1"><span>Synthesis</span></a></h3><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>import time</span></span>
158
+ <span class="line"><span>import torch</span></span>
159
+ <span class="line"><span></span></span>
160
+ <span class="line"><span># decide the input sentence by yourself</span></span>
161
+ <span class="line"><span>print(f&quot;Input your favorite sentence in {lang}.&quot;)</span></span>
162
+ <span class="line"><span>x = input()</span></span>
163
+ <span class="line"><span></span></span>
164
+ <span class="line"><span># synthesis</span></span>
165
+ <span class="line"><span>with torch.no_grad():</span></span>
166
+ <span class="line"><span> start = time.time()</span></span>
167
+ <span class="line"><span> wav = text2speech(x, speech=speech, spembs=spembs, sids=sids)[&quot;wav&quot;]</span></span>
168
+ <span class="line"><span>rtf = (time.time() - start) / (len(wav) / text2speech.fs)</span></span>
169
+ <span class="line"><span>print(f&quot;RTF = {rtf:5f}&quot;)</span></span>
170
+ <span class="line"><span></span></span>
171
+ <span class="line"><span># let us listen to generated samples</span></span>
172
+ <span class="line"><span>from IPython.display import display, Audio</span></span>
173
+ <span class="line"><span>display(Audio(wav.view(-1).cpu().numpy(), rate=text2speech.fs))</span></span>
174
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div></div><!--[--><!--]--></div><footer class="vp-page-meta"><!----><div class="vp-meta-item git-info"><!----><!----></div></footer><nav class="vp-page-nav" aria-label="page navigation"><a class="route-link prev" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><div class="hint"><span class="arrow left"></span> Prev</div><div class="link"><span>ESPnet real time E2E-TTS demonstration</span></div><!--]--></a><!----></nav><!--[--><!--]--></main><!--]--></div><!--[--><!----><!--]--><!--]--></div>
175
+ <script type="module" src="/assets/app-DTS6SjJz.js" defer></script>
176
+ </body>
177
+ </html>
espnet2/tts/tts_cli.html ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!doctype html>
2
+ <html lang="en-US">
3
+ <head>
4
+ <meta charset="utf-8" />
5
+ <meta name="viewport" content="width=device-width,initial-scale=1" />
6
+ <meta name="generator" content="VuePress 2.0.0-rc.9" />
7
+ <style>
8
+ :root {
9
+ --c-bg: #fff;
10
+ }
11
+ html.dark {
12
+ --c-bg: #22272e;
13
+ }
14
+ html,
15
+ body {
16
+ background-color: var(--c-bg);
17
+ }
18
+ </style>
19
+ <script>
20
+ const userMode = localStorage.getItem('vuepress-color-scheme')
21
+ const systemDarkMode =
22
+ window.matchMedia &&
23
+ window.matchMedia('(prefers-color-scheme: dark)').matches
24
+ if (userMode === 'dark' || (userMode !== 'light' && systemDarkMode)) {
25
+ document.documentElement.classList.toggle('dark', true)
26
+ }
27
+ </script>
28
+ <link rel="manifest" href="/manifest.webmanifest"><meta name="application-name" content="Example"><meta name="apple-mobile-web-app-title" content="Example"><meta name="apple-mobile-web-app-status-bar-style" content="black"><meta name="msapplication-TileColor" content="#3eaf7c"><meta name="theme-color" content="#3eaf7c"><title>Text-to-Speech (Recipe) | </title><meta name="description" content=" ">
29
+ <link rel="preload" href="/assets/style-SNWc1iKP.css" as="style"><link rel="stylesheet" href="/assets/style-SNWc1iKP.css">
30
+ <link rel="modulepreload" href="/assets/app-DTS6SjJz.js"><link rel="modulepreload" href="/assets/tts_cli.html-BfB21gs4.js">
31
+ <link rel="prefetch" href="/assets/index.html-DGcx4T0I.js" as="script"><link rel="prefetch" href="/assets/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html-CJ8-yKXK.js" as="script"><link rel="prefetch" href="/assets/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html-BDY4p1H1.js" as="script"><link rel="prefetch" href="/assets/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html-DDjiuGQB.js" as="script"><link rel="prefetch" href="/assets/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html-DnIftUJK.js" as="script"><link rel="prefetch" href="/assets/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html-D9GsoT-_.js" as="script"><link rel="prefetch" href="/assets/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html-DobzmqH0.js" as="script"><link rel="prefetch" href="/assets/espnet2_tutorial_2021_CMU_11751_18781.html-BY6Z52B4.js" as="script"><link rel="prefetch" href="/assets/asr_cli.html-BA-xBrC-.js" as="script"><link rel="prefetch" href="/assets/asr_library.html-rEQwKTMV.js" as="script"><link rel="prefetch" href="/assets/espnet2_asr_realtime_demo.html-BnK1Wovv.js" as="script"><link rel="prefetch" href="/assets/espnet2_asr_transfer_learning_demo.html-DJeSoTyY.js" as="script"><link rel="prefetch" href="/assets/espnet2_streaming_asr_demo.html-Yx-OX8AZ.js" as="script"><link rel="prefetch" href="/assets/espnet_se_demonstration_for_waspaa_2021.html--JdDNbEo.js" as="script"><link rel="prefetch" href="/assets/se_demo.html-DY-mv2y8.js" as="script"><link rel="prefetch" href="/assets/onnx_conversion_demo.html-D56NEMop.js" as="script"><link rel="prefetch" href="/assets/pretrained.html-JpE__EKJ.js" as="script"><link rel="prefetch" href="/assets/espnet2_2pass_slu_demo.html-BmvJ92Ni.js" as="script"><link rel="prefetch" href="/assets/st_demo.html-WLzB4ZGO.js" as="script"><link rel="prefetch" href="/assets/espnet2_tts_realtime_demo.html-BdxLBr1c.js" as="script"><link rel="prefetch" href="/assets/tts_realtime_demo.html-BKOGq7as.js" as="script"><link rel="prefetch" href="/assets/finetune_owsm.html-ICOQYZj2.js" as="script"><link rel="prefetch" href="/assets/finetune_with_lora.html-3NfoQDOl.js" as="script"><link rel="prefetch" href="/assets/train.html-BQ-t2Cs4.js" as="script"><link rel="prefetch" href="/assets/tacotron2.html-Ds1AKES7.js" as="script"><link rel="prefetch" href="/assets/404.html-DN7291h8.js" as="script"><link rel="prefetch" href="/assets/NpmBadge-rh9tvaXX.js" as="script">
32
+ </head>
33
+ <body>
34
+ <div id="app"><!--[--><div class="theme-container"><!--[--><header class="navbar"><div class="toggle-sidebar-button" title="toggle sidebar" aria-expanded="false" role="button" tabindex="0"><div class="icon" aria-hidden="true"><span></span><span></span><span></span></div></div><span><a class="route-link" href="/"><img class="logo" src="/images/espnet_logo1.png" alt=" "><span class="site-name can-hide" aria-hidden="true"> </span></a></span><div class="navbar-items-wrapper" style=""><!--[--><!--]--><nav class="navbar-items can-hide" aria-label="site navigation"><!--[--><div class="navbar-item"><a class="external-link" href="https://github.com/espnet/espnet" rel="noopener noreferrer" target="_blank" aria-label="Github"><!--[--><!--[--><!--]--> Github <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://huggingface.co/espnet" rel="noopener noreferrer" target="_blank" aria-label="HuggingFace"><!--[--><!--[--><!--]--> HuggingFace <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://espnet.github.io/espnet" rel="noopener noreferrer" target="_blank" aria-label="Docs"><!--[--><!--[--><!--]--> Docs <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link route-link-active" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SE</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SLU</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>OTHERS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ST</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/tts/tacotron2.html" aria-label="TTS demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> TTS demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_with_lora.html" aria-label="Finetune Model with ESPnet-Easy"><!--[--><!--[--><!--]--> Finetune Model with ESPnet-Easy <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/train.html" aria-label="Sample demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> Sample demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_owsm.html" aria-label="OWSM finetuning with custom dataset"><!--[--><!--[--><!--]--> OWSM finetuning with custom dataset <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_tutorial_2021_CMU_11751_18781.html" aria-label="CMU 11751/18781 2021: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 2021: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html" aria-label="CMU 11492/11692 Spring 2023: Spoken Language Understanding"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Spoken Language Understanding <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html" aria-label="CMU 11492/11692 Spring 2023: Speech Enhancement"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Speech Enhancement <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html" aria-label="CMU 11492/11692 Spring 2023: Data preparation"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Data preparation <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task)"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html" aria-label="CMU 11492/11692 Spring 2023: Text to Speech"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Text to Speech <!--[--><!--]--><!--]--></a></li><!--]--></ul></div></div><!--]--></nav><!--[--><!--]--><button class="toggle-color-mode-button" title="toggle color mode"><svg style="" class="icon" focusable="false" viewBox="0 0 32 32"><path d="M16 12.005a4 4 0 1 1-4 4a4.005 4.005 0 0 1 4-4m0-2a6 6 0 1 0 6 6a6 6 0 0 0-6-6z" fill="currentColor"></path><path d="M5.394 6.813l1.414-1.415l3.506 3.506L8.9 10.318z" fill="currentColor"></path><path d="M2 15.005h5v2H2z" fill="currentColor"></path><path d="M5.394 25.197L8.9 21.691l1.414 1.415l-3.506 3.505z" fill="currentColor"></path><path d="M15 25.005h2v5h-2z" fill="currentColor"></path><path d="M21.687 23.106l1.414-1.415l3.506 3.506l-1.414 1.414z" fill="currentColor"></path><path d="M25 15.005h5v2h-5z" fill="currentColor"></path><path d="M21.687 8.904l3.506-3.506l1.414 1.415l-3.506 3.505z" fill="currentColor"></path><path d="M15 2.005h2v5h-2z" fill="currentColor"></path></svg><svg style="display:none;" class="icon" focusable="false" viewBox="0 0 32 32"><path d="M13.502 5.414a15.075 15.075 0 0 0 11.594 18.194a11.113 11.113 0 0 1-7.975 3.39c-.138 0-.278.005-.418 0a11.094 11.094 0 0 1-3.2-21.584M14.98 3a1.002 1.002 0 0 0-.175.016a13.096 13.096 0 0 0 1.825 25.981c.164.006.328 0 .49 0a13.072 13.072 0 0 0 10.703-5.555a1.01 1.01 0 0 0-.783-1.565A13.08 13.08 0 0 1 15.89 4.38A1.015 1.015 0 0 0 14.98 3z" fill="currentColor"></path></svg></button><form class="search-box" role="search"><input type="search" placeholder="Search" autocomplete="off" spellcheck="false" value><!----></form></div></header><!--]--><div class="sidebar-mask"></div><!--[--><aside class="sidebar"><nav class="navbar-items" aria-label="site navigation"><!--[--><div class="navbar-item"><a class="external-link" href="https://github.com/espnet/espnet" rel="noopener noreferrer" target="_blank" aria-label="Github"><!--[--><!--[--><!--]--> Github <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://huggingface.co/espnet" rel="noopener noreferrer" target="_blank" aria-label="HuggingFace"><!--[--><!--[--><!--]--> HuggingFace <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://espnet.github.io/espnet" rel="noopener noreferrer" target="_blank" aria-label="Docs"><!--[--><!--[--><!--]--> Docs <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link route-link-active" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SE</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SLU</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>OTHERS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ST</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/tts/tacotron2.html" aria-label="TTS demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> TTS demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_with_lora.html" aria-label="Finetune Model with ESPnet-Easy"><!--[--><!--[--><!--]--> Finetune Model with ESPnet-Easy <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/train.html" aria-label="Sample demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> Sample demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_owsm.html" aria-label="OWSM finetuning with custom dataset"><!--[--><!--[--><!--]--> OWSM finetuning with custom dataset <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_tutorial_2021_CMU_11751_18781.html" aria-label="CMU 11751/18781 2021: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 2021: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html" aria-label="CMU 11492/11692 Spring 2023: Spoken Language Understanding"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Spoken Language Understanding <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html" aria-label="CMU 11492/11692 Spring 2023: Speech Enhancement"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Speech Enhancement <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html" aria-label="CMU 11492/11692 Spring 2023: Data preparation"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Data preparation <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task)"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html" aria-label="CMU 11492/11692 Spring 2023: Text to Speech"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Text to Speech <!--[--><!--]--><!--]--></a></li><!--]--></ul></div></div><!--]--></nav><!--[--><!--]--><ul class="sidebar-items"><!--[--><li><p tabindex="0" class="sidebar-item sidebar-heading active">TTS <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link route-link-active sidebar-item active" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="#setup-envrionment" aria-label="Setup envrionment"><!--[--><!--[--><!--]--> Setup envrionment <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#run-the-recipe" aria-label="Run the recipe"><!--[--><!--[--><!--]--> Run the recipe <!--[--><!--]--><!--]--></a><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="#stage-1-data-download" aria-label="Stage -1: Data download"><!--[--><!--[--><!--]--> Stage -1: Data download <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#stage-0-data-preparation" aria-label="Stage 0: Data preparation"><!--[--><!--[--><!--]--> Stage 0: Data preparation <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#stage-1-feature-extration" aria-label="Stage 1: Feature extration"><!--[--><!--[--><!--]--> Stage 1: Feature extration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#stage-2-dictionary-and-json-preparation" aria-label="Stage 2: Dictionary and json preparation"><!--[--><!--[--><!--]--> Stage 2: Dictionary and json preparation <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#stage-3-network-training" aria-label="Stage 3: Network training"><!--[--><!--[--><!--]--> Stage 3: Network training <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#stage-4-network-decoding" aria-label="Stage 4: Network decoding"><!--[--><!--[--><!--]--> Stage 4: Network decoding <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#stage-5-waveform-synthesis" aria-label="Stage 5: Waveform synthesis"><!--[--><!--[--><!--]--> Stage 5: Waveform synthesis <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><a class="route-link sidebar-item" href="#next-step" aria-label="NEXT step"><!--[--><!--[--><!--]--> NEXT step <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><a class="route-link sidebar-item" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">SE <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">SLU <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">ASR <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">OTHERS <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">ST <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><!--]--></ul><!--[--><!--]--></aside><!--]--><!--[--><main class="page"><!--[--><!--]--><div class="theme-default-content"><!--[--><!--]--><div><h1 id="text-to-speech-recipe" tabindex="-1"><a class="header-anchor" href="#text-to-speech-recipe"><span>Text-to-Speech (Recipe)</span></a></h1><p>This is the example notebook of how-to-run the ESPnet TTS recipe using an4 dataset.<br> You can understand the overview of TTS recipe through this notebook within an hour!</p><p>See also:</p><ul><li>Documentaion: <a href="https://espnet.github.io/espnet" target="_blank" rel="noopener noreferrer">https://espnet.github.io/espnet<span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span></a></li><li>Github: <a href="https://github.com/espnet" target="_blank" rel="noopener noreferrer">https://github.com/espnet<span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span></a></li></ul><p>Author: <a href="https://github.com/kan-bayashi" target="_blank" rel="noopener noreferrer">Tomoki Hayashi<span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span></a></p><p>Last update: 2019/07/25</p><h2 id="setup-envrionment" tabindex="-1"><a class="header-anchor" href="#setup-envrionment"><span>Setup envrionment</span></a></h2><p>First, let&#39;s setup the environmet to run the recipe.<br> It take around 10 minues. Please keep waiting for a while.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># OS setup</span></span>
35
+ <span class="line"><span>!sudo apt-get install bc tree</span></span>
36
+ <span class="line"><span>!cat /etc/os-release</span></span>
37
+ <span class="line"><span></span></span>
38
+ <span class="line"><span># espnet setup</span></span>
39
+ <span class="line"><span>!git clone https://github.com/espnet/espnet</span></span>
40
+ <span class="line"><span>!cd espnet; pip install -e .</span></span>
41
+ <span class="line"><span></span></span>
42
+ <span class="line"><span># warp ctc setup</span></span>
43
+ <span class="line"><span>!git clone https://github.com/espnet/warp-ctc -b pytorch-1.1</span></span>
44
+ <span class="line"><span>!cd warp-ctc &amp;&amp; mkdir build &amp;&amp; cd build &amp;&amp; cmake .. &amp;&amp; make -j</span></span>
45
+ <span class="line"><span>!cd warp-ctc/pytorch_binding &amp;&amp; python setup.py install </span></span>
46
+ <span class="line"><span></span></span>
47
+ <span class="line"><span># kaldi setup</span></span>
48
+ <span class="line"><span>!cd /content/espnet/tools; git clone https://github.com/kaldi-asr/kaldi</span></span>
49
+ <span class="line"><span>!echo &quot;&quot; &gt; ./espnet/tools/kaldi/tools/extras/check_dependencies.sh # ignore check</span></span>
50
+ <span class="line"><span>!chmod +x ./espnet/tools/kaldi/tools/extras/check_dependencies.sh</span></span>
51
+ <span class="line"><span>!cd ./espnet/tools/kaldi/tools; make sph2pipe sclite</span></span>
52
+ <span class="line"><span>!rm -rf espnet/tools/kaldi/tools/python</span></span>
53
+ <span class="line"><span>!wget https://18-198329952-gh.circle-artifacts.com/0/home/circleci/repo/ubuntu16-featbin.tar.gz</span></span>
54
+ <span class="line"><span>!tar -xf ./ubuntu16-featbin.tar.gz # take a few minutes</span></span>
55
+ <span class="line"><span>!cp featbin/* espnet/tools/kaldi/src/featbin/</span></span>
56
+ <span class="line"><span></span></span>
57
+ <span class="line"><span># make dummy activate</span></span>
58
+ <span class="line"><span>!mkdir -p espnet/tools/venv/bin</span></span>
59
+ <span class="line"><span>!touch espnet/tools/venv/bin/activate</span></span>
60
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="run-the-recipe" tabindex="-1"><a class="header-anchor" href="#run-the-recipe"><span>Run the recipe</span></a></h2><p>Now ready to run the recipe!<br> We use the most simplest recipe <code>egs/an4/tts1</code> as an example.</p><blockquote><p>Unfortunately, <code>egs/an4/tts1</code> is too small to generate reasonable speech.<br> But you can understand the flow or TTS recipe through this recipe since all of the TTS recipes has the exactly same flow.</p></blockquote><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># Let&#39;s go to an4 recipe!</span></span>
61
+ <span class="line"><span>import os</span></span>
62
+ <span class="line"><span>os.chdir(&quot;/content/espnet/egs/an4/tts1&quot;)</span></span>
63
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>Before running the recipe, let us check the recipe structure.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!tree -L 1</span></span>
64
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>Each recipe has the same structure and files.</p><ul><li><strong>run.sh</strong>: Main script of the recipe. Once you run this script, all of the processing will be conducted from data download, preparation, feature extraction, training, and decoding.</li><li><strong>cmd.sh</strong>: Command configuration source file about how-to-run each processing. You can modify this script if you want to run the script through job control system e.g. Slurm or Torque.</li><li><strong>path.sh</strong>: Path configuration source file. Basically, we do not have to touch.</li><li><strong>conf/</strong>: Directory containing configuration files.</li><li><strong>local/</strong>: Directory containing the recipe-specific scripts e.g. data preparation.</li><li><strong>steps/</strong> and <strong>utils/</strong>: Directory containing kaldi tools.</li></ul><p>Main script <strong>run.sh</strong> consists of several stages:</p><ul><li><strong>stage -1</strong>: Download data if the data is available online.</li><li><strong>stage 0</strong>: Prepare data to make kaldi-stype data directory.</li><li><strong>stage 1</strong>: Extract feature vector, calculate statistics, and perform normalization.</li><li><strong>stage 2</strong>: Prepare a dictionary and make json files for training.</li><li><strong>stage 3</strong>: Train the E2E-TTS network.</li><li><strong>stage 4</strong>: Decode mel-spectrogram using the trained network.</li><li><strong>stage 5</strong>: Generate a waveform from a generated mel-spectrogram using Griffin-Lim.</li></ul><p>Currently, we support the following networks:</p><ul><li>Tacotron2: <a href="https://arxiv.org/abs/1712.05884" target="_blank" rel="noopener noreferrer">Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions<span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span></a></li><li>Transformer: <a href="https://arxiv.org/pdf/1809.08895.pdf" target="_blank" rel="noopener noreferrer">Neural Speech Synthesis with Transformer Network<span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span></a></li><li>FastSpeech: <a href="https://arxiv.org/pdf/1905.09263.pdf" target="_blank" rel="noopener noreferrer">FastSpeech: Fast, Robust and Controllable Text to Speech<span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span></a></li></ul><p>Let us check each stage step-by-step via <strong>--stage</strong> and <strong>--stop_stage</strong> options!</p><h3 id="stage-1-data-download" tabindex="-1"><a class="header-anchor" href="#stage-1-data-download"><span>Stage -1: Data download</span></a></h3><p>This stage downloads dataset if the dataset is available online.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!./run.sh --stage -1 --stop_stage -1</span></span>
65
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!tree -L 1</span></span>
66
+ <span class="line"><span>!ls downloads/</span></span>
67
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><p>You can see <strong>downloads</strong> directory is cretead, which containing donwloaded an4 dataset.</p><h3 id="stage-0-data-preparation" tabindex="-1"><a class="header-anchor" href="#stage-0-data-preparation"><span>Stage 0: Data preparation</span></a></h3><p>This stage creates kaldi-style data directories.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!./run.sh --stage 0 --stop_stage 0</span></span>
68
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!tree -L 1 data</span></span>
69
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>Through the data preparation stage, kaldi-style data directories will be created.<br> Here, <strong>data/train/</strong> is corresponding to training set, and <strong>data/test</strong> is corresponding to evaluation set.<br> Each directory has the same following files:</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!ls data/*</span></span>
70
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>The above four files are all we have to prepare to create new recipes.<br> Let&#39;s check each file.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!head -n 3 data/train/{wav.scp,text,utt2spk,spk2utt}</span></span>
71
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>Each file contains the following information:</p><ul><li><strong>wav.scp</strong>: List of audio path. Each line has <code>&lt;utt_id&gt; &lt;wavfile_path or command pipe&gt;</code>. <code>&lt;utt_id&gt;</code> must be unique.</li><li><strong>text</strong>: List of transcriptions. Each line has <code>&lt;utt_id&gt; &lt;transcription&gt;</code>. In the case of TTS, we assume that <code>&lt;transcription&gt;</code> is cleaned.</li><li><strong>utt2spk</strong>: List of correspondence table between utterances and speakers. Each line has <code>&lt;utt_id&gt; &lt;speaker_id&gt;</code>.</li><li><strong>spk2utt</strong>: List of correspondence table between speakers and utterances. Each lien has <code>&lt;speaker_id&gt; &lt;utt_id&gt; ... &lt;utt_id&gt; </code>. This file can be automatically created from <strong>utt2spk</strong>.</li></ul><p>In the ESPnet, speaker information is not used for any processing.<br> Therefore, <strong>utt2spk</strong> and <strong>spk2utt</strong> can be a dummy.</p><h3 id="stage-1-feature-extration" tabindex="-1"><a class="header-anchor" href="#stage-1-feature-extration"><span>Stage 1: Feature extration</span></a></h3><p>This stage performs the following processing:</p><ol><li>Mel-spectrogram extraction</li><li>Data split into training and validation set</li><li>Statistics (mean and variance) calculation</li><li>Normalization</li></ol><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!./run.sh --stage 1 --stop_stage 1 --nj 4</span></span>
72
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>Raw filterbanks are saved in <strong>fbank/</strong> directory with ark/scp format.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!ls fbank</span></span>
73
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p><strong>.ark</strong> is binary file and <strong>.scp</strong> contain the correspondence between <code>&lt;utt_id&gt;</code> and <code>&lt;path_in_ark&gt;</code>.<br> Since feature extraction can be performed for split small sets in parallel, raw_fbank is split into <code>raw_fbank_*.{1..N}.{scp,ark}.</code></p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!head -n 3 fbank/raw_fbank_train.1.scp</span></span>
74
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>These files can be loaded in python via <strong>kaldiio</strong> as follows:</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>import kaldiio</span></span>
75
+ <span class="line"><span>import matplotlib.pyplot as plt</span></span>
76
+ <span class="line"><span></span></span>
77
+ <span class="line"><span># load scp file</span></span>
78
+ <span class="line"><span>scp_dict = kaldiio.load_scp(&quot;fbank/raw_fbank_train.1.scp&quot;)</span></span>
79
+ <span class="line"><span>for key in scp_dict:</span></span>
80
+ <span class="line"><span> plt.imshow(scp_dict[key].T[::-1])</span></span>
81
+ <span class="line"><span> plt.title(key)</span></span>
82
+ <span class="line"><span> plt.colorbar()</span></span>
83
+ <span class="line"><span> plt.show()</span></span>
84
+ <span class="line"><span> break</span></span>
85
+ <span class="line"><span> </span></span>
86
+ <span class="line"><span># load ark file</span></span>
87
+ <span class="line"><span>ark_generator = kaldiio.load_ark(&quot;fbank/raw_fbank_train.1.ark&quot;)</span></span>
88
+ <span class="line"><span>for key, array in ark_generator:</span></span>
89
+ <span class="line"><span> plt.imshow(array.T[::-1])</span></span>
90
+ <span class="line"><span> plt.title(key)</span></span>
91
+ <span class="line"><span> plt.colorbar()</span></span>
92
+ <span class="line"><span> plt.show()</span></span>
93
+ <span class="line"><span> break</span></span>
94
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>After raw mel-spectrogram extraction, some files are added in <strong>data/train/</strong>.<br><strong>feats.scp</strong> is concatenated scp file of <strong>fbank/raw_fbank_train.{1..N}.scp</strong>.<br><strong>utt2num_frames</strong> has the number of feature frames of each <code>&lt;utt_id&gt;</code>.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!ls data/train</span></span>
95
+ <span class="line"><span>!head -n 3 data/train/{feats.scp,utt2num_frames}</span></span>
96
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><p>And <strong>data/train/</strong> directory is split into two directory:</p><ul><li><strong>data/train_nodev/</strong>: data directory for training</li><li><strong>data/train_dev/</strong>: data directory for validation</li></ul><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!ls data</span></span>
97
+ <span class="line"><span>!ls data/train_*</span></span>
98
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><p>You can find <strong>cmvn.ark</strong> in <strong>data/train_nodev</strong>, which is the calculated statistics file.<br> This file also can be loaded in python via kaldiio.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># load cmvn.ark file (Be careful not load_ark, but load_mat)</span></span>
99
+ <span class="line"><span>cmvn = kaldiio.load_mat(&quot;data/train_nodev/cmvn.ark&quot;)</span></span>
100
+ <span class="line"><span></span></span>
101
+ <span class="line"><span># cmvn consists of mean and variance, the last dimension of mean represents the number of frames.</span></span>
102
+ <span class="line"><span>print(&quot;cmvn shape = &quot;+ str(cmvn.shape))</span></span>
103
+ <span class="line"><span></span></span>
104
+ <span class="line"><span># calculate mean and variance</span></span>
105
+ <span class="line"><span>mu = cmvn[0, :-1] / cmvn[0, -1]</span></span>
106
+ <span class="line"><span>var = cmvn[1, :-1] / cmvn[0, -1]</span></span>
107
+ <span class="line"><span></span></span>
108
+ <span class="line"><span># show mean</span></span>
109
+ <span class="line"><span>print(&quot;mean = &quot; + str(mu))</span></span>
110
+ <span class="line"><span>print(&quot;variance = &quot; + str(var))</span></span>
111
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>Normalzed features for training, validation and evaluation set are dumped in <strong>dump/{train_nodev,train_dev,test}/</strong>.<br> There ark and scp can be loaded as the same as the above procedure.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!ls dump/*</span></span>
112
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h3 id="stage-2-dictionary-and-json-preparation" tabindex="-1"><a class="header-anchor" href="#stage-2-dictionary-and-json-preparation"><span>Stage 2: Dictionary and json preparation</span></a></h3><p>This stage creates dictrionary from <strong>data/train_nodev/text</strong> and makes json file for training.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!./run.sh --stage 2 --stop_stage 2</span></span>
113
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>Dictrionary file will be created in <strong>data/lang_1char/</strong>.<br> Dictionary file consists of <code>&lt;token&gt;</code> <code>&lt;token index&gt;</code>.<br> Here, <code>&lt;token index&gt;</code> starts from 1 because 0 is used as padding index.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!ls data</span></span>
114
+ <span class="line"><span>!cat data/lang_1char/train_nodev_units.txt</span></span>
115
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><p>Json file will be created for training / validation /evaludation sets and they are saved as <strong>dump/{train_nodev,train_dev,test}/data.json</strong>.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!ls dump/*/*.json</span></span>
116
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>Each json file contains all of the information in the data directory.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!head -n 27 dump/train_nodev/data.json</span></span>
117
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><ul><li>&quot;shape&quot;: Shape of the input or output sequence. Here input shape [63, 80] represents the number of frames = 63 and the dimension of mel-spectrogram = 80.</li><li>&quot;text&quot;: Original transcription.</li><li>&quot;token&quot;: Token sequence of original transcription.</li><li>&quot;tokenid&quot; Token id sequence of original transcription, which is converted using the dictionary.</li></ul><p>Now ready to start training!</p><h3 id="stage-3-network-training" tabindex="-1"><a class="header-anchor" href="#stage-3-network-training"><span>Stage 3: Network training</span></a></h3><p>This stage performs training of the network.<br> Network training configurations are written as <strong>.yaml</strong> format file.<br> Let us check the default cofiguration <strong>conf/train_pytroch_tacotron2.yaml</strong>.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!cat conf/train_pytorch_tacotron2.yaml</span></span>
118
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>You can modify this configuration file to change the hyperparameters.<br> Here, let&#39;s change the number of epochs for this demonstration.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># TODO(kan-bayashi): Change here to use change_yaml.py</span></span>
119
+ <span class="line"><span>!cat conf/train_pytorch_tacotron2.yaml | sed -e &quot;s/epochs: 50/epochs: 3/g&quot; &gt; conf/train_pytorch_tacotron2_sample.yaml</span></span>
120
+ <span class="line"><span>!cat conf/train_pytorch_tacotron2_sample.yaml</span></span>
121
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>Let&#39;s train the network.<br> You can specify the config file via <strong>--train_config</strong> option. It takes several minutes.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!./run.sh --stage 3 --stop_stage 3 --train_config conf/train_pytorch_tacotron2_sample.yaml --verbose 1</span></span>
122
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>You can see the training log in <code>exp/train_*/train.log</code>.</p><p>The models are saved in <code>exp/train_*/results/</code> directory.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!ls exp/train_nodev_pytorch_train_pytorch_tacotron2_sample/{results,results/att_ws}</span></span>
123
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p><code>exp/train_*/results/*.png</code> are the figures of training curve.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>from IPython.display import Image, display_png</span></span>
124
+ <span class="line"><span>print(&quot;all loss curve&quot;)</span></span>
125
+ <span class="line"><span>display_png(Image(&quot;exp/train_nodev_pytorch_train_pytorch_tacotron2_sample/results/all_loss.png&quot;))</span></span>
126
+ <span class="line"><span>print(&quot;l1 loss curve&quot;)</span></span>
127
+ <span class="line"><span>display_png(Image(&quot;exp/train_nodev_pytorch_train_pytorch_tacotron2_sample/results/l1_loss.png&quot;))</span></span>
128
+ <span class="line"><span>print(&quot;mse loss curve&quot;)</span></span>
129
+ <span class="line"><span>display_png(Image(&quot;exp/train_nodev_pytorch_train_pytorch_tacotron2_sample/results/mse_loss.png&quot;))</span></span>
130
+ <span class="line"><span>print(&quot;bce loss curve&quot;)</span></span>
131
+ <span class="line"><span>display_png(Image(&quot;exp/train_nodev_pytorch_train_pytorch_tacotron2_sample/results/bce_loss.png&quot;))</span></span>
132
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p><code>exp/train_*/results/att_ws/.png</code> are the figures of attention weights in each epoch.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>print(&quot;Attention weights of initial epoch&quot;)</span></span>
133
+ <span class="line"><span>display_png(Image(&quot;exp/train_nodev_pytorch_train_pytorch_tacotron2_sample/results/att_ws/fash-cen1-b.ep.1.png&quot;))</span></span>
134
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><p><code>exp/train_*/results/model.loss.best</code> contains only the model parameters.<br> On the other hand, <code>exp/train_*/results/snapshot</code> contains the model parameters, optimizer states, and iterator states.<br> So you can restart from the training by specifying the snapshot file with <strong>--resume</strong> option.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># resume training from snapshot.ep.2</span></span>
135
+ <span class="line"><span>!./run.sh --stage 3 --stop_stage 3 --train_config conf/train_pytorch_tacotron2_sample.yaml --resume exp/train_nodev_pytorch_train_pytorch_tacotron2_sample/results/snapshot.ep.2 --verbose 1</span></span>
136
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!cat exp/train_nodev_pytorch_train_pytorch_tacotron2_sample/train.log</span></span>
137
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>Also, we support tensorboard.<br> You can see the training log through tensorboard.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>%load_ext tensorboard</span></span>
138
+ <span class="line"><span>%tensorboard --logdir tensorboard/train_nodev_pytorch_train_pytorch_tacotron2_sample/</span></span>
139
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="stage-4-network-decoding" tabindex="-1"><a class="header-anchor" href="#stage-4-network-decoding"><span>Stage 4: Network decoding</span></a></h3><p>This stage performs decoding using the trained model to generate mel-spectrogram from a given text.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!./run.sh --stage 4 --stop_stage 4 --nj 8 --train_config conf/train_pytorch_tacotron2_sample.yaml </span></span>
140
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>Generated features are saved as ark/scp format.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!ls exp/train_nodev_pytorch_train_pytorch_tacotron2_sample/outputs_model.loss.best_decode/*</span></span>
141
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>We can specify the model or snapshot to be used for decoding via <strong>--model</strong>.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!./run.sh --stage 4 --stop_stage 4 --nj 8 --train_config conf/train_pytorch_tacotron2_sample.yaml --model snapshot.ep.2</span></span>
142
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!ls exp/train_nodev_pytorch_train_pytorch_tacotron2_sample/outputs_snapshot.ep.2_decode/*</span></span>
143
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h3 id="stage-5-waveform-synthesis" tabindex="-1"><a class="header-anchor" href="#stage-5-waveform-synthesis"><span>Stage 5: Waveform synthesis</span></a></h3><p>Finally, in this stage, we generate waveform using Grrifin-Lim algorithm.<br> First, we perform de-normalization to convert the generated mel-spectrogram into the original scale.<br> Then we apply Grrifin-Lim algorithm to restore phase components and apply inverse STFT to generate waveforms.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!./run.sh --stage 5 --stop_stage 5 --nj 8 --train_config conf/train_pytorch_tacotron2_sample.yaml --griffin_lim_iters 50</span></span>
144
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>Generated wav files are saved in <code>exp/train_nodev_pytorch_train_pytorch_tacotron2_sample/outputs_model.loss.best_decode_denorm/*/wav</code></p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!ls exp/train_nodev_pytorch_train_pytorch_tacotron2_sample/outputs_model.loss.best_decode_denorm/*/wav</span></span>
145
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!tree -L 3</span></span>
146
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h2 id="next-step" tabindex="-1"><a class="header-anchor" href="#next-step"><span>NEXT step</span></a></h2><ul><li>Try pretrained model to generate speech.</li><li>Try a large single speaker dataset recipe <strong>egs/ljspeech/tts1</strong>.</li><li>Try a large multi-speaker recipe <strong>egs/libritts/tts1</strong>.</li><li>Make the original recipe using your own dataset.</li></ul></div><!--[--><!--]--></div><footer class="vp-page-meta"><!----><div class="vp-meta-item git-info"><!----><!----></div></footer><nav class="vp-page-nav" aria-label="page navigation"><!----><a class="route-link next" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><div class="hint">Next <span class="arrow right"></span></div><div class="link"><span>ESPnet real time E2E-TTS demonstration</span></div><!--]--></a></nav><!--[--><!--]--></main><!--]--></div><!--[--><!----><!--]--><!--]--></div>
147
+ <script type="module" src="/assets/app-DTS6SjJz.js" defer></script>
148
+ </body>
149
+ </html>
espnet2/tts/tts_realtime_demo.html ADDED
@@ -0,0 +1,387 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!doctype html>
2
+ <html lang="en-US">
3
+ <head>
4
+ <meta charset="utf-8" />
5
+ <meta name="viewport" content="width=device-width,initial-scale=1" />
6
+ <meta name="generator" content="VuePress 2.0.0-rc.9" />
7
+ <style>
8
+ :root {
9
+ --c-bg: #fff;
10
+ }
11
+ html.dark {
12
+ --c-bg: #22272e;
13
+ }
14
+ html,
15
+ body {
16
+ background-color: var(--c-bg);
17
+ }
18
+ </style>
19
+ <script>
20
+ const userMode = localStorage.getItem('vuepress-color-scheme')
21
+ const systemDarkMode =
22
+ window.matchMedia &&
23
+ window.matchMedia('(prefers-color-scheme: dark)').matches
24
+ if (userMode === 'dark' || (userMode !== 'light' && systemDarkMode)) {
25
+ document.documentElement.classList.toggle('dark', true)
26
+ }
27
+ </script>
28
+ <link rel="manifest" href="/manifest.webmanifest"><meta name="application-name" content="Example"><meta name="apple-mobile-web-app-title" content="Example"><meta name="apple-mobile-web-app-status-bar-style" content="black"><meta name="msapplication-TileColor" content="#3eaf7c"><meta name="theme-color" content="#3eaf7c"><title>ESPnet real time E2E-TTS demonstration | </title><meta name="description" content=" ">
29
+ <link rel="preload" href="/assets/style-SNWc1iKP.css" as="style"><link rel="stylesheet" href="/assets/style-SNWc1iKP.css">
30
+ <link rel="modulepreload" href="/assets/app-DTS6SjJz.js"><link rel="modulepreload" href="/assets/tts_realtime_demo.html-BKOGq7as.js">
31
+ <link rel="prefetch" href="/assets/index.html-DGcx4T0I.js" as="script"><link rel="prefetch" href="/assets/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html-CJ8-yKXK.js" as="script"><link rel="prefetch" href="/assets/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html-BDY4p1H1.js" as="script"><link rel="prefetch" href="/assets/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html-DDjiuGQB.js" as="script"><link rel="prefetch" href="/assets/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html-DnIftUJK.js" as="script"><link rel="prefetch" href="/assets/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html-D9GsoT-_.js" as="script"><link rel="prefetch" href="/assets/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html-DobzmqH0.js" as="script"><link rel="prefetch" href="/assets/espnet2_tutorial_2021_CMU_11751_18781.html-BY6Z52B4.js" as="script"><link rel="prefetch" href="/assets/asr_cli.html-BA-xBrC-.js" as="script"><link rel="prefetch" href="/assets/asr_library.html-rEQwKTMV.js" as="script"><link rel="prefetch" href="/assets/espnet2_asr_realtime_demo.html-BnK1Wovv.js" as="script"><link rel="prefetch" href="/assets/espnet2_asr_transfer_learning_demo.html-DJeSoTyY.js" as="script"><link rel="prefetch" href="/assets/espnet2_streaming_asr_demo.html-Yx-OX8AZ.js" as="script"><link rel="prefetch" href="/assets/espnet_se_demonstration_for_waspaa_2021.html--JdDNbEo.js" as="script"><link rel="prefetch" href="/assets/se_demo.html-DY-mv2y8.js" as="script"><link rel="prefetch" href="/assets/onnx_conversion_demo.html-D56NEMop.js" as="script"><link rel="prefetch" href="/assets/pretrained.html-JpE__EKJ.js" as="script"><link rel="prefetch" href="/assets/espnet2_2pass_slu_demo.html-BmvJ92Ni.js" as="script"><link rel="prefetch" href="/assets/st_demo.html-WLzB4ZGO.js" as="script"><link rel="prefetch" href="/assets/espnet2_tts_realtime_demo.html-BdxLBr1c.js" as="script"><link rel="prefetch" href="/assets/tts_cli.html-BfB21gs4.js" as="script"><link rel="prefetch" href="/assets/finetune_owsm.html-ICOQYZj2.js" as="script"><link rel="prefetch" href="/assets/finetune_with_lora.html-3NfoQDOl.js" as="script"><link rel="prefetch" href="/assets/train.html-BQ-t2Cs4.js" as="script"><link rel="prefetch" href="/assets/tacotron2.html-Ds1AKES7.js" as="script"><link rel="prefetch" href="/assets/404.html-DN7291h8.js" as="script"><link rel="prefetch" href="/assets/NpmBadge-rh9tvaXX.js" as="script">
32
+ </head>
33
+ <body>
34
+ <div id="app"><!--[--><div class="theme-container"><!--[--><header class="navbar"><div class="toggle-sidebar-button" title="toggle sidebar" aria-expanded="false" role="button" tabindex="0"><div class="icon" aria-hidden="true"><span></span><span></span><span></span></div></div><span><a class="route-link" href="/"><img class="logo" src="/images/espnet_logo1.png" alt=" "><span class="site-name can-hide" aria-hidden="true"> </span></a></span><div class="navbar-items-wrapper" style=""><!--[--><!--]--><nav class="navbar-items can-hide" aria-label="site navigation"><!--[--><div class="navbar-item"><a class="external-link" href="https://github.com/espnet/espnet" rel="noopener noreferrer" target="_blank" aria-label="Github"><!--[--><!--[--><!--]--> Github <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://huggingface.co/espnet" rel="noopener noreferrer" target="_blank" aria-label="HuggingFace"><!--[--><!--[--><!--]--> HuggingFace <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://espnet.github.io/espnet" rel="noopener noreferrer" target="_blank" aria-label="Docs"><!--[--><!--[--><!--]--> Docs <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link route-link-active" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SE</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SLU</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>OTHERS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ST</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/tts/tacotron2.html" aria-label="TTS demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> TTS demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_with_lora.html" aria-label="Finetune Model with ESPnet-Easy"><!--[--><!--[--><!--]--> Finetune Model with ESPnet-Easy <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/train.html" aria-label="Sample demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> Sample demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_owsm.html" aria-label="OWSM finetuning with custom dataset"><!--[--><!--[--><!--]--> OWSM finetuning with custom dataset <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_tutorial_2021_CMU_11751_18781.html" aria-label="CMU 11751/18781 2021: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 2021: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html" aria-label="CMU 11492/11692 Spring 2023: Spoken Language Understanding"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Spoken Language Understanding <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html" aria-label="CMU 11492/11692 Spring 2023: Speech Enhancement"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Speech Enhancement <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html" aria-label="CMU 11492/11692 Spring 2023: Data preparation"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Data preparation <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task)"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html" aria-label="CMU 11492/11692 Spring 2023: Text to Speech"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Text to Speech <!--[--><!--]--><!--]--></a></li><!--]--></ul></div></div><!--]--></nav><!--[--><!--]--><button class="toggle-color-mode-button" title="toggle color mode"><svg style="" class="icon" focusable="false" viewBox="0 0 32 32"><path d="M16 12.005a4 4 0 1 1-4 4a4.005 4.005 0 0 1 4-4m0-2a6 6 0 1 0 6 6a6 6 0 0 0-6-6z" fill="currentColor"></path><path d="M5.394 6.813l1.414-1.415l3.506 3.506L8.9 10.318z" fill="currentColor"></path><path d="M2 15.005h5v2H2z" fill="currentColor"></path><path d="M5.394 25.197L8.9 21.691l1.414 1.415l-3.506 3.505z" fill="currentColor"></path><path d="M15 25.005h2v5h-2z" fill="currentColor"></path><path d="M21.687 23.106l1.414-1.415l3.506 3.506l-1.414 1.414z" fill="currentColor"></path><path d="M25 15.005h5v2h-5z" fill="currentColor"></path><path d="M21.687 8.904l3.506-3.506l1.414 1.415l-3.506 3.505z" fill="currentColor"></path><path d="M15 2.005h2v5h-2z" fill="currentColor"></path></svg><svg style="display:none;" class="icon" focusable="false" viewBox="0 0 32 32"><path d="M13.502 5.414a15.075 15.075 0 0 0 11.594 18.194a11.113 11.113 0 0 1-7.975 3.39c-.138 0-.278.005-.418 0a11.094 11.094 0 0 1-3.2-21.584M14.98 3a1.002 1.002 0 0 0-.175.016a13.096 13.096 0 0 0 1.825 25.981c.164.006.328 0 .49 0a13.072 13.072 0 0 0 10.703-5.555a1.01 1.01 0 0 0-.783-1.565A13.08 13.08 0 0 1 15.89 4.38A1.015 1.015 0 0 0 14.98 3z" fill="currentColor"></path></svg></button><form class="search-box" role="search"><input type="search" placeholder="Search" autocomplete="off" spellcheck="false" value><!----></form></div></header><!--]--><div class="sidebar-mask"></div><!--[--><aside class="sidebar"><nav class="navbar-items" aria-label="site navigation"><!--[--><div class="navbar-item"><a class="external-link" href="https://github.com/espnet/espnet" rel="noopener noreferrer" target="_blank" aria-label="Github"><!--[--><!--[--><!--]--> Github <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://huggingface.co/espnet" rel="noopener noreferrer" target="_blank" aria-label="HuggingFace"><!--[--><!--[--><!--]--> HuggingFace <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://espnet.github.io/espnet" rel="noopener noreferrer" target="_blank" aria-label="Docs"><!--[--><!--[--><!--]--> Docs <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link route-link-active" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SE</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SLU</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>OTHERS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ST</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/tts/tacotron2.html" aria-label="TTS demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> TTS demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_with_lora.html" aria-label="Finetune Model with ESPnet-Easy"><!--[--><!--[--><!--]--> Finetune Model with ESPnet-Easy <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/train.html" aria-label="Sample demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> Sample demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_owsm.html" aria-label="OWSM finetuning with custom dataset"><!--[--><!--[--><!--]--> OWSM finetuning with custom dataset <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_tutorial_2021_CMU_11751_18781.html" aria-label="CMU 11751/18781 2021: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 2021: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html" aria-label="CMU 11492/11692 Spring 2023: Spoken Language Understanding"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Spoken Language Understanding <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html" aria-label="CMU 11492/11692 Spring 2023: Speech Enhancement"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Speech Enhancement <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html" aria-label="CMU 11492/11692 Spring 2023: Data preparation"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Data preparation <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task)"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html" aria-label="CMU 11492/11692 Spring 2023: Text to Speech"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Text to Speech <!--[--><!--]--><!--]--></a></li><!--]--></ul></div></div><!--]--></nav><!--[--><!--]--><ul class="sidebar-items"><!--[--><li><p tabindex="0" class="sidebar-item sidebar-heading active">TTS <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link route-link-active sidebar-item active" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="#install" aria-label="Install"><!--[--><!--[--><!--]--> Install <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#english-demo" aria-label="English demo"><!--[--><!--[--><!--]--> English demo <!--[--><!--]--><!--]--></a><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="#download-pretrained-feature-generation-model" aria-label="Download pretrained feature generation model"><!--[--><!--[--><!--]--> Download pretrained feature generation model <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#download-pretrained-vocoder-model" aria-label="Download pretrained vocoder model"><!--[--><!--[--><!--]--> Download pretrained vocoder model <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#setup" aria-label="Setup"><!--[--><!--[--><!--]--> Setup <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#synthesis" aria-label="Synthesis"><!--[--><!--[--><!--]--> Synthesis <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><a class="route-link sidebar-item" href="#japanese-demo" aria-label="Japanese demo"><!--[--><!--[--><!--]--> Japanese demo <!--[--><!--]--><!--]--></a><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="#install-japanese-dependencies" aria-label="Install Japanese dependencies"><!--[--><!--[--><!--]--> Install Japanese dependencies <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#download-pretrained-models" aria-label="Download pretrained models"><!--[--><!--[--><!--]--> Download pretrained models <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#setup-1" aria-label="Setup"><!--[--><!--[--><!--]--> Setup <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#synthesis-1" aria-label="Synthesis"><!--[--><!--[--><!--]--> Synthesis <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><a class="route-link sidebar-item" href="#mandarin-demo" aria-label="Mandarin demo"><!--[--><!--[--><!--]--> Mandarin demo <!--[--><!--]--><!--]--></a><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="#install-mandarin-dependencies" aria-label="Install Mandarin dependencies"><!--[--><!--[--><!--]--> Install Mandarin dependencies <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#download-pretrained-models-1" aria-label="Download pretrained models"><!--[--><!--[--><!--]--> Download pretrained models <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#setup-2" aria-label="Setup"><!--[--><!--[--><!--]--> Setup <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#synthesis-2" aria-label="Synthesis"><!--[--><!--[--><!--]--> Synthesis <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><!--]--></ul></li><li><a class="route-link sidebar-item" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">SE <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">SLU <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">ASR <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">OTHERS <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">ST <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><!--]--></ul><!--[--><!--]--></aside><!--]--><!--[--><main class="page"><!--[--><!--]--><div class="theme-default-content"><!--[--><!--]--><div><p><a href="https://colab.research.google.com/github/espnet/notebook/blob/master/tts_realtime_demo.ipynb" target="_blank" rel="noopener noreferrer"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"><span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span></a></p><h1 id="espnet-real-time-e2e-tts-demonstration" tabindex="-1"><a class="header-anchor" href="#espnet-real-time-e2e-tts-demonstration"><span>ESPnet real time E2E-TTS demonstration</span></a></h1><p>This notebook provides a demonstration of the realtime E2E-TTS using ESPnet-TTS and ParallelWaveGAN (+ MelGAN).</p><ul><li>ESPnet: https://github.com/espnet/espnet</li><li>ParallelWaveGAN: https://github.com/kan-bayashi/ParallelWaveGAN</li></ul><p>Author: Tomoki Hayashi (<a href="https://github.com/kan-bayashi" target="_blank" rel="noopener noreferrer">@kan-bayashi<span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span></a>)</p><h2 id="install" tabindex="-1"><a class="header-anchor" href="#install"><span>Install</span></a></h2><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># install minimal components</span></span>
35
+ <span class="line"><span>!pip install -q parallel_wavegan PyYaml unidecode ConfigArgparse g2p_en espnet_tts_frontend</span></span>
36
+ <span class="line"><span>!pip install --upgrade --no-cache-dir gdown</span></span>
37
+ <span class="line"><span>!git clone -q https://github.com/espnet/espnet.git</span></span>
38
+ <span class="line"><span>!cd espnet &amp;&amp; git fetch &amp;&amp; git checkout -b v.0.9.1 refs/tags/v.0.9.1</span></span>
39
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><hr><h2 id="english-demo" tabindex="-1"><a class="header-anchor" href="#english-demo"><span>English demo</span></a></h2><h3 id="download-pretrained-feature-generation-model" tabindex="-1"><a class="header-anchor" href="#download-pretrained-feature-generation-model"><span>Download pretrained feature generation model</span></a></h3><p>You can select one from three models. Please only run the seletected model cells.</p><h4 id="a-tacotron2" tabindex="-1"><a class="header-anchor" href="#a-tacotron2"><span>(a) Tacotron2</span></a></h4><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># download pretrained model</span></span>
40
+ <span class="line"><span>import os</span></span>
41
+ <span class="line"><span>if not os.path.exists(&quot;downloads/en/tacotron2&quot;):</span></span>
42
+ <span class="line"><span> !./espnet/utils/download_from_google_drive.sh \</span></span>
43
+ <span class="line"><span> https://drive.google.com/open?id=1lFfeyewyOsxaNO-DEWy9iSz6qB9ZS1UR downloads/en/tacotron2 tar.gz</span></span>
44
+ <span class="line"><span></span></span>
45
+ <span class="line"><span># set path</span></span>
46
+ <span class="line"><span>trans_type = &quot;phn&quot;</span></span>
47
+ <span class="line"><span>dict_path = &quot;downloads/en/tacotron2/data/lang_1phn/phn_train_no_dev_units.txt&quot;</span></span>
48
+ <span class="line"><span>model_path = &quot;downloads/en/tacotron2/exp/phn_train_no_dev_pytorch_train_pytorch_tacotron2.v3/results/model.last1.avg.best&quot;</span></span>
49
+ <span class="line"><span></span></span>
50
+ <span class="line"><span>print(&quot;sucessfully finished download.&quot;)</span></span>
51
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h4 id="b-transformer" tabindex="-1"><a class="header-anchor" href="#b-transformer"><span>(b) Transformer</span></a></h4><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># download pretrained model</span></span>
52
+ <span class="line"><span>import os</span></span>
53
+ <span class="line"><span>if not os.path.exists(&quot;downloads/en/transformer&quot;):</span></span>
54
+ <span class="line"><span> !./espnet/utils/download_from_google_drive.sh \</span></span>
55
+ <span class="line"><span> https://drive.google.com/open?id=1z8KSOWVBjK-_Ws4RxVN4NTx-Buy03-7c downloads/en/transformer tar.gz</span></span>
56
+ <span class="line"><span></span></span>
57
+ <span class="line"><span># set path</span></span>
58
+ <span class="line"><span>trans_type = &quot;phn&quot;</span></span>
59
+ <span class="line"><span>dict_path = &quot;downloads/en/transformer/data/lang_1phn/phn_train_no_dev_units.txt&quot;</span></span>
60
+ <span class="line"><span>model_path = &quot;downloads/en/transformer/exp/phn_train_no_dev_pytorch_train_pytorch_transformer.v3.single/results/model.last1.avg.best&quot;</span></span>
61
+ <span class="line"><span></span></span>
62
+ <span class="line"><span>print(&quot;sucessfully finished download.&quot;)</span></span>
63
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h4 id="c-fastspeech" tabindex="-1"><a class="header-anchor" href="#c-fastspeech"><span>(c) FastSpeech</span></a></h4><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># download pretrained model</span></span>
64
+ <span class="line"><span>import os</span></span>
65
+ <span class="line"><span>if not os.path.exists(&quot;downloads/en/fastspeech&quot;):</span></span>
66
+ <span class="line"><span> !./espnet/utils/download_from_google_drive.sh \</span></span>
67
+ <span class="line"><span> https://drive.google.com/open?id=1P9I4qag8wAcJiTCPawt6WCKBqUfJFtFp downloads/en/fastspeech tar.gz</span></span>
68
+ <span class="line"><span></span></span>
69
+ <span class="line"><span># set path</span></span>
70
+ <span class="line"><span>trans_type = &quot;phn&quot;</span></span>
71
+ <span class="line"><span>dict_path = &quot;downloads/en/fastspeech/data/lang_1phn/phn_train_no_dev_units.txt&quot;</span></span>
72
+ <span class="line"><span>model_path = &quot;downloads/en/fastspeech/exp/phn_train_no_dev_pytorch_train_tacotron2.v3_fastspeech.v4.single/results/model.last1.avg.best&quot;</span></span>
73
+ <span class="line"><span></span></span>
74
+ <span class="line"><span>print(&quot;Sucessfully finished download.&quot;)</span></span>
75
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="download-pretrained-vocoder-model" tabindex="-1"><a class="header-anchor" href="#download-pretrained-vocoder-model"><span>Download pretrained vocoder model</span></a></h3><p>You can select one from two models. Please only run the seletected model cells.</p><h4 id="a-parallel-wavegan" tabindex="-1"><a class="header-anchor" href="#a-parallel-wavegan"><span>(a) Parallel WaveGAN</span></a></h4><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># download pretrained model</span></span>
76
+ <span class="line"><span>import os</span></span>
77
+ <span class="line"><span>if not os.path.exists(&quot;downloads/en/parallel_wavegan&quot;):</span></span>
78
+ <span class="line"><span> !./espnet/utils/download_from_google_drive.sh \</span></span>
79
+ <span class="line"><span> https://drive.google.com/open?id=1Grn7X9wD35UcDJ5F7chwdTqTa4U7DeVB downloads/en/parallel_wavegan tar.gz</span></span>
80
+ <span class="line"><span></span></span>
81
+ <span class="line"><span># set path</span></span>
82
+ <span class="line"><span>vocoder_path = &quot;downloads/en/parallel_wavegan/ljspeech.parallel_wavegan.v2/checkpoint-400000steps.pkl&quot;</span></span>
83
+ <span class="line"><span></span></span>
84
+ <span class="line"><span>print(&quot;Sucessfully finished download.&quot;)</span></span>
85
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h4 id="b-melgan" tabindex="-1"><a class="header-anchor" href="#b-melgan"><span>(b) MelGAN</span></a></h4><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># download pretrained model</span></span>
86
+ <span class="line"><span>import os</span></span>
87
+ <span class="line"><span>if not os.path.exists(&quot;downloads/en/melgan&quot;):</span></span>
88
+ <span class="line"><span> !./espnet/utils/download_from_google_drive.sh \</span></span>
89
+ <span class="line"><span> https://drive.google.com/open?id=1_a8faVA5OGCzIcJNw4blQYjfG4oA9VEt downloads/en/melgan tar.gz</span></span>
90
+ <span class="line"><span></span></span>
91
+ <span class="line"><span># set path</span></span>
92
+ <span class="line"><span>vocoder_path = &quot;downloads/en/melgan/train_nodev_ljspeech_melgan.v3.long/checkpoint-4000000steps.pkl&quot;</span></span>
93
+ <span class="line"><span></span></span>
94
+ <span class="line"><span>print(&quot;Sucessfully finished download.&quot;)</span></span>
95
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h4 id="c-multi-band-melgan" tabindex="-1"><a class="header-anchor" href="#c-multi-band-melgan"><span>(c) Multi-band MelGAN</span></a></h4><p>This is an <strong>EXPERIMENTAL</strong> model.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># download pretrained model</span></span>
96
+ <span class="line"><span>import os</span></span>
97
+ <span class="line"><span>if not os.path.exists(&quot;downloads/en/mb-melgan&quot;):</span></span>
98
+ <span class="line"><span> !./espnet/utils/download_from_google_drive.sh \</span></span>
99
+ <span class="line"><span> https://drive.google.com/open?id=1rGG5y15uy4WZ-lJy8NPVTkmB_6VhC20V downloads/en/mb-melgan tar.gz</span></span>
100
+ <span class="line"><span></span></span>
101
+ <span class="line"><span># set path</span></span>
102
+ <span class="line"><span>vocoder_path = &quot;downloads/en/mb-melgan/train_nodev_ljspeech_multi_band_melgan.v1/checkpoint-1000000steps.pkl&quot;</span></span>
103
+ <span class="line"><span></span></span>
104
+ <span class="line"><span>print(&quot;Sucessfully finished download.&quot;)</span></span>
105
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="setup" tabindex="-1"><a class="header-anchor" href="#setup"><span>Setup</span></a></h3><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># add path</span></span>
106
+ <span class="line"><span>import sys</span></span>
107
+ <span class="line"><span>sys.path.append(&quot;espnet&quot;)</span></span>
108
+ <span class="line"><span></span></span>
109
+ <span class="line"><span># define device</span></span>
110
+ <span class="line"><span>import torch</span></span>
111
+ <span class="line"><span>device = torch.device(&quot;cuda&quot;)</span></span>
112
+ <span class="line"><span></span></span>
113
+ <span class="line"><span># define E2E-TTS model</span></span>
114
+ <span class="line"><span>from argparse import Namespace</span></span>
115
+ <span class="line"><span>from espnet.asr.asr_utils import get_model_conf</span></span>
116
+ <span class="line"><span>from espnet.asr.asr_utils import torch_load</span></span>
117
+ <span class="line"><span>from espnet.utils.dynamic_import import dynamic_import</span></span>
118
+ <span class="line"><span>idim, odim, train_args = get_model_conf(model_path)</span></span>
119
+ <span class="line"><span>model_class = dynamic_import(train_args.model_module)</span></span>
120
+ <span class="line"><span>model = model_class(idim, odim, train_args)</span></span>
121
+ <span class="line"><span>torch_load(model_path, model)</span></span>
122
+ <span class="line"><span>model = model.eval().to(device)</span></span>
123
+ <span class="line"><span>inference_args = Namespace(**{</span></span>
124
+ <span class="line"><span> &quot;threshold&quot;: 0.5,&quot;minlenratio&quot;: 0.0, &quot;maxlenratio&quot;: 10.0,</span></span>
125
+ <span class="line"><span> # Only for Tacotron 2</span></span>
126
+ <span class="line"><span> &quot;use_attention_constraint&quot;: True, &quot;backward_window&quot;: 1,&quot;forward_window&quot;:3,</span></span>
127
+ <span class="line"><span> # Only for fastspeech (lower than 1.0 is faster speech, higher than 1.0 is slower speech)</span></span>
128
+ <span class="line"><span> &quot;fastspeech_alpha&quot;: 1.0,</span></span>
129
+ <span class="line"><span> })</span></span>
130
+ <span class="line"><span></span></span>
131
+ <span class="line"><span># define neural vocoder</span></span>
132
+ <span class="line"><span>from parallel_wavegan.utils import load_model</span></span>
133
+ <span class="line"><span>fs = 22050</span></span>
134
+ <span class="line"><span>vocoder = load_model(vocoder_path)</span></span>
135
+ <span class="line"><span>vocoder.remove_weight_norm()</span></span>
136
+ <span class="line"><span>vocoder = vocoder.eval().to(device)</span></span>
137
+ <span class="line"><span></span></span>
138
+ <span class="line"><span># define text frontend</span></span>
139
+ <span class="line"><span>from tacotron_cleaner.cleaners import custom_english_cleaners</span></span>
140
+ <span class="line"><span>from g2p_en import G2p</span></span>
141
+ <span class="line"><span>with open(dict_path) as f:</span></span>
142
+ <span class="line"><span> lines = f.readlines()</span></span>
143
+ <span class="line"><span>lines = [line.replace(&quot;\n&quot;, &quot;&quot;).split(&quot; &quot;) for line in lines]</span></span>
144
+ <span class="line"><span>char_to_id = {c: int(i) for c, i in lines}</span></span>
145
+ <span class="line"><span>g2p = G2p()</span></span>
146
+ <span class="line"><span>def frontend(text):</span></span>
147
+ <span class="line"><span> &quot;&quot;&quot;Clean text and then convert to id sequence.&quot;&quot;&quot;</span></span>
148
+ <span class="line"><span> text = custom_english_cleaners(text)</span></span>
149
+ <span class="line"><span> </span></span>
150
+ <span class="line"><span> if trans_type == &quot;phn&quot;:</span></span>
151
+ <span class="line"><span> text = filter(lambda s: s != &quot; &quot;, g2p(text))</span></span>
152
+ <span class="line"><span> text = &quot; &quot;.join(text)</span></span>
153
+ <span class="line"><span> print(f&quot;Cleaned text: {text}&quot;)</span></span>
154
+ <span class="line"><span> charseq = text.split(&quot; &quot;)</span></span>
155
+ <span class="line"><span> else:</span></span>
156
+ <span class="line"><span> print(f&quot;Cleaned text: {text}&quot;)</span></span>
157
+ <span class="line"><span> charseq = list(text)</span></span>
158
+ <span class="line"><span> idseq = []</span></span>
159
+ <span class="line"><span> for c in charseq:</span></span>
160
+ <span class="line"><span> if c.isspace():</span></span>
161
+ <span class="line"><span> idseq += [char_to_id[&quot;&lt;space&gt;&quot;]]</span></span>
162
+ <span class="line"><span> elif c not in char_to_id.keys():</span></span>
163
+ <span class="line"><span> idseq += [char_to_id[&quot;&lt;unk&gt;&quot;]]</span></span>
164
+ <span class="line"><span> else:</span></span>
165
+ <span class="line"><span> idseq += [char_to_id[c]]</span></span>
166
+ <span class="line"><span> idseq += [idim - 1] # &lt;eos&gt;</span></span>
167
+ <span class="line"><span> return torch.LongTensor(idseq).view(-1).to(device)</span></span>
168
+ <span class="line"><span></span></span>
169
+ <span class="line"><span>import nltk</span></span>
170
+ <span class="line"><span>nltk.download(&#39;punkt&#39;)</span></span>
171
+ <span class="line"><span>print(&quot;Now ready to synthesize!&quot;)</span></span>
172
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="synthesis" tabindex="-1"><a class="header-anchor" href="#synthesis"><span>Synthesis</span></a></h3><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>import time</span></span>
173
+ <span class="line"><span>print(&quot;Input your favorite sentence in English!&quot;)</span></span>
174
+ <span class="line"><span>input_text = input()</span></span>
175
+ <span class="line"><span>with torch.no_grad():</span></span>
176
+ <span class="line"><span> start = time.time()</span></span>
177
+ <span class="line"><span> x = frontend(input_text)</span></span>
178
+ <span class="line"><span> c, _, _ = model.inference(x, inference_args)</span></span>
179
+ <span class="line"><span> y = vocoder.inference(c)</span></span>
180
+ <span class="line"><span>rtf = (time.time() - start) / (len(y) / fs)</span></span>
181
+ <span class="line"><span>print(f&quot;RTF = {rtf:5f}&quot;)</span></span>
182
+ <span class="line"><span></span></span>
183
+ <span class="line"><span>from IPython.display import display, Audio</span></span>
184
+ <span class="line"><span>display(Audio(y.view(-1).cpu().numpy(), rate=fs))</span></span>
185
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><hr><h2 id="japanese-demo" tabindex="-1"><a class="header-anchor" href="#japanese-demo"><span>Japanese demo</span></a></h2><h3 id="install-japanese-dependencies" tabindex="-1"><a class="header-anchor" href="#install-japanese-dependencies"><span>Install Japanese dependencies</span></a></h3><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!pip install pyopenjtalk</span></span>
186
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h3 id="download-pretrained-models" tabindex="-1"><a class="header-anchor" href="#download-pretrained-models"><span>Download pretrained models</span></a></h3><p>Here we select Tacotron2 or Transformer. The vocoder model is Parallel WaveGAN.</p><h4 id="a-tacotron-2" tabindex="-1"><a class="header-anchor" href="#a-tacotron-2"><span>(a) Tacotron 2</span></a></h4><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># download pretrained models</span></span>
187
+ <span class="line"><span>import os</span></span>
188
+ <span class="line"><span>if not os.path.exists(&quot;downloads/jp/tacotron2&quot;):</span></span>
189
+ <span class="line"><span> !./espnet/utils/download_from_google_drive.sh \</span></span>
190
+ <span class="line"><span> https://drive.google.com/open?id=1OwrUQzAmvjj1x9cDhnZPp6dqtsEqGEJM downloads/jp/tacotron2 tar.gz</span></span>
191
+ <span class="line"><span> !./espnet/utils/download_from_google_drive.sh \</span></span>
192
+ <span class="line"><span> https://drive.google.com/open?id=1kp5M4VvmagDmYckFJa78WGqh1drb_P9t downloads/jp/tacotron2 tar.gz</span></span>
193
+ <span class="line"><span></span></span>
194
+ <span class="line"><span># set path</span></span>
195
+ <span class="line"><span>dict_path = &quot;downloads/jp/tacotron2/data/lang_1phn/train_no_dev_units.txt&quot;</span></span>
196
+ <span class="line"><span>model_path = &quot;downloads/jp/tacotron2/exp/train_no_dev_pytorch_train_pytorch_tacotron2_phn/results/model.last1.avg.best&quot;</span></span>
197
+ <span class="line"><span>vocoder_path = &quot;downloads/jp/tacotron2/jsut.parallel_wavegan.v1/checkpoint-400000steps.pkl&quot;</span></span>
198
+ <span class="line"><span></span></span>
199
+ <span class="line"><span>print(&quot;sucessfully finished download.&quot;)</span></span>
200
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h4 id="b-transformer-1" tabindex="-1"><a class="header-anchor" href="#b-transformer-1"><span>(b) Transformer</span></a></h4><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># download pretrained models</span></span>
201
+ <span class="line"><span>import os</span></span>
202
+ <span class="line"><span>if not os.path.exists(&quot;downloads/jp/transformer&quot;):</span></span>
203
+ <span class="line"><span> !./espnet/utils/download_from_google_drive.sh \</span></span>
204
+ <span class="line"><span> https://drive.google.com/open?id=1OwrUQzAmvjj1x9cDhnZPp6dqtsEqGEJM downloads/jp/transformer tar.gz</span></span>
205
+ <span class="line"><span> !./espnet/utils/download_from_google_drive.sh \</span></span>
206
+ <span class="line"><span> https://drive.google.com/open?id=1mEnZfBKqA4eT6Bn0eRZuP6lNzL-IL3VD downloads/jp/transformer tar.gz</span></span>
207
+ <span class="line"><span></span></span>
208
+ <span class="line"><span># set path</span></span>
209
+ <span class="line"><span>dict_path = &quot;downloads/jp/transformer/data/lang_1phn/train_no_dev_units.txt&quot;</span></span>
210
+ <span class="line"><span>model_path = &quot;downloads/jp/transformer/exp/train_no_dev_pytorch_train_pytorch_transformer_phn/results/model.last1.avg.best&quot;</span></span>
211
+ <span class="line"><span>vocoder_path = &quot;downloads/jp/transformer/jsut.parallel_wavegan.v1/checkpoint-400000steps.pkl&quot;</span></span>
212
+ <span class="line"><span></span></span>
213
+ <span class="line"><span>print(&quot;sucessfully finished download.&quot;)</span></span>
214
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="setup-1" tabindex="-1"><a class="header-anchor" href="#setup-1"><span>Setup</span></a></h3><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># add path</span></span>
215
+ <span class="line"><span>import sys</span></span>
216
+ <span class="line"><span>sys.path.append(&quot;espnet&quot;)</span></span>
217
+ <span class="line"><span></span></span>
218
+ <span class="line"><span># define device</span></span>
219
+ <span class="line"><span>import torch</span></span>
220
+ <span class="line"><span>device = torch.device(&quot;cuda&quot;)</span></span>
221
+ <span class="line"><span></span></span>
222
+ <span class="line"><span># define E2E-TTS model</span></span>
223
+ <span class="line"><span>from argparse import Namespace</span></span>
224
+ <span class="line"><span>from espnet.asr.asr_utils import get_model_conf</span></span>
225
+ <span class="line"><span>from espnet.asr.asr_utils import torch_load</span></span>
226
+ <span class="line"><span>from espnet.utils.dynamic_import import dynamic_import</span></span>
227
+ <span class="line"><span>idim, odim, train_args = get_model_conf(model_path)</span></span>
228
+ <span class="line"><span>model_class = dynamic_import(train_args.model_module)</span></span>
229
+ <span class="line"><span>model = model_class(idim, odim, train_args)</span></span>
230
+ <span class="line"><span>torch_load(model_path, model)</span></span>
231
+ <span class="line"><span>model = model.eval().to(device)</span></span>
232
+ <span class="line"><span>inference_args = Namespace(**{&quot;threshold&quot;: 0.5, &quot;minlenratio&quot;: 0.0, &quot;maxlenratio&quot;: 10.0})</span></span>
233
+ <span class="line"><span></span></span>
234
+ <span class="line"><span># define neural vocoder</span></span>
235
+ <span class="line"><span>from parallel_wavegan.utils import load_model</span></span>
236
+ <span class="line"><span>fs = 24000</span></span>
237
+ <span class="line"><span>vocoder = load_model(vocoder_path)</span></span>
238
+ <span class="line"><span>vocoder.remove_weight_norm()</span></span>
239
+ <span class="line"><span>vocoder = vocoder.eval().to(device)</span></span>
240
+ <span class="line"><span></span></span>
241
+ <span class="line"><span># define text frontend</span></span>
242
+ <span class="line"><span>import pyopenjtalk</span></span>
243
+ <span class="line"><span>with open(dict_path) as f:</span></span>
244
+ <span class="line"><span> lines = f.readlines()</span></span>
245
+ <span class="line"><span>lines = [line.replace(&quot;\n&quot;, &quot;&quot;).split(&quot; &quot;) for line in lines]</span></span>
246
+ <span class="line"><span>char_to_id = {c: int(i) for c, i in lines}</span></span>
247
+ <span class="line"><span>def frontend(text):</span></span>
248
+ <span class="line"><span> &quot;&quot;&quot;Clean text and then convert to id sequence.&quot;&quot;&quot;</span></span>
249
+ <span class="line"><span> text = pyopenjtalk.g2p(text, kana=False)</span></span>
250
+ <span class="line"><span> print(f&quot;Cleaned text: {text}&quot;)</span></span>
251
+ <span class="line"><span> charseq = text.split(&quot; &quot;)</span></span>
252
+ <span class="line"><span> idseq = []</span></span>
253
+ <span class="line"><span> for c in charseq:</span></span>
254
+ <span class="line"><span> if c.isspace():</span></span>
255
+ <span class="line"><span> idseq += [char_to_id[&quot;&lt;space&gt;&quot;]]</span></span>
256
+ <span class="line"><span> elif c not in char_to_id.keys():</span></span>
257
+ <span class="line"><span> idseq += [char_to_id[&quot;&lt;unk&gt;&quot;]]</span></span>
258
+ <span class="line"><span> else:</span></span>
259
+ <span class="line"><span> idseq += [char_to_id[c]]</span></span>
260
+ <span class="line"><span> idseq += [idim - 1] # &lt;eos&gt;</span></span>
261
+ <span class="line"><span> return torch.LongTensor(idseq).view(-1).to(device)</span></span>
262
+ <span class="line"><span></span></span>
263
+ <span class="line"><span>frontend(&quot;初回の辞書のインストールが必要です&quot;)</span></span>
264
+ <span class="line"><span>print(&quot;Now ready to synthesize!&quot;)</span></span>
265
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="synthesis-1" tabindex="-1"><a class="header-anchor" href="#synthesis-1"><span>Synthesis</span></a></h3><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>import time</span></span>
266
+ <span class="line"><span>print(&quot;日本語で好きな文章を入力してください&quot;)</span></span>
267
+ <span class="line"><span>input_text = input()</span></span>
268
+ <span class="line"><span></span></span>
269
+ <span class="line"><span>with torch.no_grad():</span></span>
270
+ <span class="line"><span> start = time.time()</span></span>
271
+ <span class="line"><span> x = frontend(input_text)</span></span>
272
+ <span class="line"><span> c, _, _ = model.inference(x, inference_args)</span></span>
273
+ <span class="line"><span> y = vocoder.inference(c)</span></span>
274
+ <span class="line"><span>rtf = (time.time() - start) / (len(y) / fs)</span></span>
275
+ <span class="line"><span>print(f&quot;RTF = {rtf:5f}&quot;)</span></span>
276
+ <span class="line"><span></span></span>
277
+ <span class="line"><span>from IPython.display import display, Audio</span></span>
278
+ <span class="line"><span>display(Audio(y.view(-1).cpu().numpy(), rate=fs))</span></span>
279
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><hr><h2 id="mandarin-demo" tabindex="-1"><a class="header-anchor" href="#mandarin-demo"><span>Mandarin demo</span></a></h2><p><strong>IMPORTANT NOTE</strong>: The author cannot understand Mandarin. The text front-end part might have some bugs.</p><h3 id="install-mandarin-dependencies" tabindex="-1"><a class="header-anchor" href="#install-mandarin-dependencies"><span>Install Mandarin dependencies</span></a></h3><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>!pip install pypinyin</span></span>
280
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h3 id="download-pretrained-models-1" tabindex="-1"><a class="header-anchor" href="#download-pretrained-models-1"><span>Download pretrained models</span></a></h3><p>You can select Transformer or FastSpeech.</p><h4 id="a-transformer" tabindex="-1"><a class="header-anchor" href="#a-transformer"><span>(a) Transformer</span></a></h4><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># download pretrained models</span></span>
281
+ <span class="line"><span>import os</span></span>
282
+ <span class="line"><span>if not os.path.exists(&quot;downloads/zh/transformer&quot;):</span></span>
283
+ <span class="line"><span> !./espnet/utils/download_from_google_drive.sh \</span></span>
284
+ <span class="line"><span> https://drive.google.com/open?id=10M6H88jEUGbRWBmU1Ff2VaTmOAeL8CEy downloads/zh/transformer tar.gz</span></span>
285
+ <span class="line"><span> !./espnet/utils/download_from_google_drive.sh \</span></span>
286
+ <span class="line"><span> https://drive.google.com/open?id=1bTSygvonv5TS6-iuYsOIUWpN2atGnyhZ downloads/zh/transformer tar.gz</span></span>
287
+ <span class="line"><span></span></span>
288
+ <span class="line"><span># set path</span></span>
289
+ <span class="line"><span>dict_path = &quot;downloads/zh/transformer/data/lang_phn/train_no_dev_units.txt&quot;</span></span>
290
+ <span class="line"><span>model_path = &quot;downloads/zh/transformer/exp/train_no_dev_pytorch_train_pytorch_transformer.v1.single/results/model.last1.avg.best&quot;</span></span>
291
+ <span class="line"><span>vocoder_path = &quot;downloads/zh/transformer/csmsc.parallel_wavegan.v1/checkpoint-400000steps.pkl&quot;</span></span>
292
+ <span class="line"><span></span></span>
293
+ <span class="line"><span>print(&quot;sucessfully finished download.&quot;)</span></span>
294
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h4 id="b-fastspeech" tabindex="-1"><a class="header-anchor" href="#b-fastspeech"><span>(b) FastSpeech</span></a></h4><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># download pretrained models</span></span>
295
+ <span class="line"><span>import os</span></span>
296
+ <span class="line"><span>if not os.path.exists(&quot;downloads/zh/fastspeech&quot;):</span></span>
297
+ <span class="line"><span> !./espnet/utils/download_from_google_drive.sh \</span></span>
298
+ <span class="line"><span> https://drive.google.com/open?id=10M6H88jEUGbRWBmU1Ff2VaTmOAeL8CEy downloads/zh/fastspeech tar.gz</span></span>
299
+ <span class="line"><span> !./espnet/utils/download_from_google_drive.sh \</span></span>
300
+ <span class="line"><span> https://drive.google.com/open?id=1T8thxkAxjGFPXPWPTcKLvHnd6lG0-82R downloads/zh/fastspeech tar.gz </span></span>
301
+ <span class="line"><span></span></span>
302
+ <span class="line"><span># set path</span></span>
303
+ <span class="line"><span>dict_path = &quot;downloads/zh/fastspeech/data/lang_phn/train_no_dev_units.txt&quot;</span></span>
304
+ <span class="line"><span>model_path = &quot;downloads/zh/fastspeech/exp/train_no_dev_pytorch_train_fastspeech.v3.single/results/model.last1.avg.best&quot;</span></span>
305
+ <span class="line"><span>vocoder_path = &quot;downloads/zh/fastspeech/csmsc.parallel_wavegan.v1/checkpoint-400000steps.pkl&quot;</span></span>
306
+ <span class="line"><span></span></span>
307
+ <span class="line"><span>print(&quot;sucessfully finished download.&quot;)</span></span>
308
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="setup-2" tabindex="-1"><a class="header-anchor" href="#setup-2"><span>Setup</span></a></h3><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span># add path</span></span>
309
+ <span class="line"><span>import sys</span></span>
310
+ <span class="line"><span>sys.path.append(&quot;espnet&quot;)</span></span>
311
+ <span class="line"><span></span></span>
312
+ <span class="line"><span># define device</span></span>
313
+ <span class="line"><span>import torch</span></span>
314
+ <span class="line"><span>device = torch.device(&quot;cuda&quot;)</span></span>
315
+ <span class="line"><span></span></span>
316
+ <span class="line"><span># define E2E-TTS model</span></span>
317
+ <span class="line"><span>from argparse import Namespace</span></span>
318
+ <span class="line"><span>from espnet.asr.asr_utils import get_model_conf</span></span>
319
+ <span class="line"><span>from espnet.asr.asr_utils import torch_load</span></span>
320
+ <span class="line"><span>from espnet.utils.dynamic_import import dynamic_import</span></span>
321
+ <span class="line"><span>idim, odim, train_args = get_model_conf(model_path)</span></span>
322
+ <span class="line"><span>model_class = dynamic_import(train_args.model_module)</span></span>
323
+ <span class="line"><span>model = model_class(idim, odim, train_args)</span></span>
324
+ <span class="line"><span>torch_load(model_path, model)</span></span>
325
+ <span class="line"><span>model = model.eval().to(device)</span></span>
326
+ <span class="line"><span>inference_args = Namespace(**{&quot;threshold&quot;: 0.5, &quot;minlenratio&quot;: 0.0, &quot;maxlenratio&quot;: 10.0})</span></span>
327
+ <span class="line"><span></span></span>
328
+ <span class="line"><span># define neural vocoder</span></span>
329
+ <span class="line"><span>from parallel_wavegan.utils import load_model</span></span>
330
+ <span class="line"><span>fs = 24000</span></span>
331
+ <span class="line"><span>vocoder = load_model(vocoder_path)</span></span>
332
+ <span class="line"><span>vocoder.remove_weight_norm()</span></span>
333
+ <span class="line"><span>vocoder = vocoder.eval().to(device)</span></span>
334
+ <span class="line"><span></span></span>
335
+ <span class="line"><span># define text frontend</span></span>
336
+ <span class="line"><span>from pypinyin import pinyin, Style</span></span>
337
+ <span class="line"><span>from pypinyin.style._utils import get_initials, get_finals</span></span>
338
+ <span class="line"><span>with open(dict_path) as f:</span></span>
339
+ <span class="line"><span> lines = f.readlines()</span></span>
340
+ <span class="line"><span>lines = [line.replace(&quot;\n&quot;, &quot;&quot;).split(&quot; &quot;) for line in lines]</span></span>
341
+ <span class="line"><span>char_to_id = {c: int(i) for c, i in lines}</span></span>
342
+ <span class="line"><span>def frontend(text):</span></span>
343
+ <span class="line"><span> &quot;&quot;&quot;Clean text and then convert to id sequence.&quot;&quot;&quot;</span></span>
344
+ <span class="line"><span> text = pinyin(text, style=Style.TONE3)</span></span>
345
+ <span class="line"><span> text = [c[0] for c in text]</span></span>
346
+ <span class="line"><span> print(f&quot;Cleaned text: {text}&quot;)</span></span>
347
+ <span class="line"><span> idseq = []</span></span>
348
+ <span class="line"><span> for x in text:</span></span>
349
+ <span class="line"><span> c_init = get_initials(x, strict=True)</span></span>
350
+ <span class="line"><span> c_final = get_finals(x, strict=True)</span></span>
351
+ <span class="line"><span> for c in [c_init, c_final]:</span></span>
352
+ <span class="line"><span> if len(c) == 0:</span></span>
353
+ <span class="line"><span> continue</span></span>
354
+ <span class="line"><span> c = c.replace(&quot;ü&quot;, &quot;v&quot;)</span></span>
355
+ <span class="line"><span> c = c.replace(&quot;ui&quot;, &quot;uei&quot;)</span></span>
356
+ <span class="line"><span> c = c.replace(&quot;un&quot;, &quot;uen&quot;)</span></span>
357
+ <span class="line"><span> c = c.replace(&quot;iu&quot;, &quot;iou&quot;)</span></span>
358
+ <span class="line"><span> # Special rule: &quot;e5n&quot; -&gt; &quot;en5&quot;</span></span>
359
+ <span class="line"><span> if &quot;5&quot; in c:</span></span>
360
+ <span class="line"><span> c = c.replace(&quot;5&quot;, &quot;&quot;) + &quot;5&quot;</span></span>
361
+ <span class="line"><span> if c not in char_to_id.keys():</span></span>
362
+ <span class="line"><span> print(f&quot;WARN: {c} is not included in dict.&quot;)</span></span>
363
+ <span class="line"><span> idseq += [char_to_id[&quot;&lt;unk&gt;&quot;]]</span></span>
364
+ <span class="line"><span> else:</span></span>
365
+ <span class="line"><span> idseq += [char_to_id[c]]</span></span>
366
+ <span class="line"><span> idseq += [idim - 1] # &lt;eos&gt;</span></span>
367
+ <span class="line"><span> return torch.LongTensor(idseq).view(-1).to(device)</span></span>
368
+ <span class="line"><span></span></span>
369
+ <span class="line"><span>print(&quot;now ready to synthesize!&quot;)</span></span>
370
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h3 id="synthesis-2" tabindex="-1"><a class="header-anchor" href="#synthesis-2"><span>Synthesis</span></a></h3><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>import time</span></span>
371
+ <span class="line"><span>print(&quot;請用中文輸入您喜歡的句子!&quot;)</span></span>
372
+ <span class="line"><span>input_text = input()</span></span>
373
+ <span class="line"><span></span></span>
374
+ <span class="line"><span>with torch.no_grad():</span></span>
375
+ <span class="line"><span> start = time.time()</span></span>
376
+ <span class="line"><span> x = frontend(input_text)</span></span>
377
+ <span class="line"><span> c, _, _ = model.inference(x, inference_args)</span></span>
378
+ <span class="line"><span> y = vocoder.inference(c)</span></span>
379
+ <span class="line"><span>rtf = (time.time() - start) / (len(y) / fs)</span></span>
380
+ <span class="line"><span>print(f&quot;RTF = {rtf:5f}&quot;)</span></span>
381
+ <span class="line"><span></span></span>
382
+ <span class="line"><span>from IPython.display import display, Audio</span></span>
383
+ <span class="line"><span>display(Audio(y.view(-1).cpu().numpy(), rate=fs))</span></span>
384
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div></div><!--[--><!--]--></div><footer class="vp-page-meta"><!----><div class="vp-meta-item git-info"><!----><!----></div></footer><nav class="vp-page-nav" aria-label="page navigation"><a class="route-link prev" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><div class="hint"><span class="arrow left"></span> Prev</div><div class="link"><span>Text-to-Speech (Recipe)</span></div><!--]--></a><a class="route-link next" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><div class="hint">Next <span class="arrow right"></span></div><div class="link"><span>ESPnet2-TTS realtime demonstration</span></div><!--]--></a></nav><!--[--><!--]--></main><!--]--></div><!--[--><!----><!--]--><!--]--></div>
385
+ <script type="module" src="/assets/app-DTS6SjJz.js" defer></script>
386
+ </body>
387
+ </html>
espnetez/asr/finetune_owsm.html ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!doctype html>
2
+ <html lang="en-US">
3
+ <head>
4
+ <meta charset="utf-8" />
5
+ <meta name="viewport" content="width=device-width,initial-scale=1" />
6
+ <meta name="generator" content="VuePress 2.0.0-rc.9" />
7
+ <style>
8
+ :root {
9
+ --c-bg: #fff;
10
+ }
11
+ html.dark {
12
+ --c-bg: #22272e;
13
+ }
14
+ html,
15
+ body {
16
+ background-color: var(--c-bg);
17
+ }
18
+ </style>
19
+ <script>
20
+ const userMode = localStorage.getItem('vuepress-color-scheme')
21
+ const systemDarkMode =
22
+ window.matchMedia &&
23
+ window.matchMedia('(prefers-color-scheme: dark)').matches
24
+ if (userMode === 'dark' || (userMode !== 'light' && systemDarkMode)) {
25
+ document.documentElement.classList.toggle('dark', true)
26
+ }
27
+ </script>
28
+ <link rel="manifest" href="/manifest.webmanifest"><meta name="application-name" content="Example"><meta name="apple-mobile-web-app-title" content="Example"><meta name="apple-mobile-web-app-status-bar-style" content="black"><meta name="msapplication-TileColor" content="#3eaf7c"><meta name="theme-color" content="#3eaf7c"><title>OWSM finetuning with custom dataset | </title><meta name="description" content=" ">
29
+ <link rel="preload" href="/assets/style-SNWc1iKP.css" as="style"><link rel="stylesheet" href="/assets/style-SNWc1iKP.css">
30
+ <link rel="modulepreload" href="/assets/app-DTS6SjJz.js"><link rel="modulepreload" href="/assets/finetune_owsm.html-ICOQYZj2.js">
31
+ <link rel="prefetch" href="/assets/index.html-DGcx4T0I.js" as="script"><link rel="prefetch" href="/assets/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html-CJ8-yKXK.js" as="script"><link rel="prefetch" href="/assets/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html-BDY4p1H1.js" as="script"><link rel="prefetch" href="/assets/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html-DDjiuGQB.js" as="script"><link rel="prefetch" href="/assets/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html-DnIftUJK.js" as="script"><link rel="prefetch" href="/assets/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html-D9GsoT-_.js" as="script"><link rel="prefetch" href="/assets/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html-DobzmqH0.js" as="script"><link rel="prefetch" href="/assets/espnet2_tutorial_2021_CMU_11751_18781.html-BY6Z52B4.js" as="script"><link rel="prefetch" href="/assets/asr_cli.html-BA-xBrC-.js" as="script"><link rel="prefetch" href="/assets/asr_library.html-rEQwKTMV.js" as="script"><link rel="prefetch" href="/assets/espnet2_asr_realtime_demo.html-BnK1Wovv.js" as="script"><link rel="prefetch" href="/assets/espnet2_asr_transfer_learning_demo.html-DJeSoTyY.js" as="script"><link rel="prefetch" href="/assets/espnet2_streaming_asr_demo.html-Yx-OX8AZ.js" as="script"><link rel="prefetch" href="/assets/espnet_se_demonstration_for_waspaa_2021.html--JdDNbEo.js" as="script"><link rel="prefetch" href="/assets/se_demo.html-DY-mv2y8.js" as="script"><link rel="prefetch" href="/assets/onnx_conversion_demo.html-D56NEMop.js" as="script"><link rel="prefetch" href="/assets/pretrained.html-JpE__EKJ.js" as="script"><link rel="prefetch" href="/assets/espnet2_2pass_slu_demo.html-BmvJ92Ni.js" as="script"><link rel="prefetch" href="/assets/st_demo.html-WLzB4ZGO.js" as="script"><link rel="prefetch" href="/assets/espnet2_tts_realtime_demo.html-BdxLBr1c.js" as="script"><link rel="prefetch" href="/assets/tts_cli.html-BfB21gs4.js" as="script"><link rel="prefetch" href="/assets/tts_realtime_demo.html-BKOGq7as.js" as="script"><link rel="prefetch" href="/assets/finetune_with_lora.html-3NfoQDOl.js" as="script"><link rel="prefetch" href="/assets/train.html-BQ-t2Cs4.js" as="script"><link rel="prefetch" href="/assets/tacotron2.html-Ds1AKES7.js" as="script"><link rel="prefetch" href="/assets/404.html-DN7291h8.js" as="script"><link rel="prefetch" href="/assets/NpmBadge-rh9tvaXX.js" as="script">
32
+ </head>
33
+ <body>
34
+ <div id="app"><!--[--><div class="theme-container"><!--[--><header class="navbar"><div class="toggle-sidebar-button" title="toggle sidebar" aria-expanded="false" role="button" tabindex="0"><div class="icon" aria-hidden="true"><span></span><span></span><span></span></div></div><span><a class="route-link" href="/"><img class="logo" src="/images/espnet_logo1.png" alt=" "><span class="site-name can-hide" aria-hidden="true"> </span></a></span><div class="navbar-items-wrapper" style=""><!--[--><!--]--><nav class="navbar-items can-hide" aria-label="site navigation"><!--[--><div class="navbar-item"><a class="external-link" href="https://github.com/espnet/espnet" rel="noopener noreferrer" target="_blank" aria-label="Github"><!--[--><!--[--><!--]--> Github <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://huggingface.co/espnet" rel="noopener noreferrer" target="_blank" aria-label="HuggingFace"><!--[--><!--[--><!--]--> HuggingFace <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://espnet.github.io/espnet" rel="noopener noreferrer" target="_blank" aria-label="Docs"><!--[--><!--[--><!--]--> Docs <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SE</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SLU</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>OTHERS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ST</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/tts/tacotron2.html" aria-label="TTS demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> TTS demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_with_lora.html" aria-label="Finetune Model with ESPnet-Easy"><!--[--><!--[--><!--]--> Finetune Model with ESPnet-Easy <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/train.html" aria-label="Sample demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> Sample demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link route-link-active" href="/espnetez/asr/finetune_owsm.html" aria-label="OWSM finetuning with custom dataset"><!--[--><!--[--><!--]--> OWSM finetuning with custom dataset <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_tutorial_2021_CMU_11751_18781.html" aria-label="CMU 11751/18781 2021: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 2021: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html" aria-label="CMU 11492/11692 Spring 2023: Spoken Language Understanding"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Spoken Language Understanding <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html" aria-label="CMU 11492/11692 Spring 2023: Speech Enhancement"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Speech Enhancement <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html" aria-label="CMU 11492/11692 Spring 2023: Data preparation"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Data preparation <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task)"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html" aria-label="CMU 11492/11692 Spring 2023: Text to Speech"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Text to Speech <!--[--><!--]--><!--]--></a></li><!--]--></ul></div></div><!--]--></nav><!--[--><!--]--><button class="toggle-color-mode-button" title="toggle color mode"><svg style="" class="icon" focusable="false" viewBox="0 0 32 32"><path d="M16 12.005a4 4 0 1 1-4 4a4.005 4.005 0 0 1 4-4m0-2a6 6 0 1 0 6 6a6 6 0 0 0-6-6z" fill="currentColor"></path><path d="M5.394 6.813l1.414-1.415l3.506 3.506L8.9 10.318z" fill="currentColor"></path><path d="M2 15.005h5v2H2z" fill="currentColor"></path><path d="M5.394 25.197L8.9 21.691l1.414 1.415l-3.506 3.505z" fill="currentColor"></path><path d="M15 25.005h2v5h-2z" fill="currentColor"></path><path d="M21.687 23.106l1.414-1.415l3.506 3.506l-1.414 1.414z" fill="currentColor"></path><path d="M25 15.005h5v2h-5z" fill="currentColor"></path><path d="M21.687 8.904l3.506-3.506l1.414 1.415l-3.506 3.505z" fill="currentColor"></path><path d="M15 2.005h2v5h-2z" fill="currentColor"></path></svg><svg style="display:none;" class="icon" focusable="false" viewBox="0 0 32 32"><path d="M13.502 5.414a15.075 15.075 0 0 0 11.594 18.194a11.113 11.113 0 0 1-7.975 3.39c-.138 0-.278.005-.418 0a11.094 11.094 0 0 1-3.2-21.584M14.98 3a1.002 1.002 0 0 0-.175.016a13.096 13.096 0 0 0 1.825 25.981c.164.006.328 0 .49 0a13.072 13.072 0 0 0 10.703-5.555a1.01 1.01 0 0 0-.783-1.565A13.08 13.08 0 0 1 15.89 4.38A1.015 1.015 0 0 0 14.98 3z" fill="currentColor"></path></svg></button><form class="search-box" role="search"><input type="search" placeholder="Search" autocomplete="off" spellcheck="false" value><!----></form></div></header><!--]--><div class="sidebar-mask"></div><!--[--><aside class="sidebar"><nav class="navbar-items" aria-label="site navigation"><!--[--><div class="navbar-item"><a class="external-link" href="https://github.com/espnet/espnet" rel="noopener noreferrer" target="_blank" aria-label="Github"><!--[--><!--[--><!--]--> Github <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://huggingface.co/espnet" rel="noopener noreferrer" target="_blank" aria-label="HuggingFace"><!--[--><!--[--><!--]--> HuggingFace <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://espnet.github.io/espnet" rel="noopener noreferrer" target="_blank" aria-label="Docs"><!--[--><!--[--><!--]--> Docs <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SE</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SLU</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>OTHERS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ST</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/tts/tacotron2.html" aria-label="TTS demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> TTS demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_with_lora.html" aria-label="Finetune Model with ESPnet-Easy"><!--[--><!--[--><!--]--> Finetune Model with ESPnet-Easy <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/train.html" aria-label="Sample demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> Sample demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link route-link-active" href="/espnetez/asr/finetune_owsm.html" aria-label="OWSM finetuning with custom dataset"><!--[--><!--[--><!--]--> OWSM finetuning with custom dataset <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_tutorial_2021_CMU_11751_18781.html" aria-label="CMU 11751/18781 2021: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 2021: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html" aria-label="CMU 11492/11692 Spring 2023: Spoken Language Understanding"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Spoken Language Understanding <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html" aria-label="CMU 11492/11692 Spring 2023: Speech Enhancement"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Speech Enhancement <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html" aria-label="CMU 11492/11692 Spring 2023: Data preparation"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Data preparation <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task)"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html" aria-label="CMU 11492/11692 Spring 2023: Text to Speech"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Text to Speech <!--[--><!--]--><!--]--></a></li><!--]--></ul></div></div><!--]--></nav><!--[--><!--]--><ul class="sidebar-items"><!--[--><li><p tabindex="0" class="sidebar-item sidebar-heading">TTS <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnetez/tts/tacotron2.html" aria-label="TTS demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> TTS demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading active">ASR <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnetez/asr/finetune_with_lora.html" aria-label="Finetune Model with ESPnet-Easy"><!--[--><!--[--><!--]--> Finetune Model with ESPnet-Easy <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnetez/asr/train.html" aria-label="Sample demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> Sample demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link route-link-active sidebar-item active" href="/espnetez/asr/finetune_owsm.html" aria-label="OWSM finetuning with custom dataset"><!--[--><!--[--><!--]--> OWSM finetuning with custom dataset <!--[--><!--]--><!--]--></a><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="#data-preparation" aria-label="Data Preparation"><!--[--><!--[--><!--]--> Data Preparation <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#setup-training-configs-and-model" aria-label="Setup training configs and model"><!--[--><!--[--><!--]--> Setup training configs and model <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#wrap-with-espneteasydataset" aria-label="Wrap with ESPnetEasyDataset"><!--[--><!--[--><!--]--> Wrap with ESPnetEasyDataset <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#training" aria-label="Training"><!--[--><!--[--><!--]--> Training <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#inference" aria-label="Inference"><!--[--><!--[--><!--]--> Inference <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#results" aria-label="Results"><!--[--><!--[--><!--]--> Results <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><!--]--></ul></li><!--]--></ul><!--[--><!--]--></aside><!--]--><!--[--><main class="page"><!--[--><!--]--><div class="theme-default-content"><!--[--><!--]--><div><h1 id="owsm-finetuning-with-custom-dataset" tabindex="-1"><a class="header-anchor" href="#owsm-finetuning-with-custom-dataset"><span>OWSM finetuning with custom dataset</span></a></h1><p>This Jupyter notebook provides a step-by-step guide on using the ESPnetEasy module to finetune owsm model. In this demonstration, we will leverage the custom dataset to finetune an OWSM model for ASR task.</p><h2 id="data-preparation" tabindex="-1"><a class="header-anchor" href="#data-preparation"><span>Data Preparation</span></a></h2><p>For this tutorial, we assume that we have the custom dataset with 654 audio with the following directory structure:</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>audio</span></span>
35
+ <span class="line"><span>├── 001 [420 entries exceeds filelimit, not opening dir]</span></span>
36
+ <span class="line"><span>└── 002 [234 entries exceeds filelimit, not opening dir]</span></span>
37
+ <span class="line"><span>transcription</span></span>
38
+ <span class="line"><span>└── owsm_v3.1</span></span>
39
+ <span class="line"><span> ├── 001.csv</span></span>
40
+ <span class="line"><span> └── 002.csv</span></span>
41
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>The csv files contains the audio path, text, and text_ctc data in Japanese. For example, the csv constains the following data:</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>audio/001/00014.wav,しゃべるたびに追いかけてくるんですけど,なんかしゃべるたびにおいかけてくるんですけど</span></span>
42
+ <span class="line"><span>audio/001/00015.wav,え、どうしよう,えどうしよう</span></span>
43
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> os</span></span>
44
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> glob </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> glob</span></span>
45
+ <span class="line"></span>
46
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> numpy </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> np</span></span>
47
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> librosa</span></span>
48
+ <span class="line"></span>
49
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> torch</span></span>
50
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet2.bin.s2t_inference </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> Speech2Text</span></span>
51
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet2.layers.create_lora_adapter </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> create_lora_adapter</span></span>
52
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> espnetez </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> ez</span></span>
53
+ <span class="line"></span>
54
+ <span class="line"><span style="color:#6A9955;"># Define hyper parameters</span></span>
55
+ <span class="line"><span style="color:#D4D4D4;">DUMP_DIR = </span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;./dump&quot;</span></span>
56
+ <span class="line"><span style="color:#D4D4D4;">CSV_DIR = </span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;./transcription&quot;</span></span>
57
+ <span class="line"><span style="color:#D4D4D4;">EXP_DIR = </span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;./exp/finetune&quot;</span></span>
58
+ <span class="line"><span style="color:#D4D4D4;">STATS_DIR = </span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;./exp/stats_finetune&quot;</span></span>
59
+ <span class="line"></span>
60
+ <span class="line"><span style="color:#D4D4D4;">FINETUNE_MODEL = </span><span style="color:#CE9178;">&quot;espnet/owsm_v3.1_ebf&quot;</span></span>
61
+ <span class="line"><span style="color:#D4D4D4;">LORA_TARGET = [</span></span>
62
+ <span class="line"><span style="color:#CE9178;"> &quot;w_1&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;w_2&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;merge_proj&quot;</span></span>
63
+ <span class="line"><span style="color:#D4D4D4;">]</span></span>
64
+ <span class="line"><span style="color:#D4D4D4;">LANGUAGE = </span><span style="color:#CE9178;">&quot;jpn&quot;</span></span>
65
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="setup-training-configs-and-model" tabindex="-1"><a class="header-anchor" href="#setup-training-configs-and-model"><span>Setup training configs and model</span></a></h2><p>Since we are going to finetune an OWSM model for ASR task, we will use the tokenizer and TokenIDConverter of the OWSM model. We will also use the training config as the default parameter sets, and update them with the finetuning configuration.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">pretrained_model = Speech2Text.from_pretrained(</span></span>
66
+ <span class="line"><span style="color:#D4D4D4;"> FINETUNE_MODEL,</span></span>
67
+ <span class="line"><span style="color:#9CDCFE;"> category_sym</span><span style="color:#D4D4D4;">=</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;&lt;</span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">LANGUAGE</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&gt;&quot;</span><span style="color:#D4D4D4;">,</span></span>
68
+ <span class="line"><span style="color:#9CDCFE;"> beam_size</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">10</span><span style="color:#D4D4D4;">,</span></span>
69
+ <span class="line"><span style="color:#D4D4D4;">) </span><span style="color:#6A9955;"># Load model to extract configs.</span></span>
70
+ <span class="line"><span style="color:#D4D4D4;">pretrain_config = </span><span style="color:#DCDCAA;">vars</span><span style="color:#D4D4D4;">(pretrained_model.s2t_train_args)</span></span>
71
+ <span class="line"><span style="color:#D4D4D4;">tokenizer = pretrained_model.tokenizer</span></span>
72
+ <span class="line"><span style="color:#D4D4D4;">converter = pretrained_model.converter</span></span>
73
+ <span class="line"><span style="color:#C586C0;">del</span><span style="color:#D4D4D4;"> pretrained_model</span></span>
74
+ <span class="line"></span>
75
+ <span class="line"><span style="color:#D4D4D4;">finetune_config = ez.config.update_finetune_config(</span></span>
76
+ <span class="line"><span style="color:#CE9178;"> &#39;s2t&#39;</span><span style="color:#D4D4D4;">,</span></span>
77
+ <span class="line"><span style="color:#D4D4D4;"> pretrain_config,</span></span>
78
+ <span class="line"><span style="color:#569CD6;"> f</span><span style="color:#CE9178;">&quot;./config/finetune_with_lora.yaml&quot;</span></span>
79
+ <span class="line"><span style="color:#D4D4D4;">)</span></span>
80
+ <span class="line"></span>
81
+ <span class="line"><span style="color:#6A9955;"># define model loading function</span></span>
82
+ <span class="line"><span style="color:#569CD6;">def</span><span style="color:#DCDCAA;"> count_parameters</span><span style="color:#D4D4D4;">(</span><span style="color:#9CDCFE;">model</span><span style="color:#D4D4D4;">):</span></span>
83
+ <span class="line"><span style="color:#C586C0;"> return</span><span style="color:#DCDCAA;"> sum</span><span style="color:#D4D4D4;">(p.numel() </span><span style="color:#C586C0;">for</span><span style="color:#D4D4D4;"> p </span><span style="color:#C586C0;">in</span><span style="color:#D4D4D4;"> model.parameters() </span><span style="color:#C586C0;">if</span><span style="color:#D4D4D4;"> p.requires_grad)</span></span>
84
+ <span class="line"></span>
85
+ <span class="line"><span style="color:#569CD6;">def</span><span style="color:#DCDCAA;"> build_model_fn</span><span style="color:#D4D4D4;">(</span><span style="color:#9CDCFE;">args</span><span style="color:#D4D4D4;">):</span></span>
86
+ <span class="line"><span style="color:#D4D4D4;"> pretrained_model = Speech2Text.from_pretrained(</span></span>
87
+ <span class="line"><span style="color:#D4D4D4;"> FINETUNE_MODEL,</span></span>
88
+ <span class="line"><span style="color:#9CDCFE;"> category_sym</span><span style="color:#D4D4D4;">=</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;&lt;</span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">LANGUAGE</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&gt;&quot;</span><span style="color:#D4D4D4;">,</span></span>
89
+ <span class="line"><span style="color:#9CDCFE;"> beam_size</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">10</span><span style="color:#D4D4D4;">,</span></span>
90
+ <span class="line"><span style="color:#D4D4D4;"> )</span></span>
91
+ <span class="line"><span style="color:#D4D4D4;"> model = pretrained_model.s2t_model</span></span>
92
+ <span class="line"><span style="color:#D4D4D4;"> model.train()</span></span>
93
+ <span class="line"><span style="color:#DCDCAA;"> print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&#39;Trainable parameters: </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">count_parameters(model)</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&#39;</span><span style="color:#D4D4D4;">)</span></span>
94
+ <span class="line"><span style="color:#6A9955;"> # apply lora</span></span>
95
+ <span class="line"><span style="color:#D4D4D4;"> create_lora_adapter(model, </span><span style="color:#9CDCFE;">target_modules</span><span style="color:#D4D4D4;">=LORA_TARGET)</span></span>
96
+ <span class="line"><span style="color:#DCDCAA;"> print</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&#39;Trainable parameters after LORA: </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">count_parameters(model)</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&#39;</span><span style="color:#D4D4D4;">)</span></span>
97
+ <span class="line"><span style="color:#C586C0;"> return</span><span style="color:#D4D4D4;"> model</span></span>
98
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="wrap-with-espneteasydataset" tabindex="-1"><a class="header-anchor" href="#wrap-with-espneteasydataset"><span>Wrap with ESPnetEasyDataset</span></a></h2><p>Before initiating the training process, it is crucial to adapt the dataset to the ESPnet format. The dataset class should output tokenized text and audio files in <code>np.array</code> format.</p><p>Then let&#39;s define the custom dataset class. The owsm finetuning requires <code>audio</code>, <code>text</code>, <code>text_prev</code> and <code>text_ctc</code> data. You can use your custom-defined dataset, huggingface <code>datasets</code> library, or <code>lhotse</code> library, or any other dataloader that you want to use.</p><p>When you try to use custom-defined dataset, you should define the <code>data_info</code> dictionary. It defines the mapping between the output of your model and the input of ESPnet models.</p><p><strong>Note</strong>:</p><ul><li>Currently we do not support the custom dataloader that feeds processed feature.</li></ul><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">LANGUAGE = </span></span>
99
+ <span class="line"><span style="color:#6A9955;"># custom dataset class</span></span>
100
+ <span class="line"><span style="color:#569CD6;">class</span><span style="color:#4EC9B0;"> CustomDataset</span><span style="color:#D4D4D4;">(</span><span style="color:#4EC9B0;">torch</span><span style="color:#D4D4D4;">.</span><span style="color:#4EC9B0;">utils</span><span style="color:#D4D4D4;">.</span><span style="color:#4EC9B0;">data</span><span style="color:#D4D4D4;">.</span><span style="color:#4EC9B0;">Dataset</span><span style="color:#D4D4D4;">):</span></span>
101
+ <span class="line"><span style="color:#569CD6;"> def</span><span style="color:#DCDCAA;"> __init__</span><span style="color:#D4D4D4;">(</span><span style="color:#9CDCFE;">self</span><span style="color:#D4D4D4;">, </span><span style="color:#9CDCFE;">data_list</span><span style="color:#D4D4D4;">):</span></span>
102
+ <span class="line"><span style="color:#6A9955;"> # data_list is a list of tuples (audio_path, text, text_ctc)</span></span>
103
+ <span class="line"><span style="color:#569CD6;"> self</span><span style="color:#D4D4D4;">.data = data_list</span></span>
104
+ <span class="line"></span>
105
+ <span class="line"><span style="color:#569CD6;"> def</span><span style="color:#DCDCAA;"> __len__</span><span style="color:#D4D4D4;">(</span><span style="color:#9CDCFE;">self</span><span style="color:#D4D4D4;">):</span></span>
106
+ <span class="line"><span style="color:#C586C0;"> return</span><span style="color:#DCDCAA;"> len</span><span style="color:#D4D4D4;">(</span><span style="color:#569CD6;">self</span><span style="color:#D4D4D4;">.data)</span></span>
107
+ <span class="line"></span>
108
+ <span class="line"><span style="color:#569CD6;"> def</span><span style="color:#DCDCAA;"> __getitem__</span><span style="color:#D4D4D4;">(</span><span style="color:#9CDCFE;">self</span><span style="color:#D4D4D4;">, </span><span style="color:#9CDCFE;">idx</span><span style="color:#D4D4D4;">):</span></span>
109
+ <span class="line"><span style="color:#C586C0;"> return</span><span style="color:#569CD6;"> self</span><span style="color:#D4D4D4;">._parse_single_data(</span><span style="color:#569CD6;">self</span><span style="color:#D4D4D4;">.data[idx])</span></span>
110
+ <span class="line"></span>
111
+ <span class="line"><span style="color:#569CD6;"> def</span><span style="color:#DCDCAA;"> _parse_single_data</span><span style="color:#D4D4D4;">(</span><span style="color:#9CDCFE;">self</span><span style="color:#D4D4D4;">, </span><span style="color:#9CDCFE;">d</span><span style="color:#D4D4D4;">):</span></span>
112
+ <span class="line"><span style="color:#D4D4D4;"> text = </span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;&lt;</span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">LANGUAGE</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&gt;&lt;asr&gt;&lt;notimestamps&gt; </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">d[</span><span style="color:#CE9178;">&#39;transcript&#39;</span><span style="color:#D4D4D4;">]</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&quot;</span></span>
113
+ <span class="line"><span style="color:#C586C0;"> return</span><span style="color:#D4D4D4;"> {</span></span>
114
+ <span class="line"><span style="color:#CE9178;"> &quot;audio_path&quot;</span><span style="color:#D4D4D4;">: d[</span><span style="color:#CE9178;">&quot;audio_path&quot;</span><span style="color:#D4D4D4;">],</span></span>
115
+ <span class="line"><span style="color:#CE9178;"> &quot;text&quot;</span><span style="color:#D4D4D4;">: text,</span></span>
116
+ <span class="line"><span style="color:#CE9178;"> &quot;text_prev&quot;</span><span style="color:#D4D4D4;">: </span><span style="color:#CE9178;">&quot;&lt;na&gt;&quot;</span><span style="color:#D4D4D4;">,</span></span>
117
+ <span class="line"><span style="color:#CE9178;"> &quot;text_ctc&quot;</span><span style="color:#D4D4D4;">: d[</span><span style="color:#CE9178;">&#39;text_ctc&#39;</span><span style="color:#D4D4D4;">],</span></span>
118
+ <span class="line"><span style="color:#D4D4D4;"> }</span></span>
119
+ <span class="line"></span>
120
+ <span class="line"></span>
121
+ <span class="line"><span style="color:#D4D4D4;">data_list = []</span></span>
122
+ <span class="line"><span style="color:#C586C0;">for</span><span style="color:#D4D4D4;"> csv_file </span><span style="color:#C586C0;">in</span><span style="color:#DCDCAA;"> sorted</span><span style="color:#D4D4D4;">(glob(os.path.join(CSV_DIR, </span><span style="color:#CE9178;">&quot;*.csv&quot;</span><span style="color:#D4D4D4;">))):</span></span>
123
+ <span class="line"><span style="color:#C586C0;"> with</span><span style="color:#DCDCAA;"> open</span><span style="color:#D4D4D4;">(csv_file, </span><span style="color:#CE9178;">&quot;r&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#9CDCFE;">encoding</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&quot;utf-8&quot;</span><span style="color:#D4D4D4;">) </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> f:</span></span>
124
+ <span class="line"><span style="color:#D4D4D4;"> data_list += f.readlines()[</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">:] </span><span style="color:#6A9955;"># skip header</span></span>
125
+ <span class="line"></span>
126
+ <span class="line"><span style="color:#D4D4D4;">validation_examples = </span><span style="color:#B5CEA8;">20</span></span>
127
+ <span class="line"><span style="color:#D4D4D4;">train_dataset = CustomDataset(data_list[:-validation_examples])</span></span>
128
+ <span class="line"><span style="color:#D4D4D4;">valid_dataset = CustomDataset(data_list[-validation_examples:])</span></span>
129
+ <span class="line"></span>
130
+ <span class="line"><span style="color:#569CD6;">def</span><span style="color:#DCDCAA;"> tokenize</span><span style="color:#D4D4D4;">(</span><span style="color:#9CDCFE;">text</span><span style="color:#D4D4D4;">):</span></span>
131
+ <span class="line"><span style="color:#C586C0;"> return</span><span style="color:#D4D4D4;"> np.array(converter.tokens2ids(tokenizer.text2tokens(text)))</span></span>
132
+ <span class="line"></span>
133
+ <span class="line"><span style="color:#6A9955;"># The output of CustomDatasetInstance[idx] will converted to np.array</span></span>
134
+ <span class="line"><span style="color:#6A9955;"># with the functions defined in the data_info dictionary.</span></span>
135
+ <span class="line"><span style="color:#D4D4D4;">data_info = {</span></span>
136
+ <span class="line"><span style="color:#CE9178;"> &quot;speech&quot;</span><span style="color:#D4D4D4;">: </span><span style="color:#569CD6;">lambda</span><span style="color:#9CDCFE;"> d</span><span style="color:#D4D4D4;">: librosa.load(d[</span><span style="color:#CE9178;">&quot;audio_path&quot;</span><span style="color:#D4D4D4;">], </span><span style="color:#9CDCFE;">sr</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">16000</span><span style="color:#D4D4D4;">)[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">],</span></span>
137
+ <span class="line"><span style="color:#CE9178;"> &quot;text&quot;</span><span style="color:#D4D4D4;">: </span><span style="color:#569CD6;">lambda</span><span style="color:#9CDCFE;"> d</span><span style="color:#D4D4D4;">: tokenize(d[</span><span style="color:#CE9178;">&quot;text&quot;</span><span style="color:#D4D4D4;">]),</span></span>
138
+ <span class="line"><span style="color:#CE9178;"> &quot;text_prev&quot;</span><span style="color:#D4D4D4;">: </span><span style="color:#569CD6;">lambda</span><span style="color:#9CDCFE;"> d</span><span style="color:#D4D4D4;">: tokenize(d[</span><span style="color:#CE9178;">&quot;text_prev&quot;</span><span style="color:#D4D4D4;">]),</span></span>
139
+ <span class="line"><span style="color:#CE9178;"> &quot;text_ctc&quot;</span><span style="color:#D4D4D4;">: </span><span style="color:#569CD6;">lambda</span><span style="color:#9CDCFE;"> d</span><span style="color:#D4D4D4;">: tokenize(d[</span><span style="color:#CE9178;">&quot;text_ctc&quot;</span><span style="color:#D4D4D4;">]),</span></span>
140
+ <span class="line"><span style="color:#D4D4D4;">}</span></span>
141
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>Or if you want to use <code>datasets</code> library or <code>lhotse</code> library:</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># Datasets library</span></span>
142
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> datasets </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> load_dataset, Audio</span></span>
143
+ <span class="line"></span>
144
+ <span class="line"><span style="color:#D4D4D4;">train_dataset = load_dataset(</span><span style="color:#CE9178;">&quot;audiofolder&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#9CDCFE;">data_dir</span><span style="color:#D4D4D4;">=</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;/path/to/huggingface_dataset&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#9CDCFE;">split</span><span style="color:#D4D4D4;">=</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&#39;train[:-</span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">validation_examples</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">]&#39;</span><span style="color:#D4D4D4;">)</span></span>
145
+ <span class="line"><span style="color:#D4D4D4;">valid_dataset = load_dataset(</span><span style="color:#CE9178;">&quot;audiofolder&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#9CDCFE;">data_dir</span><span style="color:#D4D4D4;">=</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;/path/to/huggingface_dataset&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#9CDCFE;">split</span><span style="color:#D4D4D4;">=</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&#39;train[-</span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">validation_examples</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">:]&#39;</span><span style="color:#D4D4D4;">)</span></span>
146
+ <span class="line"><span style="color:#D4D4D4;">train_dataset = train_dataset.cast_column(</span><span style="color:#CE9178;">&quot;audio&quot;</span><span style="color:#D4D4D4;">, Audio(</span><span style="color:#9CDCFE;">sampling_rate</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">16000</span><span style="color:#D4D4D4;">))</span></span>
147
+ <span class="line"><span style="color:#D4D4D4;">valid_dataset = valid_dataset.cast_column(</span><span style="color:#CE9178;">&quot;audio&quot;</span><span style="color:#D4D4D4;">, Audio(</span><span style="color:#9CDCFE;">sampling_rate</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">16000</span><span style="color:#D4D4D4;">))</span></span>
148
+ <span class="line"><span style="color:#D4D4D4;">data_info = {</span></span>
149
+ <span class="line"><span style="color:#CE9178;"> &quot;speech&quot;</span><span style="color:#D4D4D4;">: </span><span style="color:#569CD6;">lambda</span><span style="color:#9CDCFE;"> d</span><span style="color:#D4D4D4;">: d[</span><span style="color:#CE9178;">&#39;audio&#39;</span><span style="color:#D4D4D4;">][</span><span style="color:#CE9178;">&#39;array&#39;</span><span style="color:#D4D4D4;">],</span></span>
150
+ <span class="line"><span style="color:#CE9178;"> &quot;text&quot;</span><span style="color:#D4D4D4;">: </span><span style="color:#569CD6;">lambda</span><span style="color:#9CDCFE;"> d</span><span style="color:#D4D4D4;">: tokenize(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;&lt;</span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">LANGUAGE</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&gt;&lt;asr&gt;&lt;notimestamps&gt; </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">d[</span><span style="color:#CE9178;">&#39;transcript&#39;</span><span style="color:#D4D4D4;">]</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&quot;</span><span style="color:#D4D4D4;">),</span></span>
151
+ <span class="line"><span style="color:#CE9178;"> &quot;text_prev&quot;</span><span style="color:#D4D4D4;">: </span><span style="color:#569CD6;">lambda</span><span style="color:#9CDCFE;"> d</span><span style="color:#D4D4D4;">: tokenize(</span><span style="color:#CE9178;">&quot;&lt;na&gt;&quot;</span><span style="color:#D4D4D4;">),</span></span>
152
+ <span class="line"><span style="color:#CE9178;"> &quot;text_ctc&quot;</span><span style="color:#D4D4D4;">: </span><span style="color:#569CD6;">lambda</span><span style="color:#9CDCFE;"> d</span><span style="color:#D4D4D4;">: tokenize(d[</span><span style="color:#CE9178;">&quot;text_ctc&quot;</span><span style="color:#D4D4D4;">]),</span></span>
153
+ <span class="line"><span style="color:#D4D4D4;">}</span></span>
154
+ <span class="line"></span>
155
+ <span class="line"><span style="color:#6A9955;"># Or lhotse library. The following code is from the official document.</span></span>
156
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> pathlib </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> Path</span></span>
157
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> lhotse </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> CutSet</span></span>
158
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> lhotse.recipes </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> download_librispeech, prepare_librispeech</span></span>
159
+ <span class="line"></span>
160
+ <span class="line"><span style="color:#569CD6;">def</span><span style="color:#DCDCAA;"> load_audio</span><span style="color:#D4D4D4;">(</span><span style="color:#9CDCFE;">audio_path</span><span style="color:#D4D4D4;">):</span></span>
161
+ <span class="line"><span style="color:#D4D4D4;"> y, _ = librosa.load(audio_path, </span><span style="color:#9CDCFE;">sr</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">16000</span><span style="color:#D4D4D4;">)</span></span>
162
+ <span class="line"><span style="color:#C586C0;"> return</span><span style="color:#D4D4D4;"> y</span></span>
163
+ <span class="line"></span>
164
+ <span class="line"><span style="color:#D4D4D4;">root_dir = Path(</span><span style="color:#CE9178;">&quot;data&quot;</span><span style="color:#D4D4D4;">)</span></span>
165
+ <span class="line"><span style="color:#D4D4D4;">tmp_dir = Path(</span><span style="color:#CE9178;">&quot;tmp&quot;</span><span style="color:#D4D4D4;">)</span></span>
166
+ <span class="line"><span style="color:#D4D4D4;">tmp_dir.mkdir(</span><span style="color:#9CDCFE;">exist_ok</span><span style="color:#D4D4D4;">=</span><span style="color:#569CD6;">True</span><span style="color:#D4D4D4;">)</span></span>
167
+ <span class="line"><span style="color:#D4D4D4;">num_jobs = os.cpu_count() - </span><span style="color:#B5CEA8;">1</span></span>
168
+ <span class="line"></span>
169
+ <span class="line"><span style="color:#D4D4D4;">libri_variant = </span><span style="color:#CE9178;">&quot;mini_librispeech&quot;</span></span>
170
+ <span class="line"><span style="color:#D4D4D4;">libri_root = download_librispeech(root_dir, </span><span style="color:#9CDCFE;">dataset_parts</span><span style="color:#D4D4D4;">=libri_variant)</span></span>
171
+ <span class="line"><span style="color:#D4D4D4;">libri = prepare_librispeech(</span></span>
172
+ <span class="line"><span style="color:#D4D4D4;"> libri_root, </span><span style="color:#9CDCFE;">dataset_parts</span><span style="color:#D4D4D4;">=libri_variant, </span><span style="color:#9CDCFE;">output_dir</span><span style="color:#D4D4D4;">=root_dir, </span><span style="color:#9CDCFE;">num_jobs</span><span style="color:#D4D4D4;">=num_jobs</span></span>
173
+ <span class="line"><span style="color:#D4D4D4;">)</span></span>
174
+ <span class="line"><span style="color:#D4D4D4;">train_dataset = CutSet.from_manifests(**libri[</span><span style="color:#CE9178;">&quot;train-clean-5&quot;</span><span style="color:#D4D4D4;">])</span></span>
175
+ <span class="line"><span style="color:#D4D4D4;">valid_dataset = CutSet.from_manifests(**libri[</span><span style="color:#CE9178;">&quot;dev-clean-2&quot;</span><span style="color:#D4D4D4;">])</span></span>
176
+ <span class="line"><span style="color:#D4D4D4;">data_info = {</span></span>
177
+ <span class="line"><span style="color:#CE9178;"> &quot;speech&quot;</span><span style="color:#D4D4D4;">: </span><span style="color:#569CD6;">lambda</span><span style="color:#9CDCFE;"> d</span><span style="color:#D4D4D4;">: load_audio(d.recording.sources[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">].source),</span></span>
178
+ <span class="line"><span style="color:#CE9178;"> &quot;text&quot;</span><span style="color:#D4D4D4;">: </span><span style="color:#569CD6;">lambda</span><span style="color:#9CDCFE;"> d</span><span style="color:#D4D4D4;">: tokenize(</span><span style="color:#569CD6;">f</span><span style="color:#CE9178;">&quot;&lt;</span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">LANGUAGE</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&gt;&lt;asr&gt;&lt;notimestamps&gt; </span><span style="color:#569CD6;">{</span><span style="color:#D4D4D4;">d.supervisions[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">].text</span><span style="color:#569CD6;">}</span><span style="color:#CE9178;">&quot;</span><span style="color:#D4D4D4;">),</span></span>
179
+ <span class="line"><span style="color:#CE9178;"> &quot;text_prev&quot;</span><span style="color:#D4D4D4;">: </span><span style="color:#569CD6;">lambda</span><span style="color:#9CDCFE;"> d</span><span style="color:#D4D4D4;">: tokenize(</span><span style="color:#CE9178;">&quot;&lt;na&gt;&quot;</span><span style="color:#D4D4D4;">),</span></span>
180
+ <span class="line"><span style="color:#CE9178;"> &quot;text_ctc&quot;</span><span style="color:#D4D4D4;">: </span><span style="color:#569CD6;">lambda</span><span style="color:#9CDCFE;"> d</span><span style="color:#D4D4D4;">: tokenize(d.supervisions[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">].text),</span></span>
181
+ <span class="line"><span style="color:#D4D4D4;">}</span></span>
182
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>And finally you need to wrap your custom dataset with ESPnetEasyDataset.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># Convert into ESPnet-Easy dataset format</span></span>
183
+ <span class="line"><span style="color:#D4D4D4;">train_dataset = ez.dataset.ESPnetEasyDataset(train_dataset, </span><span style="color:#9CDCFE;">data_info</span><span style="color:#D4D4D4;">=data_info)</span></span>
184
+ <span class="line"><span style="color:#D4D4D4;">valid_dataset = ez.dataset.ESPnetEasyDataset(valid_dataset, </span><span style="color:#9CDCFE;">data_info</span><span style="color:#D4D4D4;">=data_info)</span></span>
185
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="training" tabindex="-1"><a class="header-anchor" href="#training"><span>Training</span></a></h2><p>While the configuration remains consistent with other notebooks, the instantiation arguments for the Trainer class differ in this case. As we have not generated dump files, we can disregard arguments related to dump files and directly provide the train/valid dataset classes.</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>trainer = Trainer(</span></span>
186
+ <span class="line"><span> ...</span></span>
187
+ <span class="line"><span> train_dataset=your_train_dataset_instance,</span></span>
188
+ <span class="line"><span> train_dataset=your_valid_dataset_instance,</span></span>
189
+ <span class="line"><span> ...</span></span>
190
+ <span class="line"><span>)</span></span>
191
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">trainer = ez.Trainer(</span></span>
192
+ <span class="line"><span style="color:#9CDCFE;"> task</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&#39;s2t&#39;</span><span style="color:#D4D4D4;">,</span></span>
193
+ <span class="line"><span style="color:#9CDCFE;"> train_config</span><span style="color:#D4D4D4;">=finetune_config,</span></span>
194
+ <span class="line"><span style="color:#9CDCFE;"> train_dataset</span><span style="color:#D4D4D4;">=train_dataset,</span></span>
195
+ <span class="line"><span style="color:#9CDCFE;"> valid_dataset</span><span style="color:#D4D4D4;">=valid_dataset,</span></span>
196
+ <span class="line"><span style="color:#9CDCFE;"> build_model_fn</span><span style="color:#D4D4D4;">=build_model_fn, </span><span style="color:#6A9955;"># provide the pre-trained model</span></span>
197
+ <span class="line"><span style="color:#9CDCFE;"> data_info</span><span style="color:#D4D4D4;">=data_info,</span></span>
198
+ <span class="line"><span style="color:#9CDCFE;"> output_dir</span><span style="color:#D4D4D4;">=EXP_DIR,</span></span>
199
+ <span class="line"><span style="color:#9CDCFE;"> stats_dir</span><span style="color:#D4D4D4;">=STATS_DIR,</span></span>
200
+ <span class="line"><span style="color:#9CDCFE;"> ngpu</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">1</span></span>
201
+ <span class="line"><span style="color:#D4D4D4;">)</span></span>
202
+ <span class="line"><span style="color:#D4D4D4;">trainer.collect_stats()</span></span>
203
+ <span class="line"><span style="color:#D4D4D4;">trainer.train()</span></span>
204
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="inference" tabindex="-1"><a class="header-anchor" href="#inference"><span>Inference</span></a></h2><p>When training is done, we can use the inference API to generate the transcription, but don&#39;t forget to apply lora before loading the model!</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">DEVICE = </span><span style="color:#CE9178;">&quot;cuda&quot;</span></span>
205
+ <span class="line"></span>
206
+ <span class="line"><span style="color:#D4D4D4;">model = Speech2Text.from_pretrained(</span></span>
207
+ <span class="line"><span style="color:#CE9178;"> &quot;espnet/owsm_v3.1_ebf&quot;</span><span style="color:#D4D4D4;">,</span></span>
208
+ <span class="line"><span style="color:#9CDCFE;"> category_sym</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&quot;&lt;jpn&gt;&quot;</span><span style="color:#D4D4D4;">,</span></span>
209
+ <span class="line"><span style="color:#9CDCFE;"> beam_size</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">10</span><span style="color:#D4D4D4;">,</span></span>
210
+ <span class="line"><span style="color:#9CDCFE;"> device</span><span style="color:#D4D4D4;">=DEVICE</span></span>
211
+ <span class="line"><span style="color:#D4D4D4;">)</span></span>
212
+ <span class="line"><span style="color:#D4D4D4;">create_lora_adapter(model.s2t_model, </span><span style="color:#9CDCFE;">target_modules</span><span style="color:#D4D4D4;">=LORA_TARGET)</span></span>
213
+ <span class="line"><span style="color:#D4D4D4;">model.s2t_model.eval()</span></span>
214
+ <span class="line"><span style="color:#D4D4D4;">d = torch.load(</span><span style="color:#CE9178;">&quot;./exp/finetune/1epoch.pth&quot;</span><span style="color:#D4D4D4;">)</span></span>
215
+ <span class="line"><span style="color:#D4D4D4;">model.s2t_model.load_state_dict(d)</span></span>
216
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="results" tabindex="-1"><a class="header-anchor" href="#results"><span>Results</span></a></h2><p>As a result, the finetuned owsm-v3.1 could successfully transcribe the audio files.</p><p><strong>Example</strong></p><ul><li>before finetune: 出してこの時間二のどりを。</li><li>after finetune: ダンスでこの世界に彩りを。</li></ul></div><!--[--><!--]--></div><footer class="vp-page-meta"><!----><div class="vp-meta-item git-info"><!----><!----></div></footer><nav class="vp-page-nav" aria-label="page navigation"><a class="route-link prev" href="/espnetez/asr/train.html" aria-label="Sample demo for ESPnet-Easy!"><!--[--><div class="hint"><span class="arrow left"></span> Prev</div><div class="link"><span>Sample demo for ESPnet-Easy!</span></div><!--]--></a><!----></nav><!--[--><!--]--></main><!--]--></div><!--[--><!----><!--]--><!--]--></div>
217
+ <script type="module" src="/assets/app-DTS6SjJz.js" defer></script>
218
+ </body>
219
+ </html>
espnetez/asr/finetune_with_lora.html ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!doctype html>
2
+ <html lang="en-US">
3
+ <head>
4
+ <meta charset="utf-8" />
5
+ <meta name="viewport" content="width=device-width,initial-scale=1" />
6
+ <meta name="generator" content="VuePress 2.0.0-rc.9" />
7
+ <style>
8
+ :root {
9
+ --c-bg: #fff;
10
+ }
11
+ html.dark {
12
+ --c-bg: #22272e;
13
+ }
14
+ html,
15
+ body {
16
+ background-color: var(--c-bg);
17
+ }
18
+ </style>
19
+ <script>
20
+ const userMode = localStorage.getItem('vuepress-color-scheme')
21
+ const systemDarkMode =
22
+ window.matchMedia &&
23
+ window.matchMedia('(prefers-color-scheme: dark)').matches
24
+ if (userMode === 'dark' || (userMode !== 'light' && systemDarkMode)) {
25
+ document.documentElement.classList.toggle('dark', true)
26
+ }
27
+ </script>
28
+ <link rel="manifest" href="/manifest.webmanifest"><meta name="application-name" content="Example"><meta name="apple-mobile-web-app-title" content="Example"><meta name="apple-mobile-web-app-status-bar-style" content="black"><meta name="msapplication-TileColor" content="#3eaf7c"><meta name="theme-color" content="#3eaf7c"><title>Finetune Model with ESPnet-Easy | </title><meta name="description" content=" ">
29
+ <link rel="preload" href="/assets/style-SNWc1iKP.css" as="style"><link rel="stylesheet" href="/assets/style-SNWc1iKP.css">
30
+ <link rel="modulepreload" href="/assets/app-DTS6SjJz.js"><link rel="modulepreload" href="/assets/finetune_with_lora.html-3NfoQDOl.js">
31
+ <link rel="prefetch" href="/assets/index.html-DGcx4T0I.js" as="script"><link rel="prefetch" href="/assets/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html-CJ8-yKXK.js" as="script"><link rel="prefetch" href="/assets/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html-BDY4p1H1.js" as="script"><link rel="prefetch" href="/assets/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html-DDjiuGQB.js" as="script"><link rel="prefetch" href="/assets/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html-DnIftUJK.js" as="script"><link rel="prefetch" href="/assets/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html-D9GsoT-_.js" as="script"><link rel="prefetch" href="/assets/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html-DobzmqH0.js" as="script"><link rel="prefetch" href="/assets/espnet2_tutorial_2021_CMU_11751_18781.html-BY6Z52B4.js" as="script"><link rel="prefetch" href="/assets/asr_cli.html-BA-xBrC-.js" as="script"><link rel="prefetch" href="/assets/asr_library.html-rEQwKTMV.js" as="script"><link rel="prefetch" href="/assets/espnet2_asr_realtime_demo.html-BnK1Wovv.js" as="script"><link rel="prefetch" href="/assets/espnet2_asr_transfer_learning_demo.html-DJeSoTyY.js" as="script"><link rel="prefetch" href="/assets/espnet2_streaming_asr_demo.html-Yx-OX8AZ.js" as="script"><link rel="prefetch" href="/assets/espnet_se_demonstration_for_waspaa_2021.html--JdDNbEo.js" as="script"><link rel="prefetch" href="/assets/se_demo.html-DY-mv2y8.js" as="script"><link rel="prefetch" href="/assets/onnx_conversion_demo.html-D56NEMop.js" as="script"><link rel="prefetch" href="/assets/pretrained.html-JpE__EKJ.js" as="script"><link rel="prefetch" href="/assets/espnet2_2pass_slu_demo.html-BmvJ92Ni.js" as="script"><link rel="prefetch" href="/assets/st_demo.html-WLzB4ZGO.js" as="script"><link rel="prefetch" href="/assets/espnet2_tts_realtime_demo.html-BdxLBr1c.js" as="script"><link rel="prefetch" href="/assets/tts_cli.html-BfB21gs4.js" as="script"><link rel="prefetch" href="/assets/tts_realtime_demo.html-BKOGq7as.js" as="script"><link rel="prefetch" href="/assets/finetune_owsm.html-ICOQYZj2.js" as="script"><link rel="prefetch" href="/assets/train.html-BQ-t2Cs4.js" as="script"><link rel="prefetch" href="/assets/tacotron2.html-Ds1AKES7.js" as="script"><link rel="prefetch" href="/assets/404.html-DN7291h8.js" as="script"><link rel="prefetch" href="/assets/NpmBadge-rh9tvaXX.js" as="script">
32
+ </head>
33
+ <body>
34
+ <div id="app"><!--[--><div class="theme-container"><!--[--><header class="navbar"><div class="toggle-sidebar-button" title="toggle sidebar" aria-expanded="false" role="button" tabindex="0"><div class="icon" aria-hidden="true"><span></span><span></span><span></span></div></div><span><a class="route-link" href="/"><img class="logo" src="/images/espnet_logo1.png" alt=" "><span class="site-name can-hide" aria-hidden="true"> </span></a></span><div class="navbar-items-wrapper" style=""><!--[--><!--]--><nav class="navbar-items can-hide" aria-label="site navigation"><!--[--><div class="navbar-item"><a class="external-link" href="https://github.com/espnet/espnet" rel="noopener noreferrer" target="_blank" aria-label="Github"><!--[--><!--[--><!--]--> Github <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://huggingface.co/espnet" rel="noopener noreferrer" target="_blank" aria-label="HuggingFace"><!--[--><!--[--><!--]--> HuggingFace <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://espnet.github.io/espnet" rel="noopener noreferrer" target="_blank" aria-label="Docs"><!--[--><!--[--><!--]--> Docs <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SE</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SLU</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>OTHERS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ST</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/tts/tacotron2.html" aria-label="TTS demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> TTS demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link route-link-active" href="/espnetez/asr/finetune_with_lora.html" aria-label="Finetune Model with ESPnet-Easy"><!--[--><!--[--><!--]--> Finetune Model with ESPnet-Easy <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/train.html" aria-label="Sample demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> Sample demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_owsm.html" aria-label="OWSM finetuning with custom dataset"><!--[--><!--[--><!--]--> OWSM finetuning with custom dataset <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_tutorial_2021_CMU_11751_18781.html" aria-label="CMU 11751/18781 2021: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 2021: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html" aria-label="CMU 11492/11692 Spring 2023: Spoken Language Understanding"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Spoken Language Understanding <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html" aria-label="CMU 11492/11692 Spring 2023: Speech Enhancement"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Speech Enhancement <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html" aria-label="CMU 11492/11692 Spring 2023: Data preparation"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Data preparation <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task)"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html" aria-label="CMU 11492/11692 Spring 2023: Text to Speech"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Text to Speech <!--[--><!--]--><!--]--></a></li><!--]--></ul></div></div><!--]--></nav><!--[--><!--]--><button class="toggle-color-mode-button" title="toggle color mode"><svg style="" class="icon" focusable="false" viewBox="0 0 32 32"><path d="M16 12.005a4 4 0 1 1-4 4a4.005 4.005 0 0 1 4-4m0-2a6 6 0 1 0 6 6a6 6 0 0 0-6-6z" fill="currentColor"></path><path d="M5.394 6.813l1.414-1.415l3.506 3.506L8.9 10.318z" fill="currentColor"></path><path d="M2 15.005h5v2H2z" fill="currentColor"></path><path d="M5.394 25.197L8.9 21.691l1.414 1.415l-3.506 3.505z" fill="currentColor"></path><path d="M15 25.005h2v5h-2z" fill="currentColor"></path><path d="M21.687 23.106l1.414-1.415l3.506 3.506l-1.414 1.414z" fill="currentColor"></path><path d="M25 15.005h5v2h-5z" fill="currentColor"></path><path d="M21.687 8.904l3.506-3.506l1.414 1.415l-3.506 3.505z" fill="currentColor"></path><path d="M15 2.005h2v5h-2z" fill="currentColor"></path></svg><svg style="display:none;" class="icon" focusable="false" viewBox="0 0 32 32"><path d="M13.502 5.414a15.075 15.075 0 0 0 11.594 18.194a11.113 11.113 0 0 1-7.975 3.39c-.138 0-.278.005-.418 0a11.094 11.094 0 0 1-3.2-21.584M14.98 3a1.002 1.002 0 0 0-.175.016a13.096 13.096 0 0 0 1.825 25.981c.164.006.328 0 .49 0a13.072 13.072 0 0 0 10.703-5.555a1.01 1.01 0 0 0-.783-1.565A13.08 13.08 0 0 1 15.89 4.38A1.015 1.015 0 0 0 14.98 3z" fill="currentColor"></path></svg></button><form class="search-box" role="search"><input type="search" placeholder="Search" autocomplete="off" spellcheck="false" value><!----></form></div></header><!--]--><div class="sidebar-mask"></div><!--[--><aside class="sidebar"><nav class="navbar-items" aria-label="site navigation"><!--[--><div class="navbar-item"><a class="external-link" href="https://github.com/espnet/espnet" rel="noopener noreferrer" target="_blank" aria-label="Github"><!--[--><!--[--><!--]--> Github <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://huggingface.co/espnet" rel="noopener noreferrer" target="_blank" aria-label="HuggingFace"><!--[--><!--[--><!--]--> HuggingFace <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://espnet.github.io/espnet" rel="noopener noreferrer" target="_blank" aria-label="Docs"><!--[--><!--[--><!--]--> Docs <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SE</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SLU</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>OTHERS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ST</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/tts/tacotron2.html" aria-label="TTS demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> TTS demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link route-link-active" href="/espnetez/asr/finetune_with_lora.html" aria-label="Finetune Model with ESPnet-Easy"><!--[--><!--[--><!--]--> Finetune Model with ESPnet-Easy <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/train.html" aria-label="Sample demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> Sample demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_owsm.html" aria-label="OWSM finetuning with custom dataset"><!--[--><!--[--><!--]--> OWSM finetuning with custom dataset <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_tutorial_2021_CMU_11751_18781.html" aria-label="CMU 11751/18781 2021: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 2021: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html" aria-label="CMU 11492/11692 Spring 2023: Spoken Language Understanding"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Spoken Language Understanding <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html" aria-label="CMU 11492/11692 Spring 2023: Speech Enhancement"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Speech Enhancement <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html" aria-label="CMU 11492/11692 Spring 2023: Data preparation"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Data preparation <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task)"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html" aria-label="CMU 11492/11692 Spring 2023: Text to Speech"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Text to Speech <!--[--><!--]--><!--]--></a></li><!--]--></ul></div></div><!--]--></nav><!--[--><!--]--><ul class="sidebar-items"><!--[--><li><p tabindex="0" class="sidebar-item sidebar-heading">TTS <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnetez/tts/tacotron2.html" aria-label="TTS demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> TTS demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading active">ASR <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link route-link-active sidebar-item active" href="/espnetez/asr/finetune_with_lora.html" aria-label="Finetune Model with ESPnet-Easy"><!--[--><!--[--><!--]--> Finetune Model with ESPnet-Easy <!--[--><!--]--><!--]--></a><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="#load-a-pretrained-model" aria-label="Load a pretrained model"><!--[--><!--[--><!--]--> Load a pretrained model <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#training" aria-label="Training"><!--[--><!--[--><!--]--> Training <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><a class="route-link sidebar-item" href="/espnetez/asr/train.html" aria-label="Sample demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> Sample demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnetez/asr/finetune_owsm.html" aria-label="OWSM finetuning with custom dataset"><!--[--><!--[--><!--]--> OWSM finetuning with custom dataset <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><!--]--></ul><!--[--><!--]--></aside><!--]--><!--[--><main class="page"><!--[--><!--]--><div class="theme-default-content"><!--[--><!--]--><div><h1 id="finetune-model-with-espnet-easy" tabindex="-1"><a class="header-anchor" href="#finetune-model-with-espnet-easy"><span>Finetune Model with ESPnet-Easy</span></a></h1><p>In this notebook, we will explore the process of finetuning a pretrained model using the Librispeech-100 dataset. We&#39;ll start by downloading a pretrained model from the Hugging Face model hub and apply Low-Rank Adaptation (LoRA) techniques to reduce the number of training parameters.</p><p>In this notebook, we assume that the dump files have been already created. If you need guidance on creating the dump files, you can refer to the <code>training.ipynb</code> notebook.</p><p>First, we need to install the <code>loralib</code> package.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">%pip install loralib</span></span>
35
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><p>As with the <code>training.ipynb</code> notebook, we need to provide a dictionary to specify the file path and type for each data.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">DUMP_DIR = </span><span style="color:#CE9178;">&quot;./dump/libri100&quot;</span></span>
36
+ <span class="line"><span style="color:#D4D4D4;">data_info = {</span></span>
37
+ <span class="line"><span style="color:#CE9178;"> &quot;speech&quot;</span><span style="color:#D4D4D4;">: [</span><span style="color:#CE9178;">&quot;wav.scp&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;sound&quot;</span><span style="color:#D4D4D4;">],</span></span>
38
+ <span class="line"><span style="color:#CE9178;"> &quot;text&quot;</span><span style="color:#D4D4D4;">: [</span><span style="color:#CE9178;">&quot;text&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;text&quot;</span><span style="color:#D4D4D4;">],</span></span>
39
+ <span class="line"><span style="color:#D4D4D4;">}</span></span>
40
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="load-a-pretrained-model" tabindex="-1"><a class="header-anchor" href="#load-a-pretrained-model"><span>Load a pretrained model</span></a></h2><p>In ESPnet-Easy, you have the flexibility to define a custom model using the <code>build_model_fn</code> method. Additionally, you can load a pretrained model when needed.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet2.bin.asr_inference </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> Speech2Text</span></span>
41
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet2.layers.create_lora_adapter </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> create_lora_adapter</span></span>
42
+ <span class="line"></span>
43
+ <span class="line"></span>
44
+ <span class="line"><span style="color:#569CD6;">def</span><span style="color:#DCDCAA;"> build_model_fn</span><span style="color:#D4D4D4;">(</span><span style="color:#9CDCFE;">args</span><span style="color:#D4D4D4;">):</span></span>
45
+ <span class="line"><span style="color:#D4D4D4;"> pretrained_model = Speech2Text.from_pretrained(</span><span style="color:#CE9178;">&#39;pyf98/librispeech_conformer_hop_length160&#39;</span><span style="color:#D4D4D4;">)</span></span>
46
+ <span class="line"><span style="color:#D4D4D4;"> model = pretrained_model.asr_model</span></span>
47
+ <span class="line"><span style="color:#D4D4D4;"> model.train()</span></span>
48
+ <span class="line"></span>
49
+ <span class="line"><span style="color:#6A9955;"> # apply lora</span></span>
50
+ <span class="line"><span style="color:#D4D4D4;"> create_lora_adapter(model, </span><span style="color:#9CDCFE;">target_modules</span><span style="color:#D4D4D4;">=[</span><span style="color:#CE9178;">&#39;linear_q&#39;</span><span style="color:#D4D4D4;">])</span></span>
51
+ <span class="line"><span style="color:#C586C0;"> return</span><span style="color:#D4D4D4;"> model</span></span>
52
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>When working with a pretrained model, the configuration is inherited from the model by default. To activate the LoRA model, it&#39;s essential to set the <code>use_lora</code> parameter to <code>True</code>. This configuration update can be easily achieved using the <code>update_finetune_config</code> method.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> espnetez </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> ez</span></span>
53
+ <span class="line"></span>
54
+ <span class="line"></span>
55
+ <span class="line"><span style="color:#D4D4D4;">pretrained_model = Speech2Text.from_pretrained(</span><span style="color:#CE9178;">&#39;pyf98/librispeech_conformer_hop_length160&#39;</span><span style="color:#D4D4D4;">)</span></span>
56
+ <span class="line"><span style="color:#D4D4D4;">pretrain_config = </span><span style="color:#DCDCAA;">vars</span><span style="color:#D4D4D4;">(pretrained_model.asr_train_args)</span></span>
57
+ <span class="line"><span style="color:#C586C0;">del</span><span style="color:#D4D4D4;"> pretrained_model</span></span>
58
+ <span class="line"></span>
59
+ <span class="line"><span style="color:#D4D4D4;">finetune_config = ez.config.update_finetune_config(</span></span>
60
+ <span class="line"><span style="color:#CE9178;"> &#39;asr&#39;</span><span style="color:#D4D4D4;">,</span></span>
61
+ <span class="line"><span style="color:#D4D4D4;"> pretrain_config,</span></span>
62
+ <span class="line"><span style="color:#CE9178;"> &#39;config/finetune_with_lora.yaml&#39;</span></span>
63
+ <span class="line"><span style="color:#D4D4D4;">)</span></span>
64
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="training" tabindex="-1"><a class="header-anchor" href="#training"><span>Training</span></a></h2><p>Finally, let&#39;s start training.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">EXP_DIR = </span><span style="color:#CE9178;">&quot;exp/finetune&quot;</span></span>
65
+ <span class="line"><span style="color:#D4D4D4;">STATS_DIR = </span><span style="color:#CE9178;">&quot;exp/stats_finetune&quot;</span></span>
66
+ <span class="line"></span>
67
+ <span class="line"><span style="color:#D4D4D4;">trainer = ez.Trainer(</span></span>
68
+ <span class="line"><span style="color:#9CDCFE;"> task</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&#39;asr&#39;</span><span style="color:#D4D4D4;">,</span></span>
69
+ <span class="line"><span style="color:#9CDCFE;"> train_config</span><span style="color:#D4D4D4;">=finetune_config,</span></span>
70
+ <span class="line"><span style="color:#9CDCFE;"> train_dump_dir</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&quot;dump/libri100/train&quot;</span><span style="color:#D4D4D4;">,</span></span>
71
+ <span class="line"><span style="color:#9CDCFE;"> valid_dump_dir</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&quot;dump/libri100/dev&quot;</span><span style="color:#D4D4D4;">,</span></span>
72
+ <span class="line"><span style="color:#9CDCFE;"> build_model_fn</span><span style="color:#D4D4D4;">=build_model_fn, </span><span style="color:#6A9955;"># provide the pre-trained model</span></span>
73
+ <span class="line"><span style="color:#9CDCFE;"> data_info</span><span style="color:#D4D4D4;">=data_info,</span></span>
74
+ <span class="line"><span style="color:#9CDCFE;"> output_dir</span><span style="color:#D4D4D4;">=EXP_DIR,</span></span>
75
+ <span class="line"><span style="color:#9CDCFE;"> stats_dir</span><span style="color:#D4D4D4;">=STATS_DIR,</span></span>
76
+ <span class="line"><span style="color:#9CDCFE;"> ngpu</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">1</span></span>
77
+ <span class="line"><span style="color:#D4D4D4;">)</span></span>
78
+ <span class="line"><span style="color:#D4D4D4;">trainer.collect_stats()</span></span>
79
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">trainer.train()</span></span>
80
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div></div><!--[--><!--]--></div><footer class="vp-page-meta"><!----><div class="vp-meta-item git-info"><!----><!----></div></footer><nav class="vp-page-nav" aria-label="page navigation"><!----><a class="route-link next" href="/espnetez/asr/train.html" aria-label="Sample demo for ESPnet-Easy!"><!--[--><div class="hint">Next <span class="arrow right"></span></div><div class="link"><span>Sample demo for ESPnet-Easy!</span></div><!--]--></a></nav><!--[--><!--]--></main><!--]--></div><!--[--><!----><!--]--><!--]--></div>
81
+ <script type="module" src="/assets/app-DTS6SjJz.js" defer></script>
82
+ </body>
83
+ </html>
espnetez/asr/train.html ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!doctype html>
2
+ <html lang="en-US">
3
+ <head>
4
+ <meta charset="utf-8" />
5
+ <meta name="viewport" content="width=device-width,initial-scale=1" />
6
+ <meta name="generator" content="VuePress 2.0.0-rc.9" />
7
+ <style>
8
+ :root {
9
+ --c-bg: #fff;
10
+ }
11
+ html.dark {
12
+ --c-bg: #22272e;
13
+ }
14
+ html,
15
+ body {
16
+ background-color: var(--c-bg);
17
+ }
18
+ </style>
19
+ <script>
20
+ const userMode = localStorage.getItem('vuepress-color-scheme')
21
+ const systemDarkMode =
22
+ window.matchMedia &&
23
+ window.matchMedia('(prefers-color-scheme: dark)').matches
24
+ if (userMode === 'dark' || (userMode !== 'light' && systemDarkMode)) {
25
+ document.documentElement.classList.toggle('dark', true)
26
+ }
27
+ </script>
28
+ <link rel="manifest" href="/manifest.webmanifest"><meta name="application-name" content="Example"><meta name="apple-mobile-web-app-title" content="Example"><meta name="apple-mobile-web-app-status-bar-style" content="black"><meta name="msapplication-TileColor" content="#3eaf7c"><meta name="theme-color" content="#3eaf7c"><title>Sample demo for ESPnet-Easy! | </title><meta name="description" content=" ">
29
+ <link rel="preload" href="/assets/style-SNWc1iKP.css" as="style"><link rel="stylesheet" href="/assets/style-SNWc1iKP.css">
30
+ <link rel="modulepreload" href="/assets/app-DTS6SjJz.js"><link rel="modulepreload" href="/assets/train.html-BQ-t2Cs4.js">
31
+ <link rel="prefetch" href="/assets/index.html-DGcx4T0I.js" as="script"><link rel="prefetch" href="/assets/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html-CJ8-yKXK.js" as="script"><link rel="prefetch" href="/assets/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html-BDY4p1H1.js" as="script"><link rel="prefetch" href="/assets/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html-DDjiuGQB.js" as="script"><link rel="prefetch" href="/assets/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html-DnIftUJK.js" as="script"><link rel="prefetch" href="/assets/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html-D9GsoT-_.js" as="script"><link rel="prefetch" href="/assets/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html-DobzmqH0.js" as="script"><link rel="prefetch" href="/assets/espnet2_tutorial_2021_CMU_11751_18781.html-BY6Z52B4.js" as="script"><link rel="prefetch" href="/assets/asr_cli.html-BA-xBrC-.js" as="script"><link rel="prefetch" href="/assets/asr_library.html-rEQwKTMV.js" as="script"><link rel="prefetch" href="/assets/espnet2_asr_realtime_demo.html-BnK1Wovv.js" as="script"><link rel="prefetch" href="/assets/espnet2_asr_transfer_learning_demo.html-DJeSoTyY.js" as="script"><link rel="prefetch" href="/assets/espnet2_streaming_asr_demo.html-Yx-OX8AZ.js" as="script"><link rel="prefetch" href="/assets/espnet_se_demonstration_for_waspaa_2021.html--JdDNbEo.js" as="script"><link rel="prefetch" href="/assets/se_demo.html-DY-mv2y8.js" as="script"><link rel="prefetch" href="/assets/onnx_conversion_demo.html-D56NEMop.js" as="script"><link rel="prefetch" href="/assets/pretrained.html-JpE__EKJ.js" as="script"><link rel="prefetch" href="/assets/espnet2_2pass_slu_demo.html-BmvJ92Ni.js" as="script"><link rel="prefetch" href="/assets/st_demo.html-WLzB4ZGO.js" as="script"><link rel="prefetch" href="/assets/espnet2_tts_realtime_demo.html-BdxLBr1c.js" as="script"><link rel="prefetch" href="/assets/tts_cli.html-BfB21gs4.js" as="script"><link rel="prefetch" href="/assets/tts_realtime_demo.html-BKOGq7as.js" as="script"><link rel="prefetch" href="/assets/finetune_owsm.html-ICOQYZj2.js" as="script"><link rel="prefetch" href="/assets/finetune_with_lora.html-3NfoQDOl.js" as="script"><link rel="prefetch" href="/assets/tacotron2.html-Ds1AKES7.js" as="script"><link rel="prefetch" href="/assets/404.html-DN7291h8.js" as="script"><link rel="prefetch" href="/assets/NpmBadge-rh9tvaXX.js" as="script">
32
+ </head>
33
+ <body>
34
+ <div id="app"><!--[--><div class="theme-container"><!--[--><header class="navbar"><div class="toggle-sidebar-button" title="toggle sidebar" aria-expanded="false" role="button" tabindex="0"><div class="icon" aria-hidden="true"><span></span><span></span><span></span></div></div><span><a class="route-link" href="/"><img class="logo" src="/images/espnet_logo1.png" alt=" "><span class="site-name can-hide" aria-hidden="true"> </span></a></span><div class="navbar-items-wrapper" style=""><!--[--><!--]--><nav class="navbar-items can-hide" aria-label="site navigation"><!--[--><div class="navbar-item"><a class="external-link" href="https://github.com/espnet/espnet" rel="noopener noreferrer" target="_blank" aria-label="Github"><!--[--><!--[--><!--]--> Github <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://huggingface.co/espnet" rel="noopener noreferrer" target="_blank" aria-label="HuggingFace"><!--[--><!--[--><!--]--> HuggingFace <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://espnet.github.io/espnet" rel="noopener noreferrer" target="_blank" aria-label="Docs"><!--[--><!--[--><!--]--> Docs <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SE</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SLU</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>OTHERS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ST</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/tts/tacotron2.html" aria-label="TTS demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> TTS demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_with_lora.html" aria-label="Finetune Model with ESPnet-Easy"><!--[--><!--[--><!--]--> Finetune Model with ESPnet-Easy <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link route-link-active" href="/espnetez/asr/train.html" aria-label="Sample demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> Sample demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_owsm.html" aria-label="OWSM finetuning with custom dataset"><!--[--><!--[--><!--]--> OWSM finetuning with custom dataset <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_tutorial_2021_CMU_11751_18781.html" aria-label="CMU 11751/18781 2021: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 2021: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html" aria-label="CMU 11492/11692 Spring 2023: Spoken Language Understanding"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Spoken Language Understanding <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html" aria-label="CMU 11492/11692 Spring 2023: Speech Enhancement"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Speech Enhancement <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html" aria-label="CMU 11492/11692 Spring 2023: Data preparation"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Data preparation <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task)"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html" aria-label="CMU 11492/11692 Spring 2023: Text to Speech"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Text to Speech <!--[--><!--]--><!--]--></a></li><!--]--></ul></div></div><!--]--></nav><!--[--><!--]--><button class="toggle-color-mode-button" title="toggle color mode"><svg style="" class="icon" focusable="false" viewBox="0 0 32 32"><path d="M16 12.005a4 4 0 1 1-4 4a4.005 4.005 0 0 1 4-4m0-2a6 6 0 1 0 6 6a6 6 0 0 0-6-6z" fill="currentColor"></path><path d="M5.394 6.813l1.414-1.415l3.506 3.506L8.9 10.318z" fill="currentColor"></path><path d="M2 15.005h5v2H2z" fill="currentColor"></path><path d="M5.394 25.197L8.9 21.691l1.414 1.415l-3.506 3.505z" fill="currentColor"></path><path d="M15 25.005h2v5h-2z" fill="currentColor"></path><path d="M21.687 23.106l1.414-1.415l3.506 3.506l-1.414 1.414z" fill="currentColor"></path><path d="M25 15.005h5v2h-5z" fill="currentColor"></path><path d="M21.687 8.904l3.506-3.506l1.414 1.415l-3.506 3.505z" fill="currentColor"></path><path d="M15 2.005h2v5h-2z" fill="currentColor"></path></svg><svg style="display:none;" class="icon" focusable="false" viewBox="0 0 32 32"><path d="M13.502 5.414a15.075 15.075 0 0 0 11.594 18.194a11.113 11.113 0 0 1-7.975 3.39c-.138 0-.278.005-.418 0a11.094 11.094 0 0 1-3.2-21.584M14.98 3a1.002 1.002 0 0 0-.175.016a13.096 13.096 0 0 0 1.825 25.981c.164.006.328 0 .49 0a13.072 13.072 0 0 0 10.703-5.555a1.01 1.01 0 0 0-.783-1.565A13.08 13.08 0 0 1 15.89 4.38A1.015 1.015 0 0 0 14.98 3z" fill="currentColor"></path></svg></button><form class="search-box" role="search"><input type="search" placeholder="Search" autocomplete="off" spellcheck="false" value><!----></form></div></header><!--]--><div class="sidebar-mask"></div><!--[--><aside class="sidebar"><nav class="navbar-items" aria-label="site navigation"><!--[--><div class="navbar-item"><a class="external-link" href="https://github.com/espnet/espnet" rel="noopener noreferrer" target="_blank" aria-label="Github"><!--[--><!--[--><!--]--> Github <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://huggingface.co/espnet" rel="noopener noreferrer" target="_blank" aria-label="HuggingFace"><!--[--><!--[--><!--]--> HuggingFace <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://espnet.github.io/espnet" rel="noopener noreferrer" target="_blank" aria-label="Docs"><!--[--><!--[--><!--]--> Docs <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SE</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SLU</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>OTHERS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ST</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/tts/tacotron2.html" aria-label="TTS demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> TTS demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_with_lora.html" aria-label="Finetune Model with ESPnet-Easy"><!--[--><!--[--><!--]--> Finetune Model with ESPnet-Easy <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link route-link-active" href="/espnetez/asr/train.html" aria-label="Sample demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> Sample demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_owsm.html" aria-label="OWSM finetuning with custom dataset"><!--[--><!--[--><!--]--> OWSM finetuning with custom dataset <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_tutorial_2021_CMU_11751_18781.html" aria-label="CMU 11751/18781 2021: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 2021: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html" aria-label="CMU 11492/11692 Spring 2023: Spoken Language Understanding"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Spoken Language Understanding <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html" aria-label="CMU 11492/11692 Spring 2023: Speech Enhancement"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Speech Enhancement <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html" aria-label="CMU 11492/11692 Spring 2023: Data preparation"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Data preparation <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task)"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html" aria-label="CMU 11492/11692 Spring 2023: Text to Speech"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Text to Speech <!--[--><!--]--><!--]--></a></li><!--]--></ul></div></div><!--]--></nav><!--[--><!--]--><ul class="sidebar-items"><!--[--><li><p tabindex="0" class="sidebar-item sidebar-heading">TTS <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnetez/tts/tacotron2.html" aria-label="TTS demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> TTS demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading active">ASR <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnetez/asr/finetune_with_lora.html" aria-label="Finetune Model with ESPnet-Easy"><!--[--><!--[--><!--]--> Finetune Model with ESPnet-Easy <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link route-link-active sidebar-item active" href="/espnetez/asr/train.html" aria-label="Sample demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> Sample demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="#data-preparation" aria-label="Data Preparation"><!--[--><!--[--><!--]--> Data Preparation <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#train-sentencepiece-model" aria-label="Train sentencepiece model"><!--[--><!--[--><!--]--> Train sentencepiece model <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#configure-training-process" aria-label="Configure Training Process"><!--[--><!--[--><!--]--> Configure Training Process <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#training" aria-label="Training"><!--[--><!--[--><!--]--> Training <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#inference" aria-label="Inference"><!--[--><!--[--><!--]--> Inference <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><li><a class="route-link sidebar-item" href="/espnetez/asr/finetune_owsm.html" aria-label="OWSM finetuning with custom dataset"><!--[--><!--[--><!--]--> OWSM finetuning with custom dataset <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><!--]--></ul><!--[--><!--]--></aside><!--]--><!--[--><main class="page"><!--[--><!--]--><div class="theme-default-content"><!--[--><!--]--><div><h1 id="sample-demo-for-espnet-easy" tabindex="-1"><a class="header-anchor" href="#sample-demo-for-espnet-easy"><span>Sample demo for ESPnet-Easy!</span></a></h1><p>In this notebook, we will demonstrate how to train an Automatic Speech Recognition (ASR) model using the Librispeech-100 dataset. The process in this notebook follows the same dataset preparation approach as the kaldi-style dataset. If you are interested in fine-tuning pretrained models, please refer to the libri100_finetune.ipynb file.</p><p>Before proceeding, please ensure that you have already downloaded the Librispeech-100 dataset from <a href="https://www.openslr.org/12" target="_blank" rel="noopener noreferrer">OpenSLR<span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span></a> and have placed the data in a directory of your choice. In this notebook, we assume that you have stored the dataset in the <code>/hdd/dataset/</code> directory. If your dataset is located in a different directory, please make sure to replace <code>/hdd/dataset/</code> with the actual path to your dataset.</p><h2 id="data-preparation" tabindex="-1"><a class="header-anchor" href="#data-preparation"><span>Data Preparation</span></a></h2><p>This notebook follows the data preparation steps outlined in <code>asr.sh</code>. Initially, we will create a dump file to store information about the data, including the data ID, audio path, and transcriptions.</p><p>ESPnet-Easy supports various types of datasets, including:</p><ol><li><p>Dictionary-based dataset with the following structure:</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">{</span></span>
35
+ <span class="line"><span style="color:#CE9178;"> &quot;data_id&quot;</span><span style="color:#D4D4D4;">: {</span></span>
36
+ <span class="line"><span style="color:#CE9178;"> &quot;speech&quot;</span><span style="color:#D4D4D4;">: path_to_speech_file,</span></span>
37
+ <span class="line"><span style="color:#CE9178;"> &quot;text&quot;</span><span style="color:#D4D4D4;">: transcription</span></span>
38
+ <span class="line"><span style="color:#D4D4D4;"> }</span></span>
39
+ <span class="line"><span style="color:#D4D4D4;">}</span></span>
40
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div></li><li><p>List of datasets with the following structure:</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">[</span></span>
41
+ <span class="line"><span style="color:#D4D4D4;"> {</span></span>
42
+ <span class="line"><span style="color:#CE9178;"> &quot;speech&quot;</span><span style="color:#D4D4D4;">: path_to_speech_file,</span></span>
43
+ <span class="line"><span style="color:#CE9178;"> &quot;text&quot;</span><span style="color:#D4D4D4;">: transcription</span></span>
44
+ <span class="line"><span style="color:#D4D4D4;"> }</span></span>
45
+ <span class="line"><span style="color:#D4D4D4;">]</span></span>
46
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div></li></ol><p>If you choose to use a dictionary-based dataset, it&#39;s essential to ensure that each <code>data_id</code> is unique. ESPnet-Easy also accepts a dump file that may have already been created by <code>asr.sh</code>. However, in this notebook, we will create the dump file from scratch.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># Need to install espnet if you don&#39;t have it</span></span>
47
+ <span class="line"><span style="color:#D4D4D4;">%pip install -U espnet</span></span>
48
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><p>Now, let&#39;s create dump files!<br> Please note that you will need to provide a dictionary to specify the file path and type for each data. This dictionary should have the following format:</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">{</span></span>
49
+ <span class="line"><span style="color:#CE9178;"> &quot;data_name&quot;</span><span style="color:#D4D4D4;">: [</span><span style="color:#CE9178;">&quot;dump_file_name&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;dump_format&quot;</span><span style="color:#D4D4D4;">]</span></span>
50
+ <span class="line"><span style="color:#D4D4D4;">}</span></span>
51
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> os</span></span>
52
+ <span class="line"></span>
53
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> espnetez </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> ez</span></span>
54
+ <span class="line"></span>
55
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> local.data_prep </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> create_dataset</span></span>
56
+ <span class="line"></span>
57
+ <span class="line"></span>
58
+ <span class="line"><span style="color:#D4D4D4;">DUMP_DIR = </span><span style="color:#CE9178;">&quot;./dump/libri100&quot;</span></span>
59
+ <span class="line"><span style="color:#D4D4D4;">LIBRI_100_DIRS = [</span></span>
60
+ <span class="line"><span style="color:#D4D4D4;"> [</span><span style="color:#CE9178;">&quot;/hdd/database/librispeech-100/LibriSpeech/train-clean-100&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;train&quot;</span><span style="color:#D4D4D4;">],</span></span>
61
+ <span class="line"><span style="color:#D4D4D4;"> [</span><span style="color:#CE9178;">&quot;/hdd/database/librispeech-100/LibriSpeech/dev-clean&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;dev-clean&quot;</span><span style="color:#D4D4D4;">],</span></span>
62
+ <span class="line"><span style="color:#D4D4D4;"> [</span><span style="color:#CE9178;">&quot;/hdd/database/librispeech-100/LibriSpeech/dev-other&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;dev-other&quot;</span><span style="color:#D4D4D4;">],</span></span>
63
+ <span class="line"><span style="color:#D4D4D4;">]</span></span>
64
+ <span class="line"><span style="color:#D4D4D4;">data_info = {</span></span>
65
+ <span class="line"><span style="color:#CE9178;"> &quot;speech&quot;</span><span style="color:#D4D4D4;">: [</span><span style="color:#CE9178;">&quot;wav.scp&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;sound&quot;</span><span style="color:#D4D4D4;">],</span></span>
66
+ <span class="line"><span style="color:#CE9178;"> &quot;text&quot;</span><span style="color:#D4D4D4;">: [</span><span style="color:#CE9178;">&quot;text&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;text&quot;</span><span style="color:#D4D4D4;">],</span></span>
67
+ <span class="line"><span style="color:#D4D4D4;">}</span></span>
68
+ <span class="line"></span>
69
+ <span class="line"></span>
70
+ <span class="line"><span style="color:#C586C0;">for</span><span style="color:#D4D4D4;"> d, n </span><span style="color:#C586C0;">in</span><span style="color:#D4D4D4;"> LIBRI_100_DIRS:</span></span>
71
+ <span class="line"><span style="color:#D4D4D4;"> dump_dir = os.path.join(DUMP_DIR, n)</span></span>
72
+ <span class="line"><span style="color:#C586C0;"> if</span><span style="color:#569CD6;"> not</span><span style="color:#D4D4D4;"> os.path.exists(dump_dir):</span></span>
73
+ <span class="line"><span style="color:#D4D4D4;"> os.makedirs(dump_dir)</span></span>
74
+ <span class="line"></span>
75
+ <span class="line"><span style="color:#D4D4D4;"> dataset = create_dataset(d)</span></span>
76
+ <span class="line"><span style="color:#D4D4D4;"> ez.data.create_dump_file(dump_dir, dataset, data_info)</span></span>
77
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>For the validation files, you have two directories: <code>dev-clean</code> and <code>dev-other</code>. To create a unified dev dataset, you can use the <code>ez.data.join_dumps</code> function.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">ez.data.join_dumps(</span></span>
78
+ <span class="line"><span style="color:#D4D4D4;"> [</span><span style="color:#CE9178;">&quot;./dump/libri100/dev-clean&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;./dump/libri100/dev-other&quot;</span><span style="color:#D4D4D4;">], </span><span style="color:#CE9178;">&quot;./dump/libri100/dev&quot;</span></span>
79
+ <span class="line"><span style="color:#D4D4D4;">)</span></span>
80
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>Now you have dataset files in the <code>dump</code> directory. It looks like this:</p><p>wav.scp</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>1255-138279-0008 /hdd/database/librispeech-100/LibriSpeech/dev-other/1255/138279/1255-138279-0008.flac</span></span>
81
+ <span class="line"><span>1255-138279-0022 /hdd/database/librispeech-100/LibriSpeech/dev-other/1255/138279/1255-138279-0022.flac</span></span>
82
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><p>text</p><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span>1255-138279-0008 TWO THREE</span></span>
83
+ <span class="line"><span>1255-138279-0022 IF I SAID SO OF COURSE I WILL</span></span>
84
+ <span class="line"><span></span></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="train-sentencepiece-model" tabindex="-1"><a class="header-anchor" href="#train-sentencepiece-model"><span>Train sentencepiece model</span></a></h2><p>To train a SentencePiece model, we require a text file for training. Let&#39;s begin by creating the training file.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># generate training texts from the training data</span></span>
85
+ <span class="line"><span style="color:#6A9955;"># you can select several datasets to train sentencepiece.</span></span>
86
+ <span class="line"><span style="color:#D4D4D4;">ez.preprocess.prepare_sentences([</span><span style="color:#CE9178;">&quot;dump/libri100/train/text&quot;</span><span style="color:#D4D4D4;">], </span><span style="color:#CE9178;">&quot;dump/spm&quot;</span><span style="color:#D4D4D4;">)</span></span>
87
+ <span class="line"></span>
88
+ <span class="line"><span style="color:#D4D4D4;">ez.preprocess.train_sentencepiece(</span></span>
89
+ <span class="line"><span style="color:#CE9178;"> &quot;dump/spm/train.txt&quot;</span><span style="color:#D4D4D4;">,</span></span>
90
+ <span class="line"><span style="color:#CE9178;"> &quot;data/bpemodel&quot;</span><span style="color:#D4D4D4;">,</span></span>
91
+ <span class="line"><span style="color:#9CDCFE;"> vocab_size</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">5000</span><span style="color:#D4D4D4;">,</span></span>
92
+ <span class="line"><span style="color:#D4D4D4;">)</span></span>
93
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="configure-training-process" tabindex="-1"><a class="header-anchor" href="#configure-training-process"><span>Configure Training Process</span></a></h2><p>For configuring the training process, you can utilize the configuration files already provided by ESPnet contributors. To use a configuration file, you&#39;ll need to create a YAML file on your local machine. For instance, you can use the <a href="train_asr_e-branchformer_size256_mlp1024_linear1024_e12_mactrue_edrop0.0_ddrop0.0.yaml">e-branchformer config</a>.</p><p>In my case, I&#39;ve made a modification to the <code>batch_bins</code> parameter, changing it from <code>16000000</code> to <code>1600000</code> to run training on my GPU (RTX2080ti).</p><h2 id="training" tabindex="-1"><a class="header-anchor" href="#training"><span>Training</span></a></h2><p>To prepare the stats file before training, you can execute the <code>collect_stats</code> method. This step is required before the training process and ensuring accurate statistics for the model.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> espnetez </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> ez</span></span>
94
+ <span class="line"></span>
95
+ <span class="line"><span style="color:#D4D4D4;">EXP_DIR = </span><span style="color:#CE9178;">&quot;exp/train_asr_branchformer_e24_amp&quot;</span></span>
96
+ <span class="line"><span style="color:#D4D4D4;">STATS_DIR = </span><span style="color:#CE9178;">&quot;exp/stats&quot;</span></span>
97
+ <span class="line"></span>
98
+ <span class="line"><span style="color:#6A9955;"># load config</span></span>
99
+ <span class="line"><span style="color:#D4D4D4;">training_config = ez.config.from_yaml(</span></span>
100
+ <span class="line"><span style="color:#CE9178;"> &quot;asr&quot;</span><span style="color:#D4D4D4;">,</span></span>
101
+ <span class="line"><span style="color:#CE9178;"> &quot;config/train_asr_e_branchformer_size256_mlp1024_linear1024_e12_mactrue_edrop0.0_ddrop0.0.yaml&quot;</span><span style="color:#D4D4D4;">,</span></span>
102
+ <span class="line"><span style="color:#D4D4D4;">)</span></span>
103
+ <span class="line"><span style="color:#D4D4D4;">preprocessor_config = ez.utils.load_yaml(</span><span style="color:#CE9178;">&quot;config/preprocess.yaml&quot;</span><span style="color:#D4D4D4;">)</span></span>
104
+ <span class="line"><span style="color:#D4D4D4;">training_config.update(preprocessor_config)</span></span>
105
+ <span class="line"></span>
106
+ <span class="line"><span style="color:#C586C0;">with</span><span style="color:#DCDCAA;"> open</span><span style="color:#D4D4D4;">(preprocessor_config[</span><span style="color:#CE9178;">&quot;token_list&quot;</span><span style="color:#D4D4D4;">], </span><span style="color:#CE9178;">&quot;r&quot;</span><span style="color:#D4D4D4;">) </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> f:</span></span>
107
+ <span class="line"><span style="color:#D4D4D4;"> training_config[</span><span style="color:#CE9178;">&quot;token_list&quot;</span><span style="color:#D4D4D4;">] = [t.replace(</span><span style="color:#CE9178;">&quot;</span><span style="color:#D7BA7D;">\n</span><span style="color:#CE9178;">&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;&quot;</span><span style="color:#D4D4D4;">) </span><span style="color:#C586C0;">for</span><span style="color:#D4D4D4;"> t </span><span style="color:#C586C0;">in</span><span style="color:#D4D4D4;"> f.readlines()]</span></span>
108
+ <span class="line"></span>
109
+ <span class="line"><span style="color:#6A9955;"># Define the Trainer class</span></span>
110
+ <span class="line"><span style="color:#D4D4D4;">trainer = ez.Trainer(</span></span>
111
+ <span class="line"><span style="color:#9CDCFE;"> task</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&#39;asr&#39;</span><span style="color:#D4D4D4;">,</span></span>
112
+ <span class="line"><span style="color:#9CDCFE;"> train_config</span><span style="color:#D4D4D4;">=training_config,</span></span>
113
+ <span class="line"><span style="color:#9CDCFE;"> train_dump_dir</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&quot;dump/libri100/train&quot;</span><span style="color:#D4D4D4;">,</span></span>
114
+ <span class="line"><span style="color:#9CDCFE;"> valid_dump_dir</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&quot;dump/libri100/dev&quot;</span><span style="color:#D4D4D4;">,</span></span>
115
+ <span class="line"><span style="color:#9CDCFE;"> data_info</span><span style="color:#D4D4D4;">=data_info,</span></span>
116
+ <span class="line"><span style="color:#9CDCFE;"> output_dir</span><span style="color:#D4D4D4;">=EXP_DIR,</span></span>
117
+ <span class="line"><span style="color:#9CDCFE;"> stats_dir</span><span style="color:#D4D4D4;">=STATS_DIR,</span></span>
118
+ <span class="line"><span style="color:#9CDCFE;"> ngpu</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">,</span></span>
119
+ <span class="line"><span style="color:#D4D4D4;">)</span></span>
120
+ <span class="line"><span style="color:#D4D4D4;">trainer.collect_stats()</span></span>
121
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>Finally, we are ready to begin the training process!</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">trainer.train()</span></span>
122
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h2 id="inference" tabindex="-1"><a class="header-anchor" href="#inference"><span>Inference</span></a></h2><p>You can just use the inference API of the ESPnet.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> librosa</span></span>
123
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet2.bin.asr_inference </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> Speech2Text</span></span>
124
+ <span class="line"></span>
125
+ <span class="line"><span style="color:#D4D4D4;">m = Speech2Text(</span></span>
126
+ <span class="line"><span style="color:#CE9178;"> &quot;./exp/train_asr_branchformer_e24_amp/config.yaml&quot;</span><span style="color:#D4D4D4;">,</span></span>
127
+ <span class="line"><span style="color:#CE9178;"> &quot;./exp/train_asr_branchformer_e24_amp/valid.acc.best.pth&quot;</span><span style="color:#D4D4D4;">,</span></span>
128
+ <span class="line"><span style="color:#9CDCFE;"> beam_size</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">10</span></span>
129
+ <span class="line"><span style="color:#D4D4D4;">)</span></span>
130
+ <span class="line"></span>
131
+ <span class="line"><span style="color:#C586C0;">with</span><span style="color:#DCDCAA;"> open</span><span style="color:#D4D4D4;">(</span><span style="color:#CE9178;">&quot;./dump/libri100/dev/wav.scp&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;r&quot;</span><span style="color:#D4D4D4;">) </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> f:</span></span>
132
+ <span class="line"><span style="color:#D4D4D4;"> sample_path = f.readlines()[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">]</span></span>
133
+ <span class="line"><span style="color:#D4D4D4;"> </span></span>
134
+ <span class="line"><span style="color:#D4D4D4;">y, sr = librosa.load(sample_path.split()[</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">], </span><span style="color:#9CDCFE;">sr</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">16000</span><span style="color:#D4D4D4;">, </span><span style="color:#9CDCFE;">mono</span><span style="color:#D4D4D4;">=</span><span style="color:#569CD6;">True</span><span style="color:#D4D4D4;">)</span></span>
135
+ <span class="line"><span style="color:#D4D4D4;">output = m(y)</span></span>
136
+ <span class="line"><span style="color:#DCDCAA;">print</span><span style="color:#D4D4D4;">(output[</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">][</span><span style="color:#B5CEA8;">0</span><span style="color:#D4D4D4;">])</span></span>
137
+ <span class="line"></span>
138
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div></div><!--[--><!--]--></div><footer class="vp-page-meta"><!----><div class="vp-meta-item git-info"><!----><!----></div></footer><nav class="vp-page-nav" aria-label="page navigation"><a class="route-link prev" href="/espnetez/asr/finetune_with_lora.html" aria-label="Finetune Model with ESPnet-Easy"><!--[--><div class="hint"><span class="arrow left"></span> Prev</div><div class="link"><span>Finetune Model with ESPnet-Easy</span></div><!--]--></a><a class="route-link next" href="/espnetez/asr/finetune_owsm.html" aria-label="OWSM finetuning with custom dataset"><!--[--><div class="hint">Next <span class="arrow right"></span></div><div class="link"><span>OWSM finetuning with custom dataset</span></div><!--]--></a></nav><!--[--><!--]--></main><!--]--></div><!--[--><!----><!--]--><!--]--></div>
139
+ <script type="module" src="/assets/app-DTS6SjJz.js" defer></script>
140
+ </body>
141
+ </html>
espnetez/tts/tacotron2.html ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!doctype html>
2
+ <html lang="en-US">
3
+ <head>
4
+ <meta charset="utf-8" />
5
+ <meta name="viewport" content="width=device-width,initial-scale=1" />
6
+ <meta name="generator" content="VuePress 2.0.0-rc.9" />
7
+ <style>
8
+ :root {
9
+ --c-bg: #fff;
10
+ }
11
+ html.dark {
12
+ --c-bg: #22272e;
13
+ }
14
+ html,
15
+ body {
16
+ background-color: var(--c-bg);
17
+ }
18
+ </style>
19
+ <script>
20
+ const userMode = localStorage.getItem('vuepress-color-scheme')
21
+ const systemDarkMode =
22
+ window.matchMedia &&
23
+ window.matchMedia('(prefers-color-scheme: dark)').matches
24
+ if (userMode === 'dark' || (userMode !== 'light' && systemDarkMode)) {
25
+ document.documentElement.classList.toggle('dark', true)
26
+ }
27
+ </script>
28
+ <link rel="manifest" href="/manifest.webmanifest"><meta name="application-name" content="Example"><meta name="apple-mobile-web-app-title" content="Example"><meta name="apple-mobile-web-app-status-bar-style" content="black"><meta name="msapplication-TileColor" content="#3eaf7c"><meta name="theme-color" content="#3eaf7c"><title>TTS demo for ESPnet-Easy! | </title><meta name="description" content=" ">
29
+ <link rel="preload" href="/assets/style-SNWc1iKP.css" as="style"><link rel="stylesheet" href="/assets/style-SNWc1iKP.css">
30
+ <link rel="modulepreload" href="/assets/app-DTS6SjJz.js"><link rel="modulepreload" href="/assets/tacotron2.html-Ds1AKES7.js">
31
+ <link rel="prefetch" href="/assets/index.html-DGcx4T0I.js" as="script"><link rel="prefetch" href="/assets/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html-CJ8-yKXK.js" as="script"><link rel="prefetch" href="/assets/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html-BDY4p1H1.js" as="script"><link rel="prefetch" href="/assets/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html-DDjiuGQB.js" as="script"><link rel="prefetch" href="/assets/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html-DnIftUJK.js" as="script"><link rel="prefetch" href="/assets/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html-D9GsoT-_.js" as="script"><link rel="prefetch" href="/assets/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html-DobzmqH0.js" as="script"><link rel="prefetch" href="/assets/espnet2_tutorial_2021_CMU_11751_18781.html-BY6Z52B4.js" as="script"><link rel="prefetch" href="/assets/asr_cli.html-BA-xBrC-.js" as="script"><link rel="prefetch" href="/assets/asr_library.html-rEQwKTMV.js" as="script"><link rel="prefetch" href="/assets/espnet2_asr_realtime_demo.html-BnK1Wovv.js" as="script"><link rel="prefetch" href="/assets/espnet2_asr_transfer_learning_demo.html-DJeSoTyY.js" as="script"><link rel="prefetch" href="/assets/espnet2_streaming_asr_demo.html-Yx-OX8AZ.js" as="script"><link rel="prefetch" href="/assets/espnet_se_demonstration_for_waspaa_2021.html--JdDNbEo.js" as="script"><link rel="prefetch" href="/assets/se_demo.html-DY-mv2y8.js" as="script"><link rel="prefetch" href="/assets/onnx_conversion_demo.html-D56NEMop.js" as="script"><link rel="prefetch" href="/assets/pretrained.html-JpE__EKJ.js" as="script"><link rel="prefetch" href="/assets/espnet2_2pass_slu_demo.html-BmvJ92Ni.js" as="script"><link rel="prefetch" href="/assets/st_demo.html-WLzB4ZGO.js" as="script"><link rel="prefetch" href="/assets/espnet2_tts_realtime_demo.html-BdxLBr1c.js" as="script"><link rel="prefetch" href="/assets/tts_cli.html-BfB21gs4.js" as="script"><link rel="prefetch" href="/assets/tts_realtime_demo.html-BKOGq7as.js" as="script"><link rel="prefetch" href="/assets/finetune_owsm.html-ICOQYZj2.js" as="script"><link rel="prefetch" href="/assets/finetune_with_lora.html-3NfoQDOl.js" as="script"><link rel="prefetch" href="/assets/train.html-BQ-t2Cs4.js" as="script"><link rel="prefetch" href="/assets/404.html-DN7291h8.js" as="script"><link rel="prefetch" href="/assets/NpmBadge-rh9tvaXX.js" as="script">
32
+ </head>
33
+ <body>
34
+ <div id="app"><!--[--><div class="theme-container"><!--[--><header class="navbar"><div class="toggle-sidebar-button" title="toggle sidebar" aria-expanded="false" role="button" tabindex="0"><div class="icon" aria-hidden="true"><span></span><span></span><span></span></div></div><span><a class="route-link" href="/"><img class="logo" src="/images/espnet_logo1.png" alt=" "><span class="site-name can-hide" aria-hidden="true"> </span></a></span><div class="navbar-items-wrapper" style=""><!--[--><!--]--><nav class="navbar-items can-hide" aria-label="site navigation"><!--[--><div class="navbar-item"><a class="external-link" href="https://github.com/espnet/espnet" rel="noopener noreferrer" target="_blank" aria-label="Github"><!--[--><!--[--><!--]--> Github <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://huggingface.co/espnet" rel="noopener noreferrer" target="_blank" aria-label="HuggingFace"><!--[--><!--[--><!--]--> HuggingFace <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://espnet.github.io/espnet" rel="noopener noreferrer" target="_blank" aria-label="Docs"><!--[--><!--[--><!--]--> Docs <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SE</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SLU</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>OTHERS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ST</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link route-link-active" href="/espnetez/tts/tacotron2.html" aria-label="TTS demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> TTS demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_with_lora.html" aria-label="Finetune Model with ESPnet-Easy"><!--[--><!--[--><!--]--> Finetune Model with ESPnet-Easy <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/train.html" aria-label="Sample demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> Sample demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_owsm.html" aria-label="OWSM finetuning with custom dataset"><!--[--><!--[--><!--]--> OWSM finetuning with custom dataset <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_tutorial_2021_CMU_11751_18781.html" aria-label="CMU 11751/18781 2021: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 2021: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html" aria-label="CMU 11492/11692 Spring 2023: Spoken Language Understanding"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Spoken Language Understanding <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html" aria-label="CMU 11492/11692 Spring 2023: Speech Enhancement"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Speech Enhancement <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html" aria-label="CMU 11492/11692 Spring 2023: Data preparation"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Data preparation <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task)"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html" aria-label="CMU 11492/11692 Spring 2023: Text to Speech"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Text to Speech <!--[--><!--]--><!--]--></a></li><!--]--></ul></div></div><!--]--></nav><!--[--><!--]--><button class="toggle-color-mode-button" title="toggle color mode"><svg style="" class="icon" focusable="false" viewBox="0 0 32 32"><path d="M16 12.005a4 4 0 1 1-4 4a4.005 4.005 0 0 1 4-4m0-2a6 6 0 1 0 6 6a6 6 0 0 0-6-6z" fill="currentColor"></path><path d="M5.394 6.813l1.414-1.415l3.506 3.506L8.9 10.318z" fill="currentColor"></path><path d="M2 15.005h5v2H2z" fill="currentColor"></path><path d="M5.394 25.197L8.9 21.691l1.414 1.415l-3.506 3.505z" fill="currentColor"></path><path d="M15 25.005h2v5h-2z" fill="currentColor"></path><path d="M21.687 23.106l1.414-1.415l3.506 3.506l-1.414 1.414z" fill="currentColor"></path><path d="M25 15.005h5v2h-5z" fill="currentColor"></path><path d="M21.687 8.904l3.506-3.506l1.414 1.415l-3.506 3.505z" fill="currentColor"></path><path d="M15 2.005h2v5h-2z" fill="currentColor"></path></svg><svg style="display:none;" class="icon" focusable="false" viewBox="0 0 32 32"><path d="M13.502 5.414a15.075 15.075 0 0 0 11.594 18.194a11.113 11.113 0 0 1-7.975 3.39c-.138 0-.278.005-.418 0a11.094 11.094 0 0 1-3.2-21.584M14.98 3a1.002 1.002 0 0 0-.175.016a13.096 13.096 0 0 0 1.825 25.981c.164.006.328 0 .49 0a13.072 13.072 0 0 0 10.703-5.555a1.01 1.01 0 0 0-.783-1.565A13.08 13.08 0 0 1 15.89 4.38A1.015 1.015 0 0 0 14.98 3z" fill="currentColor"></path></svg></button><form class="search-box" role="search"><input type="search" placeholder="Search" autocomplete="off" spellcheck="false" value><!----></form></div></header><!--]--><div class="sidebar-mask"></div><!--[--><aside class="sidebar"><nav class="navbar-items" aria-label="site navigation"><!--[--><div class="navbar-item"><a class="external-link" href="https://github.com/espnet/espnet" rel="noopener noreferrer" target="_blank" aria-label="Github"><!--[--><!--[--><!--]--> Github <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://huggingface.co/espnet" rel="noopener noreferrer" target="_blank" aria-label="HuggingFace"><!--[--><!--[--><!--]--> HuggingFace <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><a class="external-link" href="https://espnet.github.io/espnet" rel="noopener noreferrer" target="_blank" aria-label="Docs"><!--[--><!--[--><!--]--> Docs <span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span><!--[--><!--]--><!--]--></a></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnet2"><span class="title">espnet2</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_cli.html" aria-label="Text-to-Speech (Recipe)"><!--[--><!--[--><!--]--> Text-to-Speech (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/tts_realtime_demo.html" aria-label="ESPnet real time E2E-TTS demonstration"><!--[--><!--[--><!--]--> ESPnet real time E2E-TTS demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/tts/espnet2_tts_realtime_demo.html" aria-label="ESPnet2-TTS realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-TTS realtime demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SE</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/se_demo.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/se/espnet_se_demonstration_for_waspaa_2021.html" aria-label="ESPnet Speech Enhancement Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Enhancement Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>SLU</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/slu/espnet2_2pass_slu_demo.html" aria-label="ESPNET 2 pass SLU Demonstration"><!--[--><!--[--><!--]--> ESPNET 2 pass SLU Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_cli.html" aria-label="Speech Recognition (Recipe)"><!--[--><!--[--><!--]--> Speech Recognition (Recipe) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_realtime_demo.html" aria-label="ESPnet2-ASR realtime demonstration"><!--[--><!--[--><!--]--> ESPnet2-ASR realtime demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/asr_library.html" aria-label="Speech Recognition (Library)"><!--[--><!--[--><!--]--> Speech Recognition (Library) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_asr_transfer_learning_demo.html" aria-label="Use transfer learning for ASR in ESPnet2"><!--[--><!--[--><!--]--> Use transfer learning for ASR in ESPnet2 <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/asr/espnet2_streaming_asr_demo.html" aria-label="ESPnet2 real streaming Transformer demonstration"><!--[--><!--[--><!--]--> ESPnet2 real streaming Transformer demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>OTHERS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/onnx_conversion_demo.html" aria-label="espnet_onnx demonstration"><!--[--><!--[--><!--]--> espnet_onnx demonstration <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/others/pretrained.html" aria-label="Pretrained Model"><!--[--><!--[--><!--]--> Pretrained Model <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ST</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnet2/st/st_demo.html" aria-label="ESPnet Speech Translation Demonstration"><!--[--><!--[--><!--]--> ESPnet Speech Translation Demonstration <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="espnetez"><span class="title">espnetez</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>TTS</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link route-link-active" href="/espnetez/tts/tacotron2.html" aria-label="TTS demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> TTS demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><li class="navbar-dropdown-item"><!--[--><h4 class="navbar-dropdown-subtitle"><span>ASR</span></h4><ul class="navbar-dropdown-subitem-wrapper"><!--[--><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_with_lora.html" aria-label="Finetune Model with ESPnet-Easy"><!--[--><!--[--><!--]--> Finetune Model with ESPnet-Easy <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/train.html" aria-label="Sample demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> Sample demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-subitem"><a class="route-link" href="/espnetez/asr/finetune_owsm.html" aria-label="OWSM finetuning with custom dataset"><!--[--><!--[--><!--]--> OWSM finetuning with custom dataset <!--[--><!--]--><!--]--></a></li><!--]--></ul><!--]--></li><!--]--></ul></div></div><div class="navbar-item"><div class="navbar-dropdown-wrapper"><button class="navbar-dropdown-title" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="arrow down"></span></button><button class="navbar-dropdown-title-mobile" type="button" aria-label="tutorials"><span class="title">tutorials</span><span class="right arrow"></span></button><ul style="display:none;" class="navbar-dropdown"><!--[--><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_tutorial_2021_CMU_11751_18781.html" aria-label="CMU 11751/18781 2021: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 2021: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpokenLanguageUnderstanding_CMU_11492_692_Spring2023(Assignment6).html" aria-label="CMU 11492/11692 Spring 2023: Spoken Language Understanding"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Spoken Language Understanding <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/SpeechEnhancement_CMU_11492_692_Spring2023(Assignment7).html" aria-label="CMU 11492/11692 Spring 2023: Speech Enhancement"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Speech Enhancement <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/DataPreparation_CMU_11492_692_Spring2023(Assignment0).html" aria-label="CMU 11492/11692 Spring 2023: Data preparation"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Data preparation <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_new_task_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task)"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial2 (New task) <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/espnet2_recipe_tutorial_CMU_11751_18781_Fall2022.html" aria-label="CMU 11751/18781 Fall 2022: ESPnet Tutorial"><!--[--><!--[--><!--]--> CMU 11751/18781 Fall 2022: ESPnet Tutorial <!--[--><!--]--><!--]--></a></li><li class="navbar-dropdown-item"><a class="route-link" href="/tutorials/TextToSpeech_CMU_11492_692_Spring2023(Assignment8).html" aria-label="CMU 11492/11692 Spring 2023: Text to Speech"><!--[--><!--[--><!--]--> CMU 11492/11692 Spring 2023: Text to Speech <!--[--><!--]--><!--]--></a></li><!--]--></ul></div></div><!--]--></nav><!--[--><!--]--><ul class="sidebar-items"><!--[--><li><p tabindex="0" class="sidebar-item sidebar-heading active">TTS <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link route-link-active sidebar-item active" href="/espnetez/tts/tacotron2.html" aria-label="TTS demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> TTS demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="#data-preparation" aria-label="Data preparation"><!--[--><!--[--><!--]--> Data preparation <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#generate-token-list" aria-label="Generate token list"><!--[--><!--[--><!--]--> Generate token list <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#training" aria-label="Training"><!--[--><!--[--><!--]--> Training <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="#inference" aria-label="Inference"><!--[--><!--[--><!--]--> Inference <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><!--]--></ul></li><li><p tabindex="0" class="sidebar-item sidebar-heading">ASR <!----></p><ul style="" class="sidebar-item-children"><!--[--><li><a class="route-link sidebar-item" href="/espnetez/asr/finetune_with_lora.html" aria-label="Finetune Model with ESPnet-Easy"><!--[--><!--[--><!--]--> Finetune Model with ESPnet-Easy <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnetez/asr/train.html" aria-label="Sample demo for ESPnet-Easy!"><!--[--><!--[--><!--]--> Sample demo for ESPnet-Easy! <!--[--><!--]--><!--]--></a><!----></li><li><a class="route-link sidebar-item" href="/espnetez/asr/finetune_owsm.html" aria-label="OWSM finetuning with custom dataset"><!--[--><!--[--><!--]--> OWSM finetuning with custom dataset <!--[--><!--]--><!--]--></a><!----></li><!--]--></ul></li><!--]--></ul><!--[--><!--]--></aside><!--]--><!--[--><main class="page"><!--[--><!--]--><div class="theme-default-content"><!--[--><!--]--><div><h1 id="tts-demo-for-espnet-easy" tabindex="-1"><a class="header-anchor" href="#tts-demo-for-espnet-easy"><span>TTS demo for ESPnet-Easy!</span></a></h1><p>In this notebook, we will demonstrate how to train an Text to Speech (TTS) model using the LJSpeech dataset. Basic flow of data preparation and training is the same with ASR.</p><p>Before proceeding, please ensure that you have already downloaded the LJSpeech dataset from <a href="https://keithito.com/LJ-Speech-Dataset/" target="_blank" rel="noopener noreferrer">here<span><svg class="external-link-icon" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" focusable="false" x="0px" y="0px" viewBox="0 0 100 100" width="15" height="15"><path fill="currentColor" d="M18.8,85.1h56l0,0c2.2,0,4-1.8,4-4v-32h-8v28h-48v-48h28v-8h-32l0,0c-2.2,0-4,1.8-4,4v56C14.8,83.3,16.6,85.1,18.8,85.1z"></path><polygon fill="currentColor" points="45.7,48.7 51.3,54.3 77.2,28.5 77.2,37.2 85.2,37.2 85.2,14.9 62.8,14.9 62.8,22.9 71.5,22.9"></polygon></svg><span class="external-link-icon-sr-only">open in new window</span></span></a> and have placed the data in a directory of your choice. In this notebook, we assume that you have stored the dataset in the <code>/hdd/dataset/</code> directory. If your dataset is located in a different directory, please make sure to replace <code>/hdd/dataset/</code> with the actual path to your dataset.</p><h2 id="data-preparation" tabindex="-1"><a class="header-anchor" href="#data-preparation"><span>Data preparation</span></a></h2><p>First, let&#39;s create dump files!<br> The format of the dump files is the same as the ASR dump files.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">{</span></span>
35
+ <span class="line"><span style="color:#CE9178;"> &quot;data_name&quot;</span><span style="color:#D4D4D4;">: [</span><span style="color:#CE9178;">&quot;dump_file_name&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;dump_format&quot;</span><span style="color:#D4D4D4;">]</span></span>
36
+ <span class="line"><span style="color:#D4D4D4;">}</span></span>
37
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> os</span></span>
38
+ <span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> espnetez </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> ez</span></span>
39
+ <span class="line"></span>
40
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> local.data_prep </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> get_dataset</span></span>
41
+ <span class="line"></span>
42
+ <span class="line"></span>
43
+ <span class="line"><span style="color:#D4D4D4;">DUMP_DIR = </span><span style="color:#CE9178;">&quot;./dump/ljspeech&quot;</span></span>
44
+ <span class="line"><span style="color:#D4D4D4;">LJS_DIRS = </span><span style="color:#CE9178;">&quot;/hdd/database/LJSpeech-1.1&quot;</span></span>
45
+ <span class="line"><span style="color:#D4D4D4;">data_info = {</span></span>
46
+ <span class="line"><span style="color:#CE9178;"> &quot;speech&quot;</span><span style="color:#D4D4D4;">: [</span><span style="color:#CE9178;">&quot;wav.scp&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;sound&quot;</span><span style="color:#D4D4D4;">],</span></span>
47
+ <span class="line"><span style="color:#CE9178;"> &quot;text&quot;</span><span style="color:#D4D4D4;">: [</span><span style="color:#CE9178;">&quot;text&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;text&quot;</span><span style="color:#D4D4D4;">],</span></span>
48
+ <span class="line"><span style="color:#D4D4D4;">}</span></span>
49
+ <span class="line"></span>
50
+ <span class="line"><span style="color:#D4D4D4;">train_dataset, test_dataset = get_dataset(LJS_DIRS)</span></span>
51
+ <span class="line"></span>
52
+ <span class="line"><span style="color:#D4D4D4;">train_dir = os.path.join(DUMP_DIR, </span><span style="color:#CE9178;">&quot;train&quot;</span><span style="color:#D4D4D4;">)</span></span>
53
+ <span class="line"><span style="color:#D4D4D4;">test_dir = os.path.join(DUMP_DIR, </span><span style="color:#CE9178;">&quot;test&quot;</span><span style="color:#D4D4D4;">)</span></span>
54
+ <span class="line"></span>
55
+ <span class="line"><span style="color:#D4D4D4;">ez.data.create_dump_file(train_dir, train_dataset, data_info)</span></span>
56
+ <span class="line"><span style="color:#D4D4D4;">ez.data.create_dump_file(test_dir, test_dataset, data_info)</span></span>
57
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="generate-token-list" tabindex="-1"><a class="header-anchor" href="#generate-token-list"><span>Generate token list</span></a></h2><p>To generate a token list, we need to run <code>espnet2.bin.tokenize_text</code> script. ESPnet-Easy has a wrapper function for this script.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#6A9955;"># generate training texts from the training data</span></span>
58
+ <span class="line"><span style="color:#6A9955;"># you can select several datasets to train sentencepiece.</span></span>
59
+ <span class="line"><span style="color:#D4D4D4;">ez.preprocess.prepare_sentences([</span><span style="color:#CE9178;">&quot;dump/ljspeech/train/text&quot;</span><span style="color:#D4D4D4;">], </span><span style="color:#CE9178;">&quot;data/&quot;</span><span style="color:#D4D4D4;">)</span></span>
60
+ <span class="line"><span style="color:#D4D4D4;">ez.preprocess.tokenize(</span></span>
61
+ <span class="line"><span style="color:#9CDCFE;"> input</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&quot;data/train.txt&quot;</span><span style="color:#D4D4D4;">,</span></span>
62
+ <span class="line"><span style="color:#9CDCFE;"> output</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&quot;data/tokenized.txt&quot;</span><span style="color:#D4D4D4;">,</span></span>
63
+ <span class="line"><span style="color:#9CDCFE;"> token_type</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&quot;phn&quot;</span><span style="color:#D4D4D4;">,</span></span>
64
+ <span class="line"><span style="color:#9CDCFE;"> cleaner</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&quot;tacotron&quot;</span><span style="color:#D4D4D4;">,</span></span>
65
+ <span class="line"><span style="color:#9CDCFE;"> g2p</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&quot;g2p_en&quot;</span></span>
66
+ <span class="line"><span style="color:#D4D4D4;">)</span></span>
67
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h2 id="training" tabindex="-1"><a class="header-anchor" href="#training"><span>Training</span></a></h2><p>To prepare the stats file before training, you can execute the <code>collect_stats</code> method. This step is required before the training process and ensuring accurate statistics for the model.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">EXP_DIR = </span><span style="color:#CE9178;">&quot;exp/train_tts&quot;</span></span>
68
+ <span class="line"><span style="color:#D4D4D4;">STATS_DIR = </span><span style="color:#CE9178;">&quot;exp/stats&quot;</span></span>
69
+ <span class="line"></span>
70
+ <span class="line"><span style="color:#6A9955;"># load config</span></span>
71
+ <span class="line"><span style="color:#D4D4D4;">training_config = ez.config.from_yaml(</span></span>
72
+ <span class="line"><span style="color:#CE9178;"> &quot;tts&quot;</span><span style="color:#D4D4D4;">,</span></span>
73
+ <span class="line"><span style="color:#CE9178;"> &quot;tacotron2.yaml&quot;</span><span style="color:#D4D4D4;">,</span></span>
74
+ <span class="line"><span style="color:#D4D4D4;">)</span></span>
75
+ <span class="line"><span style="color:#C586C0;">with</span><span style="color:#DCDCAA;"> open</span><span style="color:#D4D4D4;">(</span><span style="color:#CE9178;">&quot;data/tokenized.txt&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;r&quot;</span><span style="color:#D4D4D4;">) </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> f:</span></span>
76
+ <span class="line"><span style="color:#D4D4D4;"> training_config[</span><span style="color:#CE9178;">&quot;token_list&quot;</span><span style="color:#D4D4D4;">] = [t.replace(</span><span style="color:#CE9178;">&quot;</span><span style="color:#D7BA7D;">\n</span><span style="color:#CE9178;">&quot;</span><span style="color:#D4D4D4;">, </span><span style="color:#CE9178;">&quot;&quot;</span><span style="color:#D4D4D4;">) </span><span style="color:#C586C0;">for</span><span style="color:#D4D4D4;"> t </span><span style="color:#C586C0;">in</span><span style="color:#D4D4D4;"> f.readlines()]</span></span>
77
+ <span class="line"></span>
78
+ <span class="line"><span style="color:#6A9955;"># Define the Trainer class</span></span>
79
+ <span class="line"><span style="color:#D4D4D4;">trainer = ez.Trainer(</span></span>
80
+ <span class="line"><span style="color:#9CDCFE;"> task</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&#39;tts&#39;</span><span style="color:#D4D4D4;">,</span></span>
81
+ <span class="line"><span style="color:#9CDCFE;"> train_config</span><span style="color:#D4D4D4;">=training_config,</span></span>
82
+ <span class="line"><span style="color:#9CDCFE;"> train_dump_dir</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&quot;dump/ljspeech/train&quot;</span><span style="color:#D4D4D4;">,</span></span>
83
+ <span class="line"><span style="color:#9CDCFE;"> valid_dump_dir</span><span style="color:#D4D4D4;">=</span><span style="color:#CE9178;">&quot;dump/ljspeech/test&quot;</span><span style="color:#D4D4D4;">,</span></span>
84
+ <span class="line"><span style="color:#9CDCFE;"> data_info</span><span style="color:#D4D4D4;">=data_info,</span></span>
85
+ <span class="line"><span style="color:#9CDCFE;"> output_dir</span><span style="color:#D4D4D4;">=EXP_DIR,</span></span>
86
+ <span class="line"><span style="color:#9CDCFE;"> stats_dir</span><span style="color:#D4D4D4;">=STATS_DIR,</span></span>
87
+ <span class="line"><span style="color:#9CDCFE;"> ngpu</span><span style="color:#D4D4D4;">=</span><span style="color:#B5CEA8;">1</span><span style="color:#D4D4D4;">,</span></span>
88
+ <span class="line"><span style="color:#D4D4D4;">)</span></span>
89
+ <span class="line"><span style="color:#D4D4D4;">trainer.collect_stats()</span></span>
90
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>Finally, we are ready to begin the training process!</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#D4D4D4;">trainer.train()</span></span>
91
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div></div></div><h2 id="inference" tabindex="-1"><a class="header-anchor" href="#inference"><span>Inference</span></a></h2><p>You can just use the inference API of the ESPnet.</p><div class="language-python line-numbers-mode" data-ext="py" data-title="py"><pre class="shiki dark-plus" style="background-color:#1E1E1E;color:#D4D4D4;" tabindex="0"><code><span class="line"><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> soundfile </span><span style="color:#C586C0;">as</span><span style="color:#D4D4D4;"> sf</span></span>
92
+ <span class="line"><span style="color:#C586C0;">from</span><span style="color:#D4D4D4;"> espnet2.bin.tts_inference </span><span style="color:#C586C0;">import</span><span style="color:#D4D4D4;"> Text2Speech</span></span>
93
+ <span class="line"></span>
94
+ <span class="line"><span style="color:#D4D4D4;">m = Text2Speech(</span></span>
95
+ <span class="line"><span style="color:#CE9178;"> &quot;./exp/finetune/config.yaml&quot;</span><span style="color:#D4D4D4;">,</span></span>
96
+ <span class="line"><span style="color:#CE9178;"> &quot;./exp/finetune/valid.loss.ave.pth&quot;</span><span style="color:#D4D4D4;">,</span></span>
97
+ <span class="line"><span style="color:#D4D4D4;">)</span></span>
98
+ <span class="line"></span>
99
+ <span class="line"><span style="color:#D4D4D4;">text = </span><span style="color:#CE9178;">&quot;hello world&quot;</span></span>
100
+ <span class="line"><span style="color:#D4D4D4;">output = m(text)[</span><span style="color:#CE9178;">&#39;wav&#39;</span><span style="color:#D4D4D4;">]</span></span>
101
+ <span class="line"><span style="color:#D4D4D4;">sf.write(</span><span style="color:#CE9178;">&quot;output.wav&quot;</span><span style="color:#D4D4D4;">, output, </span><span style="color:#B5CEA8;">16000</span><span style="color:#D4D4D4;">)</span></span>
102
+ <span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div></div><!--[--><!--]--></div><footer class="vp-page-meta"><!----><div class="vp-meta-item git-info"><!----><!----></div></footer><!----><!--[--><!--]--></main><!--]--></div><!--[--><!----><!--]--><!--]--></div>
103
+ <script type="module" src="/assets/app-DTS6SjJz.js" defer></script>
104
+ </body>
105
+ </html>