{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "5920c653-448e-43b3-93eb-12d7073ad352", "metadata": { "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/opt/espnet/tools/anaconda/envs/espnet/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "from espnet2.bin.asr_inference import Speech2Text\n", "from espnet2.bin.asr_align import CTCSegmentation\n", "import soundfile\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "id": "83058587-1a8a-4b01-92ff-e9125fbe55a3", "metadata": { "tags": [] }, "outputs": [], "source": [ "import torch\n", "torch.set_num_threads(1)" ] }, { "cell_type": "markdown", "id": "32eb58d1-5edd-4cc1-9585-daa7f16efd05", "metadata": {}, "source": [ "## Load model" ] }, { "cell_type": "code", "execution_count": 3, "id": "5e4670d6-0949-48cf-b6b1-d9cc4cf3ad65", "metadata": { "tags": [] }, "outputs": [], "source": [ "speech2text = Speech2Text(\"exp/config.yaml\", \"exp/valid.acc.ave_10best.pth\", quantize_asr_model=True, quantize_lm=True)" ] }, { "cell_type": "markdown", "id": "3192656d-6dce-4973-a649-f7ab0c72c386", "metadata": { "tags": [] }, "source": [ "## Load example audiofile to transcribe" ] }, { "cell_type": "code", "execution_count": 4, "id": "e8120e8e-3718-4a1a-ab7a-46ef98a6bc11", "metadata": { "tags": [] }, "outputs": [], "source": [ "speech, rate = soundfile.read(\"example_audio/emt16k.wav\")\n", "assert rate == 16000" ] }, { "cell_type": "code", "execution_count": 5, "id": "eec8d4b2-c27a-4780-aeed-8aa7538f70e5", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 2.64 s, sys: 6.23 ms, total: 2.65 s\n", "Wall time: 2.66 s\n" ] } ], "source": [ "%time text, *_ = speech2text(speech)" ] }, { "cell_type": "code", "execution_count": 6, "id": "39f41a8b-94c3-42d6-a989-6c7183a6f94d", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "mina tahaksin homme täna ja homme kui saan kolm krampsumas ise müüki panna\n" ] } ], "source": [ "print(text[0])" ] }, { "cell_type": "code", "execution_count": 7, "id": "812060a6-90de-4134-8d1f-9f3d98853bc2", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Input File : 'example_audio/emt16k.wav'\n", "Channels : 1\n", "Sample Rate : 16000\n", "Precision : 16-bit\n", "Duration : 00:00:12.74 = 203815 samples ~ 955.383 CDDA sectors\n", "File Size : 408k\n", "Bit Rate : 256k\n", "Sample Encoding: 16-bit Signed Integer PCM\n", "\n" ] } ], "source": [ "!soxi example_audio/emt16k.wav" ] }, { "cell_type": "markdown", "id": "7d07e8a4-1dbf-4a79-bdf0-aeaeb160ba19", "metadata": {}, "source": [ "## Example token level alignment" ] }, { "cell_type": "code", "execution_count": 8, "id": "e6b7331c-52f1-4162-b564-2e6a08b325b0", "metadata": { "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "WARNING:root:No RNN model detected; memory consumption may be high.\n" ] } ], "source": [ "aligner = CTCSegmentation(\"exp/config.yaml\", \"exp/valid.acc.ave_10best.pth\" , kaldi_style_text=False, blank_transition_cost_zero=True)\n", "segments = aligner(speech, text[0].split())" ] }, { "cell_type": "code", "execution_count": 9, "id": "e6d18b5f-3d2a-4fcf-bf4e-00480e58094a", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "utt_0000 utt 0.36 0.78 -0.0001 mina\n", "utt_0001 utt 0.78 1.19 -0.0003 tahaksin\n", "utt_0002 utt 1.19 1.59 -0.0017 homme\n", "utt_0003 utt 1.67 2.19 -0.0001 täna\n", "utt_0004 utt 3.24 3.76 -0.0037 ja\n", "utt_0005 utt 3.76 4.28 -0.0000 homme\n", "utt_0006 utt 5.61 6.13 -0.0001 kui\n", "utt_0007 utt 6.17 6.69 -0.0009 saan\n", "utt_0008 utt 7.98 8.50 -0.2285 kolm\n", "utt_0009 utt 8.50 9.34 -0.1062 krampsumas\n", "utt_0010 utt 9.34 9.54 -0.1183 ise\n", "utt_0011 utt 9.54 10.07 -0.2588 müüki\n", "utt_0012 utt 10.07 10.31 -0.1041 panna\n", "\n" ] } ], "source": [ "print(segments)" ] }, { "cell_type": "markdown", "id": "77f82a7d-08dc-40cb-88e5-48ef8c36af7d", "metadata": { "tags": [] }, "source": [ "## Get timestamps with some correction" ] }, { "cell_type": "code", "execution_count": 10, "id": "ae9f7e3f-b75d-4bcb-98d1-ae2f037fb4af", "metadata": { "tags": [] }, "outputs": [], "source": [ "def get_timestamps(aligner, speech, text, time_correction=0.2):\n", " tokens=text.split()\n", " segments = aligner(speech, tokens)\n", " df=pd.DataFrame(segments.segments)\n", " df.columns=['start', 'end', 'confidence']\n", " df['start']=df.start+time_correction\n", " df['end']=df.end+time_correction\n", " df['words']=tokens\n", " return df" ] }, { "cell_type": "code", "execution_count": 11, "id": "0215d312-1896-43f1-9782-c92aced787b7", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 2.96 s, sys: 19 ms, total: 2.98 s\n", "Wall time: 2.98 s\n" ] } ], "source": [ "speech, rate = soundfile.read(\"example_audio/oden_kypsis16k_subset2.wav\")\n", "assert rate == 16000\n", "\n", "%time text, *_ = speech2text(speech)" ] }, { "cell_type": "code", "execution_count": 12, "id": "d31d6840-3a80-411a-969c-05f4a5e3e9a1", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Input File : 'example_audio/oden_kypsis16k_subset2.wav'\n", "Channels : 1\n", "Sample Rate : 16000\n", "Precision : 16-bit\n", "Duration : 00:00:09.19 = 146983 samples ~ 688.983 CDDA sectors\n", "File Size : 294k\n", "Bit Rate : 256k\n", "Sample Encoding: 16-bit Signed Integer PCM\n", "\n" ] } ], "source": [ "!soxi example_audio/oden_kypsis16k_subset2.wav" ] }, { "cell_type": "code", "execution_count": 13, "id": "53f3b63f-9b40-432b-b58c-f5b7223252ed", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 309 ms, sys: 8.51 ms, total: 318 ms\n", "Wall time: 312 ms\n" ] } ], "source": [ "%time df_times=get_timestamps(aligner, speech, text[0])" ] }, { "cell_type": "code", "execution_count": 14, "id": "1b4dd747-4be2-4ace-a301-6207f7dd9a71", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", " | start | \n", "end | \n", "confidence | \n", "words | \n", "
---|---|---|---|---|
0 | \n", "0.260173 | \n", "0.661328 | \n", "-0.049087 | \n", "klikid | \n", "
1 | \n", "0.661328 | \n", "0.821789 | \n", "-0.003573 | \n", "neid | \n", "
2 | \n", "0.823233 | \n", "1.784560 | \n", "-0.001952 | \n", "allserva | \n", "
3 | \n", "1.784560 | \n", "1.985137 | \n", "-0.034099 | \n", "tekivad | \n", "
4 | \n", "2.548197 | \n", "3.068255 | \n", "-0.000037 | \n", "need | \n", "
5 | \n", "3.068255 | \n", "4.031025 | \n", "-0.008919 | \n", "lubaküpsiseid | \n", "
6 | \n", "4.754546 | \n", "5.274604 | \n", "-0.000385 | \n", "mis | \n", "
7 | \n", "5.274604 | \n", "5.415008 | \n", "-0.078755 | \n", "on | \n", "
8 | \n", "5.415008 | \n", "5.555412 | \n", "-0.000224 | \n", "nagu | \n", "
9 | \n", "5.555412 | \n", "5.836220 | \n", "-0.000488 | \n", "ilusti | \n", "
10 | \n", "5.836220 | \n", "6.117029 | \n", "-0.002274 | \n", "kohati | \n", "
11 | \n", "6.238818 | \n", "7.039684 | \n", "-0.013956 | \n", "tõlgitud | \n", "
12 | \n", "7.039684 | \n", "7.240261 | \n", "-0.002010 | \n", "eesti | \n", "
13 | \n", "7.240261 | \n", "7.681531 | \n", "-0.002761 | \n", "keelde | \n", "
14 | \n", "7.803320 | \n", "8.323378 | \n", "-0.001533 | \n", "see | \n", "
15 | \n", "8.323378 | \n", "8.644302 | \n", "-0.044506 | \n", "idee | \n", "
16 | \n", "8.644302 | \n", "9.326264 | \n", "-0.215737 | \n", "arusaadavamaks | \n", "