{ "cells": [ { "cell_type": "code", "execution_count": 80, "id": "5920c653-448e-43b3-93eb-12d7073ad352", "metadata": { "tags": [] }, "outputs": [], "source": [ "from espnet2.bin.asr_inference import Speech2Text\n", "from espnet2.bin.asr_align import CTCSegmentation\n", "import soundfile\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 81, "id": "83058587-1a8a-4b01-92ff-e9125fbe55a3", "metadata": { "tags": [] }, "outputs": [], "source": [ "import torch\n", "torch.set_num_threads(1)" ] }, { "cell_type": "markdown", "id": "32eb58d1-5edd-4cc1-9585-daa7f16efd05", "metadata": {}, "source": [ "## Load model" ] }, { "cell_type": "code", "execution_count": 82, "id": "5e4670d6-0949-48cf-b6b1-d9cc4cf3ad65", "metadata": { "tags": [] }, "outputs": [], "source": [ "speech2text = Speech2Text(\"exp/config.yaml\", \"exp/valid.acc.ave_10best.pth\")" ] }, { "cell_type": "markdown", "id": "3192656d-6dce-4973-a649-f7ab0c72c386", "metadata": { "tags": [] }, "source": [ "## Load example audiofile to transcribe" ] }, { "cell_type": "code", "execution_count": 92, "id": "e8120e8e-3718-4a1a-ab7a-46ef98a6bc11", "metadata": { "tags": [] }, "outputs": [], "source": [ "speech, rate = soundfile.read(\"example_audio/emt16k.wav\")\n", "assert rate == 16000" ] }, { "cell_type": "code", "execution_count": 93, "id": "eec8d4b2-c27a-4780-aeed-8aa7538f70e5", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 3.67 s, sys: 191 ms, total: 3.87 s\n", "Wall time: 3.86 s\n" ] } ], "source": [ "%time text, *_ = speech2text(speech)" ] }, { "cell_type": "code", "execution_count": 94, "id": "39f41a8b-94c3-42d6-a989-6c7183a6f94d", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "mina tahaksin homme täna ja homme kui saanud on kui krampsumas ise veiki panna\n" ] } ], "source": [ "print(text[0])" ] }, { "cell_type": "code", "execution_count": 116, "id": "812060a6-90de-4134-8d1f-9f3d98853bc2", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Input File : 'example_audio/emt16k.wav'\n", "Channels : 1\n", "Sample Rate : 16000\n", "Precision : 16-bit\n", "Duration : 00:00:12.74 = 203815 samples ~ 955.383 CDDA sectors\n", "File Size : 408k\n", "Bit Rate : 256k\n", "Sample Encoding: 16-bit Signed Integer PCM\n", "\n" ] } ], "source": [ "!soxi example_audio/emt16k.wav" ] }, { "cell_type": "markdown", "id": "7d07e8a4-1dbf-4a79-bdf0-aeaeb160ba19", "metadata": {}, "source": [ "## Example token level alignment" ] }, { "cell_type": "code", "execution_count": 95, "id": "e6b7331c-52f1-4162-b564-2e6a08b325b0", "metadata": { "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "WARNING:root:No RNN model detected; memory consumption may be high.\n" ] } ], "source": [ "aligner = CTCSegmentation(\"exp/config.yaml\", \"exp/valid.acc.ave_10best.pth\" , kaldi_style_text=False, blank_transition_cost_zero=True)\n", "segments = aligner(speech, text[0].split())" ] }, { "cell_type": "code", "execution_count": 96, "id": "e6d18b5f-3d2a-4fcf-bf4e-00480e58094a", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "utt_0000 utt 0.36 0.78 -0.0001 mina\n", "utt_0001 utt 0.78 1.19 -0.0003 tahaksin\n", "utt_0002 utt 1.19 1.59 -0.0017 homme\n", "utt_0003 utt 1.67 2.19 -0.0001 täna\n", "utt_0004 utt 3.24 3.76 -0.0037 ja\n", "utt_0005 utt 3.76 4.28 -0.0000 homme\n", "utt_0006 utt 5.61 6.13 -0.0001 kui\n", "utt_0007 utt 6.17 6.69 -0.0009 saanud\n", "utt_0008 utt 6.81 7.33 -0.0018 on\n", "utt_0009 utt 7.98 8.50 -0.0862 kui\n", "utt_0010 utt 8.50 9.34 -0.1062 krampsumas\n", "utt_0011 utt 9.34 9.54 -0.1183 ise\n", "utt_0012 utt 9.54 10.07 -0.2033 veiki\n", "utt_0013 utt 10.07 10.31 -0.1041 panna\n", "\n" ] } ], "source": [ "print(segments)" ] }, { "cell_type": "markdown", "id": "77f82a7d-08dc-40cb-88e5-48ef8c36af7d", "metadata": { "tags": [] }, "source": [ "## Get timestamps with some correction" ] }, { "cell_type": "code", "execution_count": 97, "id": "ae9f7e3f-b75d-4bcb-98d1-ae2f037fb4af", "metadata": { "tags": [] }, "outputs": [], "source": [ "def get_timestamps(aligner, speech, text, time_correction=0.2):\n", " tokens=text.split()\n", " segments = aligner(speech, tokens)\n", " df=pd.DataFrame(segments.segments)\n", " df.columns=['start', 'end', 'confidence']\n", " df['start']=df.start+time_correction\n", " df['end']=df.end+time_correction\n", " df['words']=tokens\n", " return df" ] }, { "cell_type": "code", "execution_count": 128, "id": "93aa6281-3b73-47b7-93ca-e90fedd8d398", "metadata": { "tags": [] }, "outputs": [], "source": [ "torch.set_num_threads(5)" ] }, { "cell_type": "code", "execution_count": 131, "id": "0215d312-1896-43f1-9782-c92aced787b7", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 14.4 s, sys: 89.6 ms, total: 14.5 s\n", "Wall time: 2.9 s\n" ] } ], "source": [ "speech, rate = soundfile.read(\"example_audio/oden_kypsis16k_subset2.wav\")\n", "assert rate == 16000\n", "\n", "%time text, *_ = speech2text(speech)" ] }, { "cell_type": "code", "execution_count": 132, "id": "d31d6840-3a80-411a-969c-05f4a5e3e9a1", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Input File : 'example_audio/oden_kypsis16k_subset2.wav'\n", "Channels : 1\n", "Sample Rate : 16000\n", "Precision : 16-bit\n", "Duration : 00:00:09.19 = 146983 samples ~ 688.983 CDDA sectors\n", "File Size : 294k\n", "Bit Rate : 256k\n", "Sample Encoding: 16-bit Signed Integer PCM\n", "\n" ] } ], "source": [ "!soxi example_audio/oden_kypsis16k_subset2.wav" ] }, { "cell_type": "code", "execution_count": 108, "id": "53f3b63f-9b40-432b-b58c-f5b7223252ed", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 474 ms, sys: 30.2 ms, total: 504 ms\n", "Wall time: 501 ms\n" ] } ], "source": [ "%time df_times=get_timestamps(aligner, speech, text[0])" ] }, { "cell_type": "code", "execution_count": 109, "id": "1b4dd747-4be2-4ace-a301-6207f7dd9a71", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", " | start | \n", "end | \n", "confidence | \n", "words | \n", "
---|---|---|---|---|
0 | \n", "0.260154 | \n", "0.661184 | \n", "-0.107317 | \n", "klikid | \n", "
1 | \n", "0.661184 | \n", "0.821596 | \n", "-0.001331 | \n", "neid | \n", "
2 | \n", "0.822883 | \n", "1.784067 | \n", "-0.002136 | \n", "allserva | \n", "
3 | \n", "1.784067 | \n", "1.984582 | \n", "-0.041078 | \n", "tekivad | \n", "
4 | \n", "2.547310 | \n", "3.067362 | \n", "-0.008251 | \n", "need | \n", "
5 | \n", "3.067362 | \n", "4.029833 | \n", "-0.007814 | \n", "lubaküpsiseid | \n", "
6 | \n", "4.752973 | \n", "5.273025 | \n", "-0.000333 | \n", "mis | \n", "
7 | \n", "5.273025 | \n", "5.413385 | \n", "-0.063720 | \n", "on | \n", "
8 | \n", "5.413385 | \n", "5.553745 | \n", "-0.000231 | \n", "nagu | \n", "
9 | \n", "5.553745 | \n", "5.834466 | \n", "-0.000573 | \n", "ilusti | \n", "
10 | \n", "5.834466 | \n", "6.115187 | \n", "-0.001930 | \n", "kohati | \n", "
11 | \n", "6.236783 | \n", "7.037555 | \n", "-0.004937 | \n", "tõlgitud | \n", "
12 | \n", "7.037555 | \n", "7.238070 | \n", "-0.001092 | \n", "eesti | \n", "
13 | \n", "7.238070 | \n", "7.679202 | \n", "-0.003088 | \n", "keelde | \n", "
14 | \n", "7.800798 | \n", "8.320850 | \n", "-0.001126 | \n", "see | \n", "
15 | \n", "8.320850 | \n", "8.601571 | \n", "-0.033408 | \n", "idee | \n", "
16 | \n", "8.601571 | \n", "9.363527 | \n", "-0.032846 | \n", "arusaadavamaks | \n", "
17 | \n", "9.363527 | \n", "9.584093 | \n", "-0.390966 | \n", "ma | \n", "
18 | \n", "9.584093 | \n", "9.764557 | \n", "-0.053868 | \n", "tean | \n", "
19 | \n", "9.764557 | \n", "9.924968 | \n", "-0.000163 | \n", "et | \n", "