{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "5920c653-448e-43b3-93eb-12d7073ad352", "metadata": { "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/opt/espnet/tools/anaconda/envs/espnet/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "from espnet2.bin.asr_inference import Speech2Text\n", "from espnet2.bin.asr_align import CTCSegmentation\n", "import soundfile\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "id": "83058587-1a8a-4b01-92ff-e9125fbe55a3", "metadata": { "tags": [] }, "outputs": [], "source": [ "import torch\n", "torch.set_num_threads(1)" ] }, { "cell_type": "markdown", "id": "32eb58d1-5edd-4cc1-9585-daa7f16efd05", "metadata": {}, "source": [ "## Load model" ] }, { "cell_type": "code", "execution_count": 3, "id": "5e4670d6-0949-48cf-b6b1-d9cc4cf3ad65", "metadata": { "tags": [] }, "outputs": [], "source": [ "speech2text = Speech2Text(\"exp/config.yaml\", \"exp/valid.acc.ave_10best.pth\", quantize_asr_model=True, quantize_lm=True)" ] }, { "cell_type": "markdown", "id": "3192656d-6dce-4973-a649-f7ab0c72c386", "metadata": { "tags": [] }, "source": [ "## Load example audiofile to transcribe" ] }, { "cell_type": "code", "execution_count": 4, "id": "e8120e8e-3718-4a1a-ab7a-46ef98a6bc11", "metadata": { "tags": [] }, "outputs": [], "source": [ "speech, rate = soundfile.read(\"example_audio/emt16k.wav\")\n", "assert rate == 16000" ] }, { "cell_type": "code", "execution_count": 5, "id": "eec8d4b2-c27a-4780-aeed-8aa7538f70e5", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 2.64 s, sys: 6.23 ms, total: 2.65 s\n", "Wall time: 2.66 s\n" ] } ], "source": [ "%time text, *_ = speech2text(speech)" ] }, { "cell_type": "code", "execution_count": 6, "id": "39f41a8b-94c3-42d6-a989-6c7183a6f94d", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "mina tahaksin homme täna ja homme kui saan kolm krampsumas ise müüki panna\n" ] } ], "source": [ "print(text[0])" ] }, { "cell_type": "code", "execution_count": 7, "id": "812060a6-90de-4134-8d1f-9f3d98853bc2", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Input File : 'example_audio/emt16k.wav'\n", "Channels : 1\n", "Sample Rate : 16000\n", "Precision : 16-bit\n", "Duration : 00:00:12.74 = 203815 samples ~ 955.383 CDDA sectors\n", "File Size : 408k\n", "Bit Rate : 256k\n", "Sample Encoding: 16-bit Signed Integer PCM\n", "\n" ] } ], "source": [ "!soxi example_audio/emt16k.wav" ] }, { "cell_type": "markdown", "id": "7d07e8a4-1dbf-4a79-bdf0-aeaeb160ba19", "metadata": {}, "source": [ "## Example token level alignment" ] }, { "cell_type": "code", "execution_count": 8, "id": "e6b7331c-52f1-4162-b564-2e6a08b325b0", "metadata": { "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "WARNING:root:No RNN model detected; memory consumption may be high.\n" ] } ], "source": [ "aligner = CTCSegmentation(\"exp/config.yaml\", \"exp/valid.acc.ave_10best.pth\" , kaldi_style_text=False, blank_transition_cost_zero=True)\n", "segments = aligner(speech, text[0].split())" ] }, { "cell_type": "code", "execution_count": 9, "id": "e6d18b5f-3d2a-4fcf-bf4e-00480e58094a", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "utt_0000 utt 0.36 0.78 -0.0001 mina\n", "utt_0001 utt 0.78 1.19 -0.0003 tahaksin\n", "utt_0002 utt 1.19 1.59 -0.0017 homme\n", "utt_0003 utt 1.67 2.19 -0.0001 täna\n", "utt_0004 utt 3.24 3.76 -0.0037 ja\n", "utt_0005 utt 3.76 4.28 -0.0000 homme\n", "utt_0006 utt 5.61 6.13 -0.0001 kui\n", "utt_0007 utt 6.17 6.69 -0.0009 saan\n", "utt_0008 utt 7.98 8.50 -0.2285 kolm\n", "utt_0009 utt 8.50 9.34 -0.1062 krampsumas\n", "utt_0010 utt 9.34 9.54 -0.1183 ise\n", "utt_0011 utt 9.54 10.07 -0.2588 müüki\n", "utt_0012 utt 10.07 10.31 -0.1041 panna\n", "\n" ] } ], "source": [ "print(segments)" ] }, { "cell_type": "markdown", "id": "77f82a7d-08dc-40cb-88e5-48ef8c36af7d", "metadata": { "tags": [] }, "source": [ "## Get timestamps with some correction" ] }, { "cell_type": "code", "execution_count": 10, "id": "ae9f7e3f-b75d-4bcb-98d1-ae2f037fb4af", "metadata": { "tags": [] }, "outputs": [], "source": [ "def get_timestamps(aligner, speech, text, time_correction=0.2):\n", " tokens=text.split()\n", " segments = aligner(speech, tokens)\n", " df=pd.DataFrame(segments.segments)\n", " df.columns=['start', 'end', 'confidence']\n", " df['start']=df.start+time_correction\n", " df['end']=df.end+time_correction\n", " df['words']=tokens\n", " return df" ] }, { "cell_type": "code", "execution_count": 11, "id": "0215d312-1896-43f1-9782-c92aced787b7", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 2.96 s, sys: 19 ms, total: 2.98 s\n", "Wall time: 2.98 s\n" ] } ], "source": [ "speech, rate = soundfile.read(\"example_audio/oden_kypsis16k_subset2.wav\")\n", "assert rate == 16000\n", "\n", "%time text, *_ = speech2text(speech)" ] }, { "cell_type": "code", "execution_count": 12, "id": "d31d6840-3a80-411a-969c-05f4a5e3e9a1", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Input File : 'example_audio/oden_kypsis16k_subset2.wav'\n", "Channels : 1\n", "Sample Rate : 16000\n", "Precision : 16-bit\n", "Duration : 00:00:09.19 = 146983 samples ~ 688.983 CDDA sectors\n", "File Size : 294k\n", "Bit Rate : 256k\n", "Sample Encoding: 16-bit Signed Integer PCM\n", "\n" ] } ], "source": [ "!soxi example_audio/oden_kypsis16k_subset2.wav" ] }, { "cell_type": "code", "execution_count": 13, "id": "53f3b63f-9b40-432b-b58c-f5b7223252ed", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 309 ms, sys: 8.51 ms, total: 318 ms\n", "Wall time: 312 ms\n" ] } ], "source": [ "%time df_times=get_timestamps(aligner, speech, text[0])" ] }, { "cell_type": "code", "execution_count": 14, "id": "1b4dd747-4be2-4ace-a301-6207f7dd9a71", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
startendconfidencewords
00.2601730.661328-0.049087klikid
10.6613280.821789-0.003573neid
20.8232331.784560-0.001952allserva
31.7845601.985137-0.034099tekivad
42.5481973.068255-0.000037need
53.0682554.031025-0.008919lubaküpsiseid
64.7545465.274604-0.000385mis
75.2746045.415008-0.078755on
85.4150085.555412-0.000224nagu
95.5554125.836220-0.000488ilusti
105.8362206.117029-0.002274kohati
116.2388187.039684-0.013956tõlgitud
127.0396847.240261-0.002010eesti
137.2402617.681531-0.002761keelde
147.8033208.323378-0.001533see
158.3233788.644302-0.044506idee
168.6443029.326264-0.215737arusaadavamaks
\n", "
" ], "text/plain": [ " start end confidence words\n", "0 0.260173 0.661328 -0.049087 klikid\n", "1 0.661328 0.821789 -0.003573 neid\n", "2 0.823233 1.784560 -0.001952 allserva\n", "3 1.784560 1.985137 -0.034099 tekivad\n", "4 2.548197 3.068255 -0.000037 need\n", "5 3.068255 4.031025 -0.008919 lubaküpsiseid\n", "6 4.754546 5.274604 -0.000385 mis\n", "7 5.274604 5.415008 -0.078755 on\n", "8 5.415008 5.555412 -0.000224 nagu\n", "9 5.555412 5.836220 -0.000488 ilusti\n", "10 5.836220 6.117029 -0.002274 kohati\n", "11 6.238818 7.039684 -0.013956 tõlgitud\n", "12 7.039684 7.240261 -0.002010 eesti\n", "13 7.240261 7.681531 -0.002761 keelde\n", "14 7.803320 8.323378 -0.001533 see\n", "15 8.323378 8.644302 -0.044506 idee\n", "16 8.644302 9.326264 -0.215737 arusaadavamaks" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_times.head(20)" ] }, { "cell_type": "code", "execution_count": null, "id": "7a4be2b1-5e0f-4558-8097-b37be0b83785", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "1e9d45ad-c8fc-4bab-9285-b82ff3903702", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.16" } }, "nbformat": 4, "nbformat_minor": 5 }