{ "cells": [ { "cell_type": "code", "execution_count": 80, "id": "5920c653-448e-43b3-93eb-12d7073ad352", "metadata": { "tags": [] }, "outputs": [], "source": [ "from espnet2.bin.asr_inference import Speech2Text\n", "from espnet2.bin.asr_align import CTCSegmentation\n", "import soundfile\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 81, "id": "83058587-1a8a-4b01-92ff-e9125fbe55a3", "metadata": { "tags": [] }, "outputs": [], "source": [ "import torch\n", "torch.set_num_threads(1)" ] }, { "cell_type": "markdown", "id": "32eb58d1-5edd-4cc1-9585-daa7f16efd05", "metadata": {}, "source": [ "## Load model" ] }, { "cell_type": "code", "execution_count": 82, "id": "5e4670d6-0949-48cf-b6b1-d9cc4cf3ad65", "metadata": { "tags": [] }, "outputs": [], "source": [ "speech2text = Speech2Text(\"exp/config.yaml\", \"exp/valid.acc.ave_10best.pth\")" ] }, { "cell_type": "markdown", "id": "3192656d-6dce-4973-a649-f7ab0c72c386", "metadata": { "tags": [] }, "source": [ "## Load example audiofile to transcribe" ] }, { "cell_type": "code", "execution_count": 92, "id": "e8120e8e-3718-4a1a-ab7a-46ef98a6bc11", "metadata": { "tags": [] }, "outputs": [], "source": [ "speech, rate = soundfile.read(\"example_audio/emt16k.wav\")\n", "assert rate == 16000" ] }, { "cell_type": "code", "execution_count": 93, "id": "eec8d4b2-c27a-4780-aeed-8aa7538f70e5", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 3.67 s, sys: 191 ms, total: 3.87 s\n", "Wall time: 3.86 s\n" ] } ], "source": [ "%time text, *_ = speech2text(speech)" ] }, { "cell_type": "code", "execution_count": 94, "id": "39f41a8b-94c3-42d6-a989-6c7183a6f94d", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "mina tahaksin homme täna ja homme kui saanud on kui krampsumas ise veiki panna\n" ] } ], "source": [ "print(text[0])" ] }, { "cell_type": "code", "execution_count": 116, "id": "812060a6-90de-4134-8d1f-9f3d98853bc2", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Input File : 'example_audio/emt16k.wav'\n", "Channels : 1\n", "Sample Rate : 16000\n", "Precision : 16-bit\n", "Duration : 00:00:12.74 = 203815 samples ~ 955.383 CDDA sectors\n", "File Size : 408k\n", "Bit Rate : 256k\n", "Sample Encoding: 16-bit Signed Integer PCM\n", "\n" ] } ], "source": [ "!soxi example_audio/emt16k.wav" ] }, { "cell_type": "markdown", "id": "7d07e8a4-1dbf-4a79-bdf0-aeaeb160ba19", "metadata": {}, "source": [ "## Example token level alignment" ] }, { "cell_type": "code", "execution_count": 95, "id": "e6b7331c-52f1-4162-b564-2e6a08b325b0", "metadata": { "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "WARNING:root:No RNN model detected; memory consumption may be high.\n" ] } ], "source": [ "aligner = CTCSegmentation(\"exp/config.yaml\", \"exp/valid.acc.ave_10best.pth\" , kaldi_style_text=False, blank_transition_cost_zero=True)\n", "segments = aligner(speech, text[0].split())" ] }, { "cell_type": "code", "execution_count": 96, "id": "e6d18b5f-3d2a-4fcf-bf4e-00480e58094a", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "utt_0000 utt 0.36 0.78 -0.0001 mina\n", "utt_0001 utt 0.78 1.19 -0.0003 tahaksin\n", "utt_0002 utt 1.19 1.59 -0.0017 homme\n", "utt_0003 utt 1.67 2.19 -0.0001 täna\n", "utt_0004 utt 3.24 3.76 -0.0037 ja\n", "utt_0005 utt 3.76 4.28 -0.0000 homme\n", "utt_0006 utt 5.61 6.13 -0.0001 kui\n", "utt_0007 utt 6.17 6.69 -0.0009 saanud\n", "utt_0008 utt 6.81 7.33 -0.0018 on\n", "utt_0009 utt 7.98 8.50 -0.0862 kui\n", "utt_0010 utt 8.50 9.34 -0.1062 krampsumas\n", "utt_0011 utt 9.34 9.54 -0.1183 ise\n", "utt_0012 utt 9.54 10.07 -0.2033 veiki\n", "utt_0013 utt 10.07 10.31 -0.1041 panna\n", "\n" ] } ], "source": [ "print(segments)" ] }, { "cell_type": "markdown", "id": "77f82a7d-08dc-40cb-88e5-48ef8c36af7d", "metadata": { "tags": [] }, "source": [ "## Get timestamps with some correction" ] }, { "cell_type": "code", "execution_count": 97, "id": "ae9f7e3f-b75d-4bcb-98d1-ae2f037fb4af", "metadata": { "tags": [] }, "outputs": [], "source": [ "def get_timestamps(aligner, speech, text, time_correction=0.2):\n", " tokens=text.split()\n", " segments = aligner(speech, tokens)\n", " df=pd.DataFrame(segments.segments)\n", " df.columns=['start', 'end', 'confidence']\n", " df['start']=df.start+time_correction\n", " df['end']=df.end+time_correction\n", " df['words']=tokens\n", " return df" ] }, { "cell_type": "code", "execution_count": 128, "id": "93aa6281-3b73-47b7-93ca-e90fedd8d398", "metadata": { "tags": [] }, "outputs": [], "source": [ "torch.set_num_threads(5)" ] }, { "cell_type": "code", "execution_count": 131, "id": "0215d312-1896-43f1-9782-c92aced787b7", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 14.4 s, sys: 89.6 ms, total: 14.5 s\n", "Wall time: 2.9 s\n" ] } ], "source": [ "speech, rate = soundfile.read(\"example_audio/oden_kypsis16k_subset2.wav\")\n", "assert rate == 16000\n", "\n", "%time text, *_ = speech2text(speech)" ] }, { "cell_type": "code", "execution_count": 132, "id": "d31d6840-3a80-411a-969c-05f4a5e3e9a1", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Input File : 'example_audio/oden_kypsis16k_subset2.wav'\n", "Channels : 1\n", "Sample Rate : 16000\n", "Precision : 16-bit\n", "Duration : 00:00:09.19 = 146983 samples ~ 688.983 CDDA sectors\n", "File Size : 294k\n", "Bit Rate : 256k\n", "Sample Encoding: 16-bit Signed Integer PCM\n", "\n" ] } ], "source": [ "!soxi example_audio/oden_kypsis16k_subset2.wav" ] }, { "cell_type": "code", "execution_count": 108, "id": "53f3b63f-9b40-432b-b58c-f5b7223252ed", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 474 ms, sys: 30.2 ms, total: 504 ms\n", "Wall time: 501 ms\n" ] } ], "source": [ "%time df_times=get_timestamps(aligner, speech, text[0])" ] }, { "cell_type": "code", "execution_count": 109, "id": "1b4dd747-4be2-4ace-a301-6207f7dd9a71", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
startendconfidencewords
00.2601540.661184-0.107317klikid
10.6611840.821596-0.001331neid
20.8228831.784067-0.002136allserva
31.7840671.984582-0.041078tekivad
42.5473103.067362-0.008251need
53.0673624.029833-0.007814lubaküpsiseid
64.7529735.273025-0.000333mis
75.2730255.413385-0.063720on
85.4133855.553745-0.000231nagu
95.5537455.834466-0.000573ilusti
105.8344666.115187-0.001930kohati
116.2367837.037555-0.004937tõlgitud
127.0375557.238070-0.001092eesti
137.2380707.679202-0.003088keelde
147.8007988.320850-0.001126see
158.3208508.601571-0.033408idee
168.6015719.363527-0.032846arusaadavamaks
179.3635279.584093-0.390966ma
189.5840939.764557-0.053868tean
199.7645579.924968-0.000163et
\n", "
" ], "text/plain": [ " start end confidence words\n", "0 0.260154 0.661184 -0.107317 klikid\n", "1 0.661184 0.821596 -0.001331 neid\n", "2 0.822883 1.784067 -0.002136 allserva\n", "3 1.784067 1.984582 -0.041078 tekivad\n", "4 2.547310 3.067362 -0.008251 need\n", "5 3.067362 4.029833 -0.007814 lubaküpsiseid\n", "6 4.752973 5.273025 -0.000333 mis\n", "7 5.273025 5.413385 -0.063720 on\n", "8 5.413385 5.553745 -0.000231 nagu\n", "9 5.553745 5.834466 -0.000573 ilusti\n", "10 5.834466 6.115187 -0.001930 kohati\n", "11 6.236783 7.037555 -0.004937 tõlgitud\n", "12 7.037555 7.238070 -0.001092 eesti\n", "13 7.238070 7.679202 -0.003088 keelde\n", "14 7.800798 8.320850 -0.001126 see\n", "15 8.320850 8.601571 -0.033408 idee\n", "16 8.601571 9.363527 -0.032846 arusaadavamaks\n", "17 9.363527 9.584093 -0.390966 ma\n", "18 9.584093 9.764557 -0.053868 tean\n", "19 9.764557 9.924968 -0.000163 et" ] }, "execution_count": 109, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_times.head(20)" ] }, { "cell_type": "code", "execution_count": null, "id": "7a4be2b1-5e0f-4558-8097-b37be0b83785", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.16" } }, "nbformat": 4, "nbformat_minor": 5 }