In [1]:
from espnet2.bin.asr_inference import Speech2Text
from espnet2.bin.asr_align import CTCSegmentation
import soundfile
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
torch.set_num_threads(1)

## Load model

In [3]:
speech2text = Speech2Text("exp/config.yaml", "exp/valid.acc.ave_10best.pth", quantize_asr_model=True, quantize_lm=True)

## Load example audiofile to transcribe

In [4]:
speech, rate = soundfile.read("example_audio/emt16k.wav")
assert rate == 16000

In [5]:
%time text, *_ = speech2text(speech)

CPU times: user 2.64 s, sys: 6.23 ms, total: 2.65 s
Wall time: 2.66 s


In [6]:
print(text[0])

mina tahaksin homme täna ja homme kui saan kolm krampsumas ise müüki panna


In [7]:
!soxi example_audio/emt16k.wav


Input File     : 'example_audio/emt16k.wav'
Channels       : 1
Sample Rate    : 16000
Precision      : 16-bit
Duration       : 00:00:12.74 = 203815 samples ~ 955.383 CDDA sectors
File Size      : 408k
Bit Rate       : 256k
Sample Encoding: 16-bit Signed Integer PCM



## Example token level alignment

In [8]:
aligner = CTCSegmentation("exp/config.yaml", "exp/valid.acc.ave_10best.pth" , kaldi_style_text=False, blank_transition_cost_zero=True)
segments = aligner(speech, text[0].split())



In [9]:
print(segments)

utt_0000 utt 0.36 0.78 -0.0001 mina
utt_0001 utt 0.78 1.19 -0.0003 tahaksin
utt_0002 utt 1.19 1.59 -0.0017 homme
utt_0003 utt 1.67 2.19 -0.0001 täna
utt_0004 utt 3.24 3.76 -0.0037 ja
utt_0005 utt 3.76 4.28 -0.0000 homme
utt_0006 utt 5.61 6.13 -0.0001 kui
utt_0007 utt 6.17 6.69 -0.0009 saan
utt_0008 utt 7.98 8.50 -0.2285 kolm
utt_0009 utt 8.50 9.34 -0.1062 krampsumas
utt_0010 utt 9.34 9.54 -0.1183 ise
utt_0011 utt 9.54 10.07 -0.2588 müüki
utt_0012 utt 10.07 10.31 -0.1041 panna



## Get timestamps with some correction

In [10]:
def get_timestamps(aligner, speech, text, time_correction=0.2):
    tokens=text.split()
    segments = aligner(speech, tokens)
    df=pd.DataFrame(segments.segments)
    df.columns=['start', 'end', 'confidence']
    df['start']=df.start+time_correction
    df['end']=df.end+time_correction
    df['words']=tokens
    return df

In [11]:
speech, rate = soundfile.read("example_audio/oden_kypsis16k_subset2.wav")
assert rate == 16000

%time text, *_ = speech2text(speech)

CPU times: user 2.96 s, sys: 19 ms, total: 2.98 s
Wall time: 2.98 s


In [12]:
!soxi example_audio/oden_kypsis16k_subset2.wav


Input File     : 'example_audio/oden_kypsis16k_subset2.wav'
Channels       : 1
Sample Rate    : 16000
Precision      : 16-bit
Duration       : 00:00:09.19 = 146983 samples ~ 688.983 CDDA sectors
File Size      : 294k
Bit Rate       : 256k
Sample Encoding: 16-bit Signed Integer PCM



In [13]:
%time df_times=get_timestamps(aligner, speech, text[0])

CPU times: user 309 ms, sys: 8.51 ms, total: 318 ms
Wall time: 312 ms


In [14]:
df_times.head(20)

Unnamed: 0,start,end,confidence,words
0,0.260173,0.661328,-0.049087,klikid
1,0.661328,0.821789,-0.003573,neid
2,0.823233,1.78456,-0.001952,allserva
3,1.78456,1.985137,-0.034099,tekivad
4,2.548197,3.068255,-3.7e-05,need
5,3.068255,4.031025,-0.008919,lubaküpsiseid
6,4.754546,5.274604,-0.000385,mis
7,5.274604,5.415008,-0.078755,on
8,5.415008,5.555412,-0.000224,nagu
9,5.555412,5.83622,-0.000488,ilusti
