File size: 2,905 Bytes
d9097f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#!/usr/bin/env python3

# Copyright (c)  2023  Xiaomi Corporation
# Author: Fangjun Kuang

import kaldi_native_fbank as knf
import librosa
import numpy as np
import onnxruntime


def load_cmvn():
    neg_mean = None
    inv_std = None

    with open("am.mvn") as f:
        for line in f:
            if not line.startswith("<LearnRateCoef>"):
                continue
            t = line.split()[3:-1]
            t = list(map(lambda x: float(x), t))

            if neg_mean is None:
                neg_mean = np.array(t, dtype=np.float32)
            else:
                inv_std = np.array(t, dtype=np.float32)

    return neg_mean, inv_std


def compute_feat():
    sample_rate = 16000
    samples, _ = librosa.load("jfk.wav", sr=sample_rate)
    opts = knf.FbankOptions()
    opts.frame_opts.dither = 0
    opts.frame_opts.snip_edges = False
    opts.frame_opts.samp_freq = sample_rate
    opts.mel_opts.num_bins = 80

    online_fbank = knf.OnlineFbank(opts)
    online_fbank.accept_waveform(sample_rate, (samples * 32768).tolist())
    online_fbank.input_finished()

    features = np.stack(
        [online_fbank.get_frame(i) for i in range(online_fbank.num_frames_ready)]
    )
    assert features.data.contiguous is True
    assert features.dtype == np.float32, features.dtype

    window_size = 7  # lfr_m
    window_shift = 6  # lfr_n

    T = (features.shape[0] - window_size) // window_shift + 1
    features = np.lib.stride_tricks.as_strided(
        features,
        shape=(T, features.shape[1] * window_size),
        strides=((window_shift * features.shape[1]) * 4, 4),
    )
    neg_mean, inv_std = load_cmvn()
    features = (features + neg_mean) * inv_std
    return features


# tokens.txt in paraformer has only one column
# while it has two columns ins sherpa-onnx.
# This function can handle tokens.txt from both paraformer and sherpa-onnx
def load_tokens():
    ans = dict()
    i = 0
    with open("tokens.txt", encoding="utf-8") as f:
        for line in f:
            ans[i] = line.strip().split()[0]
            i += 1
    return ans


def main():
    features = compute_feat()
    features = np.expand_dims(features, axis=0)
    features_length = np.array([features.shape[1]], dtype=np.int32)

    session_opts = onnxruntime.SessionOptions()
    session_opts.log_severity_level = 3  # error level
    sess = onnxruntime.InferenceSession("model.onnx", session_opts)

    inputs = {
        "speech": features,
        "speech_lengths": features_length,
    }
    output_names = ["logits"]

    try:
        outputs = sess.run(output_names, input_feed=inputs)
    except ONNXRuntimeError:
        print("Input wav is silence or noise")
        return

    log_probs = outputs[0].squeeze(0)
    y = log_probs.argmax(axis=-1)

    tokens = load_tokens()
    text = "".join([tokens[i] for i in y if i not in (0, 2)])
    print(text)


if __name__ == "__main__":
    main()