File size: 5,831 Bytes
609963b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
import gradio as gr
import numpy as np
import tensorflow as tf
from tensorflow import keras
import tensorflow_io as tfio
from huggingface_hub import from_pretrained_keras


model = from_pretrained_keras("keras-io/ctc_asr", compile=False)

characters = [x for x in "abcdefghijklmnopqrstuvwxyz'?! "]
# Mapping characters to integers
char_to_num = keras.layers.StringLookup(vocabulary=characters, oov_token="")
# Mapping integers back to original characters
num_to_char = keras.layers.StringLookup(
    vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
)

# An integer scalar Tensor. The window length in samples.
frame_length = 256
# An integer scalar Tensor. The number of samples to step.
frame_step = 160
# An integer scalar Tensor. The size of the FFT to apply.
# If not provided, uses the smallest power of 2 enclosing frame_length.
fft_length = 384

SAMPLE_RATE = 22050


def decode_batch_predictions(pred):
    input_len = np.ones(pred.shape[0]) * pred.shape[1]
    # Use greedy search. For complex tasks, you can use beam search
    results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0]
    # Iterate over the results and get back the text
    output_text = []
    for result in results:
        result = tf.strings.reduce_join(num_to_char(result)).numpy().decode("utf-8")
        output_text.append(result)
    return output_text


def load_16k_audio_wav(filename):
    # Read file content
    file_content = tf.io.read_file(filename)

    # Decode audio wave
    audio_wav, sample_rate = tf.audio.decode_wav(file_content, desired_channels=1)
    audio_wav = tf.squeeze(audio_wav, axis=-1)
    sample_rate = tf.cast(sample_rate, dtype=tf.int64)

    # Resample to 16k
    audio_wav = tfio.audio.resample(
        audio_wav, rate_in=sample_rate, rate_out=SAMPLE_RATE
    )

    return audio_wav


def mic_to_tensor(recorded_audio_file):
    sample_rate, audio = recorded_audio_file

    audio_wav = tf.constant(audio, dtype=tf.float32)
    if tf.rank(audio_wav) > 1:
        audio_wav = tf.reduce_mean(audio_wav, axis=1)
    audio_wav = tfio.audio.resample(
        audio_wav, rate_in=sample_rate, rate_out=SAMPLE_RATE
    )

    audio_wav = tf.divide(audio_wav, tf.reduce_max(tf.abs(audio_wav)))

    return audio_wav


def tensor_to_predictions(audio_tensor):
    # 3. Change type to float
    audio_tensor = tf.cast(audio_tensor, tf.float32)

    # 4. Get the spectrogram
    spectrogram = tf.signal.stft(
        audio_tensor,
        frame_length=frame_length,
        frame_step=frame_step,
        fft_length=fft_length,
    )

    # 5. We only need the magnitude, which can be derived by applying tf.abs
    spectrogram = tf.abs(spectrogram)
    spectrogram = tf.math.pow(spectrogram, 0.5)

    # 6. normalisation
    means = tf.math.reduce_mean(spectrogram, 1, keepdims=True)
    stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True)
    spectrogram = (spectrogram - means) / (stddevs + 1e-10)

    spectrogram = tf.expand_dims(spectrogram, axis=0)

    batch_predictions = model.predict(spectrogram)
    batch_predictions = decode_batch_predictions(batch_predictions)
    return batch_predictions


def clear_inputs_and_outputs():
    return [None, None, None]


def predict(recorded_audio_file, uploaded_audio_file):
    # 1. Read wav file
    if recorded_audio_file:
        audio_tensor = mic_to_tensor(recorded_audio_file)
    else:
        audio_tensor = load_16k_audio_wav(uploaded_audio_file)

    prediction = tensor_to_predictions(audio_tensor)[0]
    return prediction


# gr.Interface(
#     infer,
#     inputs=gr.Audio(source="microphone", type="filepath"),
#     outputs=gr.Textbox(lines=5, label="Input Text"),
#     #title=title,
#     #description=description,
#     #article=article,
#     #examples=examples,
#     enable_queue=True,
# ).launch(debug=True)

# Main function
if __name__ == "__main__":
    demo = gr.Blocks()

    with demo:
        gr.Markdown(
            """
            <center><h1>Automatic Speech Recognition using CTC</h1></center> \
            This space is a demo of Automatic Speech Recognition using Keras trained on LJSpeech dataset.<br> \
    In this space, you can record your voice or upload a wav file and the model will predict the words spoken in English<br><br>
            """
        )
        with gr.Row():
            ## Input
            with gr.Column():
                mic_input = gr.Audio(source="microphone", label="Record your own voice")
                upl_input = gr.Audio(
                    source="upload", type="filepath", label="Upload a wav file"
                )

                with gr.Row():
                    clr_btn = gr.Button(value="Clear", variant="secondary")
                    prd_btn = gr.Button(value="Predict")

            # Outputs
            with gr.Column():
                lbl_output = gr.Label(label="Text")

        # Credits
        with gr.Row():
            gr.Markdown(
                """
                <h4>Credits</h4>
                Author: <a href="https://twitter.com/anuragcomm"> Anurag Singh</a>.<br>
                Based on the following Keras example <a href="https://keras.io/examples/audio/ctc_asr">Automatic Speech Recognition using CTC</a> by <a href="https://rbouadjenek.github.io/">Mohamed Reda Bouadjenek</a> and <a href="https://www.linkedin.com/in/parkerhuynh/">Ngoc Dung Huynh</a><br>
                Check out the model <a href="https://huggingface.co/keras-io/ctc_asr">here</a>
                """
            )

        clr_btn.click(
            fn=clear_inputs_and_outputs,
            inputs=[],
            outputs=[mic_input, upl_input, lbl_output],
        )
        prd_btn.click(
            fn=predict,
            inputs=[mic_input, upl_input],
            outputs=[lbl_output],
        )

    demo.launch(debug=True)