Spaces:

k2-fsa
/

spoken-language-identification

Running

App Files Files Community

csukuangfj commited on Mar 24

Commit

9e8a5d8

•

1 Parent(s): 4777950

add app and model

Browse files

Files changed (4) hide show

app.py +294 -0
examples.py +52 -0
model.py +121 -0
requirements.txt +6 -0

app.py ADDED Viewed

	@@ -0,0 +1,294 @@

+#!/usr/bin/env python3
+#
+# Copyright      2022-2024  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+# See LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# References:
+# https://gradio.app/docs/#dropdown
+import logging
+import os
+import tempfile
+import time
+import urllib.request
+from datetime import datetime
+from examples import examples
+import gradio as gr
+import soundfile as sf
+from model import decode, get_pretrained_model, whisper_models
+def convert_to_wav(in_filename: str) -> str:
+    """Convert the input audio file to a wave file"""
+    out_filename = in_filename + ".wav"
+    logging.info(f"Converting '{in_filename}' to '{out_filename}'")
+    _ = os.system(
+        f"ffmpeg -hide_banner -i '{in_filename}' -ar 16000 -ac 1 '{out_filename}' -y"
+    )
+    return out_filename
+def build_html_output(s: str, style: str = "result_item_success"):
+    return f"""
+    <div class='result'>
+        <div class='result_item {style}'>
+          {s}
+        </div>
+    </div>
+    """
+def process_url(
+    repo_id: str,
+    url: str,
+):
+    logging.info(f"Processing URL: {url}")
+    with tempfile.NamedTemporaryFile() as f:
+        try:
+            urllib.request.urlretrieve(url, f.name)
+            return process(
+                in_filename=f.name,
+                repo_id=repo_id,
+            )
+        except Exception as e:
+            logging.info(str(e))
+            return "", build_html_output(str(e), "result_item_error")
+def process_uploaded_file(
+    repo_id: str,
+    in_filename: str,
+):
+    if in_filename is None or in_filename == "":
+        return "", build_html_output(
+            "Please first upload a file and then click "
+            'the button "submit for recognition"',
+            "result_item_error",
+        )
+    logging.info(f"Processing uploaded file: {in_filename}")
+    try:
+        return process(
+            in_filename=in_filename,
+            repo_id=repo_id,
+        )
+    except Exception as e:
+        logging.info(str(e))
+        return "", build_html_output(str(e), "result_item_error")
+def process_microphone(
+    repo_id: str,
+    in_filename: str,
+):
+    if in_filename is None or in_filename == "":
+        return "", build_html_output(
+            "Please first click 'Record from microphone', speak, "
+            "click 'Stop recording', and then "
+            "click the button 'submit for recognition'",
+            "result_item_error",
+        )
+    logging.info(f"Processing microphone: {in_filename}")
+    try:
+        return process(
+            in_filename=in_filename,
+            repo_id=repo_id,
+        )
+    except Exception as e:
+        logging.info(str(e))
+        return "", build_html_output(str(e), "result_item_error")
+def process(
+    repo_id: str,
+    in_filename: str,
+):
+    logging.info(f"repo_id: {repo_id}")
+    logging.info(f"in_filename: {in_filename}")
+    filename = convert_to_wav(in_filename)
+    now = datetime.now()
+    date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
+    logging.info(f"Started at {date_time}")
+    start = time.time()
+    slid = get_pretrained_model(repo_id)
+    lang = decode(slid, filename)
+    date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
+    end = time.time()
+    info = sf.info(filename)
+    duration = info.duration
+    elapsed = end - start
+    rtf = elapsed / duration
+    logging.info(f"Finished at {date_time} s. Elapsed: {elapsed: .3f} s")
+    info = f"""
+    Wave duration  : {duration: .3f} s <br/>
+    Processing time: {elapsed: .3f} s <br/>
+    RTF: {elapsed: .3f}/{duration: .3f} = {rtf:.3f} <br/>
+    """
+    if rtf > 1:
+        info += (
+            "<br/>We are loading the model for the first run. "
+            "Please run again to measure the real RTF.<br/>"
+        )
+    logging.info(info)
+    logging.info(f"\nrepo_id: {repo_id}\nDetected language: {lang}")
+    return text, build_html_output(info)
+title = "# Spoken Language Identification: [Next-gen Kaldi](https://github.com/k2-fsa) + [Whisper](https://github.com/openai/whisper/)"
+description = """
+This space shows how to do spoken language identification with [Next-gen Kaldi](https://github.com/k2-fsa)
+using [Whisper](https://github.com/openai/whisper/) multilingual models.
+It is running on a machine with 2 vCPUs with 16 GB RAM within a docker container provided by Hugging Face.
+See more information by visiting the following links:
+- <https://github.com/k2-fsa/sherpa-onnx>
+If you want to deploy it locally, please see
+<https://k2-fsa.github.io/sherpa/onnx>
+"""
+# css style is copied from
+# https://huggingface.co/spaces/alphacep/asr/blob/main/app.py#L113
+css = """
+.result {display:flex;flex-direction:column}
+.result_item {padding:15px;margin-bottom:8px;border-radius:15px;width:100%}
+.result_item_success {background-color:mediumaquamarine;color:white;align-self:start}
+.result_item_error {background-color:#ff7070;color:white;align-self:start}
+"""
+demo = gr.Blocks(css=css)
+with demo:
+    gr.Markdown(title)
+    model_choices = list(whisper_models.keys())
+    model_dropdown = gr.Dropdown(
+        choices=model_choices,
+        label="Select a model",
+        value=model_choices[0],
+    )
+    with gr.Tabs():
+        with gr.TabItem("Upload from disk"):
+            uploaded_file = gr.Audio(
+                sources=["upload"],  # Choose between "microphone", "upload"
+                type="filepath",
+                label="Upload from disk",
+            )
+            upload_button = gr.Button("Submit for recognition")
+            uploaded_output = gr.Textbox(label="Recognized speech from uploaded file")
+            uploaded_html_info = gr.HTML(label="Info")
+            gr.Examples(
+                examples=examples,
+                inputs=[
+                    model_dropdown,
+                    uploaded_file,
+                ],
+                outputs=[uploaded_output, uploaded_html_info],
+                fn=process_uploaded_file,
+            )
+        with gr.TabItem("Record from microphone"):
+            microphone = gr.Audio(
+                sources=["microphone"],  # Choose between "microphone", "upload"
+                type="filepath",
+                label="Record from microphone",
+            )
+            record_button = gr.Button("Submit for recognition")
+            recorded_output = gr.Textbox(label="Recognized speech from recordings")
+            recorded_html_info = gr.HTML(label="Info")
+            gr.Examples(
+                examples=examples,
+                inputs=[
+                    model_dropdown,
+                    microphone,
+                ],
+                outputs=[recorded_output, recorded_html_info],
+                fn=process_microphone,
+            )
+        with gr.TabItem("From URL"):
+            url_textbox = gr.Textbox(
+                max_lines=1,
+                placeholder="URL to an audio file",
+                label="URL",
+                interactive=True,
+            )
+            url_button = gr.Button("Submit for recognition")
+            url_output = gr.Textbox(label="Recognized speech from URL")
+            url_html_info = gr.HTML(label="Info")
+        upload_button.click(
+            process_uploaded_file,
+            inputs=[
+                model_dropdown,
+                uploaded_file,
+            ],
+            outputs=[uploaded_output, uploaded_html_info],
+        )
+        record_button.click(
+            process_microphone,
+            inputs=[
+                model_dropdown,
+                microphone,
+            ],
+            outputs=[recorded_output, recorded_html_info],
+        )
+        url_button.click(
+            process_url,
+            inputs=[
+                model_dropdown,
+                url_textbox,
+            ],
+            outputs=[url_output, url_html_info],
+        )
+    gr.Markdown(description)
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO)
+    demo.launch()

examples.py ADDED Viewed

	@@ -0,0 +1,52 @@

+#!/usr/bin/env python3
+#
+# Copyright      2022-2024  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+# See LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+wavs = [
+    "ar-arabic.wav"
+    "bg-bulgarian.wav"
+    "cs-czech.wav"
+    "da-danish.wav"
+    "de-german.wav"
+    "el-greek.wav"
+    "en-english.wav"
+    "es-spanish.wav"
+    "fa-persian.wav"
+    "fi-finnish.wav"
+    "fr-french.wav"
+    "hi-hindi.wav"
+    "hr-croatian.wav"
+    "id-indonesian.wav"
+    "it-italian.wav"
+    "ja-japanese.wav"
+    "ko-korean.wav"
+    "nl-dutch.wav"
+    "no-norwegian.wav"
+    "po-polish.wav"
+    "pt-portuguese.wav"
+    "ro-romanian.wav"
+    "ru-russian.wav"
+    "sk-slovak.wav"
+    "sv-swedish.wav"
+    "ta-tamil.wav"
+    "tl-tagalog.wav"
+    "tr-turkish.wav"
+    "uk-ukrainian.wav"
+    "zh-chinese.wav"
+]
+examples = [["tiny", f"./test_wavs/{w}"] for w in wavs]

model.py ADDED Viewed

	@@ -0,0 +1,121 @@

+# Copyright      2022-2024  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+# See LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import wave
+from functools import lru_cache
+from typing import Tuple
+import numpy as np
+import sherpa_onnx
+from huggingface_hub import hf_hub_download
+from iso639 import Lang
+sample_rate = 16000
+def read_wave(wave_filename: str) -> Tuple[np.ndarray, int]:
+    """
+    Args:
+      wave_filename:
+        Path to a wave file. It should be single channel and each sample should
+        be 16-bit. Its sample rate does not need to be 16kHz.
+    Returns:
+      Return a tuple containing:
+       - A 1-D array of dtype np.float32 containing the samples, which are
+       normalized to the range [-1, 1].
+       - sample rate of the wave file
+    """
+    with wave.open(wave_filename) as f:
+        assert f.getnchannels() == 1, f.getnchannels()
+        assert f.getsampwidth() == 2, f.getsampwidth()  # it is in bytes
+        num_samples = f.getnframes()
+        samples = f.readframes(num_samples)
+        samples_int16 = np.frombuffer(samples, dtype=np.int16)
+        samples_float32 = samples_int16.astype(np.float32)
+        samples_float32 = samples_float32 / 32768
+        return samples_float32, f.getframerate()
+def decode(
+    slid: sherpa_onnx.SpokenLanguageIdentification,
+    filename: str,
+) -> str:
+    s = recognizer.create_stream()
+    samples, sample_rate = read_wave(filename)
+    s.accept_waveform(sample_rate, samples)
+    lang = slid.compute(s)
+    if lang == "":
+        return "Unknown"
+    try:
+        return Lang(lang).name
+    except:
+        return lang
+def _get_nn_model_filename(
+    repo_id: str,
+    filename: str,
+    subfolder: str = ".",
+) -> str:
+    nn_model_filename = hf_hub_download(
+        repo_id=repo_id,
+        filename=filename,
+        subfolder=subfolder,
+    )
+    return nn_model_filename
+@lru_cache(maxsize=8)
+def get_pretrained_model(name: str) -> sherpa_onnx.SpokenLanguageIdentification:
+    assert name in (
+        "tiny",
+        "base",
+        "small",
+        "medium",
+    ), name
+    full_repo_id = "csukuangfj/sherpa-onnx-whisper-" + name
+    encoder = _get_nn_model_filename(
+        repo_id=full_repo_id,
+        filename=f"{name}-encoder.int8.onnx",
+    )
+    decoder = _get_nn_model_filename(
+        repo_id=full_repo_id,
+        filename=f"{name}-decoder.int8.onnx",
+    )
+    config = sherpa_onnx.SpokenLanguageIdentificationConfig(
+        whisper=sherpa_onnx.SpokenLanguageIdentificationWhisperConfig(
+            encoder=encoder,
+            decoder=decoder,
+        ),
+        num_threads=2,
+        debug=1,
+        provider="cpu",
+    )
+    return sherpa_onnx.SpokenLanguageIdentification(config)
+whisper_models = {
+    "tiny": get_pretrained_model,
+    "base": get_pretrained_model,
+    "small": get_pretrained_model,
+    "medium": get_pretrained_model,
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+soundfile
+numpy
+huggingface_hub
+sherpa-onnx>=1.9.12
+iso639-lang