Spaces:

bofenghuang
/

speech-to-text

Running

File size: 2,864 Bytes

import logging
import warnings

import gradio as gr
import librosa
# import torchaudio
from transformers import pipeline
from transformers.utils.logging import disable_progress_bar

warnings.filterwarnings("ignore")

disable_progress_bar()

logging.basicConfig(
    format="%(asctime)s [%(levelname)s] [%(name)s] %(message)s",
    datefmt="%Y-%m-%dT%H:%M:%SZ",
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

MODEL_NAME = "bofenghuang/asr-wav2vec2-ctc-french"
SAMPLE_RATE = 16_000

pipe = pipeline(model=MODEL_NAME)
logger.info("ASR pipeline has been initialized")


def process_audio_file(audio_file):
    # waveform, sample_rate = torchaudio.load(audio_file)
    # waveform = waveform.squeeze(axis=0)  # mono
    # # resample
    # if sample_rate != SAMPLE_RATE:
    #     resampler = torchaudio.transforms.Resample(sample_rate, SAMPLE_RATE)
    #     waveform = resampler(waveform)

    waveform, sample_rate = librosa.load(audio_file, mono=True)
    # resample
    if sample_rate != SAMPLE_RATE:
        waveform = librosa.resample(waveform, orig_sr=sample_rate, target_sr=SAMPLE_RATE)

    return waveform


def transcribe(microphone_audio_file, uploaded_audio_file):
    warning_message = ""
    if (microphone_audio_file is not None) and (uploaded_audio_file is not None):
        warning_message = (
            "WARNING: You've uploaded an audio file and used the microphone. "
            "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
        )
        audio_file = microphone_audio_file
    elif (microphone_audio_file is None) and (uploaded_audio_file is None):
        return "ERROR: You have to either use the microphone or upload an audio file"
    elif microphone_audio_file is not None:
        audio_file = microphone_audio_file
    else:
        audio_file = uploaded_audio_file

    audio_data = process_audio_file(audio_file)

    # text = pipe(audio_data)["text"]
    text = pipe(audio_data, chunk_length_s=30, stride_length_s=5)["text"]
    logger.info(f"Transcription for {audio_file}: {text}")

    return warning_message + text


iface = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(source="microphone", type="filepath", label="Record something...", optional=True),
        gr.Audio(source="upload", type="filepath", label="Upload some audio file...", optional=True),
    ],
    outputs="text",
    layout="horizontal",
    # theme="huggingface",
    title="Speech-to-Text in French",
    description=f"Realtime demo for French automatic speech recognition. Demo uses the the fine-tuned checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files of arbitrary length.",
    allow_flagging="never",
)

# iface.launch(server_name="0.0.0.0", debug=True, share=True)
iface.launch(enable_queue=True)