Realtime-whisper-large-v3-turbo

Running on Zero

File size: 1,489 Bytes

5d52c32
6c226f9
 
 
d790c0b
88183ad
1e8d252
6c226f9
17f14b2
f696e7e
6c226f9
 
 
 
 
f696e7e
6c226f9
 
 
5d52c32
3da85d4
1e8d252
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3da85d4
 
3df1d51
46704ba
4731eae
1e8d252
 
3da85d4
3df1d51

import spaces
import torch
import gradio as gr
from transformers import pipeline
import tempfile
import os
import uuid

MODEL_NAME = "ylacombe/whisper-large-v3-turbo"
BATCH_SIZE = 8
device = 0 if torch.cuda.is_available() else "cpu"

pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    chunk_length_s=30,
    device=device,
)

@spaces.GPU
def transcribe(inputs, previous_transcription):
    try:
        # Generate a unique filename using UUID
        filename = f"{uuid.uuid4().hex}.wav"
        filepath = os.path.join(tempfile.gettempdir(), filename)

        # Save the audio data to the temporary file
        with open(filepath, "wb") as f:
            f.write(inputs[1])

        previous_transcription += pipe(filepath, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe"}, return_timestamps=True)["text"]

        # Remove the temporary file after transcription
        os.remove(filepath)

        return previous_transcription
    except Exception as e:
        print(f"Error during transcription: {e}")
        return previous_transcription  # Return the current transcription if an error occurs

with gr.Blocks() as demo:
    with gr.Column():
        input_audio_microphone = gr.Audio(streaming=True)
        output = gr.Textbox(label="Transcription", value="")

        input_audio_microphone.stream(transcribe, [input_audio_microphone, output], [output], time_limit=45, stream_every=2, concurrency_limit=None)

demo.queue().launch()