Spaces:

Emmanuel08
/

CCI_Realtime_Transcribing_model

Sleeping

File size: 4,889 Bytes

32b6530
f33e6ad
32b6530
f33e6ad
32b6530
 
d63bba0
32b6530
a612649
32b6530
d63bba0
a612649
32b6530
a612649
32b6530
d63bba0
32b6530
 
 
d63bba0
 
 
 
32b6530
f33e6ad
32b6530
 
 
 
 
 
a612649
32b6530
 
a612649
32b6530
 
d63bba0
32b6530
 
 
 
 
 
 
 
826b8e9
32b6530
 
 
a612649
826b8e9
f33e6ad
 
826b8e9
 
 
 
 
 
f33e6ad
826b8e9
 
 
 
 
 
 
 
 
d63bba0
826b8e9
 
 
 
 
 
f33e6ad
826b8e9
f33e6ad
826b8e9
 
 
 
 
 
 
 
 
 
 
 
d63bba0
826b8e9
 
 
d63bba0
826b8e9
a612649
 
826b8e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d63bba0
826b8e9
 
f33e6ad
826b8e9
 
 
 
 
 
 
 
 
 
 
 
 
d63bba0
826b8e9
 
 
d63bba0
826b8e9

import torch
import torchaudio
import gradio as gr
import time
import numpy as np
import scipy.io.wavfile
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

# ✅ 1️⃣ Use "whisper-medium" for the best balance of speed & accuracy
device = "cpu"
torch_dtype = torch.float32
MODEL_NAME = "openai/whisper-medium"

# ✅ 2️⃣ Load Whisper Model on CPU
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    MODEL_NAME, torch_dtype=torch_dtype, use_safetensors=True
)
model.to(device)

# ✅ 3️⃣ Speed up execution with torch.compile()
model = torch.compile(model)  # ✅ Faster inference on CPU

# ✅ 4️⃣ Load Processor & Pipeline
processor = AutoProcessor.from_pretrained(MODEL_NAME)
processor.feature_extractor.sampling_rate = 16000  # ✅ Set correct sampling rate

pipe = pipeline(
    task="automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    chunk_length_s=10,  # ✅ Longer chunks for better accuracy
    torch_dtype=torch_dtype,
    device=device,
    generate_kwargs={"num_beams": 5, "language": "en", "temperature": 0.1},  # ✅ Beam search + English
)

# ✅ 5️⃣ Real-Time Streaming Transcription (Microphone)
def stream_transcribe(stream, new_chunk):
    start_time = time.time()
    try:
        sr, y = new_chunk

        # ✅ Convert stereo to mono
        if y.ndim > 1:
            y = y.mean(axis=1)

        y = y.astype(np.float32)
        y /= np.max(np.abs(y))

        # ✅ Resample audio to 16kHz using torchaudio
        y_tensor = torch.tensor(y)
        y_resampled = torchaudio.functional.resample(y_tensor, orig_freq=sr, new_freq=16000).numpy()

        # ✅ Append to Stream
        if stream is not None:
            stream = np.concatenate([stream, y_resampled])
        else:
            stream = y_resampled
            
        # ✅ Run Transcription with Optimized Parameters
        transcription = pipe({"sampling_rate": 16000, "raw": stream})["text"]
        latency = time.time() - start_time

        return stream, transcription, f"{latency:.2f} sec"

    except Exception as e:
        print(f"Error: {e}")
        return stream, str(e), "Error"

# ✅ 6️⃣ Transcription for File Upload
def transcribe(inputs, previous_transcription):
    start_time = time.time()
    try:
        # ✅ Convert file input to correct format
        sample_rate, audio_data = inputs

        # ✅ Resample using torchaudio (optimized)
        audio_tensor = torch.tensor(audio_data)
        resampled_audio = torchaudio.functional.resample(audio_tensor, orig_freq=sample_rate, new_freq=16000).numpy()

        transcription = pipe({"sampling_rate": 16000, "raw": resampled_audio})["text"]

        previous_transcription += transcription
        latency = time.time() - start_time

        return previous_transcription, f"{latency:.2f} sec"

    except Exception as e:
        print(f"Error: {e}")
        return previous_transcription, "Error"

# ✅ 7️⃣ Clear Function
def clear():
    return ""

# ✅ 8️⃣ Gradio Interface (Microphone Streaming)
with gr.Blocks() as microphone:
    gr.Markdown(f"# Whisper Medium - High Accuracy Transcription (Optimized CPU) 🎙️")
    gr.Markdown(f"Using [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) for best speech-to-text performance.")

    with gr.Row():
        input_audio_microphone = gr.Audio(sources=["microphone"], type="numpy", streaming=True)
        output = gr.Textbox(label="Live Transcription", value="")
        latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0")

    with gr.Row():
        clear_button = gr.Button("Clear Output")

    state = gr.State()
    input_audio_microphone.stream(
        stream_transcribe, [state, input_audio_microphone], 
        [state, output, latency_textbox], time_limit=30, stream_every=1
    )
    clear_button.click(clear, outputs=[output])

# ✅ 9️⃣ Gradio Interface (File Upload)
with gr.Blocks() as file:
    gr.Markdown(f"# Upload Audio File for Transcription 🎵")
    gr.Markdown(f"Using [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) for better transcription accuracy.")

    with gr.Row():
        input_audio = gr.Audio(sources=["upload"], type="numpy")
        output = gr.Textbox(label="Transcription", value="")
        latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0")

    with gr.Row():
        submit_button = gr.Button("Submit")
        clear_button = gr.Button("Clear Output")

    submit_button.click(transcribe, [input_audio, output], [output, latency_textbox])
    clear_button.click(clear, outputs=[output])

# ✅ 🔟 Final Gradio App
with gr.Blocks(theme=gr.themes.Ocean()) as demo:
    gr.TabbedInterface([microphone, file], ["Microphone", "Upload Audio"])

# ✅ 1️⃣1️⃣ Run Gradio Locally
if __name__ == "__main__":
    demo.launch()