Spaces:
Running
Running
| import gradio as gr | |
| from transformers import pipeline, AutoTokenizer, AutoFeatureExtractor | |
| import torch | |
| import numpy as np # Add this import at top | |
| # List of your 4 HF Whisper‑style models | |
| # All are Arabic‑focused ASR models; they must be `WhisperTokenizer` / `WhisperFeatureExtractor` compatible | |
| model_ids = [ | |
| "IJyad/whisper-large-v3-Tarteel", | |
| "deepdml/whisper-medium-ar-quran-mix-norm", | |
| "naazimsnh02/whisper-large-v3-turbo-ar-quran", | |
| "Habib-HF/tarbiyah-ai-whisper-medium-merged", | |
| ] | |
| # Caching pipelines to save GPU VRAM (they share tokenizer/feature_extractor if compatible) | |
| _registry = {} | |
| def _get_pipeline(model_id): | |
| if model_id not in _registry: | |
| # Whisper‑style ASR pipeline automatically handles tokenizer + feature_extractor | |
| pipe = pipeline( | |
| "automatic-speech-recognition", | |
| model=model_id, | |
| device=0 if torch.cuda.is_available() else -1, | |
| ) | |
| _registry[model_id] = pipe | |
| return _registry[model_id] | |
| # Single transcription function that runs all 4 models | |
| def compare_on_mic(audio): | |
| if audio is None: | |
| return ["No audio input"] * 5 | |
| sr, y = audio # y is numpy.int16 from Gradio mic | |
| # 🆕 FIX: Convert int16 → float32 and normalize (Whisper expects [-1.0, 1.0]) | |
| if y.dtype == np.int16: | |
| y = y.astype(np.float32) / 32768.0 # Standard Whisper normalization | |
| # Ensure mono (squeeze channels if stereo) | |
| if len(y.shape) > 1: | |
| y = np.mean(y, axis=0) | |
| all_texts = [] | |
| for model_id in model_ids: | |
| try: | |
| pipe = _get_pipeline(model_id) | |
| # Pass normalized float32 numpy array | |
| result = pipe({"sampling_rate": sr, "raw": y}) | |
| text = result["text"].strip() | |
| except Exception as e: | |
| text = f"[Error: {str(e)[:80]}]" | |
| all_texts.append(f"**{model_id.split('/')[-1]}**: {text}") | |
| merged_text = "\n\n".join(all_texts) | |
| return all_texts + [merged_text] # 4 individual + 1 merged | |
| # Build Gradio layout | |
| with gr.Blocks(title="Compare 4 Arabic Quran Whisper Models") as demo: | |
| gr.Markdown(""" | |
| # Compare Whisper‑style ASR models on mic samples | |
| Click **Record** and speak (preferably Arabic Qur’ān / tajweed content). | |
| All 4 models will transcribe the **same** mic buffer side‑by‑side. | |
| """) | |
| with gr.Row(): | |
| mic_input = gr.Microphone( | |
| label="🎙️ Mic Input", | |
| type="numpy", | |
| interactive=True, | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("### 1. `IJyad/whisper-large-v3-Tarteel`") | |
| out1 = gr.Textbox(label="Transcription", lines=4) | |
| with gr.Column(): | |
| gr.Markdown("### 2. `deepdml/whisper-medium-ar-quran-mix-norm`") | |
| out2 = gr.Textbox(label="Transcription", lines=4) | |
| with gr.Column(): | |
| gr.Markdown("### 3. `naazimsnh02/whisper-large-v3-turbo-ar-quran`") | |
| out3 = gr.Textbox(label="Transcription", lines=4) | |
| with gr.Column(): | |
| gr.Markdown("### 4. `Habib-HF/tarbiyah-ai-whisper-medium-merged`") | |
| out4 = gr.Textbox(label="Transcription", lines=4) | |
| # One big comparison box (optional, helps see differences at a glance) | |
| with gr.Row(): | |
| gr.Markdown("### Side‑by‑side comparison") | |
| out_all = gr.Textbox(label="All models together", lines=8) | |
| # Connect mic to inference function (multiple outputs via list) | |
| mic_input.change( | |
| fn=compare_on_mic, | |
| inputs=[mic_input], | |
| outputs=[out1, out2, out3, out4, out_all] | |
| ) | |
| demo.launch(debug=False) # Hugging Face Spaces will override host/port | |