Spaces:

sae8d
/

comparison

Running

App Files Files Community

comparison / app.py

sae8d

Update app.py

00efa0b verified 16 days ago

raw

history blame contribute delete

3.7 kB

	import gradio as gr
	from transformers import pipeline, AutoTokenizer, AutoFeatureExtractor
	import torch
	import numpy as np # Add this import at top

	# List of your 4 HF Whisper‑style models
	# All are Arabic‑focused ASR models; they must be `WhisperTokenizer` / `WhisperFeatureExtractor` compatible
	model_ids = [
	"IJyad/whisper-large-v3-Tarteel",
	"deepdml/whisper-medium-ar-quran-mix-norm",
	"naazimsnh02/whisper-large-v3-turbo-ar-quran",
	"Habib-HF/tarbiyah-ai-whisper-medium-merged",
	]

	# Caching pipelines to save GPU VRAM (they share tokenizer/feature_extractor if compatible)
	_registry = {}

	def _get_pipeline(model_id):
	if model_id not in _registry:
	# Whisper‑style ASR pipeline automatically handles tokenizer + feature_extractor
	pipe = pipeline(
	"automatic-speech-recognition",
	model=model_id,
	device=0 if torch.cuda.is_available() else -1,
	)
	_registry[model_id] = pipe
	return _registry[model_id]

	# Single transcription function that runs all 4 models
	def compare_on_mic(audio):
	if audio is None:
	return ["No audio input"] * 5

	sr, y = audio # y is numpy.int16 from Gradio mic

	# 🆕 FIX: Convert int16 → float32 and normalize (Whisper expects [-1.0, 1.0])
	if y.dtype == np.int16:
	y = y.astype(np.float32) / 32768.0 # Standard Whisper normalization

	# Ensure mono (squeeze channels if stereo)
	if len(y.shape) > 1:
	y = np.mean(y, axis=0)

	all_texts = []

	for model_id in model_ids:
	try:
	pipe = _get_pipeline(model_id)
	# Pass normalized float32 numpy array
	result = pipe({"sampling_rate": sr, "raw": y})
	text = result["text"].strip()
	except Exception as e:
	text = f"[Error: {str(e)[:80]}]"
	all_texts.append(f"{model_id.split('/')[-1]}: {text}")

	merged_text = "\n\n".join(all_texts)
	return all_texts + [merged_text] # 4 individual + 1 merged


	# Build Gradio layout
	with gr.Blocks(title="Compare 4 Arabic Quran Whisper Models") as demo:
	gr.Markdown("""
	# Compare Whisper‑style ASR models on mic samples
	Click Record and speak (preferably Arabic Qur’ān / tajweed content).
	All 4 models will transcribe the same mic buffer side‑by‑side.
	""")

	with gr.Row():
	mic_input = gr.Microphone(
	label="🎙️ Mic Input",
	type="numpy",
	interactive=True,
	)

	with gr.Row():
	with gr.Column():
	gr.Markdown("### 1. `IJyad/whisper-large-v3-Tarteel`")
	out1 = gr.Textbox(label="Transcription", lines=4)
	with gr.Column():
	gr.Markdown("### 2. `deepdml/whisper-medium-ar-quran-mix-norm`")
	out2 = gr.Textbox(label="Transcription", lines=4)
	with gr.Column():
	gr.Markdown("### 3. `naazimsnh02/whisper-large-v3-turbo-ar-quran`")
	out3 = gr.Textbox(label="Transcription", lines=4)
	with gr.Column():
	gr.Markdown("### 4. `Habib-HF/tarbiyah-ai-whisper-medium-merged`")
	out4 = gr.Textbox(label="Transcription", lines=4)

	# One big comparison box (optional, helps see differences at a glance)
	with gr.Row():
	gr.Markdown("### Side‑by‑side comparison")
	out_all = gr.Textbox(label="All models together", lines=8)

	# Connect mic to inference function (multiple outputs via list)
	mic_input.change(
	fn=compare_on_mic,
	inputs=[mic_input],
	outputs=[out1, out2, out3, out4, out_all]
	)

	demo.launch(debug=False) # Hugging Face Spaces will override host/port