Spaces:

01Yassine
/

MoulSot

Runtime error

App Files Files Community

MoulSot / app.py

01Yassine

Update app.py

cdce7e5 verified about 1 month ago

raw

history blame contribute delete

3.83 kB

	import gradio as gr
	import torchaudio
	import soundfile as sf
	import torch
	from transformers import pipeline

	# Preload both models
	models = {
	"moulsot_v0.1_2500": pipeline("automatic-speech-recognition", model="01Yassine/moulsot_v0.1_2500"),
	"moulsot_v0.2_1000": pipeline("automatic-speech-recognition", model="01Yassine/moulsot_v0.2_1000")
	}

	# Adjust generation configs
	for m in models.values():
	m.model.generation_config.input_ids = m.model.generation_config.forced_decoder_ids
	m.model.generation_config.forced_decoder_ids = None


	def load_audio(audio_path):
	"""Robustly load any audio file into (waveform, sr)"""
	try:
	waveform, sr = torchaudio.load(audio_path)
	except Exception:
	# fallback for unknown backends
	data, sr = sf.read(audio_path)
	waveform = torch.tensor(data, dtype=torch.float32).T
	if waveform.ndim == 1:
	waveform = waveform.unsqueeze(0)
	return waveform, sr


	def ensure_mono_16k(audio_path):
	"""Convert audio to mono + 16 kHz"""
	waveform, sr = load_audio(audio_path)
	if waveform.shape[0] > 1:
	waveform = waveform.mean(dim=0, keepdim=True)
	if sr != 16000:
	resampler = torchaudio.transforms.Resample(sr, 16000)
	waveform = resampler(waveform)
	sr = 16000
	return waveform, sr


	def trim_leading_silence(waveform, sr, keep_ms=100, threshold=0.01):
	"""Trim leading silence, keep ≤ keep_ms ms"""
	energy = waveform.abs().mean(dim=0)
	non_silence_idx = (energy > threshold).nonzero(as_tuple=True)[0]
	if len(non_silence_idx) == 0:
	return waveform # all silence
	first_non_silence = non_silence_idx[0].item()
	keep_samples = int(sr * (keep_ms / 1000.0))
	start = max(0, first_non_silence - keep_samples)
	return waveform[:, start:]


	def preprocess_audio(audio_path):
	waveform, sr = ensure_mono_16k(audio_path)
	waveform = trim_leading_silence(waveform, sr, keep_ms=100, threshold=0.01)
	tmp_path = "/tmp/processed_trimmed.wav"
	torchaudio.save(tmp_path, waveform, sr)
	return tmp_path


	def transcribe(audio, selected_model):
	if audio is None:
	return "Please record or upload an audio file.", "Please record or upload an audio file."

	processed_audio = preprocess_audio(audio)

	pipe_selected = models[selected_model]
	other_model = [k for k in models if k != selected_model][0]
	pipe_other = models[other_model]

	result_selected = pipe_selected(processed_audio)["text"]
	result_other = pipe_other(processed_audio)["text"]

	return result_selected, result_other


	title = "🎙️ Moulsot ASR Comparison"
	description = """
	Compare two fine-tuned Whisper models for Darija ASR:
	- 🟩 moulsot_v0.1_2500
	- 🟦 moulsot_v0.2_1000

	You can record or upload an audio sample.
	The app automatically:
	- converts to 16 kHz mono
	- removes leading silence (≤ 0.1 s)
	Then both models transcribe the result side by side.
	"""

	with gr.Blocks(title=title) as demo:
	gr.Markdown(f"# {title}\n{description}")

	with gr.Row():
	audio_input = gr.Audio(
	sources=["microphone", "upload"],
	type="filepath",
	label="🎤 Record or Upload Audio"
	)
	model_choice = gr.Radio(
	["moulsot_v0.1_2500", "moulsot_v0.2_1000"],
	label="Choose Primary Model",
	value="moulsot_v0.1_2500"
	)

	transcribe_btn = gr.Button("🚀 Transcribe")

	with gr.Row():
	output_selected = gr.Textbox(label="🟩 Selected Model Output")
	output_other = gr.Textbox(label="🟦 Other Model Output")

	transcribe_btn.click(
	fn=transcribe,
	inputs=[audio_input, model_choice],
	outputs=[output_selected, output_other]
	)

	if __name__ == "__main__":
	demo.launch()