#import gradio as gr import tempfile from pydub import AudioSegment from transformers import pipeline from pyannote.audio import Pipeline import gradio as gr import gradio as gr from transformers import pipeline import whisper from pyannote.audio import Pipeline import tempfile import os def load_models(model_size): if model_size == "transcriber": model_name = "clinifyemr/yoruba-model-finetuned" transcriber = pipeline("automatic-speech-recognition", model=model_name) return transcriber, None else: model = whisper.load_model(model_size) return None, model from flask import jsonify import tempfile import os import io def process_audio(audio_file, num_speakers, model_size): transcriber, whisper_model = load_models(model_size) # Ensure audio file is provided if audio_file is None: return jsonify({"error": "Audio file is required"}), 400 try: audio_file.seek(0) # Reset the file pointer with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: tmp.write(audio_file.read()) tmp_path = tmp.name # Initialize transcription_text transcription_text = None if transcriber: result = transcriber(tmp_path) transcription_text = result['text'] elif whisper_model: result = whisper_model.transcribe(tmp_path) transcription_text = result['text'] if transcription_text is None: raise ValueError("No transcription results") # Diarization process diarization_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=HF_TOKEN) diarization = diarization_pipeline(tmp_path, min_speakers=num_speakers, max_speakers=5) os.remove(tmp_path) # Cleanup the temporary file return jsonify({ "transcription": transcription_text, "diarization": diarization.get_timeline().json() }) except Exception as e: os.remove(tmp_path) # Ensure to cleanup on error return jsonify({"error": f"Error processing audio file: {e}"}), 500 def gradio_interface(audio_file, num_speakers, model_size): transcription, diarization = process_audio(audio_file, num_speakers, model_size) if transcription is None or diarization is None: return "Error in processing audio file", "No diarization result" return transcription, diarization iface = gr.Interface( fn=gradio_interface, inputs=[ gr.Audio(type="filepath", label="Upload Audio"), gr.Dropdown(choices=[1,2,3,4,5], label="Number of Speakers"), gr.Dropdown(choices=['base', 'small', 'medium', 'large', 'transcriber'], label="Model Selection") ], outputs=[ gr.Textbox(label="Transcription"), gr.JSON(label="Diarization Output") ], title="Audio Transcription and Speaker Diarization", description="Upload your audio file to transcribe and analyze speaker diarization." ) iface.launch()