DereAbdulhameed's picture
Update app.py
1073b8a verified
#import gradio as gr
import tempfile
from pydub import AudioSegment
from transformers import pipeline
from pyannote.audio import Pipeline
import gradio as gr
import gradio as gr
from transformers import pipeline
import whisper
from pyannote.audio import Pipeline
import tempfile
import os
def load_models(model_size):
if model_size == "transcriber":
model_name = "clinifyemr/yoruba-model-finetuned"
transcriber = pipeline("automatic-speech-recognition", model=model_name)
return transcriber, None
else:
model = whisper.load_model(model_size)
return None, model
from flask import jsonify
import tempfile
import os
import io
def process_audio(audio_file, num_speakers, model_size):
transcriber, whisper_model = load_models(model_size)
# Ensure audio file is provided
if audio_file is None:
return jsonify({"error": "Audio file is required"}), 400
try:
audio_file.seek(0) # Reset the file pointer
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
tmp.write(audio_file.read())
tmp_path = tmp.name
# Initialize transcription_text
transcription_text = None
if transcriber:
result = transcriber(tmp_path)
transcription_text = result['text']
elif whisper_model:
result = whisper_model.transcribe(tmp_path)
transcription_text = result['text']
if transcription_text is None:
raise ValueError("No transcription results")
# Diarization process
diarization_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=HF_TOKEN)
diarization = diarization_pipeline(tmp_path, min_speakers=num_speakers, max_speakers=5)
os.remove(tmp_path) # Cleanup the temporary file
return jsonify({
"transcription": transcription_text,
"diarization": diarization.get_timeline().json()
})
except Exception as e:
os.remove(tmp_path) # Ensure to cleanup on error
return jsonify({"error": f"Error processing audio file: {e}"}), 500
def gradio_interface(audio_file, num_speakers, model_size):
transcription, diarization = process_audio(audio_file, num_speakers, model_size)
if transcription is None or diarization is None:
return "Error in processing audio file", "No diarization result"
return transcription, diarization
iface = gr.Interface(
fn=gradio_interface,
inputs=[
gr.Audio(type="filepath", label="Upload Audio"),
gr.Dropdown(choices=[1,2,3,4,5], label="Number of Speakers"),
gr.Dropdown(choices=['base', 'small', 'medium', 'large', 'transcriber'], label="Model Selection")
],
outputs=[
gr.Textbox(label="Transcription"),
gr.JSON(label="Diarization Output")
],
title="Audio Transcription and Speaker Diarization",
description="Upload your audio file to transcribe and analyze speaker diarization."
)
iface.launch()