File size: 4,592 Bytes
1689179 6df750f 1689179 a8b6268 1689179 d8378cd a8b6268 6df750f a8b6268 6df750f a8b6268 6df750f a8b6268 6df750f a8b6268 6df750f a8b6268 6df750f a8b6268 6df750f 1689179 6df750f a8b6268 d8378cd 1689179 a8b6268 6df750f 1689179 6df750f 1689179 6df750f 1689179 6df750f 1689179 6df750f 1689179 a8b6268 1689179 6df750f 1689179 6df750f 1689179 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
import gradio as gr
from pyannote.audio import Pipeline
import torch
import whisper
from huggingface_hub import login
import os
import traceback
# Login to Hugging Face if token is available
hf_token = os.environ.get("HF_TOKEN")
if not hf_token:
print("WARNING: HF_TOKEN environment variable not found. Please set it in the Space settings.")
diarization_pipeline = None
else:
try:
login(token=hf_token)
print("Successfully logged in to Hugging Face")
# Initialize the diarization pipeline
print("Loading pyannote/speaker-diarization-3.1 pipeline...")
diarization_pipeline = Pipeline.from_pretrained(
"pyannote/speaker-diarization-3.1",
use_auth_token=hf_token
)
print("Diarization pipeline loaded successfully!")
# Send pipeline to GPU if available
if torch.cuda.is_available():
print("GPU detected, moving pipeline to GPU")
diarization_pipeline.to(torch.device("cuda"))
else:
print("No GPU detected, using CPU")
except Exception as e:
print(f"Error loading diarization pipeline: {e}")
print(f"Error type: {type(e).__name__}")
print("Traceback:")
traceback.print_exc()
diarization_pipeline = None
# Load Whisper model
try:
print("Loading Whisper model...")
whisper_model = whisper.load_model("base")
print("Whisper model loaded successfully!")
except Exception as e:
print(f"Error loading Whisper model: {e}")
whisper_model = None
def transcribe_with_diarization(audio_file):
"""Process audio file for both diarization and transcription"""
if diarization_pipeline is None:
return "❌ Diarization pipeline not loaded. Please ensure HF_TOKEN is set and you have access to pyannote/speaker-diarization-3.1."
if whisper_model is None:
return "❌ Whisper model not loaded."
if audio_file is None:
return "Please upload an audio file."
try:
print(f"Processing audio file: {audio_file}")
# Step 1: Transcribe with Whisper
print("Transcribing audio with Whisper...")
transcription_result = whisper_model.transcribe(audio_file, language="pt")
segments = transcription_result["segments"]
print(f"Transcription complete. Found {len(segments)} segments")
# Step 2: Diarize with pyannote
print("Performing speaker diarization...")
diarization = diarization_pipeline(audio_file)
print("Diarization complete")
# Step 3: Match transcription segments with speaker labels
results = []
for segment in segments:
start_time = segment['start']
end_time = segment['end']
text = segment['text'].strip()
# Find the speaker at this timestamp
speaker = None
for turn, _, label in diarization.itertracks(yield_label=True):
# Check if this segment overlaps with the speaker turn
if turn.start <= start_time <= turn.end or turn.start <= end_time <= turn.end:
speaker = label
break
if speaker:
results.append(f"[{speaker}] ({start_time:.1f}s - {end_time:.1f}s): {text}")
else:
results.append(f"[Unknown] ({start_time:.1f}s - {end_time:.1f}s): {text}")
if not results:
return "No transcription available."
# Add summary
speakers = set()
for turn, _, speaker in diarization.itertracks(yield_label=True):
speakers.add(speaker)
summary = f"Found {len(speakers)} speakers in the conversation.\n\n"
return summary + "\n".join(results)
except Exception as e:
error_msg = f"Error processing audio: {str(e)}"
print(error_msg)
traceback.print_exc()
return error_msg
# Create Gradio interface
demo = gr.Interface(
fn=transcribe_with_diarization,
inputs=gr.Audio(type="filepath", label="Upload Audio File"),
outputs=gr.Textbox(label="Transcription with Speaker Identification", lines=20),
title="Speaker Diarization + Transcription",
description="Upload an audio file to identify different speakers and transcribe what they said. Uses pyannote for speaker identification and Whisper for transcription.",
examples=[],
cache_examples=False
)
if __name__ == "__main__":
demo.launch() |