File size: 4,592 Bytes
1689179
 
 
6df750f
1689179
 
a8b6268
1689179
 
 
d8378cd
a8b6268
6df750f
a8b6268
 
 
 
 
6df750f
a8b6268
6df750f
a8b6268
 
 
6df750f
a8b6268
 
 
 
6df750f
a8b6268
 
 
 
6df750f
a8b6268
 
 
6df750f
1689179
6df750f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a8b6268
 
 
d8378cd
1689179
a8b6268
 
6df750f
 
 
 
 
 
 
 
 
 
1689179
6df750f
1689179
6df750f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1689179
 
6df750f
 
 
 
 
 
1689179
6df750f
 
1689179
 
a8b6268
 
 
 
1689179
 
 
6df750f
1689179
6df750f
 
 
1689179
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import gradio as gr
from pyannote.audio import Pipeline
import torch
import whisper
from huggingface_hub import login
import os
import traceback

# Login to Hugging Face if token is available
hf_token = os.environ.get("HF_TOKEN")
if not hf_token:
    print("WARNING: HF_TOKEN environment variable not found. Please set it in the Space settings.")
    diarization_pipeline = None
else:
    try:
        login(token=hf_token)
        print("Successfully logged in to Hugging Face")
        
        # Initialize the diarization pipeline
        print("Loading pyannote/speaker-diarization-3.1 pipeline...")
        diarization_pipeline = Pipeline.from_pretrained(
            "pyannote/speaker-diarization-3.1",
            use_auth_token=hf_token
        )
        print("Diarization pipeline loaded successfully!")
        
        # Send pipeline to GPU if available
        if torch.cuda.is_available():
            print("GPU detected, moving pipeline to GPU")
            diarization_pipeline.to(torch.device("cuda"))
        else:
            print("No GPU detected, using CPU")
            
    except Exception as e:
        print(f"Error loading diarization pipeline: {e}")
        print(f"Error type: {type(e).__name__}")
        print("Traceback:")
        traceback.print_exc()
        diarization_pipeline = None

# Load Whisper model
try:
    print("Loading Whisper model...")
    whisper_model = whisper.load_model("base")
    print("Whisper model loaded successfully!")
except Exception as e:
    print(f"Error loading Whisper model: {e}")
    whisper_model = None

def transcribe_with_diarization(audio_file):
    """Process audio file for both diarization and transcription"""
    if diarization_pipeline is None:
        return "❌ Diarization pipeline not loaded. Please ensure HF_TOKEN is set and you have access to pyannote/speaker-diarization-3.1."
    
    if whisper_model is None:
        return "❌ Whisper model not loaded."
    
    if audio_file is None:
        return "Please upload an audio file."
    
    try:
        print(f"Processing audio file: {audio_file}")
        
        # Step 1: Transcribe with Whisper
        print("Transcribing audio with Whisper...")
        transcription_result = whisper_model.transcribe(audio_file, language="pt")
        segments = transcription_result["segments"]
        print(f"Transcription complete. Found {len(segments)} segments")
        
        # Step 2: Diarize with pyannote
        print("Performing speaker diarization...")
        diarization = diarization_pipeline(audio_file)
        print("Diarization complete")
        
        # Step 3: Match transcription segments with speaker labels
        results = []
        
        for segment in segments:
            start_time = segment['start']
            end_time = segment['end']
            text = segment['text'].strip()
            
            # Find the speaker at this timestamp
            speaker = None
            for turn, _, label in diarization.itertracks(yield_label=True):
                # Check if this segment overlaps with the speaker turn
                if turn.start <= start_time <= turn.end or turn.start <= end_time <= turn.end:
                    speaker = label
                    break
            
            if speaker:
                results.append(f"[{speaker}] ({start_time:.1f}s - {end_time:.1f}s): {text}")
            else:
                results.append(f"[Unknown] ({start_time:.1f}s - {end_time:.1f}s): {text}")
        
        if not results:
            return "No transcription available."
        
        # Add summary
        speakers = set()
        for turn, _, speaker in diarization.itertracks(yield_label=True):
            speakers.add(speaker)
        
        summary = f"Found {len(speakers)} speakers in the conversation.\n\n"
        return summary + "\n".join(results)
        
    except Exception as e:
        error_msg = f"Error processing audio: {str(e)}"
        print(error_msg)
        traceback.print_exc()
        return error_msg

# Create Gradio interface
demo = gr.Interface(
    fn=transcribe_with_diarization,
    inputs=gr.Audio(type="filepath", label="Upload Audio File"),
    outputs=gr.Textbox(label="Transcription with Speaker Identification", lines=20),
    title="Speaker Diarization + Transcription",
    description="Upload an audio file to identify different speakers and transcribe what they said. Uses pyannote for speaker identification and Whisper for transcription.",
    examples=[],
    cache_examples=False
)

if __name__ == "__main__":
    demo.launch()