asdrolf
commited on
Commit
•
6c746fd
1
Parent(s):
10287ea
ok
Browse files
app.py
CHANGED
@@ -1,4 +1,78 @@
|
|
1 |
import streamlit as st
|
|
|
|
|
|
|
|
|
2 |
|
3 |
-
|
4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
+
from faster_whisper import WhisperModel
|
3 |
+
from pyannote.audio import Pipeline
|
4 |
+
import pyannote.core
|
5 |
+
from collections import defaultdict
|
6 |
|
7 |
+
# Función para asignar hablantes a segmentos de transcripción
|
8 |
+
def assign_speakers_to_segments(diarization, transcription_segments):
|
9 |
+
speaker_segments = []
|
10 |
+
|
11 |
+
# Convertir diarización en un diccionario con los tiempos de inicio y fin de cada hablante
|
12 |
+
diarization_dict = defaultdict(list)
|
13 |
+
for segment, _, speaker in diarization.itertracks(yield_label=True):
|
14 |
+
diarization_dict[speaker].append((segment.start, segment.end))
|
15 |
+
|
16 |
+
for transcription_segment in transcription_segments:
|
17 |
+
start, end = transcription_segment.start, transcription_segment.end
|
18 |
+
speakers_count = defaultdict(float)
|
19 |
+
|
20 |
+
# Contar la duración de cada hablante dentro del segmento de transcripción
|
21 |
+
for speaker, times in diarization_dict.items():
|
22 |
+
for seg_start, seg_end in times:
|
23 |
+
# Calcular la intersección del tiempo de hablante con el segmento de transcripción
|
24 |
+
overlap_start = max(start, seg_start)
|
25 |
+
overlap_end = min(end, seg_end)
|
26 |
+
overlap_duration = max(0, overlap_end - overlap_start)
|
27 |
+
speakers_count[speaker] += overlap_duration
|
28 |
+
|
29 |
+
# Elegir el hablante con la mayor duración total en el segmento
|
30 |
+
if speakers_count:
|
31 |
+
speaker = max(speakers_count, key=speakers_count.get)
|
32 |
+
else:
|
33 |
+
speaker = "Unknown"
|
34 |
+
|
35 |
+
# Añadir el texto del segmento de transcripción y el hablante correspondiente
|
36 |
+
speaker_segments.append((speaker, transcription_segment.text))
|
37 |
+
|
38 |
+
return speaker_segments
|
39 |
+
|
40 |
+
# Función principal de la aplicación Streamlit
|
41 |
+
def main():
|
42 |
+
st.title("Aplicación de Diarización y Transcripción de Audio")
|
43 |
+
|
44 |
+
# Opción para cargar archivo de audio
|
45 |
+
audio_file = st.file_uploader("Cargar archivo de audio", type=['wav'])
|
46 |
+
|
47 |
+
if audio_file is not None:
|
48 |
+
# Guardar el archivo de audio cargado
|
49 |
+
with open("audio_temp.wav", "wb") as f:
|
50 |
+
f.write(audio_file.getbuffer())
|
51 |
+
|
52 |
+
# Cargar y ejecutar los modelos
|
53 |
+
speaker_segments = process_audio("audio_temp.wav")
|
54 |
+
|
55 |
+
# Mostrar los resultados
|
56 |
+
for speaker, text in speaker_segments:
|
57 |
+
st.write(f"Speaker {speaker}: {text}")
|
58 |
+
|
59 |
+
# Función para procesar el audio
|
60 |
+
def process_audio(audio_path):
|
61 |
+
# Cargar el modelo de diarización
|
62 |
+
pipeline = Pipeline.from_pretrained(
|
63 |
+
"pyannote/speaker-diarization-3.1",
|
64 |
+
use_auth_token="tu_token_aqui"
|
65 |
+
)
|
66 |
+
|
67 |
+
# Cargar el modelo de transcripción
|
68 |
+
model = WhisperModel("large-v3", device="cpu", compute_type="int8")
|
69 |
+
|
70 |
+
# Ejecutar la diarización y la transcripción
|
71 |
+
diarization = pipeline(audio_path)
|
72 |
+
segments, info = model.transcribe(audio_path)
|
73 |
+
|
74 |
+
# Asignar hablantes a segmentos de transcripción
|
75 |
+
return assign_speakers_to_segments(diarization, segments)
|
76 |
+
|
77 |
+
if __name__ == "__main__":
|
78 |
+
main()
|