Update app.py
Browse files
app.py
CHANGED
@@ -41,14 +41,15 @@ device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
|
41 |
pipe = pipeline(
|
42 |
task="automatic-speech-recognition",
|
43 |
model=MODEL_NAME,
|
44 |
-
chunk_length_s=30,
|
45 |
device=device,
|
46 |
model_kwargs={"low_cpu_mem_usage": True},
|
|
|
47 |
)
|
48 |
|
49 |
|
50 |
|
51 |
|
|
|
52 |
def associate_speakers_with_timestamps(transcription_result, diarization, tolerance=0.1, min_segment_duration=0.5):
|
53 |
word_segments = transcription_result['chunks']
|
54 |
diarization_segments = list(diarization.itertracks(yield_label=True))
|
@@ -123,9 +124,11 @@ def parse_simplified_diarization(simplified_text):
|
|
123 |
def process_transcription(*args):
|
124 |
generator = transcribe_and_diarize(*args)
|
125 |
for progress_message, raw_text, speaker_transcription in generator:
|
126 |
-
|
|
|
|
|
127 |
simplified_diarization = simplify_diarization_output(speaker_transcription)
|
128 |
-
|
129 |
|
130 |
def process_yt_transcription(*args):
|
131 |
html_embed, raw_text, speaker_transcription = yt_transcribe(*args)
|
@@ -173,6 +176,10 @@ def display_progress(progress_state):
|
|
173 |
""")
|
174 |
|
175 |
@spaces.GPU(duration=120)
|
|
|
|
|
|
|
|
|
176 |
def transcribe_and_diarize(file_path, task, progress=gr.Progress()):
|
177 |
progress(0, desc="Initialisation...")
|
178 |
yield "Chargement du fichier...", None, None
|
@@ -180,10 +187,12 @@ def transcribe_and_diarize(file_path, task, progress=gr.Progress()):
|
|
180 |
progress(0.2, desc="Préparation de l'audio...")
|
181 |
yield "Préparation de l'audio...", None, None
|
182 |
|
183 |
-
progress(0.
|
184 |
-
|
185 |
-
|
186 |
-
|
|
|
|
|
187 |
progress(0.6, desc=" C'est fait 😮💨 ! Je m'active à fusionner tout ça, un instant, J'y suis presque...")
|
188 |
if diarization_pipeline:
|
189 |
diarization = diarization_pipeline(file_path)
|
@@ -391,7 +400,7 @@ with demo:
|
|
391 |
progress_display = gr.Markdown(label="État de la progression")
|
392 |
|
393 |
with gr.Accordion("Résultats 📊", open=True):
|
394 |
-
|
395 |
speaker_output = gr.Textbox(label="👥 Diarisation (format simplifié)", info="Identification des locuteurs. Format : 'SPEAKER_XX: texte'")
|
396 |
with gr.Accordion("Métadonnées (optionnel) 📌", open=False):
|
397 |
audio_duration = gr.Textbox(label="⏱️ Durée de l'audio (mm:ss)")
|
@@ -474,10 +483,11 @@ with demo:
|
|
474 |
""")
|
475 |
|
476 |
# Connexions des boutons aux fonctions appropriées
|
477 |
-
|
478 |
-
|
479 |
-
|
480 |
-
|
|
|
481 |
)
|
482 |
|
483 |
format_button.click(
|
|
|
41 |
pipe = pipeline(
|
42 |
task="automatic-speech-recognition",
|
43 |
model=MODEL_NAME,
|
|
|
44 |
device=device,
|
45 |
model_kwargs={"low_cpu_mem_usage": True},
|
46 |
+
return_timestamps="word"
|
47 |
)
|
48 |
|
49 |
|
50 |
|
51 |
|
52 |
+
|
53 |
def associate_speakers_with_timestamps(transcription_result, diarization, tolerance=0.1, min_segment_duration=0.5):
|
54 |
word_segments = transcription_result['chunks']
|
55 |
diarization_segments = list(diarization.itertracks(yield_label=True))
|
|
|
124 |
def process_transcription(*args):
|
125 |
generator = transcribe_and_diarize(*args)
|
126 |
for progress_message, raw_text, speaker_transcription in generator:
|
127 |
+
yield progress_message, raw_text, "" # Streaming de la transcription brute
|
128 |
+
|
129 |
+
# Une fois la transcription terminée, effectuez la diarisation
|
130 |
simplified_diarization = simplify_diarization_output(speaker_transcription)
|
131 |
+
yield progress_message, raw_text, simplified_diarization
|
132 |
|
133 |
def process_yt_transcription(*args):
|
134 |
html_embed, raw_text, speaker_transcription = yt_transcribe(*args)
|
|
|
176 |
""")
|
177 |
|
178 |
@spaces.GPU(duration=120)
|
179 |
+
def stream_transcription(audio):
|
180 |
+
for result in pipe(audio, chunk_length_s=10, stride_length_s=(4, 2)):
|
181 |
+
yield result["text"]
|
182 |
+
|
183 |
def transcribe_and_diarize(file_path, task, progress=gr.Progress()):
|
184 |
progress(0, desc="Initialisation...")
|
185 |
yield "Chargement du fichier...", None, None
|
|
|
187 |
progress(0.2, desc="Préparation de l'audio...")
|
188 |
yield "Préparation de l'audio...", None, None
|
189 |
|
190 |
+
progress(0.3, desc="Laissez moi quelques minutes pour déchiffrer les voix et rédiger l'audio 🤓 ✍️ ...")
|
191 |
+
transcription = ""
|
192 |
+
for chunk in stream_transcription(audio_np):
|
193 |
+
transcription += chunk
|
194 |
+
yield "Transcription en cours...", transcription, []
|
195 |
+
|
196 |
progress(0.6, desc=" C'est fait 😮💨 ! Je m'active à fusionner tout ça, un instant, J'y suis presque...")
|
197 |
if diarization_pipeline:
|
198 |
diarization = diarization_pipeline(file_path)
|
|
|
400 |
progress_display = gr.Markdown(label="État de la progression")
|
401 |
|
402 |
with gr.Accordion("Résultats 📊", open=True):
|
403 |
+
transcription_output = gr.Textbox(label="📝 Transcription brute", info="Texte généré par le modèle. Modifiable si nécessaire.")
|
404 |
speaker_output = gr.Textbox(label="👥 Diarisation (format simplifié)", info="Identification des locuteurs. Format : 'SPEAKER_XX: texte'")
|
405 |
with gr.Accordion("Métadonnées (optionnel) 📌", open=False):
|
406 |
audio_duration = gr.Textbox(label="⏱️ Durée de l'audio (mm:ss)")
|
|
|
483 |
""")
|
484 |
|
485 |
# Connexions des boutons aux fonctions appropriées
|
486 |
+
submit_button.click(
|
487 |
+
process_transcription,
|
488 |
+
inputs=[audio_input],
|
489 |
+
outputs=[progress_output, transcription_output, diarization_output],
|
490 |
+
show_progress=True,
|
491 |
)
|
492 |
|
493 |
format_button.click(
|