Woziii commited on
Commit
51ecff6
·
verified ·
1 Parent(s): 7ef4d9e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -109
app.py CHANGED
@@ -49,7 +49,7 @@ pipe = pipeline(
49
 
50
 
51
 
52
- def associate_speakers_with_timestamps(transcription_result, diarization, tolerance=0.02, min_segment_duration=0.05, max_words_to_merge=20):
53
  word_segments = transcription_result['chunks']
54
  diarization_segments = list(diarization.itertracks(yield_label=True))
55
  speaker_transcription = []
@@ -60,18 +60,14 @@ def associate_speakers_with_timestamps(transcription_result, diarization, tolera
60
  def flush_current_segment():
61
  nonlocal current_speaker, current_text
62
  if current_speaker and current_text:
63
- speaker_transcription.append({
64
- "speaker": current_speaker,
65
- "text": ' '.join(current_text),
66
- "start": word_segments[len(speaker_transcription)]['timestamp'][0],
67
- "end": word_segments[len(speaker_transcription) + len(current_text) - 1]['timestamp'][1]
68
- })
69
  current_text = []
70
 
71
  for word in word_segments:
72
  word_start, word_end = word['timestamp']
73
  word_text = word['text']
74
 
 
75
  matching_segment = None
76
  for segment, _, speaker in diarization_segments:
77
  if segment.start - tolerance <= word_start < segment.end + tolerance:
@@ -84,77 +80,32 @@ def associate_speakers_with_timestamps(transcription_result, diarization, tolera
84
  flush_current_segment()
85
  current_speaker = speaker
86
 
 
87
  if word_start - last_word_end > 1.0: # Pause de plus d'une seconde
88
  flush_current_segment()
89
 
90
  current_text.append(word_text)
91
  last_word_end = word_end
92
  else:
 
93
  if current_speaker:
94
  current_text.append(word_text)
95
  else:
 
96
  current_speaker = "SPEAKER_UNKNOWN"
97
  current_text.append(word_text)
98
 
99
  flush_current_segment()
100
 
101
- def detect_interruptions(transcription, time_threshold=0.5):
102
- for i in range(len(transcription) - 1):
103
- current_end = transcription[i]['end']
104
- next_start = transcription[i+1]['start']
105
- if next_start - current_end < time_threshold:
106
- transcription[i]['text'] += ' [...]'
107
- transcription[i+1]['text'] = '[...] ' + transcription[i+1]['text']
108
- return transcription
109
-
110
- speaker_transcription = detect_interruptions(speaker_transcription)
111
-
112
- def post_process_transcription(transcription, max_words):
113
- processed = []
114
- current_speaker = None
115
- current_text = []
116
- current_start = None
117
- current_end = None
118
-
119
- for segment in transcription:
120
- if segment['speaker'] == current_speaker and len(' '.join(current_text + [segment['text']]).split()) <= max_words:
121
- current_text.append(segment['text'])
122
- current_end = segment['end']
123
- else:
124
- if current_speaker:
125
- processed.append({
126
- "speaker": current_speaker,
127
- "text": ' '.join(current_text),
128
- "start": current_start,
129
- "end": current_end
130
- })
131
- current_speaker = segment['speaker']
132
- current_text = [segment['text']]
133
- current_start = segment['start']
134
- current_end = segment['end']
135
-
136
- if current_speaker:
137
- processed.append({
138
- "speaker": current_speaker,
139
- "text": ' '.join(current_text),
140
- "start": current_start,
141
- "end": current_end
142
- })
143
-
144
- return processed
145
-
146
- merged_transcription = post_process_transcription(speaker_transcription, max_words_to_merge)
147
-
148
- speakers = sorted(set(segment['speaker'] for segment in merged_transcription))
149
- metadata = {
150
- "speaker_count": len(speakers),
151
- "speakers": speakers
152
- }
153
 
154
- return {
155
- "transcription": merged_transcription,
156
- "metadata": metadata
157
- }
158
 
159
  def simplify_diarization_output(speaker_transcription):
160
  simplified = []
@@ -245,46 +196,31 @@ def transcribe_and_diarize(file_path, task, progress=gr.Progress()):
245
  progress(1.0, desc="Terminé!")
246
  return "Transcription terminée!", transcription_result['text'], speaker_transcription
247
 
248
- def format_to_markdown(transcription_result, audio_duration=None, location=None, speaker_age=None, context=None, custom_speaker_names=None):
249
- if isinstance(transcription_result, dict):
250
- metadata = transcription_result.get("metadata", {})
251
- transcription = transcription_result.get("transcription", [])
252
- else:
253
- metadata = {}
254
- transcription = transcription_result
255
-
256
- speaker_count = metadata.get("speaker_count", "non spécifié")
257
- speakers = metadata.get("speakers", [])
258
-
259
- metadata_text = "\n".join([
260
- f"- **Date de traitement** : '{datetime.now().strftime('%d/%m/%Y %H:%M')}'",
261
- f"- **Durée de l'audio** : '{audio_duration if audio_duration else '[à remplir]'} secondes'",
262
- f"- **Lieu** : '{location if location else '[non spécifié]'}'",
263
- f"- **Âge de l'intervenant** : '{f'{speaker_age} ans' if speaker_age else '[non spécifié]'}'",
264
- f"- **Contexte** : '{context if context else '[non spécifié]'}'",
265
- f"- **Nombre d'interlocuteurs** : '{speaker_count}'",
266
- f"- **Interlocuteurs bruts** : '{', '.join(speakers)}'"
267
- ])
268
-
269
  try:
270
- formatted_transcription = []
271
- for segment in transcription:
272
- speaker = segment['speaker']
273
- text = segment['text']
274
- start_time = format_time(segment['start'])
275
- end_time = format_time(segment['end'])
276
-
277
- if custom_speaker_names and speaker in custom_speaker_names:
278
- display_speaker = custom_speaker_names[speaker]
279
- else:
280
- display_speaker = speaker
281
-
282
- formatted_transcription.append(f"**[{start_time} - {end_time}] {display_speaker}**: {text}")
283
-
284
- transcription_text = "\n\n".join(formatted_transcription)
285
  except Exception as e:
286
  print(f"Error formatting speaker transcription: {e}")
287
- transcription_text = "Error formatting speaker transcription. Using raw transcription instead.\n\n" + str(transcription)
288
 
289
  formatted_output = f"""
290
  # Transcription Formatée
@@ -297,9 +233,6 @@ def format_to_markdown(transcription_result, audio_duration=None, location=None,
297
  """
298
  return formatted_output
299
 
300
- def format_time(seconds):
301
- return f"{int(seconds // 60):02d}:{int(seconds % 60):02d}"
302
-
303
  def _return_yt_html_embed(yt_url):
304
  video_id = yt_url.split("?v=")[-1]
305
  HTML_str = (
@@ -464,7 +397,6 @@ with demo:
464
  audio_duration = gr.Textbox(label="⏱️ Durée de l'audio (mm:ss)")
465
  location = gr.Textbox(label="📍 Lieu de l'enregistrement")
466
  speaker_age = gr.Number(label="👤 Âge de l'intervenant principal")
467
- custom_speaker_names = gr.TextArea(label="Noms personnalisés des locuteurs (format: SPEAKER_00: Nom1, SPEAKER_01: Nom2)")
468
  context = gr.Textbox(label="📝 Contexte de l'enregistrement")
469
 
470
  format_button = gr.Button("✨ Générer la transcription formatée", elem_classes="button-secondary")
@@ -529,7 +461,7 @@ with demo:
529
  - Modèles :
530
  - [Whisper-médium](https://huggingface.co/openai/whisper-medium) : Model size - 764M params - Tensor type F32 -
531
  - [speaker-diarization-3.1](https://huggingface.co/pyannote/speaker-diarization-3.1) : Model size - Unknow - Tensor type F32 -
532
- - Version : V.2.0.0-Bêta
533
  - Langues : FR, EN
534
  - Copyright : cc-by-nc-4.0
535
  - [En savoir +](https://huggingface.co/spaces/Woziii/scribe/blob/main/README.md)
@@ -543,9 +475,9 @@ with demo:
543
  )
544
 
545
  format_button.click(
546
- format_to_markdown,
547
- inputs=[raw_output, speaker_output, audio_duration, location, speaker_age, context, custom_speaker_names],
548
- outputs=formatted_output
549
  )
550
 
551
  mic_transcribe_button.click(
@@ -574,4 +506,4 @@ with demo:
574
 
575
 
576
  if __name__ == "__main__":
577
- demo.queue().launch()
 
49
 
50
 
51
 
52
+ def associate_speakers_with_timestamps(transcription_result, diarization, tolerance=0.02, min_segment_duration=0.05):
53
  word_segments = transcription_result['chunks']
54
  diarization_segments = list(diarization.itertracks(yield_label=True))
55
  speaker_transcription = []
 
60
  def flush_current_segment():
61
  nonlocal current_speaker, current_text
62
  if current_speaker and current_text:
63
+ speaker_transcription.append((current_speaker, ' '.join(current_text)))
 
 
 
 
 
64
  current_text = []
65
 
66
  for word in word_segments:
67
  word_start, word_end = word['timestamp']
68
  word_text = word['text']
69
 
70
+ # Trouver le segment de diarisation correspondant
71
  matching_segment = None
72
  for segment, _, speaker in diarization_segments:
73
  if segment.start - tolerance <= word_start < segment.end + tolerance:
 
80
  flush_current_segment()
81
  current_speaker = speaker
82
 
83
+ # Gérer les pauses longues
84
  if word_start - last_word_end > 1.0: # Pause de plus d'une seconde
85
  flush_current_segment()
86
 
87
  current_text.append(word_text)
88
  last_word_end = word_end
89
  else:
90
+ # Si aucun segment ne correspond, attribuer au dernier locuteur connu
91
  if current_speaker:
92
  current_text.append(word_text)
93
  else:
94
+ # Si c'est le premier mot sans correspondance, créer un nouveau segment
95
  current_speaker = "SPEAKER_UNKNOWN"
96
  current_text.append(word_text)
97
 
98
  flush_current_segment()
99
 
100
+ # Fusionner les segments courts du même locuteur
101
+ merged_transcription = []
102
+ for speaker, text in speaker_transcription:
103
+ if not merged_transcription or merged_transcription[-1][0] != speaker or len(text.split()) > 3:
104
+ merged_transcription.append((speaker, text))
105
+ else:
106
+ merged_transcription[-1] = (speaker, merged_transcription[-1][1] + " " + text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
+ return merged_transcription
 
 
 
109
 
110
  def simplify_diarization_output(speaker_transcription):
111
  simplified = []
 
196
  progress(1.0, desc="Terminé!")
197
  return "Transcription terminée!", transcription_result['text'], speaker_transcription
198
 
199
+ def format_to_markdown(transcription_text, speaker_transcription, audio_duration=None, location=None, speaker_age=None, context=None):
200
+ metadata = {
201
+ "Date de traitement": datetime.now().strftime('%d/%m/%Y %H:%M'),
202
+ "Durée de l'audio": f"{audio_duration} secondes" if audio_duration else "[à remplir]",
203
+ "Lieu": location if location else "[non spécifié]",
204
+ "Âge de l'intervenant": f"{speaker_age} ans" if speaker_age else "[non spécifié]",
205
+ "Contexte": context if context else "[non spécifié]"
206
+ }
207
+
208
+ metadata_text = "\n".join([f"- **{key}** : '{value}'" for key, value in metadata.items()])
209
+
 
 
 
 
 
 
 
 
 
 
210
  try:
211
+ if isinstance(speaker_transcription, str):
212
+ speaker_transcription = parse_simplified_diarization(speaker_transcription)
213
+
214
+ if isinstance(speaker_transcription, list) and all(isinstance(item, tuple) and len(item) == 2 for item in speaker_transcription):
215
+ formatted_transcription = []
216
+ for speaker, text in speaker_transcription:
217
+ formatted_transcription.append(f"**{speaker}**: {text}")
218
+ transcription_text = "\n\n".join(formatted_transcription)
219
+ else:
220
+ raise ValueError("Invalid speaker transcription format")
 
 
 
 
 
221
  except Exception as e:
222
  print(f"Error formatting speaker transcription: {e}")
223
+ transcription_text = "Error formatting speaker transcription. Using raw transcription instead.\n\n" + transcription_text
224
 
225
  formatted_output = f"""
226
  # Transcription Formatée
 
233
  """
234
  return formatted_output
235
 
 
 
 
236
  def _return_yt_html_embed(yt_url):
237
  video_id = yt_url.split("?v=")[-1]
238
  HTML_str = (
 
397
  audio_duration = gr.Textbox(label="⏱️ Durée de l'audio (mm:ss)")
398
  location = gr.Textbox(label="📍 Lieu de l'enregistrement")
399
  speaker_age = gr.Number(label="👤 Âge de l'intervenant principal")
 
400
  context = gr.Textbox(label="📝 Contexte de l'enregistrement")
401
 
402
  format_button = gr.Button("✨ Générer la transcription formatée", elem_classes="button-secondary")
 
461
  - Modèles :
462
  - [Whisper-médium](https://huggingface.co/openai/whisper-medium) : Model size - 764M params - Tensor type F32 -
463
  - [speaker-diarization-3.1](https://huggingface.co/pyannote/speaker-diarization-3.1) : Model size - Unknow - Tensor type F32 -
464
+ - Version : V.2.0.2-Bêta
465
  - Langues : FR, EN
466
  - Copyright : cc-by-nc-4.0
467
  - [En savoir +](https://huggingface.co/spaces/Woziii/scribe/blob/main/README.md)
 
475
  )
476
 
477
  format_button.click(
478
+ format_to_markdown,
479
+ inputs=[raw_output, speaker_output, audio_duration, location, speaker_age, context],
480
+ outputs=formatted_output
481
  )
482
 
483
  mic_transcribe_button.click(
 
506
 
507
 
508
  if __name__ == "__main__":
509
+ demo.queue().launch()