oceansweep commited on
Commit
74055ee
1 Parent(s): 0f1bc91

Upload 3 files

Browse files
App_Function_Libraries/Audio/Audio_Files.py CHANGED
@@ -19,25 +19,25 @@ import logging
19
  import os
20
  import subprocess
21
  import tempfile
 
22
  import uuid
23
  from datetime import datetime
24
  from pathlib import Path
25
-
 
26
  import requests
27
  import yt_dlp
28
-
29
- from App_Function_Libraries.Audio.Audio_Transcription_Lib import speech_to_text
30
- from App_Function_Libraries.Chunk_Lib import improved_chunking_process
31
  #
32
  # Local Imports
33
- from App_Function_Libraries.DB.DB_Manager import add_media_to_database, add_media_with_keywords, \
34
  check_media_and_whisper_model
35
- from App_Function_Libraries.Summarization.Summarization_General_Lib import save_transcription_and_summary, perform_transcription, \
36
- perform_summarization
37
- from App_Function_Libraries.Utils.Utils import create_download_directory, save_segments_to_json, downloaded_files, \
38
- sanitize_filename
39
  from App_Function_Libraries.Video_DL_Ingestion_Lib import extract_metadata
40
-
 
41
  #
42
  #######################################################################################################################
43
  # Function Definitions
@@ -106,168 +106,34 @@ def download_audio_file(url, current_whisper_model="", use_cookies=False, cookie
106
  logging.error(f"Unexpected error downloading audio file: {str(e)}")
107
  raise
108
 
109
-
110
- def process_audio(
111
- audio_file_path,
112
- num_speakers=2,
113
- whisper_model="small.en",
114
- custom_prompt_input=None,
115
- offset=0,
116
- api_name=None,
117
- api_key=None,
118
- vad_filter=False,
119
- rolling_summarization=False,
120
- detail_level=0.01,
121
- keywords="default,no_keyword_set",
122
- chunk_text_by_words=False,
123
- max_words=0,
124
- chunk_text_by_sentences=False,
125
- max_sentences=0,
126
- chunk_text_by_paragraphs=False,
127
- max_paragraphs=0,
128
- chunk_text_by_tokens=False,
129
- max_tokens=0
130
- ):
131
- try:
132
-
133
- # Perform transcription
134
- audio_file_path, segments = perform_transcription(audio_file_path, offset, whisper_model, vad_filter)
135
-
136
- if audio_file_path is None or segments is None:
137
- logging.error("Process_Audio: Transcription failed or segments not available.")
138
- return "Process_Audio: Transcription failed.", None, None, None, None, None
139
-
140
- logging.debug(f"Process_Audio: Transcription audio_file: {audio_file_path}")
141
- logging.debug(f"Process_Audio: Transcription segments: {segments}")
142
-
143
- transcription_text = {'audio_file': audio_file_path, 'transcription': segments}
144
- logging.debug(f"Process_Audio: Transcription text: {transcription_text}")
145
-
146
- # Save segments to JSON
147
- segments_json_path = save_segments_to_json(segments)
148
-
149
- # Perform summarization
150
- summary_text = None
151
- if api_name:
152
- if rolling_summarization is not None:
153
- pass
154
- # FIXME rolling summarization
155
- # summary_text = rolling_summarize_function(
156
- # transcription_text,
157
- # detail=detail_level,
158
- # api_name=api_name,
159
- # api_key=api_key,
160
- # custom_prompt=custom_prompt_input,
161
- # chunk_by_words=chunk_text_by_words,
162
- # max_words=max_words,
163
- # chunk_by_sentences=chunk_text_by_sentences,
164
- # max_sentences=max_sentences,
165
- # chunk_by_paragraphs=chunk_text_by_paragraphs,
166
- # max_paragraphs=max_paragraphs,
167
- # chunk_by_tokens=chunk_text_by_tokens,
168
- # max_tokens=max_tokens
169
- # )
170
- else:
171
- summary_text = perform_summarization(api_name, segments_json_path, custom_prompt_input, api_key)
172
-
173
- if summary_text is None:
174
- logging.error("Summary text is None. Check summarization function.")
175
- summary_file_path = None
176
- else:
177
- summary_text = 'Summary not available'
178
- summary_file_path = None
179
-
180
- # Save transcription and summary
181
- download_path = create_download_directory("Audio_Processing")
182
- json_file_path, summary_file_path = save_transcription_and_summary(transcription_text, summary_text,
183
- download_path)
184
-
185
- # Update function call to add_media_to_database so that it properly applies the title, author and file type
186
- # Add to database
187
- add_media_to_database(None, {'title': 'Audio File', 'author': 'Unknown'}, segments, summary_text, keywords,
188
- custom_prompt_input, whisper_model)
189
-
190
- return transcription_text, summary_text, json_file_path, summary_file_path, None, None
191
-
192
- except Exception as e:
193
- logging.error(f"Error in process_audio: {str(e)}")
194
- return str(e), None, None, None, None, None
195
-
196
-
197
- def process_single_audio(audio_file_path, whisper_model, api_name, api_key, keep_original,custom_keywords, source,
198
- custom_prompt_input, chunk_method, max_chunk_size, chunk_overlap, use_adaptive_chunking,
199
- use_multi_level_chunking, chunk_language):
200
- progress = []
201
- transcription = ""
202
- summary = ""
203
-
204
- def update_progress(message):
205
- progress.append(message)
206
- return "\n".join(progress)
207
-
208
- try:
209
- # Check file size before processing
210
- file_size = os.path.getsize(audio_file_path)
211
- if file_size > MAX_FILE_SIZE:
212
- update_progress(f"File size ({file_size / (1024 * 1024):.2f} MB) exceeds the maximum limit of {MAX_FILE_SIZE / (1024 * 1024):.2f} MB. Skipping this file.")
213
- return "\n".join(progress), "", ""
214
-
215
- # Perform transcription
216
- update_progress("Starting transcription...")
217
- segments = speech_to_text(audio_file_path, whisper_model=whisper_model)
218
- transcription = " ".join([segment['Text'] for segment in segments])
219
- update_progress("Audio transcribed successfully.")
220
-
221
- # Perform summarization if API is provided
222
- if api_name and api_key:
223
- update_progress("Starting summarization...")
224
- summary = perform_summarization(api_name, transcription, "Summarize the following audio transcript",
225
- api_key)
226
- update_progress("Audio summarized successfully.")
227
- else:
228
- summary = "No summary available"
229
-
230
- # Prepare keywords
231
- keywords = "audio,transcription"
232
- if custom_keywords:
233
- keywords += f",{custom_keywords}"
234
-
235
- # Add to database
236
- add_media_with_keywords(
237
- url=source,
238
- title=os.path.basename(audio_file_path),
239
- media_type='audio',
240
- content=transcription,
241
- keywords=keywords,
242
- prompt="Summarize the following audio transcript",
243
- summary=summary,
244
- transcription_model=whisper_model,
245
- author="Unknown",
246
- ingestion_date=None # This will use the current date
247
- )
248
- update_progress("Audio file added to database successfully.")
249
-
250
- if not keep_original and source != "Uploaded File":
251
- os.remove(audio_file_path)
252
- update_progress(f"Temporary file {audio_file_path} removed.")
253
- elif keep_original and source != "Uploaded File":
254
- update_progress(f"Original audio file kept at: {audio_file_path}")
255
-
256
- except Exception as e:
257
- update_progress(f"Error processing {source}: {str(e)}")
258
- transcription = f"Error: {str(e)}"
259
- summary = "No summary due to error"
260
-
261
- return "\n".join(progress), transcription, summary
262
-
263
-
264
  def process_audio_files(audio_urls, audio_file, whisper_model, api_name, api_key, use_cookies, cookies, keep_original,
265
  custom_keywords, custom_prompt_input, chunk_method, max_chunk_size, chunk_overlap,
266
- use_adaptive_chunking, use_multi_level_chunking, chunk_language, diarize):
 
 
 
 
 
267
  progress = []
268
- temp_files = []
269
  all_transcriptions = []
270
  all_summaries = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
 
272
  def update_progress(message):
273
  progress.append(message)
@@ -335,6 +201,12 @@ def process_audio_files(audio_urls, audio_file, whisper_model, api_name, api_key
335
  audio_file_path = download_audio_file(url, use_cookies, cookies)
336
  if not os.path.exists(audio_file_path):
337
  update_progress(f"Downloaded file not found: {audio_file_path}")
 
 
 
 
 
 
338
  continue
339
 
340
  temp_files.append(audio_file_path)
@@ -344,6 +216,12 @@ def process_audio_files(audio_urls, audio_file, whisper_model, api_name, api_key
344
  reencoded_mp3_path = reencode_mp3(audio_file_path)
345
  if not os.path.exists(reencoded_mp3_path):
346
  update_progress(f"Re-encoded file not found: {reencoded_mp3_path}")
 
 
 
 
 
 
347
  continue
348
 
349
  temp_files.append(reencoded_mp3_path)
@@ -352,6 +230,12 @@ def process_audio_files(audio_urls, audio_file, whisper_model, api_name, api_key
352
  wav_file_path = convert_mp3_to_wav(reencoded_mp3_path)
353
  if not os.path.exists(wav_file_path):
354
  update_progress(f"Converted WAV file not found: {wav_file_path}")
 
 
 
 
 
 
355
  continue
356
 
357
  temp_files.append(wav_file_path)
@@ -370,20 +254,36 @@ def process_audio_files(audio_urls, audio_file, whisper_model, api_name, api_key
370
  segments = segments['segments']
371
 
372
  if isinstance(segments, list):
373
- transcription = " ".join([segment.get('Text', '') for segment in segments])
 
 
 
374
  update_progress("Audio transcribed successfully.")
375
  else:
376
  update_progress("Unexpected segments format received from speech_to_text.")
377
  logging.error(f"Unexpected segments format: {segments}")
 
 
 
 
 
 
378
  continue
379
 
380
  if not transcription.strip():
381
  update_progress("Transcription is empty.")
 
 
 
 
 
 
382
  else:
383
  # Apply chunking
384
  chunked_text = improved_chunking_process(transcription, chunk_options)
385
 
386
  # Summarize
 
387
  if api_name:
388
  try:
389
  summary = perform_summarization(api_name, chunked_text, custom_prompt_input, api_key)
@@ -391,16 +291,25 @@ def process_audio_files(audio_urls, audio_file, whisper_model, api_name, api_key
391
  except Exception as e:
392
  logging.error(f"Error during summarization: {str(e)}")
393
  summary = "Summary generation failed"
 
 
 
 
 
 
394
  else:
395
  summary = "No summary available (API not provided)"
396
 
397
  all_transcriptions.append(transcription)
398
  all_summaries.append(summary)
399
 
 
 
 
400
  # Add to database
401
  add_media_with_keywords(
402
  url=url,
403
- title=os.path.basename(wav_file_path),
404
  media_type='audio',
405
  content=transcription,
406
  keywords=custom_keywords,
@@ -411,79 +320,129 @@ def process_audio_files(audio_urls, audio_file, whisper_model, api_name, api_key
411
  ingestion_date=datetime.now().strftime('%Y-%m-%d')
412
  )
413
  update_progress("Audio file processed and added to database.")
 
 
 
 
 
 
414
 
415
  # Process uploaded file if provided
416
  if audio_file:
 
417
  if os.path.getsize(audio_file.name) > MAX_FILE_SIZE:
418
  update_progress(
419
  f"Uploaded file size exceeds the maximum limit of {MAX_FILE_SIZE / (1024 * 1024):.2f}MB. Skipping this file.")
420
  else:
421
- # Re-encode MP3 to fix potential issues
422
- reencoded_mp3_path = reencode_mp3(audio_file.name)
423
- if not os.path.exists(reencoded_mp3_path):
424
- update_progress(f"Re-encoded file not found: {reencoded_mp3_path}")
425
- return update_progress("Processing failed: Re-encoded file not found"), "", ""
426
-
427
- temp_files.append(reencoded_mp3_path)
428
-
429
- # Convert re-encoded MP3 to WAV
430
- wav_file_path = convert_mp3_to_wav(reencoded_mp3_path)
431
- if not os.path.exists(wav_file_path):
432
- update_progress(f"Converted WAV file not found: {wav_file_path}")
433
- return update_progress("Processing failed: Converted WAV file not found"), "", ""
434
-
435
- temp_files.append(wav_file_path)
436
-
437
- # Initialize transcription
438
- transcription = ""
439
-
440
- if diarize:
441
- segments = speech_to_text(wav_file_path, whisper_model=whisper_model, diarize=True)
442
- else:
443
- segments = speech_to_text(wav_file_path, whisper_model=whisper_model)
444
-
445
- # Handle segments nested under 'segments' key
446
- if isinstance(segments, dict) and 'segments' in segments:
447
- segments = segments['segments']
448
-
449
- if isinstance(segments, list):
450
- transcription = " ".join([segment.get('Text', '') for segment in segments])
451
- else:
452
- update_progress("Unexpected segments format received from speech_to_text.")
453
- logging.error(f"Unexpected segments format: {segments}")
454
-
455
- chunked_text = improved_chunking_process(transcription, chunk_options)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
456
 
457
- if api_name and api_key:
458
- try:
459
- summary = perform_summarization(api_name, chunked_text, custom_prompt_input, api_key)
460
- update_progress("Audio summarized successfully.")
461
- except Exception as e:
462
- logging.error(f"Error during summarization: {str(e)}")
463
- summary = "Summary generation failed"
464
- else:
465
- summary = "No summary available (API not provided)"
466
 
467
- all_transcriptions.append(transcription)
468
- all_summaries.append(summary)
 
 
 
 
469
 
470
- add_media_with_keywords(
471
- url="Uploaded File",
472
- title=os.path.basename(wav_file_path),
473
- media_type='audio',
474
- content=transcription,
475
- keywords=custom_keywords,
476
- prompt=custom_prompt_input,
477
- summary=summary,
478
- transcription_model=whisper_model,
479
- author="Unknown",
480
- ingestion_date=datetime.now().strftime('%Y-%m-%d')
481
- )
482
- update_progress("Uploaded file processed and added to database.")
483
 
484
- # Final cleanup
485
- if not keep_original:
486
- cleanup_files()
487
 
488
  final_progress = update_progress("All processing complete.")
489
  final_transcriptions = "\n\n".join(all_transcriptions)
@@ -493,10 +452,39 @@ def process_audio_files(audio_urls, audio_file, whisper_model, api_name, api_key
493
 
494
  except Exception as e:
495
  logging.error(f"Error processing audio files: {str(e)}")
 
 
 
 
 
496
  cleanup_files()
497
  return update_progress(f"Processing failed: {str(e)}"), "", ""
498
 
499
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
500
  def download_youtube_audio(url):
501
  try:
502
  # Determine ffmpeg path based on the operating system.
@@ -564,12 +552,55 @@ def download_youtube_audio(url):
564
  def process_podcast(url, title, author, keywords, custom_prompt, api_name, api_key, whisper_model,
565
  keep_original=False, enable_diarization=False, use_cookies=False, cookies=None,
566
  chunk_method=None, max_chunk_size=300, chunk_overlap=0, use_adaptive_chunking=False,
567
- use_multi_level_chunking=False, chunk_language='english'):
568
- progress = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
569
  error_message = ""
570
  temp_files = []
571
 
 
 
 
 
 
 
572
  def update_progress(message):
 
 
 
 
 
 
 
 
 
573
  progress.append(message)
574
  return "\n".join(progress)
575
 
@@ -583,13 +614,21 @@ def process_podcast(url, title, author, keywords, custom_prompt, api_name, api_k
583
  except Exception as e:
584
  update_progress(f"Failed to remove temporary file {file}: {str(e)}")
585
 
 
 
586
  try:
587
- # Download podcast
588
- audio_file = download_audio_file(url, use_cookies, cookies)
 
 
 
 
 
 
589
  temp_files.append(audio_file)
590
  update_progress("Podcast downloaded successfully.")
591
 
592
- # Extract metadata
593
  metadata = extract_metadata(url)
594
  title = title or metadata.get('title', 'Unknown Podcast')
595
  author = author or metadata.get('uploader', 'Unknown Author')
@@ -607,7 +646,7 @@ Duration: {metadata.get('duration', 'N/A')} seconds
607
  Description: {metadata.get('description', 'N/A')}
608
  """
609
 
610
- # Update keywords
611
  new_keywords = []
612
  if metadata.get('series'):
613
  new_keywords.append(f"series:{metadata['series']}")
@@ -617,22 +656,36 @@ Description: {metadata.get('description', 'N/A')}
617
  new_keywords.append(f"season:{metadata['season']}")
618
 
619
  keywords = f"{keywords},{','.join(new_keywords)}" if keywords else ','.join(new_keywords)
620
-
621
  update_progress(f"Metadata extracted - Title: {title}, Author: {author}, Keywords: {keywords}")
622
 
623
- # Transcribe the podcast
624
  try:
625
  if enable_diarization:
626
  segments = speech_to_text(audio_file, whisper_model=whisper_model, diarize=True)
627
  else:
628
  segments = speech_to_text(audio_file, whisper_model=whisper_model)
629
- transcription = " ".join([segment['Text'] for segment in segments])
630
- update_progress("Podcast transcribed successfully.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
631
  except Exception as e:
632
  error_message = f"Transcription failed: {str(e)}"
633
- raise
634
 
635
- # Apply chunking
636
  chunk_options = {
637
  'method': chunk_method,
638
  'max_size': max_chunk_size,
@@ -646,17 +699,19 @@ Description: {metadata.get('description', 'N/A')}
646
  # Combine metadata and transcription
647
  full_content = metadata_text + "\n\nTranscription:\n" + transcription
648
 
649
- # Summarize if API is provided
650
  summary = None
651
- if api_name and api_key:
652
  try:
653
  summary = perform_summarization(api_name, chunked_text, custom_prompt, api_key)
654
  update_progress("Podcast summarized successfully.")
655
  except Exception as e:
656
  error_message = f"Summarization failed: {str(e)}"
657
- raise
 
 
658
 
659
- # Add to database
660
  try:
661
  add_media_with_keywords(
662
  url=url,
@@ -673,18 +728,57 @@ Description: {metadata.get('description', 'N/A')}
673
  update_progress("Podcast added to database successfully.")
674
  except Exception as e:
675
  error_message = f"Error adding podcast to database: {str(e)}"
676
- raise
677
 
678
- # Cleanup
679
  cleanup_files()
680
 
681
- return (update_progress("Processing complete."), full_content, summary or "No summary generated.",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
682
  title, author, keywords, error_message)
683
 
684
  except Exception as e:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
685
  logging.error(f"Error processing podcast: {str(e)}")
686
  cleanup_files()
687
- return update_progress(f"Processing failed: {str(e)}"), "", "", "", "", "", str(e)
 
688
 
689
 
690
  #
 
19
  import os
20
  import subprocess
21
  import tempfile
22
+ import time
23
  import uuid
24
  from datetime import datetime
25
  from pathlib import Path
26
+ #
27
+ # External Imports
28
  import requests
29
  import yt_dlp
 
 
 
30
  #
31
  # Local Imports
32
+ from App_Function_Libraries.DB.DB_Manager import add_media_with_keywords, \
33
  check_media_and_whisper_model
34
+ from App_Function_Libraries.Metrics.metrics_logger import log_counter, log_histogram
35
+ from App_Function_Libraries.Summarization.Summarization_General_Lib import perform_summarization
36
+ from App_Function_Libraries.Utils.Utils import downloaded_files, \
37
+ sanitize_filename, generate_unique_id, temp_files
38
  from App_Function_Libraries.Video_DL_Ingestion_Lib import extract_metadata
39
+ from App_Function_Libraries.Audio.Audio_Transcription_Lib import speech_to_text
40
+ from App_Function_Libraries.Chunk_Lib import improved_chunking_process
41
  #
42
  #######################################################################################################################
43
  # Function Definitions
 
106
  logging.error(f"Unexpected error downloading audio file: {str(e)}")
107
  raise
108
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  def process_audio_files(audio_urls, audio_file, whisper_model, api_name, api_key, use_cookies, cookies, keep_original,
110
  custom_keywords, custom_prompt_input, chunk_method, max_chunk_size, chunk_overlap,
111
+ use_adaptive_chunking, use_multi_level_chunking, chunk_language, diarize,
112
+ keep_timestamps, custom_title):
113
+
114
+ start_time = time.time() # Start time for processing
115
+ processed_count = 0
116
+ failed_count = 0
117
  progress = []
 
118
  all_transcriptions = []
119
  all_summaries = []
120
+ #v2
121
+ def format_transcription_with_timestamps(segments):
122
+ if keep_timestamps:
123
+ formatted_segments = []
124
+ for segment in segments:
125
+ start = segment.get('Time_Start', 0)
126
+ end = segment.get('Time_End', 0)
127
+ text = segment.get('Text', '').strip() # Ensure text is stripped of leading/trailing spaces
128
+
129
+ # Add the formatted timestamp and text to the list, followed by a newline
130
+ formatted_segments.append(f"[{start:.2f}-{end:.2f}] {text}")
131
+
132
+ # Join the segments with a newline to ensure proper formatting
133
+ return "\n".join(formatted_segments)
134
+ else:
135
+ # Join the text without timestamps
136
+ return "\n".join([segment.get('Text', '').strip() for segment in segments])
137
 
138
  def update_progress(message):
139
  progress.append(message)
 
201
  audio_file_path = download_audio_file(url, use_cookies, cookies)
202
  if not os.path.exists(audio_file_path):
203
  update_progress(f"Downloaded file not found: {audio_file_path}")
204
+ failed_count += 1
205
+ log_counter(
206
+ metric_name="audio_files_failed_total",
207
+ labels={"whisper_model": whisper_model, "api_name": api_name},
208
+ value=1
209
+ )
210
  continue
211
 
212
  temp_files.append(audio_file_path)
 
216
  reencoded_mp3_path = reencode_mp3(audio_file_path)
217
  if not os.path.exists(reencoded_mp3_path):
218
  update_progress(f"Re-encoded file not found: {reencoded_mp3_path}")
219
+ failed_count += 1
220
+ log_counter(
221
+ metric_name="audio_files_failed_total",
222
+ labels={"whisper_model": whisper_model, "api_name": api_name},
223
+ value=1
224
+ )
225
  continue
226
 
227
  temp_files.append(reencoded_mp3_path)
 
230
  wav_file_path = convert_mp3_to_wav(reencoded_mp3_path)
231
  if not os.path.exists(wav_file_path):
232
  update_progress(f"Converted WAV file not found: {wav_file_path}")
233
+ failed_count += 1
234
+ log_counter(
235
+ metric_name="audio_files_failed_total",
236
+ labels={"whisper_model": whisper_model, "api_name": api_name},
237
+ value=1
238
+ )
239
  continue
240
 
241
  temp_files.append(wav_file_path)
 
254
  segments = segments['segments']
255
 
256
  if isinstance(segments, list):
257
+ # Log first 5 segments for debugging
258
+ logging.debug(f"Segments before formatting: {segments[:5]}")
259
+ transcription = format_transcription_with_timestamps(segments)
260
+ logging.debug(f"Formatted transcription (first 500 chars): {transcription[:500]}")
261
  update_progress("Audio transcribed successfully.")
262
  else:
263
  update_progress("Unexpected segments format received from speech_to_text.")
264
  logging.error(f"Unexpected segments format: {segments}")
265
+ failed_count += 1
266
+ log_counter(
267
+ metric_name="audio_files_failed_total",
268
+ labels={"whisper_model": whisper_model, "api_name": api_name},
269
+ value=1
270
+ )
271
  continue
272
 
273
  if not transcription.strip():
274
  update_progress("Transcription is empty.")
275
+ failed_count += 1
276
+ log_counter(
277
+ metric_name="audio_files_failed_total",
278
+ labels={"whisper_model": whisper_model, "api_name": api_name},
279
+ value=1
280
+ )
281
  else:
282
  # Apply chunking
283
  chunked_text = improved_chunking_process(transcription, chunk_options)
284
 
285
  # Summarize
286
+ logging.debug(f"Audio Transcription API Name: {api_name}")
287
  if api_name:
288
  try:
289
  summary = perform_summarization(api_name, chunked_text, custom_prompt_input, api_key)
 
291
  except Exception as e:
292
  logging.error(f"Error during summarization: {str(e)}")
293
  summary = "Summary generation failed"
294
+ failed_count += 1
295
+ log_counter(
296
+ metric_name="audio_files_failed_total",
297
+ labels={"whisper_model": whisper_model, "api_name": api_name},
298
+ value=1
299
+ )
300
  else:
301
  summary = "No summary available (API not provided)"
302
 
303
  all_transcriptions.append(transcription)
304
  all_summaries.append(summary)
305
 
306
+ # Use custom_title if provided, otherwise use the original filename
307
+ title = custom_title if custom_title else os.path.basename(wav_file_path)
308
+
309
  # Add to database
310
  add_media_with_keywords(
311
  url=url,
312
+ title=title,
313
  media_type='audio',
314
  content=transcription,
315
  keywords=custom_keywords,
 
320
  ingestion_date=datetime.now().strftime('%Y-%m-%d')
321
  )
322
  update_progress("Audio file processed and added to database.")
323
+ processed_count += 1
324
+ log_counter(
325
+ metric_name="audio_files_processed_total",
326
+ labels={"whisper_model": whisper_model, "api_name": api_name},
327
+ value=1
328
+ )
329
 
330
  # Process uploaded file if provided
331
  if audio_file:
332
+ url = generate_unique_id()
333
  if os.path.getsize(audio_file.name) > MAX_FILE_SIZE:
334
  update_progress(
335
  f"Uploaded file size exceeds the maximum limit of {MAX_FILE_SIZE / (1024 * 1024):.2f}MB. Skipping this file.")
336
  else:
337
+ try:
338
+ # Re-encode MP3 to fix potential issues
339
+ reencoded_mp3_path = reencode_mp3(audio_file.name)
340
+ if not os.path.exists(reencoded_mp3_path):
341
+ update_progress(f"Re-encoded file not found: {reencoded_mp3_path}")
342
+ return update_progress("Processing failed: Re-encoded file not found"), "", ""
343
+
344
+ temp_files.append(reencoded_mp3_path)
345
+
346
+ # Convert re-encoded MP3 to WAV
347
+ wav_file_path = convert_mp3_to_wav(reencoded_mp3_path)
348
+ if not os.path.exists(wav_file_path):
349
+ update_progress(f"Converted WAV file not found: {wav_file_path}")
350
+ return update_progress("Processing failed: Converted WAV file not found"), "", ""
351
+
352
+ temp_files.append(wav_file_path)
353
+
354
+ # Initialize transcription
355
+ transcription = ""
356
+
357
+ if diarize:
358
+ segments = speech_to_text(wav_file_path, whisper_model=whisper_model, diarize=True)
359
+ else:
360
+ segments = speech_to_text(wav_file_path, whisper_model=whisper_model)
361
+
362
+ # Handle segments nested under 'segments' key
363
+ if isinstance(segments, dict) and 'segments' in segments:
364
+ segments = segments['segments']
365
+
366
+ if isinstance(segments, list):
367
+ transcription = format_transcription_with_timestamps(segments)
368
+ else:
369
+ update_progress("Unexpected segments format received from speech_to_text.")
370
+ logging.error(f"Unexpected segments format: {segments}")
371
+
372
+ chunked_text = improved_chunking_process(transcription, chunk_options)
373
+
374
+ logging.debug(f"Audio Transcription API Name: {api_name}")
375
+ if api_name:
376
+ try:
377
+ summary = perform_summarization(api_name, chunked_text, custom_prompt_input, api_key)
378
+ update_progress("Audio summarized successfully.")
379
+ except Exception as e:
380
+ logging.error(f"Error during summarization: {str(e)}")
381
+ summary = "Summary generation failed"
382
+ else:
383
+ summary = "No summary available (API not provided)"
384
+
385
+ all_transcriptions.append(transcription)
386
+ all_summaries.append(summary)
387
+
388
+ # Use custom_title if provided, otherwise use the original filename
389
+ title = custom_title if custom_title else os.path.basename(wav_file_path)
390
+
391
+ add_media_with_keywords(
392
+ url="Uploaded File",
393
+ title=title,
394
+ media_type='audio',
395
+ content=transcription,
396
+ keywords=custom_keywords,
397
+ prompt=custom_prompt_input,
398
+ summary=summary,
399
+ transcription_model=whisper_model,
400
+ author="Unknown",
401
+ ingestion_date=datetime.now().strftime('%Y-%m-%d')
402
+ )
403
+ update_progress("Uploaded file processed and added to database.")
404
+ processed_count += 1
405
+ log_counter(
406
+ metric_name="audio_files_processed_total",
407
+ labels={"whisper_model": whisper_model, "api_name": api_name},
408
+ value=1
409
+ )
410
+ except Exception as e:
411
+ update_progress(f"Error processing uploaded file: {str(e)}")
412
+ logging.error(f"Error processing uploaded file: {str(e)}")
413
+ failed_count += 1
414
+ log_counter(
415
+ metric_name="audio_files_failed_total",
416
+ labels={"whisper_model": whisper_model, "api_name": api_name},
417
+ value=1
418
+ )
419
+ return update_progress("Processing failed: Error processing uploaded file"), "", ""
420
+ # Final cleanup
421
+ if not keep_original:
422
+ cleanup_files()
423
 
424
+ end_time = time.time()
425
+ processing_time = end_time - start_time
426
+ # Log processing time
427
+ log_histogram(
428
+ metric_name="audio_processing_time_seconds",
429
+ value=processing_time,
430
+ labels={"whisper_model": whisper_model, "api_name": api_name}
431
+ )
 
432
 
433
+ # Optionally, log total counts
434
+ log_counter(
435
+ metric_name="total_audio_files_processed",
436
+ labels={"whisper_model": whisper_model, "api_name": api_name},
437
+ value=processed_count
438
+ )
439
 
440
+ log_counter(
441
+ metric_name="total_audio_files_failed",
442
+ labels={"whisper_model": whisper_model, "api_name": api_name},
443
+ value=failed_count
444
+ )
 
 
 
 
 
 
 
 
445
 
 
 
 
446
 
447
  final_progress = update_progress("All processing complete.")
448
  final_transcriptions = "\n\n".join(all_transcriptions)
 
452
 
453
  except Exception as e:
454
  logging.error(f"Error processing audio files: {str(e)}")
455
+ log_counter(
456
+ metric_name="audio_files_failed_total",
457
+ labels={"whisper_model": whisper_model, "api_name": api_name},
458
+ value=1
459
+ )
460
  cleanup_files()
461
  return update_progress(f"Processing failed: {str(e)}"), "", ""
462
 
463
 
464
+ def format_transcription_with_timestamps(segments, keep_timestamps):
465
+ """
466
+ Formats the transcription segments with or without timestamps.
467
+
468
+ Parameters:
469
+ segments (list): List of transcription segments.
470
+ keep_timestamps (bool): Whether to include timestamps.
471
+
472
+ Returns:
473
+ str: Formatted transcription.
474
+ """
475
+ if keep_timestamps:
476
+ formatted_segments = []
477
+ for segment in segments:
478
+ start = segment.get('Time_Start', 0)
479
+ end = segment.get('Time_End', 0)
480
+ text = segment.get('Text', '').strip()
481
+
482
+ formatted_segments.append(f"[{start:.2f}-{end:.2f}] {text}")
483
+ return "\n".join(formatted_segments)
484
+ else:
485
+ return "\n".join([segment.get('Text', '').strip() for segment in segments])
486
+
487
+
488
  def download_youtube_audio(url):
489
  try:
490
  # Determine ffmpeg path based on the operating system.
 
552
  def process_podcast(url, title, author, keywords, custom_prompt, api_name, api_key, whisper_model,
553
  keep_original=False, enable_diarization=False, use_cookies=False, cookies=None,
554
  chunk_method=None, max_chunk_size=300, chunk_overlap=0, use_adaptive_chunking=False,
555
+ use_multi_level_chunking=False, chunk_language='english', keep_timestamps=True):
556
+ """
557
+ Processes a podcast by downloading the audio, transcribing it, summarizing the transcription,
558
+ and adding the results to the database. Metrics are logged throughout the process.
559
+
560
+ Parameters:
561
+ url (str): URL of the podcast.
562
+ title (str): Title of the podcast.
563
+ author (str): Author of the podcast.
564
+ keywords (str): Comma-separated keywords.
565
+ custom_prompt (str): Custom prompt for summarization.
566
+ api_name (str): API name for summarization.
567
+ api_key (str): API key for summarization.
568
+ whisper_model (str): Whisper model to use for transcription.
569
+ keep_original (bool): Whether to keep the original audio file.
570
+ enable_diarization (bool): Whether to enable speaker diarization.
571
+ use_cookies (bool): Whether to use cookies for authenticated downloads.
572
+ cookies (str): JSON-formatted cookies string.
573
+ chunk_method (str): Method for chunking text.
574
+ max_chunk_size (int): Maximum size for each text chunk.
575
+ chunk_overlap (int): Overlap size between chunks.
576
+ use_adaptive_chunking (bool): Whether to use adaptive chunking.
577
+ use_multi_level_chunking (bool): Whether to use multi-level chunking.
578
+ chunk_language (str): Language for chunking.
579
+ keep_timestamps (bool): Whether to keep timestamps in transcription.
580
+
581
+ Returns:
582
+ tuple: (progress_message, transcription, summary, title, author, keywords, error_message)
583
+ """
584
+ start_time = time.time() # Start time for processing
585
  error_message = ""
586
  temp_files = []
587
 
588
+ # Define labels for metrics
589
+ labels = {
590
+ "whisper_model": whisper_model,
591
+ "api_name": api_name if api_name else "None"
592
+ }
593
+
594
  def update_progress(message):
595
+ """
596
+ Updates the progress messages.
597
+
598
+ Parameters:
599
+ message (str): Progress message to append.
600
+
601
+ Returns:
602
+ str: Combined progress messages.
603
+ """
604
  progress.append(message)
605
  return "\n".join(progress)
606
 
 
614
  except Exception as e:
615
  update_progress(f"Failed to remove temporary file {file}: {str(e)}")
616
 
617
+ progress = [] # Initialize progress messages
618
+
619
  try:
620
+ # Handle cookies if required
621
+ if use_cookies:
622
+ cookies = json.loads(cookies)
623
+
624
+ # Download the podcast audio file
625
+ audio_file = download_audio_file(url, whisper_model, use_cookies, cookies)
626
+ if not audio_file:
627
+ raise RuntimeError("Failed to download podcast audio.")
628
  temp_files.append(audio_file)
629
  update_progress("Podcast downloaded successfully.")
630
 
631
+ # Extract metadata from the podcast
632
  metadata = extract_metadata(url)
633
  title = title or metadata.get('title', 'Unknown Podcast')
634
  author = author or metadata.get('uploader', 'Unknown Author')
 
646
  Description: {metadata.get('description', 'N/A')}
647
  """
648
 
649
+ # Update keywords with metadata information
650
  new_keywords = []
651
  if metadata.get('series'):
652
  new_keywords.append(f"series:{metadata['series']}")
 
656
  new_keywords.append(f"season:{metadata['season']}")
657
 
658
  keywords = f"{keywords},{','.join(new_keywords)}" if keywords else ','.join(new_keywords)
 
659
  update_progress(f"Metadata extracted - Title: {title}, Author: {author}, Keywords: {keywords}")
660
 
661
+ # Transcribe the podcast audio
662
  try:
663
  if enable_diarization:
664
  segments = speech_to_text(audio_file, whisper_model=whisper_model, diarize=True)
665
  else:
666
  segments = speech_to_text(audio_file, whisper_model=whisper_model)
667
+ # SEems like this could be optimized... FIXME
668
+ def format_segment(segment):
669
+ start = segment.get('start', 0)
670
+ end = segment.get('end', 0)
671
+ text = segment.get('Text', '')
672
+
673
+ if isinstance(segments, dict) and 'segments' in segments:
674
+ segments = segments['segments']
675
+
676
+ if isinstance(segments, list):
677
+ transcription = format_transcription_with_timestamps(segments, keep_timestamps)
678
+ update_progress("Podcast transcribed successfully.")
679
+ else:
680
+ raise ValueError("Unexpected segments format received from speech_to_text.")
681
+
682
+ if not transcription.strip():
683
+ raise ValueError("Transcription is empty.")
684
  except Exception as e:
685
  error_message = f"Transcription failed: {str(e)}"
686
+ raise RuntimeError(error_message)
687
 
688
+ # Apply chunking to the transcription
689
  chunk_options = {
690
  'method': chunk_method,
691
  'max_size': max_chunk_size,
 
699
  # Combine metadata and transcription
700
  full_content = metadata_text + "\n\nTranscription:\n" + transcription
701
 
702
+ # Summarize the transcription if API is provided
703
  summary = None
704
+ if api_name:
705
  try:
706
  summary = perform_summarization(api_name, chunked_text, custom_prompt, api_key)
707
  update_progress("Podcast summarized successfully.")
708
  except Exception as e:
709
  error_message = f"Summarization failed: {str(e)}"
710
+ raise RuntimeError(error_message)
711
+ else:
712
+ summary = "No summary available (API not provided)"
713
 
714
+ # Add the processed podcast to the database
715
  try:
716
  add_media_with_keywords(
717
  url=url,
 
728
  update_progress("Podcast added to database successfully.")
729
  except Exception as e:
730
  error_message = f"Error adding podcast to database: {str(e)}"
731
+ raise RuntimeError(error_message)
732
 
733
+ # Cleanup temporary files if required
734
  cleanup_files()
735
 
736
+ # Calculate processing time
737
+ end_time = time.time()
738
+ processing_time = end_time - start_time
739
+
740
+ # Log successful processing
741
+ log_counter(
742
+ metric_name="podcasts_processed_total",
743
+ labels=labels,
744
+ value=1
745
+ )
746
+
747
+ # Log processing time
748
+ log_histogram(
749
+ metric_name="podcast_processing_time_seconds",
750
+ value=processing_time,
751
+ labels=labels
752
+ )
753
+
754
+ # Return the final outputs
755
+ final_progress = update_progress("Processing complete.")
756
+ return (final_progress, full_content, summary or "No summary generated.",
757
  title, author, keywords, error_message)
758
 
759
  except Exception as e:
760
+ # Calculate processing time up to the point of failure
761
+ end_time = time.time()
762
+ processing_time = end_time - start_time
763
+
764
+ # Log failed processing
765
+ log_counter(
766
+ metric_name="podcasts_failed_total",
767
+ labels=labels,
768
+ value=1
769
+ )
770
+
771
+ # Log processing time even on failure
772
+ log_histogram(
773
+ metric_name="podcast_processing_time_seconds",
774
+ value=processing_time,
775
+ labels=labels
776
+ )
777
+
778
  logging.error(f"Error processing podcast: {str(e)}")
779
  cleanup_files()
780
+ final_progress = update_progress(f"Processing failed: {str(e)}")
781
+ return (final_progress, "", "", "", "", "", str(e))
782
 
783
 
784
  #
App_Function_Libraries/Audio/Audio_Transcription_Lib.py CHANGED
@@ -1,277 +1,335 @@
1
- # Audio_Transcription_Lib.py
2
- #########################################
3
- # Transcription Library
4
- # This library is used to perform transcription of audio files.
5
- # Currently, uses faster_whisper for transcription.
6
- #
7
- ####################
8
- # Function List
9
- #
10
- # 1. convert_to_wav(video_file_path, offset=0, overwrite=False)
11
- # 2. speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='small.en', vad_filter=False)
12
- #
13
- ####################
14
- #
15
- # Import necessary libraries to run solo for testing
16
- import gc
17
- import json
18
- import logging
19
- import os
20
- import queue
21
- import sys
22
- import subprocess
23
- import tempfile
24
- import threading
25
- import time
26
- # DEBUG Imports
27
- #from memory_profiler import profile
28
- #import pyaudio
29
- from faster_whisper import WhisperModel as OriginalWhisperModel
30
- from typing import Optional, Union, List, Dict, Any
31
- #
32
- # Import Local
33
- from App_Function_Libraries.Utils.Utils import load_comprehensive_config
34
- #
35
- #######################################################################################################################
36
- # Function Definitions
37
- #
38
-
39
- # Convert video .m4a into .wav using ffmpeg
40
- # ffmpeg -i "example.mp4" -ar 16000 -ac 1 -c:a pcm_s16le "output.wav"
41
- # https://www.gyan.dev/ffmpeg/builds/
42
- #
43
-
44
-
45
- whisper_model_instance = None
46
- config = load_comprehensive_config()
47
- processing_choice = config.get('Processing', 'processing_choice', fallback='cpu')
48
-
49
-
50
-
51
- class WhisperModel(OriginalWhisperModel):
52
- tldw_dir = os.path.dirname(os.path.dirname(__file__))
53
- default_download_root = os.path.join(tldw_dir, 'App_Function_Libraries', 'models', 'Whisper')
54
-
55
- valid_model_sizes = [
56
- "tiny.en", "tiny", "base.en", "base", "small.en", "small", "medium.en", "medium",
57
- "large-v1", "large-v2", "large-v3", "large", "distil-large-v2", "distil-medium.en",
58
- "distil-small.en", "distil-large-v3"
59
- ]
60
-
61
- def __init__(
62
- self,
63
- model_size_or_path: str,
64
- device: str = "auto",
65
- device_index: Union[int, List[int]] = 0,
66
- compute_type: str = "default",
67
- cpu_threads: int = 16,
68
- num_workers: int = 1,
69
- download_root: Optional[str] = None,
70
- local_files_only: bool = False,
71
- files: Optional[Dict[str, Any]] = None,
72
- **model_kwargs: Any
73
- ):
74
- if download_root is None:
75
- download_root = self.default_download_root
76
-
77
- os.makedirs(download_root, exist_ok=True)
78
-
79
- # FIXME - validate....
80
- # Also write an integration test...
81
- # Check if model_size_or_path is a valid model size
82
- if model_size_or_path in self.valid_model_sizes:
83
- # It's a model size, so we'll use the download_root
84
- model_path = os.path.join(download_root, model_size_or_path)
85
- if not os.path.isdir(model_path):
86
- # If it doesn't exist, we'll let the parent class download it
87
- model_size_or_path = model_size_or_path # Keep the original model size
88
- else:
89
- # If it exists, use the full path
90
- model_size_or_path = model_path
91
- else:
92
- # It's not a valid model size, so assume it's a path
93
- model_size_or_path = os.path.abspath(model_size_or_path)
94
-
95
- super().__init__(
96
- model_size_or_path,
97
- device=device,
98
- device_index=device_index,
99
- compute_type=compute_type,
100
- cpu_threads=cpu_threads,
101
- num_workers=num_workers,
102
- download_root=download_root,
103
- local_files_only=local_files_only,
104
- # Maybe? idk, FIXME
105
- # files=files,
106
- # **model_kwargs
107
- )
108
-
109
- def get_whisper_model(model_name, device):
110
- global whisper_model_instance
111
- if whisper_model_instance is None:
112
- logging.info(f"Initializing new WhisperModel with size {model_name} on device {device}")
113
- whisper_model_instance = WhisperModel(model_name, device=device)
114
- return whisper_model_instance
115
-
116
- # # FIXME: This is a temporary solution.
117
- # # This doesn't clear older models, which means potentially a lot of memory is being used...
118
- # def get_whisper_model(model_name, device):
119
- # global whisper_model_instance
120
- # if whisper_model_instance is None:
121
- # from faster_whisper import WhisperModel
122
- # logging.info(f"Initializing new WhisperModel with size {model_name} on device {device}")
123
- #
124
- # # FIXME - add logic to detect if the model is already downloaded
125
- # # want to first check if the model is already downloaded
126
- # # if not, download it using the existing logic in 'WhisperModel'
127
- # # https://github.com/SYSTRAN/faster-whisper/blob/d57c5b40b06e59ec44240d93485a95799548af50/faster_whisper/transcribe.py#L584
128
- # # Designated path should be `tldw/App_Function_Libraries/models/Whisper/`
129
- # WhisperModel.download_root = os.path.join(os.path.dirname(__file__), 'models', 'Whisper')
130
- # os.makedirs(WhisperModel.download_root, exist_ok=True)
131
- # whisper_model_instance = WhisperModel(model_name, device=device)
132
- # return whisper_model_instance
133
-
134
-
135
- # os.system(r'.\Bin\ffmpeg.exe -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
136
- #DEBUG
137
- #@profile
138
- def convert_to_wav(video_file_path, offset=0, overwrite=False):
139
- out_path = os.path.splitext(video_file_path)[0] + ".wav"
140
-
141
- if os.path.exists(out_path) and not overwrite:
142
- print(f"File '{out_path}' already exists. Skipping conversion.")
143
- logging.info(f"Skipping conversion as file already exists: {out_path}")
144
- return out_path
145
- print("Starting conversion process of .m4a to .WAV")
146
- out_path = os.path.splitext(video_file_path)[0] + ".wav"
147
-
148
- try:
149
- if os.name == "nt":
150
- logging.debug("ffmpeg being ran on windows")
151
-
152
- if sys.platform.startswith('win'):
153
- ffmpeg_cmd = ".\\Bin\\ffmpeg.exe"
154
- logging.debug(f"ffmpeg_cmd: {ffmpeg_cmd}")
155
- else:
156
- ffmpeg_cmd = 'ffmpeg' # Assume 'ffmpeg' is in PATH for non-Windows systems
157
-
158
- command = [
159
- ffmpeg_cmd, # Assuming the working directory is correctly set where .\Bin exists
160
- "-ss", "00:00:00", # Start at the beginning of the video
161
- "-i", video_file_path,
162
- "-ar", "16000", # Audio sample rate
163
- "-ac", "1", # Number of audio channels
164
- "-c:a", "pcm_s16le", # Audio codec
165
- out_path
166
- ]
167
- try:
168
- # Redirect stdin from null device to prevent ffmpeg from waiting for input
169
- with open(os.devnull, 'rb') as null_file:
170
- result = subprocess.run(command, stdin=null_file, text=True, capture_output=True)
171
- if result.returncode == 0:
172
- logging.info("FFmpeg executed successfully")
173
- logging.debug("FFmpeg output: %s", result.stdout)
174
- else:
175
- logging.error("Error in running FFmpeg")
176
- logging.error("FFmpeg stderr: %s", result.stderr)
177
- raise RuntimeError(f"FFmpeg error: {result.stderr}")
178
- except Exception as e:
179
- logging.error("Error occurred - ffmpeg doesn't like windows")
180
- raise RuntimeError("ffmpeg failed")
181
- elif os.name == "posix":
182
- os.system(f'ffmpeg -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
183
- else:
184
- raise RuntimeError("Unsupported operating system")
185
- logging.info("Conversion to WAV completed: %s", out_path)
186
- except subprocess.CalledProcessError as e:
187
- logging.error("Error executing FFmpeg command: %s", str(e))
188
- raise RuntimeError("Error converting video file to WAV")
189
- except Exception as e:
190
- logging.error("speech-to-text: Error transcribing audio: %s", str(e))
191
- return {"error": str(e)}
192
- gc.collect()
193
- return out_path
194
-
195
-
196
- # Transcribe .wav into .segments.json
197
- #DEBUG
198
- #@profile
199
- def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='medium.en', vad_filter=False, diarize=False):
200
- global whisper_model_instance, processing_choice
201
- logging.info('speech-to-text: Loading faster_whisper model: %s', whisper_model)
202
-
203
- time_start = time.time()
204
- if audio_file_path is None:
205
- raise ValueError("speech-to-text: No audio file provided")
206
- logging.info("speech-to-text: Audio file path: %s", audio_file_path)
207
-
208
- try:
209
- _, file_ending = os.path.splitext(audio_file_path)
210
- out_file = audio_file_path.replace(file_ending, ".segments.json")
211
- prettified_out_file = audio_file_path.replace(file_ending, ".segments_pretty.json")
212
- if os.path.exists(out_file):
213
- logging.info("speech-to-text: Segments file already exists: %s", out_file)
214
- with open(out_file) as f:
215
- global segments
216
- segments = json.load(f)
217
- return segments
218
-
219
- logging.info('speech-to-text: Starting transcription...')
220
- options = dict(language=selected_source_lang, beam_size=5, best_of=5, vad_filter=vad_filter)
221
- transcribe_options = dict(task="transcribe", **options)
222
- # use function and config at top of file
223
- logging.debug("speech-to-text: Using whisper model: %s", whisper_model)
224
- whisper_model_instance = get_whisper_model(whisper_model, processing_choice)
225
- segments_raw, info = whisper_model_instance.transcribe(audio_file_path, **transcribe_options)
226
-
227
- segments = []
228
- for segment_chunk in segments_raw:
229
- chunk = {
230
- "Time_Start": segment_chunk.start,
231
- "Time_End": segment_chunk.end,
232
- "Text": segment_chunk.text
233
- }
234
- logging.debug("Segment: %s", chunk)
235
- segments.append(chunk)
236
- # Print to verify its working
237
- print(f"{segment_chunk.start:.2f}s - {segment_chunk.end:.2f}s | {segment_chunk.text}")
238
-
239
- # Log it as well.
240
- logging.debug(
241
- f"Transcribed Segment: {segment_chunk.start:.2f}s - {segment_chunk.end:.2f}s | {segment_chunk.text}")
242
-
243
- if segments:
244
- segments[0]["Text"] = f"This text was transcribed using whisper model: {whisper_model}\n\n" + segments[0]["Text"]
245
-
246
- if not segments:
247
- raise RuntimeError("No transcription produced. The audio file may be invalid or empty.")
248
- logging.info("speech-to-text: Transcription completed in %.2f seconds", time.time() - time_start)
249
-
250
- # Save the segments to a JSON file - prettified and non-prettified
251
- # FIXME so this is an optional flag to save either the prettified json file or the normal one
252
- save_json = True
253
- if save_json:
254
- logging.info("speech-to-text: Saving segments to JSON file")
255
- output_data = {'segments': segments}
256
-
257
- logging.info("speech-to-text: Saving prettified JSON to %s", prettified_out_file)
258
- with open(prettified_out_file, 'w') as f:
259
- json.dump(output_data, f, indent=2)
260
-
261
- logging.info("speech-to-text: Saving JSON to %s", out_file)
262
- with open(out_file, 'w') as f:
263
- json.dump(output_data, f)
264
-
265
- logging.debug(f"speech-to-text: returning {segments[:500]}")
266
- gc.collect()
267
- return segments
268
-
269
- except Exception as e:
270
- logging.error("speech-to-text: Error transcribing audio: %s", str(e))
271
- raise RuntimeError("speech-to-text: Error transcribing audio")
272
-
273
-
274
-
275
- #
276
- #
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277
  #######################################################################################################################
 
1
+ # Audio_Transcription_Lib.py
2
+ #########################################
3
+ # Transcription Library
4
+ # This library is used to perform transcription of audio files.
5
+ # Currently, uses faster_whisper for transcription.
6
+ #
7
+ ####################
8
+ # Function List
9
+ #
10
+ # 1. convert_to_wav(video_file_path, offset=0, overwrite=False)
11
+ # 2. speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='small.en', vad_filter=False)
12
+ #
13
+ ####################
14
+ #
15
+ # Import necessary libraries to run solo for testing
16
+ import gc
17
+ import json
18
+ import logging
19
+ import multiprocessing
20
+ import os
21
+ import queue
22
+ import sys
23
+ import subprocess
24
+ import tempfile
25
+ import threading
26
+ import time
27
+ # DEBUG Imports
28
+ #from memory_profiler import profile
29
+ import pyaudio
30
+ from faster_whisper import WhisperModel as OriginalWhisperModel
31
+ from typing import Optional, Union, List, Dict, Any
32
+ #
33
+ # Import Local
34
+ from App_Function_Libraries.Utils.Utils import load_comprehensive_config
35
+ from App_Function_Libraries.Metrics.metrics_logger import log_counter, log_histogram
36
+ #
37
+ #######################################################################################################################
38
+ # Function Definitions
39
+ #
40
+
41
+ # Convert video .m4a into .wav using ffmpeg
42
+ # ffmpeg -i "example.mp4" -ar 16000 -ac 1 -c:a pcm_s16le "output.wav"
43
+ # https://www.gyan.dev/ffmpeg/builds/
44
+ #
45
+
46
+
47
+ whisper_model_instance = None
48
+ config = load_comprehensive_config()
49
+ processing_choice = config.get('Processing', 'processing_choice', fallback='cpu')
50
+ total_thread_count = multiprocessing.cpu_count()
51
+
52
+
53
+ class WhisperModel(OriginalWhisperModel):
54
+ tldw_dir = os.path.dirname(os.path.dirname(__file__))
55
+ default_download_root = os.path.join(tldw_dir, 'models', 'Whisper')
56
+
57
+ valid_model_sizes = [
58
+ "tiny.en", "tiny", "base.en", "base", "small.en", "small", "medium.en", "medium",
59
+ "large-v1", "large-v2", "large-v3", "large", "distil-large-v2", "distil-medium.en",
60
+ "distil-small.en", "distil-large-v3",
61
+ ]
62
+
63
+ def __init__(
64
+ self,
65
+ model_size_or_path: str,
66
+ device: str = processing_choice,
67
+ device_index: Union[int, List[int]] = 0,
68
+ compute_type: str = "default",
69
+ cpu_threads: int = 0,#total_thread_count, FIXME - I think this should be 0
70
+ num_workers: int = 1,
71
+ download_root: Optional[str] = None,
72
+ local_files_only: bool = False,
73
+ files: Optional[Dict[str, Any]] = None,
74
+ **model_kwargs: Any
75
+ ):
76
+ if download_root is None:
77
+ download_root = self.default_download_root
78
+
79
+ os.makedirs(download_root, exist_ok=True)
80
+
81
+ # FIXME - validate....
82
+ # Also write an integration test...
83
+ # Check if model_size_or_path is a valid model size
84
+ if model_size_or_path in self.valid_model_sizes:
85
+ # It's a model size, so we'll use the download_root
86
+ model_path = os.path.join(download_root, model_size_or_path)
87
+ if not os.path.isdir(model_path):
88
+ # If it doesn't exist, we'll let the parent class download it
89
+ model_size_or_path = model_size_or_path # Keep the original model size
90
+ else:
91
+ # If it exists, use the full path
92
+ model_size_or_path = model_path
93
+ else:
94
+ # It's not a valid model size, so assume it's a path
95
+ model_size_or_path = os.path.abspath(model_size_or_path)
96
+
97
+ super().__init__(
98
+ model_size_or_path,
99
+ device=device,
100
+ device_index=device_index,
101
+ compute_type=compute_type,
102
+ cpu_threads=cpu_threads,
103
+ num_workers=num_workers,
104
+ download_root=download_root,
105
+ local_files_only=local_files_only,
106
+ # Maybe? idk, FIXME
107
+ # files=files,
108
+ # **model_kwargs
109
+ )
110
+
111
+ def get_whisper_model(model_name, device):
112
+ global whisper_model_instance
113
+ if whisper_model_instance is None:
114
+ logging.info(f"Initializing new WhisperModel with size {model_name} on device {device}")
115
+ whisper_model_instance = WhisperModel(model_name, device=device)
116
+ return whisper_model_instance
117
+
118
+ # os.system(r'.\Bin\ffmpeg.exe -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
119
+ #DEBUG
120
+ #@profile
121
+ def convert_to_wav(video_file_path, offset=0, overwrite=False):
122
+ log_counter("convert_to_wav_attempt", labels={"file_path": video_file_path})
123
+ start_time = time.time()
124
+
125
+ out_path = os.path.splitext(video_file_path)[0] + ".wav"
126
+
127
+ if os.path.exists(out_path) and not overwrite:
128
+ print(f"File '{out_path}' already exists. Skipping conversion.")
129
+ logging.info(f"Skipping conversion as file already exists: {out_path}")
130
+ log_counter("convert_to_wav_skipped", labels={"file_path": video_file_path})
131
+ return out_path
132
+
133
+ print("Starting conversion process of .m4a to .WAV")
134
+ out_path = os.path.splitext(video_file_path)[0] + ".wav"
135
+
136
+ try:
137
+ if os.name == "nt":
138
+ logging.debug("ffmpeg being ran on windows")
139
+
140
+ if sys.platform.startswith('win'):
141
+ ffmpeg_cmd = ".\\Bin\\ffmpeg.exe"
142
+ logging.debug(f"ffmpeg_cmd: {ffmpeg_cmd}")
143
+ else:
144
+ ffmpeg_cmd = 'ffmpeg' # Assume 'ffmpeg' is in PATH for non-Windows systems
145
+
146
+ command = [
147
+ ffmpeg_cmd, # Assuming the working directory is correctly set where .\Bin exists
148
+ "-ss", "00:00:00", # Start at the beginning of the video
149
+ "-i", video_file_path,
150
+ "-ar", "16000", # Audio sample rate
151
+ "-ac", "1", # Number of audio channels
152
+ "-c:a", "pcm_s16le", # Audio codec
153
+ out_path
154
+ ]
155
+ try:
156
+ # Redirect stdin from null device to prevent ffmpeg from waiting for input
157
+ with open(os.devnull, 'rb') as null_file:
158
+ result = subprocess.run(command, stdin=null_file, text=True, capture_output=True)
159
+ if result.returncode == 0:
160
+ logging.info("FFmpeg executed successfully")
161
+ logging.debug("FFmpeg output: %s", result.stdout)
162
+ else:
163
+ logging.error("Error in running FFmpeg")
164
+ logging.error("FFmpeg stderr: %s", result.stderr)
165
+ raise RuntimeError(f"FFmpeg error: {result.stderr}")
166
+ except Exception as e:
167
+ logging.error("Error occurred - ffmpeg doesn't like windows")
168
+ raise RuntimeError("ffmpeg failed")
169
+ elif os.name == "posix":
170
+ os.system(f'ffmpeg -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
171
+ else:
172
+ raise RuntimeError("Unsupported operating system")
173
+ logging.info("Conversion to WAV completed: %s", out_path)
174
+ log_counter("convert_to_wav_success", labels={"file_path": video_file_path})
175
+ except Exception as e:
176
+ logging.error("speech-to-text: Error transcribing audio: %s", str(e))
177
+ log_counter("convert_to_wav_error", labels={"file_path": video_file_path, "error": str(e)})
178
+ return {"error": str(e)}
179
+
180
+ conversion_time = time.time() - start_time
181
+ log_histogram("convert_to_wav_duration", conversion_time, labels={"file_path": video_file_path})
182
+
183
+ gc.collect()
184
+ return out_path
185
+
186
+
187
+ # Transcribe .wav into .segments.json
188
+ #DEBUG
189
+ #@profile
190
+ # FIXME - I feel like the `vad_filter` shoudl be enabled by default....
191
+ def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='medium.en', vad_filter=False, diarize=False):
192
+ log_counter("speech_to_text_attempt", labels={"file_path": audio_file_path, "model": whisper_model})
193
+ time_start = time.time()
194
+
195
+ if audio_file_path is None:
196
+ log_counter("speech_to_text_error", labels={"error": "No audio file provided"})
197
+ raise ValueError("speech-to-text: No audio file provided")
198
+ logging.info("speech-to-text: Audio file path: %s", audio_file_path)
199
+
200
+ try:
201
+ _, file_ending = os.path.splitext(audio_file_path)
202
+ out_file = audio_file_path.replace(file_ending, "-whisper_model-"+whisper_model+".segments.json")
203
+ prettified_out_file = audio_file_path.replace(file_ending, "-whisper_model-"+whisper_model+".segments_pretty.json")
204
+ if os.path.exists(out_file):
205
+ logging.info("speech-to-text: Segments file already exists: %s", out_file)
206
+ with open(out_file) as f:
207
+ global segments
208
+ segments = json.load(f)
209
+ return segments
210
+
211
+ logging.info('speech-to-text: Starting transcription...')
212
+ # FIXME - revisit this
213
+ options = dict(language=selected_source_lang, beam_size=10, best_of=10, vad_filter=vad_filter)
214
+ transcribe_options = dict(task="transcribe", **options)
215
+ # use function and config at top of file
216
+ logging.debug("speech-to-text: Using whisper model: %s", whisper_model)
217
+ whisper_model_instance = get_whisper_model(whisper_model, processing_choice)
218
+ # faster_whisper transcription right here - FIXME -test batching - ha
219
+ segments_raw, info = whisper_model_instance.transcribe(audio_file_path, **transcribe_options)
220
+
221
+ segments = []
222
+ for segment_chunk in segments_raw:
223
+ chunk = {
224
+ "Time_Start": segment_chunk.start,
225
+ "Time_End": segment_chunk.end,
226
+ "Text": segment_chunk.text
227
+ }
228
+ logging.debug("Segment: %s", chunk)
229
+ segments.append(chunk)
230
+ # Print to verify its working
231
+ logging.info(f"{segment_chunk.start:.2f}s - {segment_chunk.end:.2f}s | {segment_chunk.text}")
232
+
233
+ # Log it as well.
234
+ logging.debug(
235
+ f"Transcribed Segment: {segment_chunk.start:.2f}s - {segment_chunk.end:.2f}s | {segment_chunk.text}")
236
+
237
+ if segments:
238
+ segments[0]["Text"] = f"This text was transcribed using whisper model: {whisper_model}\n\n" + segments[0]["Text"]
239
+
240
+ if not segments:
241
+ log_counter("speech_to_text_error", labels={"error": "No transcription produced"})
242
+ raise RuntimeError("No transcription produced. The audio file may be invalid or empty.")
243
+
244
+ transcription_time = time.time() - time_start
245
+ logging.info("speech-to-text: Transcription completed in %.2f seconds", transcription_time)
246
+ log_histogram("speech_to_text_duration", transcription_time, labels={"file_path": audio_file_path, "model": whisper_model})
247
+ log_counter("speech_to_text_success", labels={"file_path": audio_file_path, "model": whisper_model})
248
+ # Save the segments to a JSON file - prettified and non-prettified
249
+ # FIXME refactor so this is an optional flag to save either the prettified json file or the normal one
250
+ save_json = True
251
+ if save_json:
252
+ logging.info("speech-to-text: Saving segments to JSON file")
253
+ output_data = {'segments': segments}
254
+ logging.info("speech-to-text: Saving prettified JSON to %s", prettified_out_file)
255
+ with open(prettified_out_file, 'w') as f:
256
+ json.dump(output_data, f, indent=2)
257
+
258
+ logging.info("speech-to-text: Saving JSON to %s", out_file)
259
+ with open(out_file, 'w') as f:
260
+ json.dump(output_data, f)
261
+
262
+ logging.debug(f"speech-to-text: returning {segments[:500]}")
263
+ gc.collect()
264
+ return segments
265
+
266
+ except Exception as e:
267
+ logging.error("speech-to-text: Error transcribing audio: %s", str(e))
268
+ log_counter("speech_to_text_error", labels={"file_path": audio_file_path, "model": whisper_model, "error": str(e)})
269
+ raise RuntimeError("speech-to-text: Error transcribing audio")
270
+
271
+
272
+ def record_audio(duration, sample_rate=16000, chunk_size=1024):
273
+ log_counter("record_audio_attempt", labels={"duration": duration})
274
+ p = pyaudio.PyAudio()
275
+ stream = p.open(format=pyaudio.paInt16,
276
+ channels=1,
277
+ rate=sample_rate,
278
+ input=True,
279
+ frames_per_buffer=chunk_size)
280
+
281
+ print("Recording...")
282
+ frames = []
283
+ stop_recording = threading.Event()
284
+ audio_queue = queue.Queue()
285
+
286
+ def audio_callback():
287
+ for _ in range(0, int(sample_rate / chunk_size * duration)):
288
+ if stop_recording.is_set():
289
+ break
290
+ data = stream.read(chunk_size)
291
+ audio_queue.put(data)
292
+
293
+ audio_thread = threading.Thread(target=audio_callback)
294
+ audio_thread.start()
295
+
296
+ return p, stream, audio_queue, stop_recording, audio_thread
297
+
298
+
299
+ def stop_recording(p, stream, audio_queue, stop_recording_event, audio_thread):
300
+ log_counter("stop_recording_attempt")
301
+ start_time = time.time()
302
+ stop_recording_event.set()
303
+ audio_thread.join()
304
+
305
+ frames = []
306
+ while not audio_queue.empty():
307
+ frames.append(audio_queue.get())
308
+
309
+ print("Recording finished.")
310
+
311
+ stream.stop_stream()
312
+ stream.close()
313
+ p.terminate()
314
+
315
+ stop_time = time.time() - start_time
316
+ log_histogram("stop_recording_duration", stop_time)
317
+ log_counter("stop_recording_success")
318
+ return b''.join(frames)
319
+
320
+ def save_audio_temp(audio_data, sample_rate=16000):
321
+ log_counter("save_audio_temp_attempt")
322
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
323
+ import wave
324
+ wf = wave.open(temp_file.name, 'wb')
325
+ wf.setnchannels(1)
326
+ wf.setsampwidth(2)
327
+ wf.setframerate(sample_rate)
328
+ wf.writeframes(audio_data)
329
+ wf.close()
330
+ log_counter("save_audio_temp_success")
331
+ return temp_file.name
332
+
333
+ #
334
+ #
335
  #######################################################################################################################