Spaces:

oceansweep
/

tldw

Running

App Files Files Community

oceansweep commited on Oct 16

Commit

74055ee

•

1 Parent(s): 0f1bc91

Upload 3 files

Browse files

Files changed (2) hide show

App_Function_Libraries/Audio/Audio_Files.py +345 -251
App_Function_Libraries/Audio/Audio_Transcription_Lib.py +334 -276

App_Function_Libraries/Audio/Audio_Files.py CHANGED Viewed

@@ -19,25 +19,25 @@ import logging
 import os
 import subprocess
 import tempfile
 import uuid
 from datetime import datetime
 from pathlib import Path
 import requests
 import yt_dlp
-from App_Function_Libraries.Audio.Audio_Transcription_Lib import speech_to_text
-from App_Function_Libraries.Chunk_Lib import improved_chunking_process
 #
 # Local Imports
-from App_Function_Libraries.DB.DB_Manager import add_media_to_database, add_media_with_keywords, \
     check_media_and_whisper_model
-from App_Function_Libraries.Summarization.Summarization_General_Lib import save_transcription_and_summary, perform_transcription, \
-    perform_summarization
-from App_Function_Libraries.Utils.Utils import create_download_directory, save_segments_to_json, downloaded_files, \
-    sanitize_filename
 from App_Function_Libraries.Video_DL_Ingestion_Lib import extract_metadata
 #
 #######################################################################################################################
 # Function Definitions
@@ -106,168 +106,34 @@ def download_audio_file(url, current_whisper_model="", use_cookies=False, cookie
         logging.error(f"Unexpected error downloading audio file: {str(e)}")
         raise
-def process_audio(
-        audio_file_path,
-        num_speakers=2,
-        whisper_model="small.en",
-        custom_prompt_input=None,
-        offset=0,
-        api_name=None,
-        api_key=None,
-        vad_filter=False,
-        rolling_summarization=False,
-        detail_level=0.01,
-        keywords="default,no_keyword_set",
-        chunk_text_by_words=False,
-        max_words=0,
-        chunk_text_by_sentences=False,
-        max_sentences=0,
-        chunk_text_by_paragraphs=False,
-        max_paragraphs=0,
-        chunk_text_by_tokens=False,
-        max_tokens=0
-):
-    try:
-        # Perform transcription
-        audio_file_path, segments = perform_transcription(audio_file_path, offset, whisper_model, vad_filter)
-        if audio_file_path is None or segments is None:
-            logging.error("Process_Audio: Transcription failed or segments not available.")
-            return "Process_Audio: Transcription failed.", None, None, None, None, None
-        logging.debug(f"Process_Audio: Transcription audio_file: {audio_file_path}")
-        logging.debug(f"Process_Audio: Transcription segments: {segments}")
-        transcription_text = {'audio_file': audio_file_path, 'transcription': segments}
-        logging.debug(f"Process_Audio: Transcription text: {transcription_text}")
-        # Save segments to JSON
-        segments_json_path = save_segments_to_json(segments)
-        # Perform summarization
-        summary_text = None
-        if api_name:
-            if rolling_summarization is not None:
-                pass
-                # FIXME rolling summarization
-                # summary_text = rolling_summarize_function(
-                #     transcription_text,
-                #     detail=detail_level,
-                #     api_name=api_name,
-                #     api_key=api_key,
-                #     custom_prompt=custom_prompt_input,
-                #     chunk_by_words=chunk_text_by_words,
-                #     max_words=max_words,
-                #     chunk_by_sentences=chunk_text_by_sentences,
-                #     max_sentences=max_sentences,
-                #     chunk_by_paragraphs=chunk_text_by_paragraphs,
-                #     max_paragraphs=max_paragraphs,
-                #     chunk_by_tokens=chunk_text_by_tokens,
-                #     max_tokens=max_tokens
-                # )
-            else:
-                summary_text = perform_summarization(api_name, segments_json_path, custom_prompt_input, api_key)
-            if summary_text is None:
-                logging.error("Summary text is None. Check summarization function.")
-                summary_file_path = None
-        else:
-            summary_text = 'Summary not available'
-            summary_file_path = None
-        # Save transcription and summary
-        download_path = create_download_directory("Audio_Processing")
-        json_file_path, summary_file_path = save_transcription_and_summary(transcription_text, summary_text,
-                                                                           download_path)
-        # Update function call to add_media_to_database so that it properly applies the title, author and file type
-        # Add to database
-        add_media_to_database(None, {'title': 'Audio File', 'author': 'Unknown'}, segments, summary_text, keywords,
-                              custom_prompt_input, whisper_model)
-        return transcription_text, summary_text, json_file_path, summary_file_path, None, None
-    except Exception as e:
-        logging.error(f"Error in process_audio: {str(e)}")
-        return str(e), None, None, None, None, None
-def process_single_audio(audio_file_path, whisper_model, api_name, api_key, keep_original,custom_keywords, source,
-                         custom_prompt_input, chunk_method, max_chunk_size, chunk_overlap, use_adaptive_chunking,
-                         use_multi_level_chunking, chunk_language):
-    progress = []
-    transcription = ""
-    summary = ""
-    def update_progress(message):
-        progress.append(message)
-        return "\n".join(progress)
-    try:
-        # Check file size before processing
-        file_size = os.path.getsize(audio_file_path)
-        if file_size > MAX_FILE_SIZE:
-            update_progress(f"File size ({file_size / (1024 * 1024):.2f} MB) exceeds the maximum limit of {MAX_FILE_SIZE / (1024 * 1024):.2f} MB. Skipping this file.")
-            return "\n".join(progress), "", ""
-        # Perform transcription
-        update_progress("Starting transcription...")
-        segments = speech_to_text(audio_file_path, whisper_model=whisper_model)
-        transcription = " ".join([segment['Text'] for segment in segments])
-        update_progress("Audio transcribed successfully.")
-        # Perform summarization if API is provided
-        if api_name and api_key:
-            update_progress("Starting summarization...")
-            summary = perform_summarization(api_name, transcription, "Summarize the following audio transcript",
-                                            api_key)
-            update_progress("Audio summarized successfully.")
-        else:
-            summary = "No summary available"
-        # Prepare keywords
-        keywords = "audio,transcription"
-        if custom_keywords:
-            keywords += f",{custom_keywords}"
-        # Add to database
-        add_media_with_keywords(
-            url=source,
-            title=os.path.basename(audio_file_path),
-            media_type='audio',
-            content=transcription,
-            keywords=keywords,
-            prompt="Summarize the following audio transcript",
-            summary=summary,
-            transcription_model=whisper_model,
-            author="Unknown",
-            ingestion_date=None  # This will use the current date
-        )
-        update_progress("Audio file added to database successfully.")
-        if not keep_original and source != "Uploaded File":
-            os.remove(audio_file_path)
-            update_progress(f"Temporary file {audio_file_path} removed.")
-        elif keep_original and source != "Uploaded File":
-            update_progress(f"Original audio file kept at: {audio_file_path}")
-    except Exception as e:
-        update_progress(f"Error processing {source}: {str(e)}")
-        transcription = f"Error: {str(e)}"
-        summary = "No summary due to error"
-    return "\n".join(progress), transcription, summary
 def process_audio_files(audio_urls, audio_file, whisper_model, api_name, api_key, use_cookies, cookies, keep_original,
                         custom_keywords, custom_prompt_input, chunk_method, max_chunk_size, chunk_overlap,
-                        use_adaptive_chunking, use_multi_level_chunking, chunk_language, diarize):
     progress = []
-    temp_files = []
     all_transcriptions = []
     all_summaries = []
     def update_progress(message):
         progress.append(message)
@@ -335,6 +201,12 @@ def process_audio_files(audio_urls, audio_file, whisper_model, api_name, api_key
             audio_file_path = download_audio_file(url, use_cookies, cookies)
             if not os.path.exists(audio_file_path):
                 update_progress(f"Downloaded file not found: {audio_file_path}")
                 continue
             temp_files.append(audio_file_path)
@@ -344,6 +216,12 @@ def process_audio_files(audio_urls, audio_file, whisper_model, api_name, api_key
             reencoded_mp3_path = reencode_mp3(audio_file_path)
             if not os.path.exists(reencoded_mp3_path):
                 update_progress(f"Re-encoded file not found: {reencoded_mp3_path}")
                 continue
             temp_files.append(reencoded_mp3_path)
@@ -352,6 +230,12 @@ def process_audio_files(audio_urls, audio_file, whisper_model, api_name, api_key
             wav_file_path = convert_mp3_to_wav(reencoded_mp3_path)
             if not os.path.exists(wav_file_path):
                 update_progress(f"Converted WAV file not found: {wav_file_path}")
                 continue
             temp_files.append(wav_file_path)
@@ -370,20 +254,36 @@ def process_audio_files(audio_urls, audio_file, whisper_model, api_name, api_key
                 segments = segments['segments']
             if isinstance(segments, list):
-                transcription = " ".join([segment.get('Text', '') for segment in segments])
                 update_progress("Audio transcribed successfully.")
             else:
                 update_progress("Unexpected segments format received from speech_to_text.")
                 logging.error(f"Unexpected segments format: {segments}")
                 continue
             if not transcription.strip():
                 update_progress("Transcription is empty.")
             else:
                 # Apply chunking
                 chunked_text = improved_chunking_process(transcription, chunk_options)
                 # Summarize
                 if api_name:
                     try:
                         summary = perform_summarization(api_name, chunked_text, custom_prompt_input, api_key)
@@ -391,16 +291,25 @@ def process_audio_files(audio_urls, audio_file, whisper_model, api_name, api_key
                     except Exception as e:
                         logging.error(f"Error during summarization: {str(e)}")
                         summary = "Summary generation failed"
                 else:
                     summary = "No summary available (API not provided)"
                 all_transcriptions.append(transcription)
                 all_summaries.append(summary)
                 # Add to database
                 add_media_with_keywords(
                     url=url,
-                    title=os.path.basename(wav_file_path),
                     media_type='audio',
                     content=transcription,
                     keywords=custom_keywords,
@@ -411,79 +320,129 @@ def process_audio_files(audio_urls, audio_file, whisper_model, api_name, api_key
                     ingestion_date=datetime.now().strftime('%Y-%m-%d')
                 )
                 update_progress("Audio file processed and added to database.")
         # Process uploaded file if provided
         if audio_file:
             if os.path.getsize(audio_file.name) > MAX_FILE_SIZE:
                 update_progress(
                     f"Uploaded file size exceeds the maximum limit of {MAX_FILE_SIZE / (1024 * 1024):.2f}MB. Skipping this file.")
             else:
-                # Re-encode MP3 to fix potential issues
-                reencoded_mp3_path = reencode_mp3(audio_file.name)
-                if not os.path.exists(reencoded_mp3_path):
-                    update_progress(f"Re-encoded file not found: {reencoded_mp3_path}")
-                    return update_progress("Processing failed: Re-encoded file not found"), "", ""
-                temp_files.append(reencoded_mp3_path)
-                # Convert re-encoded MP3 to WAV
-                wav_file_path = convert_mp3_to_wav(reencoded_mp3_path)
-                if not os.path.exists(wav_file_path):
-                    update_progress(f"Converted WAV file not found: {wav_file_path}")
-                    return update_progress("Processing failed: Converted WAV file not found"), "", ""
-                temp_files.append(wav_file_path)
-                # Initialize transcription
-                transcription = ""
-                if diarize:
-                    segments = speech_to_text(wav_file_path, whisper_model=whisper_model, diarize=True)
-                else:
-                    segments = speech_to_text(wav_file_path, whisper_model=whisper_model)
-                # Handle segments nested under 'segments' key
-                if isinstance(segments, dict) and 'segments' in segments:
-                    segments = segments['segments']
-                if isinstance(segments, list):
-                    transcription = " ".join([segment.get('Text', '') for segment in segments])
-                else:
-                    update_progress("Unexpected segments format received from speech_to_text.")
-                    logging.error(f"Unexpected segments format: {segments}")
-                chunked_text = improved_chunking_process(transcription, chunk_options)
-                if api_name and api_key:
-                    try:
-                        summary = perform_summarization(api_name, chunked_text, custom_prompt_input, api_key)
-                        update_progress("Audio summarized successfully.")
-                    except Exception as e:
-                        logging.error(f"Error during summarization: {str(e)}")
-                        summary = "Summary generation failed"
-                else:
-                    summary = "No summary available (API not provided)"
-                all_transcriptions.append(transcription)
-                all_summaries.append(summary)
-                add_media_with_keywords(
-                    url="Uploaded File",
-                    title=os.path.basename(wav_file_path),
-                    media_type='audio',
-                    content=transcription,
-                    keywords=custom_keywords,
-                    prompt=custom_prompt_input,
-                    summary=summary,
-                    transcription_model=whisper_model,
-                    author="Unknown",
-                    ingestion_date=datetime.now().strftime('%Y-%m-%d')
-                )
-                update_progress("Uploaded file processed and added to database.")
-        # Final cleanup
-        if not keep_original:
-            cleanup_files()
         final_progress = update_progress("All processing complete.")
         final_transcriptions = "\n\n".join(all_transcriptions)
@@ -493,10 +452,39 @@ def process_audio_files(audio_urls, audio_file, whisper_model, api_name, api_key
     except Exception as e:
         logging.error(f"Error processing audio files: {str(e)}")
         cleanup_files()
         return update_progress(f"Processing failed: {str(e)}"), "", ""
 def download_youtube_audio(url):
     try:
         # Determine ffmpeg path based on the operating system.
@@ -564,12 +552,55 @@ def download_youtube_audio(url):
 def process_podcast(url, title, author, keywords, custom_prompt, api_name, api_key, whisper_model,
                     keep_original=False, enable_diarization=False, use_cookies=False, cookies=None,
                     chunk_method=None, max_chunk_size=300, chunk_overlap=0, use_adaptive_chunking=False,
-                    use_multi_level_chunking=False, chunk_language='english'):
-    progress = []
     error_message = ""
     temp_files = []
     def update_progress(message):
         progress.append(message)
         return "\n".join(progress)
@@ -583,13 +614,21 @@ def process_podcast(url, title, author, keywords, custom_prompt, api_name, api_k
                 except Exception as e:
                     update_progress(f"Failed to remove temporary file {file}: {str(e)}")
     try:
-        # Download podcast
-        audio_file = download_audio_file(url, use_cookies, cookies)
         temp_files.append(audio_file)
         update_progress("Podcast downloaded successfully.")
-        # Extract metadata
         metadata = extract_metadata(url)
         title = title or metadata.get('title', 'Unknown Podcast')
         author = author or metadata.get('uploader', 'Unknown Author')
@@ -607,7 +646,7 @@ Duration: {metadata.get('duration', 'N/A')} seconds
 Description: {metadata.get('description', 'N/A')}
 """
-        # Update keywords
         new_keywords = []
         if metadata.get('series'):
             new_keywords.append(f"series:{metadata['series']}")
@@ -617,22 +656,36 @@ Description: {metadata.get('description', 'N/A')}
             new_keywords.append(f"season:{metadata['season']}")
         keywords = f"{keywords},{','.join(new_keywords)}" if keywords else ','.join(new_keywords)
         update_progress(f"Metadata extracted - Title: {title}, Author: {author}, Keywords: {keywords}")
-        # Transcribe the podcast
         try:
             if enable_diarization:
                 segments = speech_to_text(audio_file, whisper_model=whisper_model, diarize=True)
             else:
                 segments = speech_to_text(audio_file, whisper_model=whisper_model)
-            transcription = " ".join([segment['Text'] for segment in segments])
-            update_progress("Podcast transcribed successfully.")
         except Exception as e:
             error_message = f"Transcription failed: {str(e)}"
-            raise
-        # Apply chunking
         chunk_options = {
             'method': chunk_method,
             'max_size': max_chunk_size,
@@ -646,17 +699,19 @@ Description: {metadata.get('description', 'N/A')}
         # Combine metadata and transcription
         full_content = metadata_text + "\n\nTranscription:\n" + transcription
-        # Summarize if API is provided
         summary = None
-        if api_name and api_key:
             try:
                 summary = perform_summarization(api_name, chunked_text, custom_prompt, api_key)
                 update_progress("Podcast summarized successfully.")
             except Exception as e:
                 error_message = f"Summarization failed: {str(e)}"
-                raise
-        # Add to database
         try:
             add_media_with_keywords(
                 url=url,
@@ -673,18 +728,57 @@ Description: {metadata.get('description', 'N/A')}
             update_progress("Podcast added to database successfully.")
         except Exception as e:
             error_message = f"Error adding podcast to database: {str(e)}"
-            raise
-        # Cleanup
         cleanup_files()
-        return (update_progress("Processing complete."), full_content, summary or "No summary generated.",
                 title, author, keywords, error_message)
     except Exception as e:
         logging.error(f"Error processing podcast: {str(e)}")
         cleanup_files()
-        return update_progress(f"Processing failed: {str(e)}"), "", "", "", "", "", str(e)
 #

 import os
 import subprocess
 import tempfile
+import time
 import uuid
 from datetime import datetime
 from pathlib import Path
+#
+# External Imports
 import requests
 import yt_dlp
 #
 # Local Imports
+from App_Function_Libraries.DB.DB_Manager import add_media_with_keywords, \
     check_media_and_whisper_model
+from App_Function_Libraries.Metrics.metrics_logger import log_counter, log_histogram
+from App_Function_Libraries.Summarization.Summarization_General_Lib import perform_summarization
+from App_Function_Libraries.Utils.Utils import downloaded_files, \
+    sanitize_filename, generate_unique_id, temp_files
 from App_Function_Libraries.Video_DL_Ingestion_Lib import extract_metadata
+from App_Function_Libraries.Audio.Audio_Transcription_Lib import speech_to_text
+from App_Function_Libraries.Chunk_Lib import improved_chunking_process
 #
 #######################################################################################################################
 # Function Definitions
         logging.error(f"Unexpected error downloading audio file: {str(e)}")
         raise
 def process_audio_files(audio_urls, audio_file, whisper_model, api_name, api_key, use_cookies, cookies, keep_original,
                         custom_keywords, custom_prompt_input, chunk_method, max_chunk_size, chunk_overlap,
+                        use_adaptive_chunking, use_multi_level_chunking, chunk_language, diarize,
+                        keep_timestamps, custom_title):
+    start_time = time.time()  # Start time for processing
+    processed_count = 0
+    failed_count = 0
     progress = []
     all_transcriptions = []
     all_summaries = []
+    #v2
+    def format_transcription_with_timestamps(segments):
+        if keep_timestamps:
+            formatted_segments = []
+            for segment in segments:
+                start = segment.get('Time_Start', 0)
+                end = segment.get('Time_End', 0)
+                text = segment.get('Text', '').strip()  # Ensure text is stripped of leading/trailing spaces
+                # Add the formatted timestamp and text to the list, followed by a newline
+                formatted_segments.append(f"[{start:.2f}-{end:.2f}] {text}")
+            # Join the segments with a newline to ensure proper formatting
+            return "\n".join(formatted_segments)
+        else:
+            # Join the text without timestamps
+            return "\n".join([segment.get('Text', '').strip() for segment in segments])
     def update_progress(message):
         progress.append(message)
             audio_file_path = download_audio_file(url, use_cookies, cookies)
             if not os.path.exists(audio_file_path):
                 update_progress(f"Downloaded file not found: {audio_file_path}")
+                failed_count += 1
+                log_counter(
+                    metric_name="audio_files_failed_total",
+                    labels={"whisper_model": whisper_model, "api_name": api_name},
+                    value=1
+                )
                 continue
             temp_files.append(audio_file_path)
             reencoded_mp3_path = reencode_mp3(audio_file_path)
             if not os.path.exists(reencoded_mp3_path):
                 update_progress(f"Re-encoded file not found: {reencoded_mp3_path}")
+                failed_count += 1
+                log_counter(
+                    metric_name="audio_files_failed_total",
+                    labels={"whisper_model": whisper_model, "api_name": api_name},
+                    value=1
+                )
                 continue
             temp_files.append(reencoded_mp3_path)
             wav_file_path = convert_mp3_to_wav(reencoded_mp3_path)
             if not os.path.exists(wav_file_path):
                 update_progress(f"Converted WAV file not found: {wav_file_path}")
+                failed_count += 1
+                log_counter(
+                    metric_name="audio_files_failed_total",
+                    labels={"whisper_model": whisper_model, "api_name": api_name},
+                    value=1
+                )
                 continue
             temp_files.append(wav_file_path)
                 segments = segments['segments']
             if isinstance(segments, list):
+                # Log first 5 segments for debugging
+                logging.debug(f"Segments before formatting: {segments[:5]}")
+                transcription = format_transcription_with_timestamps(segments)
+                logging.debug(f"Formatted transcription (first 500 chars): {transcription[:500]}")
                 update_progress("Audio transcribed successfully.")
             else:
                 update_progress("Unexpected segments format received from speech_to_text.")
                 logging.error(f"Unexpected segments format: {segments}")
+                failed_count += 1
+                log_counter(
+                    metric_name="audio_files_failed_total",
+                    labels={"whisper_model": whisper_model, "api_name": api_name},
+                    value=1
+                )
                 continue
             if not transcription.strip():
                 update_progress("Transcription is empty.")
+                failed_count += 1
+                log_counter(
+                    metric_name="audio_files_failed_total",
+                    labels={"whisper_model": whisper_model, "api_name": api_name},
+                    value=1
+                )
             else:
                 # Apply chunking
                 chunked_text = improved_chunking_process(transcription, chunk_options)
                 # Summarize
+                logging.debug(f"Audio Transcription API Name: {api_name}")
                 if api_name:
                     try:
                         summary = perform_summarization(api_name, chunked_text, custom_prompt_input, api_key)
                     except Exception as e:
                         logging.error(f"Error during summarization: {str(e)}")
                         summary = "Summary generation failed"
+                        failed_count += 1
+                        log_counter(
+                            metric_name="audio_files_failed_total",
+                            labels={"whisper_model": whisper_model, "api_name": api_name},
+                            value=1
+                        )
                 else:
                     summary = "No summary available (API not provided)"
                 all_transcriptions.append(transcription)
                 all_summaries.append(summary)
+                # Use custom_title if provided, otherwise use the original filename
+                title = custom_title if custom_title else os.path.basename(wav_file_path)
                 # Add to database
                 add_media_with_keywords(
                     url=url,
+                    title=title,
                     media_type='audio',
                     content=transcription,
                     keywords=custom_keywords,
                     ingestion_date=datetime.now().strftime('%Y-%m-%d')
                 )
                 update_progress("Audio file processed and added to database.")
+                processed_count += 1
+                log_counter(
+                    metric_name="audio_files_processed_total",
+                    labels={"whisper_model": whisper_model, "api_name": api_name},
+                    value=1
+                )
         # Process uploaded file if provided
         if audio_file:
+            url = generate_unique_id()
             if os.path.getsize(audio_file.name) > MAX_FILE_SIZE:
                 update_progress(
                     f"Uploaded file size exceeds the maximum limit of {MAX_FILE_SIZE / (1024 * 1024):.2f}MB. Skipping this file.")
             else:
+                try:
+                    # Re-encode MP3 to fix potential issues
+                    reencoded_mp3_path = reencode_mp3(audio_file.name)
+                    if not os.path.exists(reencoded_mp3_path):
+                        update_progress(f"Re-encoded file not found: {reencoded_mp3_path}")
+                        return update_progress("Processing failed: Re-encoded file not found"), "", ""
+                    temp_files.append(reencoded_mp3_path)
+                    # Convert re-encoded MP3 to WAV
+                    wav_file_path = convert_mp3_to_wav(reencoded_mp3_path)
+                    if not os.path.exists(wav_file_path):
+                        update_progress(f"Converted WAV file not found: {wav_file_path}")
+                        return update_progress("Processing failed: Converted WAV file not found"), "", ""
+                    temp_files.append(wav_file_path)
+                    # Initialize transcription
+                    transcription = ""
+                    if diarize:
+                        segments = speech_to_text(wav_file_path, whisper_model=whisper_model, diarize=True)
+                    else:
+                        segments = speech_to_text(wav_file_path, whisper_model=whisper_model)
+                    # Handle segments nested under 'segments' key
+                    if isinstance(segments, dict) and 'segments' in segments:
+                        segments = segments['segments']
+                    if isinstance(segments, list):
+                        transcription = format_transcription_with_timestamps(segments)
+                    else:
+                        update_progress("Unexpected segments format received from speech_to_text.")
+                        logging.error(f"Unexpected segments format: {segments}")
+                    chunked_text = improved_chunking_process(transcription, chunk_options)
+                    logging.debug(f"Audio Transcription API Name: {api_name}")
+                    if api_name:
+                        try:
+                            summary = perform_summarization(api_name, chunked_text, custom_prompt_input, api_key)
+                            update_progress("Audio summarized successfully.")
+                        except Exception as e:
+                            logging.error(f"Error during summarization: {str(e)}")
+                            summary = "Summary generation failed"
+                    else:
+                        summary = "No summary available (API not provided)"
+                    all_transcriptions.append(transcription)
+                    all_summaries.append(summary)
+                    # Use custom_title if provided, otherwise use the original filename
+                    title = custom_title if custom_title else os.path.basename(wav_file_path)
+                    add_media_with_keywords(
+                        url="Uploaded File",
+                        title=title,
+                        media_type='audio',
+                        content=transcription,
+                        keywords=custom_keywords,
+                        prompt=custom_prompt_input,
+                        summary=summary,
+                        transcription_model=whisper_model,
+                        author="Unknown",
+                        ingestion_date=datetime.now().strftime('%Y-%m-%d')
+                    )
+                    update_progress("Uploaded file processed and added to database.")
+                    processed_count += 1
+                    log_counter(
+                        metric_name="audio_files_processed_total",
+                        labels={"whisper_model": whisper_model, "api_name": api_name},
+                        value=1
+                    )
+                except Exception as e:
+                    update_progress(f"Error processing uploaded file: {str(e)}")
+                    logging.error(f"Error processing uploaded file: {str(e)}")
+                    failed_count += 1
+                    log_counter(
+                        metric_name="audio_files_failed_total",
+                        labels={"whisper_model": whisper_model, "api_name": api_name},
+                        value=1
+                    )
+                    return update_progress("Processing failed: Error processing uploaded file"), "", ""
+        # Final cleanup
+        if not keep_original:
+            cleanup_files()
+        end_time = time.time()
+        processing_time = end_time - start_time
+        # Log processing time
+        log_histogram(
+            metric_name="audio_processing_time_seconds",
+            value=processing_time,
+            labels={"whisper_model": whisper_model, "api_name": api_name}
+        )
+        # Optionally, log total counts
+        log_counter(
+            metric_name="total_audio_files_processed",
+            labels={"whisper_model": whisper_model, "api_name": api_name},
+            value=processed_count
+        )
+        log_counter(
+            metric_name="total_audio_files_failed",
+            labels={"whisper_model": whisper_model, "api_name": api_name},
+            value=failed_count
+        )
         final_progress = update_progress("All processing complete.")
         final_transcriptions = "\n\n".join(all_transcriptions)
     except Exception as e:
         logging.error(f"Error processing audio files: {str(e)}")
+        log_counter(
+            metric_name="audio_files_failed_total",
+            labels={"whisper_model": whisper_model, "api_name": api_name},
+            value=1
+        )
         cleanup_files()
         return update_progress(f"Processing failed: {str(e)}"), "", ""
+def format_transcription_with_timestamps(segments, keep_timestamps):
+    """
+    Formats the transcription segments with or without timestamps.
+    Parameters:
+        segments (list): List of transcription segments.
+        keep_timestamps (bool): Whether to include timestamps.
+    Returns:
+        str: Formatted transcription.
+    """
+    if keep_timestamps:
+        formatted_segments = []
+        for segment in segments:
+            start = segment.get('Time_Start', 0)
+            end = segment.get('Time_End', 0)
+            text = segment.get('Text', '').strip()
+            formatted_segments.append(f"[{start:.2f}-{end:.2f}] {text}")
+        return "\n".join(formatted_segments)
+    else:
+        return "\n".join([segment.get('Text', '').strip() for segment in segments])
 def download_youtube_audio(url):
     try:
         # Determine ffmpeg path based on the operating system.
 def process_podcast(url, title, author, keywords, custom_prompt, api_name, api_key, whisper_model,
                     keep_original=False, enable_diarization=False, use_cookies=False, cookies=None,
                     chunk_method=None, max_chunk_size=300, chunk_overlap=0, use_adaptive_chunking=False,
+                    use_multi_level_chunking=False, chunk_language='english', keep_timestamps=True):
+    """
+    Processes a podcast by downloading the audio, transcribing it, summarizing the transcription,
+    and adding the results to the database. Metrics are logged throughout the process.
+    Parameters:
+        url (str): URL of the podcast.
+        title (str): Title of the podcast.
+        author (str): Author of the podcast.
+        keywords (str): Comma-separated keywords.
+        custom_prompt (str): Custom prompt for summarization.
+        api_name (str): API name for summarization.
+        api_key (str): API key for summarization.
+        whisper_model (str): Whisper model to use for transcription.
+        keep_original (bool): Whether to keep the original audio file.
+        enable_diarization (bool): Whether to enable speaker diarization.
+        use_cookies (bool): Whether to use cookies for authenticated downloads.
+        cookies (str): JSON-formatted cookies string.
+        chunk_method (str): Method for chunking text.
+        max_chunk_size (int): Maximum size for each text chunk.
+        chunk_overlap (int): Overlap size between chunks.
+        use_adaptive_chunking (bool): Whether to use adaptive chunking.
+        use_multi_level_chunking (bool): Whether to use multi-level chunking.
+        chunk_language (str): Language for chunking.
+        keep_timestamps (bool): Whether to keep timestamps in transcription.
+    Returns:
+        tuple: (progress_message, transcription, summary, title, author, keywords, error_message)
+    """
+    start_time = time.time()  # Start time for processing
     error_message = ""
     temp_files = []
+    # Define labels for metrics
+    labels = {
+        "whisper_model": whisper_model,
+        "api_name": api_name if api_name else "None"
+    }
     def update_progress(message):
+        """
+        Updates the progress messages.
+        Parameters:
+            message (str): Progress message to append.
+        Returns:
+            str: Combined progress messages.
+        """
         progress.append(message)
         return "\n".join(progress)
                 except Exception as e:
                     update_progress(f"Failed to remove temporary file {file}: {str(e)}")
+    progress = []  # Initialize progress messages
     try:
+        # Handle cookies if required
+        if use_cookies:
+            cookies = json.loads(cookies)
+        # Download the podcast audio file
+        audio_file = download_audio_file(url, whisper_model, use_cookies, cookies)
+        if not audio_file:
+            raise RuntimeError("Failed to download podcast audio.")
         temp_files.append(audio_file)
         update_progress("Podcast downloaded successfully.")
+        # Extract metadata from the podcast
         metadata = extract_metadata(url)
         title = title or metadata.get('title', 'Unknown Podcast')
         author = author or metadata.get('uploader', 'Unknown Author')
 Description: {metadata.get('description', 'N/A')}
 """
+        # Update keywords with metadata information
         new_keywords = []
         if metadata.get('series'):
             new_keywords.append(f"series:{metadata['series']}")
             new_keywords.append(f"season:{metadata['season']}")
         keywords = f"{keywords},{','.join(new_keywords)}" if keywords else ','.join(new_keywords)
         update_progress(f"Metadata extracted - Title: {title}, Author: {author}, Keywords: {keywords}")
+        # Transcribe the podcast audio
         try:
             if enable_diarization:
                 segments = speech_to_text(audio_file, whisper_model=whisper_model, diarize=True)
             else:
                 segments = speech_to_text(audio_file, whisper_model=whisper_model)
+            # SEems like this could be optimized... FIXME
+            def format_segment(segment):
+                start = segment.get('start', 0)
+                end = segment.get('end', 0)
+                text = segment.get('Text', '')
+            if isinstance(segments, dict) and 'segments' in segments:
+                segments = segments['segments']
+            if isinstance(segments, list):
+                transcription = format_transcription_with_timestamps(segments, keep_timestamps)
+                update_progress("Podcast transcribed successfully.")
+            else:
+                raise ValueError("Unexpected segments format received from speech_to_text.")
+            if not transcription.strip():
+                raise ValueError("Transcription is empty.")
         except Exception as e:
             error_message = f"Transcription failed: {str(e)}"
+            raise RuntimeError(error_message)
+        # Apply chunking to the transcription
         chunk_options = {
             'method': chunk_method,
             'max_size': max_chunk_size,
         # Combine metadata and transcription
         full_content = metadata_text + "\n\nTranscription:\n" + transcription
+        # Summarize the transcription if API is provided
         summary = None
+        if api_name:
             try:
                 summary = perform_summarization(api_name, chunked_text, custom_prompt, api_key)
                 update_progress("Podcast summarized successfully.")
             except Exception as e:
                 error_message = f"Summarization failed: {str(e)}"
+                raise RuntimeError(error_message)
+        else:
+            summary = "No summary available (API not provided)"
+        # Add the processed podcast to the database
         try:
             add_media_with_keywords(
                 url=url,
             update_progress("Podcast added to database successfully.")
         except Exception as e:
             error_message = f"Error adding podcast to database: {str(e)}"
+            raise RuntimeError(error_message)
+        # Cleanup temporary files if required
         cleanup_files()
+        # Calculate processing time
+        end_time = time.time()
+        processing_time = end_time - start_time
+        # Log successful processing
+        log_counter(
+            metric_name="podcasts_processed_total",
+            labels=labels,
+            value=1
+        )
+        # Log processing time
+        log_histogram(
+            metric_name="podcast_processing_time_seconds",
+            value=processing_time,
+            labels=labels
+        )
+        # Return the final outputs
+        final_progress = update_progress("Processing complete.")
+        return (final_progress, full_content, summary or "No summary generated.",
                 title, author, keywords, error_message)
     except Exception as e:
+        # Calculate processing time up to the point of failure
+        end_time = time.time()
+        processing_time = end_time - start_time
+        # Log failed processing
+        log_counter(
+            metric_name="podcasts_failed_total",
+            labels=labels,
+            value=1
+        )
+        # Log processing time even on failure
+        log_histogram(
+            metric_name="podcast_processing_time_seconds",
+            value=processing_time,
+            labels=labels
+        )
         logging.error(f"Error processing podcast: {str(e)}")
         cleanup_files()
+        final_progress = update_progress(f"Processing failed: {str(e)}")
+        return (final_progress, "", "", "", "", "", str(e))
 #

App_Function_Libraries/Audio/Audio_Transcription_Lib.py CHANGED Viewed

@@ -1,277 +1,335 @@
-# Audio_Transcription_Lib.py
-#########################################
-# Transcription Library
-# This library is used to perform transcription of audio files.
-# Currently, uses faster_whisper for transcription.
-#
-####################
-# Function List
-#
-# 1. convert_to_wav(video_file_path, offset=0, overwrite=False)
-# 2. speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='small.en', vad_filter=False)
-#
-####################
-#
-# Import necessary libraries to run solo for testing
-import gc
-import json
-import logging
-import os
-import queue
-import sys
-import subprocess
-import tempfile
-import threading
-import time
-# DEBUG Imports
-#from memory_profiler import profile
-#import pyaudio
-from faster_whisper import WhisperModel as OriginalWhisperModel
-from typing import Optional, Union, List, Dict, Any
-#
-# Import Local
-from App_Function_Libraries.Utils.Utils import load_comprehensive_config
-#
-#######################################################################################################################
-# Function Definitions
-#
-# Convert video .m4a into .wav using ffmpeg
-#   ffmpeg -i "example.mp4" -ar 16000 -ac 1 -c:a pcm_s16le "output.wav"
-#       https://www.gyan.dev/ffmpeg/builds/
-#
-whisper_model_instance = None
-config = load_comprehensive_config()
-processing_choice = config.get('Processing', 'processing_choice', fallback='cpu')
-class WhisperModel(OriginalWhisperModel):
-    tldw_dir = os.path.dirname(os.path.dirname(__file__))
-    default_download_root = os.path.join(tldw_dir, 'App_Function_Libraries', 'models', 'Whisper')
-    valid_model_sizes = [
-        "tiny.en", "tiny", "base.en", "base", "small.en", "small", "medium.en", "medium",
-        "large-v1", "large-v2", "large-v3", "large", "distil-large-v2", "distil-medium.en",
-        "distil-small.en", "distil-large-v3"
-    ]
-    def __init__(
-        self,
-        model_size_or_path: str,
-        device: str = "auto",
-        device_index: Union[int, List[int]] = 0,
-        compute_type: str = "default",
-        cpu_threads: int = 16,
-        num_workers: int = 1,
-        download_root: Optional[str] = None,
-        local_files_only: bool = False,
-        files: Optional[Dict[str, Any]] = None,
-        **model_kwargs: Any
-    ):
-        if download_root is None:
-            download_root = self.default_download_root
-        os.makedirs(download_root, exist_ok=True)
-        # FIXME - validate....
-        # Also write an integration test...
-        # Check if model_size_or_path is a valid model size
-        if model_size_or_path in self.valid_model_sizes:
-            # It's a model size, so we'll use the download_root
-            model_path = os.path.join(download_root, model_size_or_path)
-            if not os.path.isdir(model_path):
-                # If it doesn't exist, we'll let the parent class download it
-                model_size_or_path = model_size_or_path  # Keep the original model size
-            else:
-                # If it exists, use the full path
-                model_size_or_path = model_path
-        else:
-            # It's not a valid model size, so assume it's a path
-            model_size_or_path = os.path.abspath(model_size_or_path)
-        super().__init__(
-            model_size_or_path,
-            device=device,
-            device_index=device_index,
-            compute_type=compute_type,
-            cpu_threads=cpu_threads,
-            num_workers=num_workers,
-            download_root=download_root,
-            local_files_only=local_files_only,
-# Maybe? idk, FIXME
-#            files=files,
-#            **model_kwargs
-        )
-def get_whisper_model(model_name, device):
-    global whisper_model_instance
-    if whisper_model_instance is None:
-        logging.info(f"Initializing new WhisperModel with size {model_name} on device {device}")
-        whisper_model_instance = WhisperModel(model_name, device=device)
-    return whisper_model_instance
-# # FIXME: This is a temporary solution.
-# # This doesn't clear older models, which means potentially a lot of memory is being used...
-# def get_whisper_model(model_name, device):
-#     global whisper_model_instance
-#     if whisper_model_instance is None:
-#         from faster_whisper import WhisperModel
-#         logging.info(f"Initializing new WhisperModel with size {model_name} on device {device}")
-#
-#         # FIXME - add logic to detect if the model is already downloaded
-#         # want to first check if the model is already downloaded
-#         # if not, download it using the existing logic in 'WhisperModel'
-#         # https://github.com/SYSTRAN/faster-whisper/blob/d57c5b40b06e59ec44240d93485a95799548af50/faster_whisper/transcribe.py#L584
-#         # Designated path should be `tldw/App_Function_Libraries/models/Whisper/`
-#         WhisperModel.download_root = os.path.join(os.path.dirname(__file__), 'models', 'Whisper')
-#         os.makedirs(WhisperModel.download_root, exist_ok=True)
-#         whisper_model_instance = WhisperModel(model_name, device=device)
-#     return whisper_model_instance
-# os.system(r'.\Bin\ffmpeg.exe -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
-#DEBUG
-#@profile
-def convert_to_wav(video_file_path, offset=0, overwrite=False):
-    out_path = os.path.splitext(video_file_path)[0] + ".wav"
-    if os.path.exists(out_path) and not overwrite:
-        print(f"File '{out_path}' already exists. Skipping conversion.")
-        logging.info(f"Skipping conversion as file already exists: {out_path}")
-        return out_path
-    print("Starting conversion process of .m4a to .WAV")
-    out_path = os.path.splitext(video_file_path)[0] + ".wav"
-    try:
-        if os.name == "nt":
-            logging.debug("ffmpeg being ran on windows")
-            if sys.platform.startswith('win'):
-                ffmpeg_cmd = ".\\Bin\\ffmpeg.exe"
-                logging.debug(f"ffmpeg_cmd: {ffmpeg_cmd}")
-            else:
-                ffmpeg_cmd = 'ffmpeg'  # Assume 'ffmpeg' is in PATH for non-Windows systems
-            command = [
-                ffmpeg_cmd,  # Assuming the working directory is correctly set where .\Bin exists
-                "-ss", "00:00:00",  # Start at the beginning of the video
-                "-i", video_file_path,
-                "-ar", "16000",  # Audio sample rate
-                "-ac", "1",  # Number of audio channels
-                "-c:a", "pcm_s16le",  # Audio codec
-                out_path
-            ]
-            try:
-                # Redirect stdin from null device to prevent ffmpeg from waiting for input
-                with open(os.devnull, 'rb') as null_file:
-                    result = subprocess.run(command, stdin=null_file, text=True, capture_output=True)
-                if result.returncode == 0:
-                    logging.info("FFmpeg executed successfully")
-                    logging.debug("FFmpeg output: %s", result.stdout)
-                else:
-                    logging.error("Error in running FFmpeg")
-                    logging.error("FFmpeg stderr: %s", result.stderr)
-                    raise RuntimeError(f"FFmpeg error: {result.stderr}")
-            except Exception as e:
-                logging.error("Error occurred - ffmpeg doesn't like windows")
-                raise RuntimeError("ffmpeg failed")
-        elif os.name == "posix":
-            os.system(f'ffmpeg -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
-        else:
-            raise RuntimeError("Unsupported operating system")
-        logging.info("Conversion to WAV completed: %s", out_path)
-    except subprocess.CalledProcessError as e:
-        logging.error("Error executing FFmpeg command: %s", str(e))
-        raise RuntimeError("Error converting video file to WAV")
-    except Exception as e:
-        logging.error("speech-to-text: Error transcribing audio: %s", str(e))
-        return {"error": str(e)}
-    gc.collect()
-    return out_path
-# Transcribe .wav into .segments.json
-#DEBUG
-#@profile
-def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='medium.en', vad_filter=False, diarize=False):
-    global whisper_model_instance, processing_choice
-    logging.info('speech-to-text: Loading faster_whisper model: %s', whisper_model)
-    time_start = time.time()
-    if audio_file_path is None:
-        raise ValueError("speech-to-text: No audio file provided")
-    logging.info("speech-to-text: Audio file path: %s", audio_file_path)
-    try:
-        _, file_ending = os.path.splitext(audio_file_path)
-        out_file = audio_file_path.replace(file_ending, ".segments.json")
-        prettified_out_file = audio_file_path.replace(file_ending, ".segments_pretty.json")
-        if os.path.exists(out_file):
-            logging.info("speech-to-text: Segments file already exists: %s", out_file)
-            with open(out_file) as f:
-                global segments
-                segments = json.load(f)
-            return segments
-        logging.info('speech-to-text: Starting transcription...')
-        options = dict(language=selected_source_lang, beam_size=5, best_of=5, vad_filter=vad_filter)
-        transcribe_options = dict(task="transcribe", **options)
-        # use function and config at top of file
-        logging.debug("speech-to-text: Using whisper model: %s", whisper_model)
-        whisper_model_instance = get_whisper_model(whisper_model, processing_choice)
-        segments_raw, info = whisper_model_instance.transcribe(audio_file_path, **transcribe_options)
-        segments = []
-        for segment_chunk in segments_raw:
-            chunk = {
-                "Time_Start": segment_chunk.start,
-                "Time_End": segment_chunk.end,
-                "Text": segment_chunk.text
-            }
-            logging.debug("Segment: %s", chunk)
-            segments.append(chunk)
-            # Print to verify its working
-            print(f"{segment_chunk.start:.2f}s - {segment_chunk.end:.2f}s | {segment_chunk.text}")
-            # Log it as well.
-            logging.debug(
-                f"Transcribed Segment: {segment_chunk.start:.2f}s - {segment_chunk.end:.2f}s | {segment_chunk.text}")
-        if segments:
-            segments[0]["Text"] = f"This text was transcribed using whisper model: {whisper_model}\n\n" + segments[0]["Text"]
-        if not segments:
-            raise RuntimeError("No transcription produced. The audio file may be invalid or empty.")
-        logging.info("speech-to-text: Transcription completed in %.2f seconds", time.time() - time_start)
-        # Save the segments to a JSON file - prettified and non-prettified
-        # FIXME so this is an optional flag to save either the prettified json file or the normal one
-        save_json = True
-        if save_json:
-            logging.info("speech-to-text: Saving segments to JSON file")
-            output_data = {'segments': segments}
-            logging.info("speech-to-text: Saving prettified JSON to %s", prettified_out_file)
-            with open(prettified_out_file, 'w') as f:
-                json.dump(output_data, f, indent=2)
-            logging.info("speech-to-text: Saving JSON to %s", out_file)
-            with open(out_file, 'w') as f:
-                json.dump(output_data, f)
-        logging.debug(f"speech-to-text: returning {segments[:500]}")
-        gc.collect()
-        return segments
-    except Exception as e:
-        logging.error("speech-to-text: Error transcribing audio: %s", str(e))
-        raise RuntimeError("speech-to-text: Error transcribing audio")
-#
-#
 #######################################################################################################################

+# Audio_Transcription_Lib.py
+#########################################
+# Transcription Library
+# This library is used to perform transcription of audio files.
+# Currently, uses faster_whisper for transcription.
+#
+####################
+# Function List
+#
+# 1. convert_to_wav(video_file_path, offset=0, overwrite=False)
+# 2. speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='small.en', vad_filter=False)
+#
+####################
+#
+# Import necessary libraries to run solo for testing
+import gc
+import json
+import logging
+import multiprocessing
+import os
+import queue
+import sys
+import subprocess
+import tempfile
+import threading
+import time
+# DEBUG Imports
+#from memory_profiler import profile
+import pyaudio
+from faster_whisper import WhisperModel as OriginalWhisperModel
+from typing import Optional, Union, List, Dict, Any
+#
+# Import Local
+from App_Function_Libraries.Utils.Utils import load_comprehensive_config
+from App_Function_Libraries.Metrics.metrics_logger import log_counter, log_histogram
+#
+#######################################################################################################################
+# Function Definitions
+#
+# Convert video .m4a into .wav using ffmpeg
+#   ffmpeg -i "example.mp4" -ar 16000 -ac 1 -c:a pcm_s16le "output.wav"
+#       https://www.gyan.dev/ffmpeg/builds/
+#
+whisper_model_instance = None
+config = load_comprehensive_config()
+processing_choice = config.get('Processing', 'processing_choice', fallback='cpu')
+total_thread_count = multiprocessing.cpu_count()
+class WhisperModel(OriginalWhisperModel):
+    tldw_dir = os.path.dirname(os.path.dirname(__file__))
+    default_download_root = os.path.join(tldw_dir, 'models', 'Whisper')
+    valid_model_sizes = [
+        "tiny.en", "tiny", "base.en", "base", "small.en", "small", "medium.en", "medium",
+        "large-v1", "large-v2", "large-v3", "large", "distil-large-v2", "distil-medium.en",
+        "distil-small.en", "distil-large-v3",
+    ]
+    def __init__(
+        self,
+        model_size_or_path: str,
+        device: str = processing_choice,
+        device_index: Union[int, List[int]] = 0,
+        compute_type: str = "default",
+        cpu_threads: int = 0,#total_thread_count, FIXME - I think this should be 0
+        num_workers: int = 1,
+        download_root: Optional[str] = None,
+        local_files_only: bool = False,
+        files: Optional[Dict[str, Any]] = None,
+        **model_kwargs: Any
+    ):
+        if download_root is None:
+            download_root = self.default_download_root
+        os.makedirs(download_root, exist_ok=True)
+        # FIXME - validate....
+        # Also write an integration test...
+        # Check if model_size_or_path is a valid model size
+        if model_size_or_path in self.valid_model_sizes:
+            # It's a model size, so we'll use the download_root
+            model_path = os.path.join(download_root, model_size_or_path)
+            if not os.path.isdir(model_path):
+                # If it doesn't exist, we'll let the parent class download it
+                model_size_or_path = model_size_or_path  # Keep the original model size
+            else:
+                # If it exists, use the full path
+                model_size_or_path = model_path
+        else:
+            # It's not a valid model size, so assume it's a path
+            model_size_or_path = os.path.abspath(model_size_or_path)
+        super().__init__(
+            model_size_or_path,
+            device=device,
+            device_index=device_index,
+            compute_type=compute_type,
+            cpu_threads=cpu_threads,
+            num_workers=num_workers,
+            download_root=download_root,
+            local_files_only=local_files_only,
+# Maybe? idk, FIXME
+#            files=files,
+#            **model_kwargs
+        )
+def get_whisper_model(model_name, device):
+    global whisper_model_instance
+    if whisper_model_instance is None:
+        logging.info(f"Initializing new WhisperModel with size {model_name} on device {device}")
+        whisper_model_instance = WhisperModel(model_name, device=device)
+    return whisper_model_instance
+# os.system(r'.\Bin\ffmpeg.exe -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
+#DEBUG
+#@profile
+def convert_to_wav(video_file_path, offset=0, overwrite=False):
+    log_counter("convert_to_wav_attempt", labels={"file_path": video_file_path})
+    start_time = time.time()
+    out_path = os.path.splitext(video_file_path)[0] + ".wav"
+    if os.path.exists(out_path) and not overwrite:
+        print(f"File '{out_path}' already exists. Skipping conversion.")
+        logging.info(f"Skipping conversion as file already exists: {out_path}")
+        log_counter("convert_to_wav_skipped", labels={"file_path": video_file_path})
+        return out_path
+    print("Starting conversion process of .m4a to .WAV")
+    out_path = os.path.splitext(video_file_path)[0] + ".wav"
+    try:
+        if os.name == "nt":
+            logging.debug("ffmpeg being ran on windows")
+            if sys.platform.startswith('win'):
+                ffmpeg_cmd = ".\\Bin\\ffmpeg.exe"
+                logging.debug(f"ffmpeg_cmd: {ffmpeg_cmd}")
+            else:
+                ffmpeg_cmd = 'ffmpeg'  # Assume 'ffmpeg' is in PATH for non-Windows systems
+            command = [
+                ffmpeg_cmd,  # Assuming the working directory is correctly set where .\Bin exists
+                "-ss", "00:00:00",  # Start at the beginning of the video
+                "-i", video_file_path,
+                "-ar", "16000",  # Audio sample rate
+                "-ac", "1",  # Number of audio channels
+                "-c:a", "pcm_s16le",  # Audio codec
+                out_path
+            ]
+            try:
+                # Redirect stdin from null device to prevent ffmpeg from waiting for input
+                with open(os.devnull, 'rb') as null_file:
+                    result = subprocess.run(command, stdin=null_file, text=True, capture_output=True)
+                if result.returncode == 0:
+                    logging.info("FFmpeg executed successfully")
+                    logging.debug("FFmpeg output: %s", result.stdout)
+                else:
+                    logging.error("Error in running FFmpeg")
+                    logging.error("FFmpeg stderr: %s", result.stderr)
+                    raise RuntimeError(f"FFmpeg error: {result.stderr}")
+            except Exception as e:
+                logging.error("Error occurred - ffmpeg doesn't like windows")
+                raise RuntimeError("ffmpeg failed")
+        elif os.name == "posix":
+            os.system(f'ffmpeg -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
+        else:
+            raise RuntimeError("Unsupported operating system")
+        logging.info("Conversion to WAV completed: %s", out_path)
+        log_counter("convert_to_wav_success", labels={"file_path": video_file_path})
+    except Exception as e:
+        logging.error("speech-to-text: Error transcribing audio: %s", str(e))
+        log_counter("convert_to_wav_error", labels={"file_path": video_file_path, "error": str(e)})
+        return {"error": str(e)}
+    conversion_time = time.time() - start_time
+    log_histogram("convert_to_wav_duration", conversion_time, labels={"file_path": video_file_path})
+    gc.collect()
+    return out_path
+# Transcribe .wav into .segments.json
+#DEBUG
+#@profile
+# FIXME - I feel like the `vad_filter` shoudl be enabled by default....
+def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='medium.en', vad_filter=False, diarize=False):
+    log_counter("speech_to_text_attempt", labels={"file_path": audio_file_path, "model": whisper_model})
+    time_start = time.time()
+    if audio_file_path is None:
+        log_counter("speech_to_text_error", labels={"error": "No audio file provided"})
+        raise ValueError("speech-to-text: No audio file provided")
+    logging.info("speech-to-text: Audio file path: %s", audio_file_path)
+    try:
+        _, file_ending = os.path.splitext(audio_file_path)
+        out_file = audio_file_path.replace(file_ending, "-whisper_model-"+whisper_model+".segments.json")
+        prettified_out_file = audio_file_path.replace(file_ending, "-whisper_model-"+whisper_model+".segments_pretty.json")
+        if os.path.exists(out_file):
+            logging.info("speech-to-text: Segments file already exists: %s", out_file)
+            with open(out_file) as f:
+                global segments
+                segments = json.load(f)
+            return segments
+        logging.info('speech-to-text: Starting transcription...')
+        # FIXME - revisit this
+        options = dict(language=selected_source_lang, beam_size=10, best_of=10, vad_filter=vad_filter)
+        transcribe_options = dict(task="transcribe", **options)
+        # use function and config at top of file
+        logging.debug("speech-to-text: Using whisper model: %s", whisper_model)
+        whisper_model_instance = get_whisper_model(whisper_model, processing_choice)
+        # faster_whisper transcription right here - FIXME -test batching - ha
+        segments_raw, info = whisper_model_instance.transcribe(audio_file_path, **transcribe_options)
+        segments = []
+        for segment_chunk in segments_raw:
+            chunk = {
+                "Time_Start": segment_chunk.start,
+                "Time_End": segment_chunk.end,
+                "Text": segment_chunk.text
+            }
+            logging.debug("Segment: %s", chunk)
+            segments.append(chunk)
+            # Print to verify its working
+            logging.info(f"{segment_chunk.start:.2f}s - {segment_chunk.end:.2f}s | {segment_chunk.text}")
+            # Log it as well.
+            logging.debug(
+                f"Transcribed Segment: {segment_chunk.start:.2f}s - {segment_chunk.end:.2f}s | {segment_chunk.text}")
+        if segments:
+            segments[0]["Text"] = f"This text was transcribed using whisper model: {whisper_model}\n\n" + segments[0]["Text"]
+        if not segments:
+            log_counter("speech_to_text_error", labels={"error": "No transcription produced"})
+            raise RuntimeError("No transcription produced. The audio file may be invalid or empty.")
+        transcription_time = time.time() - time_start
+        logging.info("speech-to-text: Transcription completed in %.2f seconds", transcription_time)
+        log_histogram("speech_to_text_duration", transcription_time, labels={"file_path": audio_file_path, "model": whisper_model})
+        log_counter("speech_to_text_success", labels={"file_path": audio_file_path, "model": whisper_model})
+        # Save the segments to a JSON file - prettified and non-prettified
+        # FIXME refactor so this is an optional flag to save either the prettified json file or the normal one
+        save_json = True
+        if save_json:
+            logging.info("speech-to-text: Saving segments to JSON file")
+            output_data = {'segments': segments}
+            logging.info("speech-to-text: Saving prettified JSON to %s", prettified_out_file)
+            with open(prettified_out_file, 'w') as f:
+                json.dump(output_data, f, indent=2)
+            logging.info("speech-to-text: Saving JSON to %s", out_file)
+            with open(out_file, 'w') as f:
+                json.dump(output_data, f)
+        logging.debug(f"speech-to-text: returning {segments[:500]}")
+        gc.collect()
+        return segments
+    except Exception as e:
+        logging.error("speech-to-text: Error transcribing audio: %s", str(e))
+        log_counter("speech_to_text_error", labels={"file_path": audio_file_path, "model": whisper_model, "error": str(e)})
+        raise RuntimeError("speech-to-text: Error transcribing audio")
+def record_audio(duration, sample_rate=16000, chunk_size=1024):
+    log_counter("record_audio_attempt", labels={"duration": duration})
+    p = pyaudio.PyAudio()
+    stream = p.open(format=pyaudio.paInt16,
+                    channels=1,
+                    rate=sample_rate,
+                    input=True,
+                    frames_per_buffer=chunk_size)
+    print("Recording...")
+    frames = []
+    stop_recording = threading.Event()
+    audio_queue = queue.Queue()
+    def audio_callback():
+        for _ in range(0, int(sample_rate / chunk_size * duration)):
+            if stop_recording.is_set():
+                break
+            data = stream.read(chunk_size)
+            audio_queue.put(data)
+    audio_thread = threading.Thread(target=audio_callback)
+    audio_thread.start()
+    return p, stream, audio_queue, stop_recording, audio_thread
+def stop_recording(p, stream, audio_queue, stop_recording_event, audio_thread):
+    log_counter("stop_recording_attempt")
+    start_time = time.time()
+    stop_recording_event.set()
+    audio_thread.join()
+    frames = []
+    while not audio_queue.empty():
+        frames.append(audio_queue.get())
+    print("Recording finished.")
+    stream.stop_stream()
+    stream.close()
+    p.terminate()
+    stop_time = time.time() - start_time
+    log_histogram("stop_recording_duration", stop_time)
+    log_counter("stop_recording_success")
+    return b''.join(frames)
+def save_audio_temp(audio_data, sample_rate=16000):
+    log_counter("save_audio_temp_attempt")
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
+        import wave
+        wf = wave.open(temp_file.name, 'wb')
+        wf.setnchannels(1)
+        wf.setsampwidth(2)
+        wf.setframerate(sample_rate)
+        wf.writeframes(audio_data)
+        wf.close()
+        log_counter("save_audio_temp_success")
+        return temp_file.name
+#
+#
 #######################################################################################################################