Jatin Mehra
Enhance documentation across API and content generation modules with detailed module-level docstrings and function descriptions
4a108da
#!/usr/bin/env python3
"""
Podcast Preprocessing Module
This module handles the audio preprocessing pipeline for the podcast-to-blog generator.
It provides functionality for:
1. Loading and transcribing audio files using Groq's Whisper models
2. Processing and cleaning transcription text
3. Language detection and mapping to appropriate model codes
The module relies on Groq's API for transcription services and supports multiple languages.
Dependencies:
- groq: API client for Groq services
- dotenv: For loading environment variables
- logging: For detailed operation logging
"""
import logging
import os
import json
from groq import Groq
import dotenv
# Load environment variables
dotenv.load_dotenv()
# Create logs directory if it doesn't exist
log_dir = "logs"
os.makedirs(log_dir, exist_ok=True)
# Configure logging
log_file = os.path.join(log_dir, "preprocessing.log")
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(log_file),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
def map_language_code(language):
"""
Map common language names to Groq Whisper model language codes.
This function converts user-friendly language names to the ISO language
codes needed for the Whisper transcription model.
Args:
language (str): Language name in English (e.g., "english", "french")
Returns:
str or None: Two-letter ISO language code if mapping exists, None otherwise
Example:
>>> map_language_code("english")
'en'
>>> map_language_code("german")
'de'
"""
logger.debug(f"Mapping language: {language}")
# This function can be expanded to include more languages as needed
language = language.lower()
language_map = {
"english": "en",
"french": "fr",
"spanish": "es",
"hindi": "hi",
"german": "de",
"italian": "it",
}
mapped_code = language_map.get(language, None)
if mapped_code:
logger.info(f"Mapped '{language}' to language code '{mapped_code}'")
else:
logger.warning(f"Could not map language: {language}")
return mapped_code
def process_transcript(transcript):
"""
Process a raw transcript to clean and format it for better readability.
This function performs text normalization operations to prepare the transcript
for further processing and content generation:
- Removes newline and carriage return characters
- Normalizes whitespace
- Additional text cleanup as needed
Args:
transcript (str): Raw transcript text from the transcription service
Returns:
str: Cleaned and formatted transcript text
Note:
This function can be expanded to include more sophisticated text
normalization if needed, such as speaker diarization or paragraph
segmentation.
"""
logger.info("Processing transcript...")
# Remove unwanted characters
cleaned_transcript = transcript.replace("\n", " ").replace("\r", "")
logger.debug("Removed newline and carriage return characters.")
# Additional processing can be implemented here:
# - Remove repeated spaces
# - Fix common transcription artifacts
# - Normalize punctuation
# - Format speaker labels if available
logger.info("Transcript processing complete.")
return cleaned_transcript
def load_transcript(model_type="whisper-large-v3-turbo", file_path=None, language=None):
"""
Load and transcribe an audio file using Groq's Whisper API.
This function handles the entire audio transcription process:
1. Validates the audio file existence
2. Initializes the Groq client
3. Handles language specification or auto-detection
4. Transcribes the audio file
5. Processes the transcription result
Args:
model_type (str, optional): The Whisper model variant to use.
Defaults to "whisper-large-v3-turbo".
file_path (str, optional): Path to the audio file to transcribe.
Must be a supported audio format (.mp3, .wav, .m4a, .ogg).
language (str, optional): Language of the audio content.
If provided, forces transcription in that language.
If None, language auto-detection is used.
Returns:
str: Processed transcript text
Raises:
FileNotFoundError: If the audio file doesn't exist at the specified path
ValueError: If an unsupported language is specified
Exception: For other API or processing errors
Note:
This function requires a valid Groq API key to be set in the environment.
"""
logger.info(f"Loading transcript from '{file_path}' using model '{model_type}'. Specified language: {language}")
if not file_path or not os.path.exists(file_path):
logger.error(f"Audio file not found at path: {file_path}")
raise FileNotFoundError(f"Audio file not found at path: {file_path}")
# Initialize the Groq client
logger.debug("Initializing Groq client")
client = Groq()
logger.info("Groq client initialized")
# The `language` parameter can be specified to force a language,
# otherwise will attempt auto-detection.
whisper_language_code = None
if language:
whisper_language_code = map_language_code(language)
if not whisper_language_code:
logger.error(f"Unsupported language specified: {language}")
raise ValueError(f"Unsupported language: {language}")
logger.info(f"Using specified language code for transcription: {whisper_language_code}")
else:
# Auto-detect the language
logger.info("Language not specified, auto-detection will be used")
whisper_language_code = None # Explicitly set to None for clarity
# Load and transcribe
logger.info(f"Starting transcription for '{file_path}'...")
try:
with open(file_path, "rb") as file:
# Create a transcription of the audio file
response = client.audio.transcriptions.create(
file=file, # Required audio file
model=model_type, # Required model to use for transcription
response_format="text", # Get simple text response
language=whisper_language_code, # Optional language parameter
temperature=0.0 # Lower temperature for more deterministic results
)
# The response is already a string when using response_format="text"
transcript = response
logger.info("Transcription successful.")
return process_transcript(transcript)
except Exception as e:
logger.error(f"Error during transcription: {e}", exc_info=True)
raise # Re-raise the exception after logging
# Example usage
if __name__ == "__main__":
"""
Example script to demonstrate the module's functionality.
When run directly, this script:
1. Takes a sample audio file
2. Transcribes it using the specified model and language
3. Saves the processed transcript to a text file
This provides a convenient way to test the transcription pipeline
without invoking the full API.
"""
logger.info("Starting preprocessing script example.")
# Load a transcript from a file
file_path = "audio/Clean code challenge - Silicon Valley Season 5, Ep6.mp3"
language = 'english' # Set to a specific language code if needed (e.g., "en", "fr")
output_file = "transcript.txt"
try:
logger.info(f"Attempting to load transcript for: {file_path}")
transcript = load_transcript(model_type="whisper-large-v3-turbo", file_path=file_path, language=language)
# Save the transcript to a file
logger.info(f"Saving transcript to '{output_file}'")
with open(output_file, "w") as f:
f.write(transcript)
logger.info("Transcript saved successfully.")
except FileNotFoundError as e:
logger.error(f"File not found error during example execution: {e}")
print(f"Error: {e}")
except ValueError as e:
logger.error(f"Value error during example execution: {e}")
print(f"Error: {e}")
except Exception as e:
logger.error(f"An unexpected error occurred during example execution: {e}", exc_info=True)
print(f"An unexpected error occurred: {e}")
logger.info("Preprocessing script example finished.")