Jatin Mehra
Enhance documentation across API and content generation modules with detailed module-level docstrings and function descriptions
4a108da
#!/usr/bin/env python3 | |
""" | |
Podcast Preprocessing Module | |
This module handles the audio preprocessing pipeline for the podcast-to-blog generator. | |
It provides functionality for: | |
1. Loading and transcribing audio files using Groq's Whisper models | |
2. Processing and cleaning transcription text | |
3. Language detection and mapping to appropriate model codes | |
The module relies on Groq's API for transcription services and supports multiple languages. | |
Dependencies: | |
- groq: API client for Groq services | |
- dotenv: For loading environment variables | |
- logging: For detailed operation logging | |
""" | |
import logging | |
import os | |
import json | |
from groq import Groq | |
import dotenv | |
# Load environment variables | |
dotenv.load_dotenv() | |
# Create logs directory if it doesn't exist | |
log_dir = "logs" | |
os.makedirs(log_dir, exist_ok=True) | |
# Configure logging | |
log_file = os.path.join(log_dir, "preprocessing.log") | |
logging.basicConfig( | |
level=logging.INFO, | |
format='%(asctime)s - %(levelname)s - %(message)s', | |
handlers=[ | |
logging.FileHandler(log_file), | |
logging.StreamHandler() | |
] | |
) | |
logger = logging.getLogger(__name__) | |
def map_language_code(language): | |
""" | |
Map common language names to Groq Whisper model language codes. | |
This function converts user-friendly language names to the ISO language | |
codes needed for the Whisper transcription model. | |
Args: | |
language (str): Language name in English (e.g., "english", "french") | |
Returns: | |
str or None: Two-letter ISO language code if mapping exists, None otherwise | |
Example: | |
>>> map_language_code("english") | |
'en' | |
>>> map_language_code("german") | |
'de' | |
""" | |
logger.debug(f"Mapping language: {language}") | |
# This function can be expanded to include more languages as needed | |
language = language.lower() | |
language_map = { | |
"english": "en", | |
"french": "fr", | |
"spanish": "es", | |
"hindi": "hi", | |
"german": "de", | |
"italian": "it", | |
} | |
mapped_code = language_map.get(language, None) | |
if mapped_code: | |
logger.info(f"Mapped '{language}' to language code '{mapped_code}'") | |
else: | |
logger.warning(f"Could not map language: {language}") | |
return mapped_code | |
def process_transcript(transcript): | |
""" | |
Process a raw transcript to clean and format it for better readability. | |
This function performs text normalization operations to prepare the transcript | |
for further processing and content generation: | |
- Removes newline and carriage return characters | |
- Normalizes whitespace | |
- Additional text cleanup as needed | |
Args: | |
transcript (str): Raw transcript text from the transcription service | |
Returns: | |
str: Cleaned and formatted transcript text | |
Note: | |
This function can be expanded to include more sophisticated text | |
normalization if needed, such as speaker diarization or paragraph | |
segmentation. | |
""" | |
logger.info("Processing transcript...") | |
# Remove unwanted characters | |
cleaned_transcript = transcript.replace("\n", " ").replace("\r", "") | |
logger.debug("Removed newline and carriage return characters.") | |
# Additional processing can be implemented here: | |
# - Remove repeated spaces | |
# - Fix common transcription artifacts | |
# - Normalize punctuation | |
# - Format speaker labels if available | |
logger.info("Transcript processing complete.") | |
return cleaned_transcript | |
def load_transcript(model_type="whisper-large-v3-turbo", file_path=None, language=None): | |
""" | |
Load and transcribe an audio file using Groq's Whisper API. | |
This function handles the entire audio transcription process: | |
1. Validates the audio file existence | |
2. Initializes the Groq client | |
3. Handles language specification or auto-detection | |
4. Transcribes the audio file | |
5. Processes the transcription result | |
Args: | |
model_type (str, optional): The Whisper model variant to use. | |
Defaults to "whisper-large-v3-turbo". | |
file_path (str, optional): Path to the audio file to transcribe. | |
Must be a supported audio format (.mp3, .wav, .m4a, .ogg). | |
language (str, optional): Language of the audio content. | |
If provided, forces transcription in that language. | |
If None, language auto-detection is used. | |
Returns: | |
str: Processed transcript text | |
Raises: | |
FileNotFoundError: If the audio file doesn't exist at the specified path | |
ValueError: If an unsupported language is specified | |
Exception: For other API or processing errors | |
Note: | |
This function requires a valid Groq API key to be set in the environment. | |
""" | |
logger.info(f"Loading transcript from '{file_path}' using model '{model_type}'. Specified language: {language}") | |
if not file_path or not os.path.exists(file_path): | |
logger.error(f"Audio file not found at path: {file_path}") | |
raise FileNotFoundError(f"Audio file not found at path: {file_path}") | |
# Initialize the Groq client | |
logger.debug("Initializing Groq client") | |
client = Groq() | |
logger.info("Groq client initialized") | |
# The `language` parameter can be specified to force a language, | |
# otherwise will attempt auto-detection. | |
whisper_language_code = None | |
if language: | |
whisper_language_code = map_language_code(language) | |
if not whisper_language_code: | |
logger.error(f"Unsupported language specified: {language}") | |
raise ValueError(f"Unsupported language: {language}") | |
logger.info(f"Using specified language code for transcription: {whisper_language_code}") | |
else: | |
# Auto-detect the language | |
logger.info("Language not specified, auto-detection will be used") | |
whisper_language_code = None # Explicitly set to None for clarity | |
# Load and transcribe | |
logger.info(f"Starting transcription for '{file_path}'...") | |
try: | |
with open(file_path, "rb") as file: | |
# Create a transcription of the audio file | |
response = client.audio.transcriptions.create( | |
file=file, # Required audio file | |
model=model_type, # Required model to use for transcription | |
response_format="text", # Get simple text response | |
language=whisper_language_code, # Optional language parameter | |
temperature=0.0 # Lower temperature for more deterministic results | |
) | |
# The response is already a string when using response_format="text" | |
transcript = response | |
logger.info("Transcription successful.") | |
return process_transcript(transcript) | |
except Exception as e: | |
logger.error(f"Error during transcription: {e}", exc_info=True) | |
raise # Re-raise the exception after logging | |
# Example usage | |
if __name__ == "__main__": | |
""" | |
Example script to demonstrate the module's functionality. | |
When run directly, this script: | |
1. Takes a sample audio file | |
2. Transcribes it using the specified model and language | |
3. Saves the processed transcript to a text file | |
This provides a convenient way to test the transcription pipeline | |
without invoking the full API. | |
""" | |
logger.info("Starting preprocessing script example.") | |
# Load a transcript from a file | |
file_path = "audio/Clean code challenge - Silicon Valley Season 5, Ep6.mp3" | |
language = 'english' # Set to a specific language code if needed (e.g., "en", "fr") | |
output_file = "transcript.txt" | |
try: | |
logger.info(f"Attempting to load transcript for: {file_path}") | |
transcript = load_transcript(model_type="whisper-large-v3-turbo", file_path=file_path, language=language) | |
# Save the transcript to a file | |
logger.info(f"Saving transcript to '{output_file}'") | |
with open(output_file, "w") as f: | |
f.write(transcript) | |
logger.info("Transcript saved successfully.") | |
except FileNotFoundError as e: | |
logger.error(f"File not found error during example execution: {e}") | |
print(f"Error: {e}") | |
except ValueError as e: | |
logger.error(f"Value error during example execution: {e}") | |
print(f"Error: {e}") | |
except Exception as e: | |
logger.error(f"An unexpected error occurred during example execution: {e}", exc_info=True) | |
print(f"An unexpected error occurred: {e}") | |
logger.info("Preprocessing script example finished.") | |