Spaces:

jatinmehra
/

AI-Powered-Podcast-to-Blog-Generator

Sleeping

AI-Powered-Podcast-to-Blog-Generator / modules /preprocessing.py

Jatin Mehra

Enhance documentation across API and content generation modules with detailed module-level docstrings and function descriptions

4a108da 4 months ago

raw

history blame contribute delete

8.62 kB

	#!/usr/bin/env python3
	"""
	Podcast Preprocessing Module

	This module handles the audio preprocessing pipeline for the podcast-to-blog generator.
	It provides functionality for:
	1. Loading and transcribing audio files using Groq's Whisper models
	2. Processing and cleaning transcription text
	3. Language detection and mapping to appropriate model codes

	The module relies on Groq's API for transcription services and supports multiple languages.

	Dependencies:
	- groq: API client for Groq services
	- dotenv: For loading environment variables
	- logging: For detailed operation logging
	"""

	import logging
	import os
	import json
	from groq import Groq
	import dotenv

	# Load environment variables
	dotenv.load_dotenv()

	# Create logs directory if it doesn't exist
	log_dir = "logs"
	os.makedirs(log_dir, exist_ok=True)

	# Configure logging
	log_file = os.path.join(log_dir, "preprocessing.log")
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(levelname)s - %(message)s',
	handlers=[
	logging.FileHandler(log_file),
	logging.StreamHandler()
	]
	)

	logger = logging.getLogger(__name__)

	def map_language_code(language):
	"""
	Map common language names to Groq Whisper model language codes.

	This function converts user-friendly language names to the ISO language
	codes needed for the Whisper transcription model.

	Args:
	language (str): Language name in English (e.g., "english", "french")

	Returns:
	str or None: Two-letter ISO language code if mapping exists, None otherwise

	Example:
	>>> map_language_code("english")
	'en'
	>>> map_language_code("german")
	'de'
	"""
	logger.debug(f"Mapping language: {language}")
	# This function can be expanded to include more languages as needed
	language = language.lower()
	language_map = {
	"english": "en",
	"french": "fr",
	"spanish": "es",
	"hindi": "hi",
	"german": "de",
	"italian": "it",
	}
	mapped_code = language_map.get(language, None)
	if mapped_code:
	logger.info(f"Mapped '{language}' to language code '{mapped_code}'")
	else:
	logger.warning(f"Could not map language: {language}")
	return mapped_code


	def process_transcript(transcript):
	"""
	Process a raw transcript to clean and format it for better readability.

	This function performs text normalization operations to prepare the transcript
	for further processing and content generation:
	- Removes newline and carriage return characters
	- Normalizes whitespace
	- Additional text cleanup as needed

	Args:
	transcript (str): Raw transcript text from the transcription service

	Returns:
	str: Cleaned and formatted transcript text

	Note:
	This function can be expanded to include more sophisticated text
	normalization if needed, such as speaker diarization or paragraph
	segmentation.
	"""
	logger.info("Processing transcript...")
	# Remove unwanted characters
	cleaned_transcript = transcript.replace("\n", " ").replace("\r", "")
	logger.debug("Removed newline and carriage return characters.")

	# Additional processing can be implemented here:
	# - Remove repeated spaces
	# - Fix common transcription artifacts
	# - Normalize punctuation
	# - Format speaker labels if available

	logger.info("Transcript processing complete.")
	return cleaned_transcript

	def load_transcript(model_type="whisper-large-v3-turbo", file_path=None, language=None):
	"""
	Load and transcribe an audio file using Groq's Whisper API.

	This function handles the entire audio transcription process:
	1. Validates the audio file existence
	2. Initializes the Groq client
	3. Handles language specification or auto-detection
	4. Transcribes the audio file
	5. Processes the transcription result

	Args:
	model_type (str, optional): The Whisper model variant to use.
	Defaults to "whisper-large-v3-turbo".
	file_path (str, optional): Path to the audio file to transcribe.
	Must be a supported audio format (.mp3, .wav, .m4a, .ogg).
	language (str, optional): Language of the audio content.
	If provided, forces transcription in that language.
	If None, language auto-detection is used.

	Returns:
	str: Processed transcript text

	Raises:
	FileNotFoundError: If the audio file doesn't exist at the specified path
	ValueError: If an unsupported language is specified
	Exception: For other API or processing errors

	Note:
	This function requires a valid Groq API key to be set in the environment.
	"""
	logger.info(f"Loading transcript from '{file_path}' using model '{model_type}'. Specified language: {language}")
	if not file_path or not os.path.exists(file_path):
	logger.error(f"Audio file not found at path: {file_path}")
	raise FileNotFoundError(f"Audio file not found at path: {file_path}")

	# Initialize the Groq client
	logger.debug("Initializing Groq client")
	client = Groq()
	logger.info("Groq client initialized")

	# The `language` parameter can be specified to force a language,
	# otherwise will attempt auto-detection.
	whisper_language_code = None
	if language:
	whisper_language_code = map_language_code(language)
	if not whisper_language_code:
	logger.error(f"Unsupported language specified: {language}")
	raise ValueError(f"Unsupported language: {language}")
	logger.info(f"Using specified language code for transcription: {whisper_language_code}")
	else:
	# Auto-detect the language
	logger.info("Language not specified, auto-detection will be used")
	whisper_language_code = None # Explicitly set to None for clarity

	# Load and transcribe
	logger.info(f"Starting transcription for '{file_path}'...")
	try:
	with open(file_path, "rb") as file:
	# Create a transcription of the audio file
	response = client.audio.transcriptions.create(
	file=file, # Required audio file
	model=model_type, # Required model to use for transcription
	response_format="text", # Get simple text response
	language=whisper_language_code, # Optional language parameter
	temperature=0.0 # Lower temperature for more deterministic results
	)

	# The response is already a string when using response_format="text"
	transcript = response
	logger.info("Transcription successful.")
	return process_transcript(transcript)
	except Exception as e:
	logger.error(f"Error during transcription: {e}", exc_info=True)
	raise # Re-raise the exception after logging


	# Example usage
	if __name__ == "__main__":
	"""
	Example script to demonstrate the module's functionality.

	When run directly, this script:
	1. Takes a sample audio file
	2. Transcribes it using the specified model and language
	3. Saves the processed transcript to a text file

	This provides a convenient way to test the transcription pipeline
	without invoking the full API.
	"""
	logger.info("Starting preprocessing script example.")
	# Load a transcript from a file
	file_path = "audio/Clean code challenge - Silicon Valley Season 5, Ep6.mp3"
	language = 'english' # Set to a specific language code if needed (e.g., "en", "fr")
	output_file = "transcript.txt"

	try:
	logger.info(f"Attempting to load transcript for: {file_path}")
	transcript = load_transcript(model_type="whisper-large-v3-turbo", file_path=file_path, language=language)
	# Save the transcript to a file
	logger.info(f"Saving transcript to '{output_file}'")
	with open(output_file, "w") as f:
	f.write(transcript)
	logger.info("Transcript saved successfully.")
	except FileNotFoundError as e:
	logger.error(f"File not found error during example execution: {e}")
	print(f"Error: {e}")
	except ValueError as e:
	logger.error(f"Value error during example execution: {e}")
	print(f"Error: {e}")
	except Exception as e:
	logger.error(f"An unexpected error occurred during example execution: {e}", exc_info=True)
	print(f"An unexpected error occurred: {e}")

	logger.info("Preprocessing script example finished.")