File size: 8,622 Bytes
4a108da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
acf76e7
 
472ec51
 
 
 
 
 
acf76e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4a108da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
acf76e7
 
 
 
 
 
 
 
 
 
 
 
 
 
472ec51
acf76e7
 
 
 
 
 
 
4a108da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
acf76e7
 
 
 
 
 
4a108da
 
 
 
 
 
acf76e7
 
 
472ec51
acf76e7
4a108da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
acf76e7
 
 
 
 
 
472ec51
 
 
 
acf76e7
 
472ec51
acf76e7
 
 
 
 
 
 
 
 
472ec51
 
acf76e7
472ec51
acf76e7
 
472ec51
 
 
 
 
 
 
 
 
 
 
 
 
 
acf76e7
 
472ec51
acf76e7
 
 
 
4a108da
 
 
 
 
 
 
 
 
 
 
acf76e7
 
 
 
 
 
 
 
472ec51
acf76e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
#!/usr/bin/env python3
"""
Podcast Preprocessing Module

This module handles the audio preprocessing pipeline for the podcast-to-blog generator.
It provides functionality for:
1. Loading and transcribing audio files using Groq's Whisper models
2. Processing and cleaning transcription text
3. Language detection and mapping to appropriate model codes

The module relies on Groq's API for transcription services and supports multiple languages.

Dependencies:
- groq: API client for Groq services
- dotenv: For loading environment variables
- logging: For detailed operation logging
"""

import logging
import os
import json
from groq import Groq
import dotenv

# Load environment variables
dotenv.load_dotenv()

# Create logs directory if it doesn't exist
log_dir = "logs"
os.makedirs(log_dir, exist_ok=True)

# Configure logging
log_file = os.path.join(log_dir, "preprocessing.log")
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_file),
        logging.StreamHandler() 
    ]
)

logger = logging.getLogger(__name__)

def map_language_code(language):
    """
    Map common language names to Groq Whisper model language codes.
    
    This function converts user-friendly language names to the ISO language
    codes needed for the Whisper transcription model.
    
    Args:
        language (str): Language name in English (e.g., "english", "french")
        
    Returns:
        str or None: Two-letter ISO language code if mapping exists, None otherwise
        
    Example:
        >>> map_language_code("english")
        'en'
        >>> map_language_code("german")
        'de'
    """
    logger.debug(f"Mapping language: {language}")
    # This function can be expanded to include more languages as needed
    language = language.lower()
    language_map = {
        "english": "en",
        "french": "fr",
        "spanish": "es",
        "hindi": "hi",
        "german": "de",
        "italian": "it",
    }
    mapped_code = language_map.get(language, None)
    if mapped_code:
        logger.info(f"Mapped '{language}' to language code '{mapped_code}'")
    else:
        logger.warning(f"Could not map language: {language}")
    return mapped_code


def process_transcript(transcript):
    """
    Process a raw transcript to clean and format it for better readability.
    
    This function performs text normalization operations to prepare the transcript
    for further processing and content generation:
    - Removes newline and carriage return characters
    - Normalizes whitespace
    - Additional text cleanup as needed
    
    Args:
        transcript (str): Raw transcript text from the transcription service
        
    Returns:
        str: Cleaned and formatted transcript text
        
    Note:
        This function can be expanded to include more sophisticated text 
        normalization if needed, such as speaker diarization or paragraph 
        segmentation.
    """
    logger.info("Processing transcript...")
    # Remove unwanted characters
    cleaned_transcript = transcript.replace("\n", " ").replace("\r", "")
    logger.debug("Removed newline and carriage return characters.")

    # Additional processing can be implemented here:
    # - Remove repeated spaces
    # - Fix common transcription artifacts
    # - Normalize punctuation
    # - Format speaker labels if available
    
    logger.info("Transcript processing complete.")
    return cleaned_transcript

def load_transcript(model_type="whisper-large-v3-turbo", file_path=None, language=None):
    """
    Load and transcribe an audio file using Groq's Whisper API.
    
    This function handles the entire audio transcription process:
    1. Validates the audio file existence
    2. Initializes the Groq client
    3. Handles language specification or auto-detection
    4. Transcribes the audio file
    5. Processes the transcription result
    
    Args:
        model_type (str, optional): The Whisper model variant to use.
            Defaults to "whisper-large-v3-turbo".
        file_path (str, optional): Path to the audio file to transcribe.
            Must be a supported audio format (.mp3, .wav, .m4a, .ogg).
        language (str, optional): Language of the audio content.
            If provided, forces transcription in that language.
            If None, language auto-detection is used.
            
    Returns:
        str: Processed transcript text
        
    Raises:
        FileNotFoundError: If the audio file doesn't exist at the specified path
        ValueError: If an unsupported language is specified
        Exception: For other API or processing errors
        
    Note:
        This function requires a valid Groq API key to be set in the environment.
    """
    logger.info(f"Loading transcript from '{file_path}' using model '{model_type}'. Specified language: {language}")
    if not file_path or not os.path.exists(file_path):
        logger.error(f"Audio file not found at path: {file_path}")
        raise FileNotFoundError(f"Audio file not found at path: {file_path}")

    # Initialize the Groq client
    logger.debug("Initializing Groq client")
    client = Groq()
    logger.info("Groq client initialized")

    # The `language` parameter can be specified to force a language,
    # otherwise will attempt auto-detection.
    whisper_language_code = None
    if language:
        whisper_language_code = map_language_code(language)
        if not whisper_language_code:
            logger.error(f"Unsupported language specified: {language}")
            raise ValueError(f"Unsupported language: {language}")
        logger.info(f"Using specified language code for transcription: {whisper_language_code}")
    else:
        # Auto-detect the language
        logger.info("Language not specified, auto-detection will be used")
        whisper_language_code = None  # Explicitly set to None for clarity

    # Load and transcribe
    logger.info(f"Starting transcription for '{file_path}'...")
    try:
        with open(file_path, "rb") as file:
            # Create a transcription of the audio file
            response = client.audio.transcriptions.create(
                file=file,  # Required audio file
                model=model_type,  # Required model to use for transcription
                response_format="text",  # Get simple text response
                language=whisper_language_code,  # Optional language parameter
                temperature=0.0  # Lower temperature for more deterministic results
            )
            
            # The response is already a string when using response_format="text"
            transcript = response
            logger.info("Transcription successful.")
            return process_transcript(transcript)
    except Exception as e:
        logger.error(f"Error during transcription: {e}", exc_info=True)
        raise  # Re-raise the exception after logging


# Example usage
if __name__ == "__main__":
    """
    Example script to demonstrate the module's functionality.
    
    When run directly, this script:
    1. Takes a sample audio file
    2. Transcribes it using the specified model and language
    3. Saves the processed transcript to a text file
    
    This provides a convenient way to test the transcription pipeline
    without invoking the full API.
    """
    logger.info("Starting preprocessing script example.")
    # Load a transcript from a file
    file_path = "audio/Clean code challenge - Silicon Valley Season 5, Ep6.mp3"
    language = 'english'  # Set to a specific language code if needed (e.g., "en", "fr")
    output_file = "transcript.txt"

    try:
        logger.info(f"Attempting to load transcript for: {file_path}")
        transcript = load_transcript(model_type="whisper-large-v3-turbo", file_path=file_path, language=language)
        # Save the transcript to a file
        logger.info(f"Saving transcript to '{output_file}'")
        with open(output_file, "w") as f:
            f.write(transcript)
        logger.info("Transcript saved successfully.")
    except FileNotFoundError as e:
        logger.error(f"File not found error during example execution: {e}")
        print(f"Error: {e}")
    except ValueError as e:
        logger.error(f"Value error during example execution: {e}")
        print(f"Error: {e}")
    except Exception as e:
        logger.error(f"An unexpected error occurred during example execution: {e}", exc_info=True)
        print(f"An unexpected error occurred: {e}")

    logger.info("Preprocessing script example finished.")