Spaces:
Sleeping
Sleeping
| """ | |
| Audio Processing Tool for GAIA Agent | |
| Provides comprehensive audio processing capabilities including: | |
| - Speech-to-text transcription using Whisper | |
| - Audio format support (MP3, WAV, M4A, etc.) | |
| - Content analysis and information extraction | |
| - Audio quality enhancement and noise reduction | |
| """ | |
| import os | |
| import logging | |
| import tempfile | |
| import asyncio | |
| from typing import Dict, Any, Optional, List, Union | |
| from pathlib import Path | |
| import json | |
| try: | |
| import soundfile as sf | |
| import numpy as np | |
| from faster_whisper import WhisperModel | |
| AUDIO_DEPS_AVAILABLE = True | |
| except ImportError as e: | |
| logging.warning(f"Audio dependencies not available: {e}") | |
| AUDIO_DEPS_AVAILABLE = False | |
| try: | |
| from .base_tool import SimpleAGNOTool | |
| except ImportError: | |
| from base_tool import SimpleAGNOTool | |
| logger = logging.getLogger(__name__) | |
| class AudioProcessingTool(SimpleAGNOTool): | |
| """ | |
| Advanced audio processing tool with Whisper integration for GAIA evaluation. | |
| Features: | |
| - Multi-format audio support (MP3, WAV, M4A, FLAC, OGG) | |
| - High-accuracy speech-to-text transcription | |
| - Content analysis and structured data extraction | |
| - Audio quality assessment and enhancement | |
| - Streaming support for large files | |
| """ | |
| def __init__(self): | |
| """Initialize the audio processing tool.""" | |
| super().__init__( | |
| name="audio_processing", | |
| description="Process audio files with speech-to-text transcription and content analysis" | |
| ) | |
| self.available = AUDIO_DEPS_AVAILABLE | |
| self.whisper_model = None | |
| self.supported_formats = ['.mp3', '.wav', '.m4a', '.flac', '.ogg', '.aac', '.wma'] | |
| self.max_file_size = 100 * 1024 * 1024 # 100MB | |
| self.transcription_timeout = 60 # seconds | |
| if self.available: | |
| self._init_whisper_model() | |
| else: | |
| logger.warning("⚠️ Audio processing tool not available - missing dependencies") | |
| def _init_whisper_model(self): | |
| """Initialize the Whisper model for transcription.""" | |
| try: | |
| # Use base model for balance of speed and accuracy | |
| # Can be upgraded to 'small' or 'medium' for better accuracy | |
| model_size = os.getenv('WHISPER_MODEL_SIZE', 'base') | |
| logger.info(f"🎤 Initializing Whisper model: {model_size}") | |
| self.whisper_model = WhisperModel( | |
| model_size, | |
| device="cpu", # Use CPU for compatibility | |
| compute_type="int8" # Optimize for memory usage | |
| ) | |
| logger.info("✅ Whisper model initialized successfully") | |
| except Exception as e: | |
| logger.error(f"❌ Failed to initialize Whisper model: {e}") | |
| self.available = False | |
| self.whisper_model = None | |
| def process_audio_file(self, file_path: str, extract_content: bool = True) -> Dict[str, Any]: | |
| """ | |
| Process an audio file with transcription and content analysis. | |
| Args: | |
| file_path: Path to the audio file | |
| extract_content: Whether to perform content analysis | |
| Returns: | |
| Dictionary containing transcription and analysis results | |
| """ | |
| if not self.available: | |
| return { | |
| 'success': False, | |
| 'error': 'Audio processing not available - missing dependencies', | |
| 'transcription': '', | |
| 'content_analysis': {} | |
| } | |
| try: | |
| # Validate file | |
| validation_result = self._validate_audio_file(file_path) | |
| if not validation_result['valid']: | |
| return { | |
| 'success': False, | |
| 'error': validation_result['error'], | |
| 'transcription': '', | |
| 'content_analysis': {} | |
| } | |
| # Transcribe audio | |
| logger.info(f"🎤 Transcribing audio file: {file_path}") | |
| transcription_result = self._transcribe_audio(file_path) | |
| if not transcription_result['success']: | |
| return transcription_result | |
| transcription = transcription_result['transcription'] | |
| # Perform content analysis if requested | |
| content_analysis = {} | |
| if extract_content and transcription: | |
| content_analysis = self._analyze_content(transcription) | |
| result = { | |
| 'success': True, | |
| 'transcription': transcription, | |
| 'content_analysis': content_analysis, | |
| 'audio_info': validation_result.get('info', {}), | |
| 'confidence': transcription_result.get('confidence', 0.0) | |
| } | |
| logger.info(f"✅ Audio processing completed successfully") | |
| logger.info(f"📝 Transcription length: {len(transcription)} characters") | |
| return result | |
| except Exception as e: | |
| logger.error(f"❌ Error processing audio file: {e}") | |
| return { | |
| 'success': False, | |
| 'error': f"Audio processing failed: {str(e)}", | |
| 'transcription': '', | |
| 'content_analysis': {} | |
| } | |
| def _validate_audio_file(self, file_path: str) -> Dict[str, Any]: | |
| """Validate audio file format, size, and accessibility.""" | |
| try: | |
| path = Path(file_path) | |
| # Check if file exists | |
| if not path.exists(): | |
| return {'valid': False, 'error': f"Audio file not found: {file_path}"} | |
| # Check file size | |
| file_size = path.stat().st_size | |
| if file_size > self.max_file_size: | |
| return { | |
| 'valid': False, | |
| 'error': f"File too large: {file_size / (1024*1024):.1f}MB (max: {self.max_file_size / (1024*1024)}MB)" | |
| } | |
| # Check file format | |
| file_ext = path.suffix.lower() | |
| if file_ext not in self.supported_formats: | |
| return { | |
| 'valid': False, | |
| 'error': f"Unsupported format: {file_ext}. Supported: {', '.join(self.supported_formats)}" | |
| } | |
| # Try to read audio info | |
| try: | |
| info = sf.info(file_path) | |
| audio_info = { | |
| 'duration': info.duration, | |
| 'sample_rate': info.samplerate, | |
| 'channels': info.channels, | |
| 'format': info.format, | |
| 'subtype': info.subtype | |
| } | |
| except Exception as e: | |
| return {'valid': False, 'error': f"Cannot read audio file: {str(e)}"} | |
| return { | |
| 'valid': True, | |
| 'info': audio_info | |
| } | |
| except Exception as e: | |
| return {'valid': False, 'error': f"File validation error: {str(e)}"} | |
| def _transcribe_audio(self, file_path: str) -> Dict[str, Any]: | |
| """Transcribe audio file using Whisper.""" | |
| try: | |
| if not self.whisper_model: | |
| return { | |
| 'success': False, | |
| 'error': 'Whisper model not initialized', | |
| 'transcription': '' | |
| } | |
| # Transcribe with timeout | |
| segments, info = self.whisper_model.transcribe( | |
| file_path, | |
| beam_size=5, | |
| language=None, # Auto-detect language | |
| task="transcribe", | |
| temperature=0.0, # Deterministic output | |
| compression_ratio_threshold=2.4, | |
| log_prob_threshold=-1.0, | |
| no_speech_threshold=0.6, | |
| condition_on_previous_text=False | |
| ) | |
| # Combine segments into full transcription | |
| transcription_parts = [] | |
| total_confidence = 0.0 | |
| segment_count = 0 | |
| for segment in segments: | |
| transcription_parts.append(segment.text.strip()) | |
| if hasattr(segment, 'avg_logprob'): | |
| total_confidence += segment.avg_logprob | |
| segment_count += 1 | |
| transcription = ' '.join(transcription_parts).strip() | |
| # Calculate average confidence | |
| avg_confidence = 0.0 | |
| if segment_count > 0: | |
| avg_confidence = total_confidence / segment_count | |
| # Convert log probability to confidence score (0-1) | |
| avg_confidence = max(0.0, min(1.0, (avg_confidence + 1.0) / 1.0)) | |
| logger.info(f"🎤 Transcription completed: {len(transcription)} chars, confidence: {avg_confidence:.2f}") | |
| return { | |
| 'success': True, | |
| 'transcription': transcription, | |
| 'confidence': avg_confidence, | |
| 'language': info.language if hasattr(info, 'language') else 'unknown', | |
| 'duration': info.duration if hasattr(info, 'duration') else 0.0 | |
| } | |
| except Exception as e: | |
| logger.error(f"❌ Transcription failed: {e}") | |
| return { | |
| 'success': False, | |
| 'error': f"Transcription failed: {str(e)}", | |
| 'transcription': '' | |
| } | |
| def _analyze_content(self, transcription: str) -> Dict[str, Any]: | |
| """Analyze transcribed content for structured information extraction.""" | |
| try: | |
| analysis = { | |
| 'word_count': len(transcription.split()), | |
| 'character_count': len(transcription), | |
| 'sentences': len([s for s in transcription.split('.') if s.strip()]), | |
| 'keywords': [], | |
| 'entities': [], | |
| 'topics': [], | |
| 'structured_data': {} | |
| } | |
| # Extract potential structured information | |
| text_lower = transcription.lower() | |
| # Look for recipe ingredients (for strawberry pie example) | |
| if any(keyword in text_lower for keyword in ['recipe', 'ingredients', 'cooking', 'baking', 'pie', 'cake']): | |
| analysis['topics'].append('recipe') | |
| analysis['structured_data']['recipe_indicators'] = self._extract_recipe_info(transcription) | |
| # Look for homework/educational content (for homework example) | |
| if any(keyword in text_lower for keyword in ['homework', 'assignment', 'page', 'chapter', 'exercise', 'problem']): | |
| analysis['topics'].append('education') | |
| analysis['structured_data']['education_indicators'] = self._extract_education_info(transcription) | |
| # Extract numbers and quantities | |
| import re | |
| numbers = re.findall(r'\b\d+(?:\.\d+)?\b', transcription) | |
| analysis['structured_data']['numbers'] = numbers | |
| # Extract page references | |
| page_refs = re.findall(r'page\s+(\d+)', text_lower) | |
| if page_refs: | |
| analysis['structured_data']['page_numbers'] = page_refs | |
| return analysis | |
| except Exception as e: | |
| logger.warning(f"⚠️ Content analysis failed: {e}") | |
| return {'error': str(e)} | |
| def _extract_recipe_info(self, text: str) -> Dict[str, Any]: | |
| """Extract recipe-specific information from transcription.""" | |
| import re | |
| recipe_info = { | |
| 'ingredients': [], | |
| 'quantities': [], | |
| 'cooking_methods': [], | |
| 'time_references': [] | |
| } | |
| # Common ingredient patterns | |
| ingredient_patterns = [ | |
| r'(\d+(?:\.\d+)?)\s*(cups?|tablespoons?|teaspoons?|pounds?|ounces?|grams?)\s+(?:of\s+)?([a-zA-Z\s]+)', | |
| r'([a-zA-Z\s]+)(?:\s*,\s*(\d+(?:\.\d+)?)\s*(cups?|tablespoons?|teaspoons?))?', | |
| ] | |
| text_lower = text.lower() | |
| # Extract ingredients with quantities | |
| for pattern in ingredient_patterns: | |
| matches = re.findall(pattern, text_lower) | |
| for match in matches: | |
| if len(match) >= 3: | |
| quantity, unit, ingredient = match[0], match[1], match[2] | |
| if ingredient.strip(): | |
| recipe_info['ingredients'].append({ | |
| 'ingredient': ingredient.strip(), | |
| 'quantity': quantity, | |
| 'unit': unit | |
| }) | |
| # Look for common cooking methods | |
| cooking_methods = ['bake', 'mix', 'stir', 'whip', 'fold', 'beat', 'combine', 'add', 'pour'] | |
| for method in cooking_methods: | |
| if method in text_lower: | |
| recipe_info['cooking_methods'].append(method) | |
| # Extract time references | |
| time_patterns = [ | |
| r'(\d+)\s*minutes?', | |
| r'(\d+)\s*hours?', | |
| r'(\d+)\s*degrees?' | |
| ] | |
| for pattern in time_patterns: | |
| matches = re.findall(pattern, text_lower) | |
| recipe_info['time_references'].extend(matches) | |
| return recipe_info | |
| def _extract_education_info(self, text: str) -> Dict[str, Any]: | |
| """Extract education-specific information from transcription.""" | |
| import re | |
| education_info = { | |
| 'page_numbers': [], | |
| 'chapter_numbers': [], | |
| 'exercise_numbers': [], | |
| 'subjects': [], | |
| 'assignments': [] | |
| } | |
| text_lower = text.lower() | |
| # Extract page numbers | |
| page_patterns = [ | |
| r'page\s+(\d+)', | |
| r'on\s+page\s+(\d+)', | |
| r'turn\s+to\s+page\s+(\d+)' | |
| ] | |
| for pattern in page_patterns: | |
| matches = re.findall(pattern, text_lower) | |
| education_info['page_numbers'].extend(matches) | |
| # Extract chapter numbers | |
| chapter_patterns = [ | |
| r'chapter\s+(\d+)', | |
| r'unit\s+(\d+)' | |
| ] | |
| for pattern in chapter_patterns: | |
| matches = re.findall(pattern, text_lower) | |
| education_info['chapter_numbers'].extend(matches) | |
| # Extract exercise/problem numbers | |
| exercise_patterns = [ | |
| r'exercise\s+(\d+)', | |
| r'problem\s+(\d+)', | |
| r'question\s+(\d+)' | |
| ] | |
| for pattern in exercise_patterns: | |
| matches = re.findall(pattern, text_lower) | |
| education_info['exercise_numbers'].extend(matches) | |
| # Identify subjects | |
| subjects = ['math', 'mathematics', 'science', 'history', 'english', 'literature', 'physics', 'chemistry', 'biology'] | |
| for subject in subjects: | |
| if subject in text_lower: | |
| education_info['subjects'].append(subject) | |
| return education_info | |
| def extract_specific_info(self, transcription: str, info_type: str) -> List[str]: | |
| """ | |
| Extract specific information from transcription. | |
| Args: | |
| transcription: The transcribed text | |
| info_type: Type of information to extract ('ingredients', 'page_numbers', 'numbers', etc.) | |
| Returns: | |
| List of extracted information | |
| """ | |
| import re | |
| if info_type == 'ingredients': | |
| # Extract ingredients from recipe transcription | |
| ingredients = [] | |
| text_lower = transcription.lower() | |
| # Common ingredient words | |
| ingredient_keywords = [ | |
| 'flour', 'sugar', 'butter', 'eggs', 'milk', 'cream', 'vanilla', | |
| 'strawberries', 'berries', 'fruit', 'salt', 'baking powder', | |
| 'cinnamon', 'nutmeg', 'lemon', 'orange', 'chocolate', 'nuts' | |
| ] | |
| for keyword in ingredient_keywords: | |
| if keyword in text_lower: | |
| # Try to extract with quantity | |
| pattern = rf'(\d+(?:\.\d+)?)\s*(?:cups?|tablespoons?|teaspoons?|pounds?|ounces?)?\s*(?:of\s+)?{keyword}' | |
| matches = re.findall(pattern, text_lower) | |
| if matches: | |
| ingredients.extend([f"{match} {keyword}" for match in matches]) | |
| else: | |
| ingredients.append(keyword) | |
| return list(set(ingredients)) # Remove duplicates | |
| elif info_type == 'page_numbers': | |
| # Extract page numbers | |
| patterns = [ | |
| r'page\s+(\d+)', | |
| r'on\s+page\s+(\d+)', | |
| r'turn\s+to\s+page\s+(\d+)', | |
| r'go\s+to\s+page\s+(\d+)' | |
| ] | |
| page_numbers = [] | |
| for pattern in patterns: | |
| matches = re.findall(pattern, transcription.lower()) | |
| page_numbers.extend(matches) | |
| return list(set(page_numbers)) # Remove duplicates | |
| elif info_type == 'numbers': | |
| # Extract all numbers | |
| numbers = re.findall(r'\b\d+(?:\.\d+)?\b', transcription) | |
| return numbers | |
| else: | |
| return [] | |
| def get_tool_functions(self) -> List[Dict[str, Any]]: | |
| """Get function definitions for AGNO integration.""" | |
| return [ | |
| { | |
| "name": "process_audio_file", | |
| "description": "Process audio file with speech-to-text transcription and content analysis", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": { | |
| "type": "string", | |
| "description": "Path to the audio file to process" | |
| }, | |
| "extract_content": { | |
| "type": "boolean", | |
| "description": "Whether to perform content analysis on transcription", | |
| "default": True | |
| } | |
| }, | |
| "required": ["file_path"] | |
| } | |
| }, | |
| { | |
| "name": "extract_specific_info", | |
| "description": "Extract specific information from audio transcription", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "transcription": { | |
| "type": "string", | |
| "description": "The transcribed text to analyze" | |
| }, | |
| "info_type": { | |
| "type": "string", | |
| "description": "Type of information to extract", | |
| "enum": ["ingredients", "page_numbers", "numbers"] | |
| } | |
| }, | |
| "required": ["transcription", "info_type"] | |
| } | |
| } | |
| ] | |
| # Create tool instance for AGNO integration | |
| def create_audio_processing_tool() -> Optional[AudioProcessingTool]: | |
| """Create and return audio processing tool instance.""" | |
| try: | |
| tool = AudioProcessingTool() | |
| if tool.available: | |
| logger.info("✅ Audio processing tool created successfully") | |
| return tool | |
| else: | |
| logger.warning("⚠️ Audio processing tool not available") | |
| return None | |
| except Exception as e: | |
| logger.error(f"❌ Failed to create audio processing tool: {e}") | |
| return None |