# Core dependencies torch>=2.0.0 transformers>=4.34.0 gradio>=4.13.0 # Audio processing and model dependencies datasets[audio]>=2.14.0 evaluate>=0.4.0 jiwer>=3.0.0 # Optimization and acceleration bitsandbytes>=0.41.1 accelerate>=0.24.1 xformers>=0.0.27 # Hugging Face integration huggingface_hub>=0.19.3 peft spaces # Image processing Pillow>=9.0.0 # Additional required packages for whisper model librosa soundfile ffmpeg-python