LyricExtractor

Sleeping

App Files Files Community

eyov commited on Nov 10

Commit

cef05ee

•

1 Parent(s): 0c49d55

Upload 4 files

Browse files

Files changed (4) hide show

app.py +115 -0
demucs_handler.py +101 -0
requirements.txt +8 -0
whisper_handler.py +80 -0

app.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import os
+import sys
+import logging
+import gradio as gr
+import shutil
+from demucs_handler import DemucsProcessor, check_dependencies, configure_model
+from whisper_handler import WhisperTranscriber
+import tempfile
+import torch
+import torchaudio
+import soundfile as sf
+import librosa
+import numpy as np
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+def validate_environment():
+    try:
+        import torch
+        import torchaudio
+        import demucs
+        logging.info(f"PyTorch version: {torch.__version__}")
+        logging.info(f"Torchaudio version: {torchaudio.__version__}")
+        logging.info(f"CUDA available: {torch.cuda.is_available()}")
+    except ImportError as e:
+        logging.error(f"Environment validation failed: {e}")
+        sys.exit(1)
+def create_interface():
+    validate_environment()
+    processor = DemucsProcessor()
+    transcriber = WhisperTranscriber()
+    def process_audio(audio_file, whisper_model="base", progress=gr.Progress()):
+        if audio_file is None:
+            return None, "Please upload an audio file."
+        temp_files = []
+        try:
+            progress(0, desc="Starting processing")
+            logging.info(f"Processing file: {audio_file}")
+            with tempfile.TemporaryDirectory() as temp_dir:
+                temp_audio_path = os.path.join(temp_dir, "input.wav")
+                vocals_output_path = os.path.join(temp_dir, "vocals.wav")
+                # Convert to WAV first
+                audio, sr = librosa.load(audio_file, sr=44100)
+                # Fixed: use samplerate instead of sr
+                sf.write(temp_audio_path, audio, samplerate=sr)
+                temp_files.append(temp_audio_path)
+                progress(0.1, desc="Separating vocals")
+                try:
+                    vocals_path = processor.separate_vocals(temp_audio_path)
+                    # Copy vocals to output path
+                    shutil.copy2(vocals_path, vocals_output_path)
+                    temp_files.append(vocals_output_path)
+                except RuntimeError as e:
+                    logging.error(f"Vocal separation failed: {str(e)}")
+                    return None, f"Vocal separation failed: {str(e)}"
+                # Load the processed vocals for playback
+                vocals_audio, vocals_sr = librosa.load(vocals_output_path, sr=None)
+                progress(0.75, desc="Transcribing")
+                lyrics = transcriber.transcribe(vocals_output_path)
+                progress(1.0, desc="Processing complete")
+                # Return the audio data tuple and lyrics
+                return (vocals_sr, vocals_audio), lyrics
+        except Exception as e:
+            error_message = f"Processing error: {str(e)}"
+            logging.error(error_message)
+            return None, error_message
+        finally:
+            # Cleanup temporary files
+            for file in temp_files:
+                if file and os.path.exists(file):
+                    try:
+                        os.remove(file)
+                    except:
+                        pass
+    interface = gr.Interface(
+        fn=process_audio,
+        inputs=[
+            gr.Audio(label="Upload Audio File", type="filepath"),
+            gr.Dropdown(
+                choices=["tiny", "base", "small", "medium", "large-v2"],
+                value="medium",
+                label="Whisper Model Size"
+            )
+        ],
+        outputs=[
+            gr.Audio(label="Isolated Vocals", type="numpy"),
+            gr.Textbox(label="Transcribed Lyrics", lines=10, max_lines=20)
+        ],
+        title="Audio Lyrics Extractor",
+        description="Upload an audio file to extract vocals and transcribe lyrics",
+        analytics_enabled=False
+    )
+    return interface
+if __name__ == "__main__":
+    if not check_dependencies():
+        print("Please install missing dependencies")
+        exit(1)
+    interface = create_interface()
+    interface.launch()

demucs_handler.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import torch
+import torchaudio
+from demucs.pretrained import get_model
+from demucs.apply import apply_model
+import tempfile
+import os
+import numpy as np
+import librosa
+class DemucsProcessor:
+    def __init__(self, model_name="htdemucs"):
+        self.model_name = model_name
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        print(f"Using device: {self.device}")
+        self.model = get_model(model_name)
+        self.model.to(self.device)
+        self.sources = self.model.sources
+        print(f"Model loaded successfully on {self.device}")
+        print(f"Available sources: {self.sources}")
+    def load_audio(self, file_path):
+        try:
+            waveform, sample_rate = torchaudio.load(file_path)
+            print(f"Audio loaded - Shape: {waveform.shape}, Sample rate: {sample_rate}")
+            # Handle mono input
+            if waveform.dim() == 1:
+                waveform = waveform.unsqueeze(0)
+            if waveform.shape[0] == 1:
+                waveform = waveform.repeat(2, 1)
+            return waveform, sample_rate
+        except Exception as e:
+            print(f"Error loading with torchaudio: {e}")
+            try:
+                # Fallback to librosa
+                audio, sr = librosa.load(file_path, sr=44100, mono=False)
+                if audio.ndim == 1:
+                    audio = np.vstack([audio, audio])
+                waveform = torch.from_numpy(audio)
+                return waveform, sr
+            except Exception as e:
+                raise RuntimeError(f"Failed to load audio: {str(e)}")
+    def separate_vocals(self, audio_path):
+        try:
+            # Load audio
+            waveform, sample_rate = self.load_audio(audio_path)
+            print(f"Audio loaded - Shape: {waveform.shape}, Sample rate: {sample_rate}")
+            # Ensure correct shape and device
+            waveform = waveform.to(self.device)
+            # Add batch dimension
+            waveform = waveform.unsqueeze(0)
+            # Process the entire audio at once instead of segments
+            with torch.no_grad():
+                sources = apply_model(self.model, waveform)
+                # Get vocals
+                vocals_idx = self.sources.index('vocals')
+                vocals = sources[:, vocals_idx]
+            # Save to temporary file
+            with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
+                torchaudio.save(
+                    tmp.name,
+                    vocals.squeeze(0).cpu(),
+                    sample_rate,
+                    format='wav'
+                )
+                return tmp.name
+        except Exception as e:
+            raise RuntimeError(f"Separation failed: {str(e)}")
+def configure_model():
+    return {
+        "segment_size": 16 if torch.cuda.is_available() else 4,  # Increased from 8
+        "overlap": 0.1,
+        "sample_rate": 44100,
+        "channels": 2
+    }
+def check_dependencies():
+    try:
+        import torch
+        import torchaudio
+        import librosa
+        import demucs
+        from demucs.pretrained import get_model
+        # Test audio loading
+        test_audio = np.random.random(44100)
+        test_tensor = torch.from_numpy(test_audio)
+        print("All required packages are installed correctly")
+        return True
+    except ImportError as e:
+        print(f"Missing dependency: {str(e)}")
+        return False

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+gradio==4.0.2
+demucs==4.0.1
+transformers==4.31.0
+torch==2.0.1
+torchaudio==2.0.2
+torchvision==0.15.2
+soundfile
+librosa

whisper_handler.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import torch
+from transformers import pipeline
+import librosa
+import soundfile as sf
+import numpy as np
+class WhisperTranscriber:
+    def __init__(self, model_size="medium"):
+        self.model_size = model_size
+        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
+        self.model = pipeline(
+            "automatic-speech-recognition",
+            model=f"openai/whisper-{model_size}",
+            chunk_length_s=30,
+            device=self.device,
+            batch_size=8,
+            torch_dtype=torch.float16,
+            return_timestamps=True
+            )
+    def preprocess_audio(self, audio_path, target_sr=16000):
+        # Load audio with librosa for better preprocessing
+        y, sr = librosa.load(audio_path, sr=None)
+        # Resample to 16kHz (Whisper's expected rate)
+        y_resampled = librosa.resample(y=y, orig_sr=sr, target_sr=target_sr)
+        # Apply noise reduction
+        y_cleaned = librosa.effects.preemphasis(y_resampled)
+        # Normalize audio
+        y_normalized = librosa.util.normalize(y_cleaned)
+        # Remove silence and very quiet parts
+        y_filtered = librosa.effects.trim(
+            y_normalized,
+            top_db=30,
+            frame_length=2048,
+            hop_length=512
+        )[0]
+        return y_filtered, target_sr
+    def transcribe(self, audio_path):
+        try:
+            # Preprocess audio
+            audio_data, sample_rate = self.preprocess_audio(audio_path)
+            print(f"Audio loaded and preprocessed - Shape: {audio_data.shape}, Sample rate: {sample_rate}")
+            # Transcribe
+            result = self.model(
+                audio_data,
+                generate_kwargs={
+                    "task": "transcribe",
+                    "language": "en",
+                    "max_new_tokens": 256,
+                    "temperature": 0.7  # Added to reduce hallucination
+                }
+            )
+            # Extract transcription with timestamps if available
+            if isinstance(result, dict):
+                if "chunks" in result:
+                    transcription = " ".join([chunk["text"] for chunk in result["chunks"]])
+                else:
+                    transcription = result["text"]
+            else:
+                transcription = result
+            return transcription
+        except Exception as e:
+            print(f"Error in transcribe: {str(e)}")
+            raise
+# Example usage
+if __name__ == "__main__":
+    transcriber = WhisperTranscriber(model_size="medium")
+    transcription = transcriber.transcribe("path_to_your_audio_file.wav")
+    print(f"Transcription: {transcription}")