eyov commited on
Commit
cef05ee
1 Parent(s): 0c49d55

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +115 -0
  2. demucs_handler.py +101 -0
  3. requirements.txt +8 -0
  4. whisper_handler.py +80 -0
app.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import logging
4
+ import gradio as gr
5
+ import shutil
6
+ from demucs_handler import DemucsProcessor, check_dependencies, configure_model
7
+ from whisper_handler import WhisperTranscriber
8
+ import tempfile
9
+ import torch
10
+ import torchaudio
11
+ import soundfile as sf
12
+ import librosa
13
+ import numpy as np
14
+
15
+ # Set up logging
16
+ logging.basicConfig(
17
+ level=logging.INFO,
18
+ format='%(asctime)s - %(levelname)s - %(message)s'
19
+ )
20
+
21
+ def validate_environment():
22
+ try:
23
+ import torch
24
+ import torchaudio
25
+ import demucs
26
+ logging.info(f"PyTorch version: {torch.__version__}")
27
+ logging.info(f"Torchaudio version: {torchaudio.__version__}")
28
+ logging.info(f"CUDA available: {torch.cuda.is_available()}")
29
+ except ImportError as e:
30
+ logging.error(f"Environment validation failed: {e}")
31
+ sys.exit(1)
32
+
33
+ def create_interface():
34
+ validate_environment()
35
+ processor = DemucsProcessor()
36
+ transcriber = WhisperTranscriber()
37
+
38
+ def process_audio(audio_file, whisper_model="base", progress=gr.Progress()):
39
+ if audio_file is None:
40
+ return None, "Please upload an audio file."
41
+
42
+ temp_files = []
43
+ try:
44
+ progress(0, desc="Starting processing")
45
+ logging.info(f"Processing file: {audio_file}")
46
+
47
+ with tempfile.TemporaryDirectory() as temp_dir:
48
+ temp_audio_path = os.path.join(temp_dir, "input.wav")
49
+ vocals_output_path = os.path.join(temp_dir, "vocals.wav")
50
+
51
+ # Convert to WAV first
52
+ audio, sr = librosa.load(audio_file, sr=44100)
53
+ # Fixed: use samplerate instead of sr
54
+ sf.write(temp_audio_path, audio, samplerate=sr)
55
+ temp_files.append(temp_audio_path)
56
+
57
+ progress(0.1, desc="Separating vocals")
58
+ try:
59
+ vocals_path = processor.separate_vocals(temp_audio_path)
60
+ # Copy vocals to output path
61
+ shutil.copy2(vocals_path, vocals_output_path)
62
+ temp_files.append(vocals_output_path)
63
+ except RuntimeError as e:
64
+ logging.error(f"Vocal separation failed: {str(e)}")
65
+ return None, f"Vocal separation failed: {str(e)}"
66
+
67
+ # Load the processed vocals for playback
68
+ vocals_audio, vocals_sr = librosa.load(vocals_output_path, sr=None)
69
+
70
+ progress(0.75, desc="Transcribing")
71
+ lyrics = transcriber.transcribe(vocals_output_path)
72
+ progress(1.0, desc="Processing complete")
73
+
74
+ # Return the audio data tuple and lyrics
75
+ return (vocals_sr, vocals_audio), lyrics
76
+
77
+ except Exception as e:
78
+ error_message = f"Processing error: {str(e)}"
79
+ logging.error(error_message)
80
+ return None, error_message
81
+ finally:
82
+ # Cleanup temporary files
83
+ for file in temp_files:
84
+ if file and os.path.exists(file):
85
+ try:
86
+ os.remove(file)
87
+ except:
88
+ pass
89
+
90
+ interface = gr.Interface(
91
+ fn=process_audio,
92
+ inputs=[
93
+ gr.Audio(label="Upload Audio File", type="filepath"),
94
+ gr.Dropdown(
95
+ choices=["tiny", "base", "small", "medium", "large-v2"],
96
+ value="medium",
97
+ label="Whisper Model Size"
98
+ )
99
+ ],
100
+ outputs=[
101
+ gr.Audio(label="Isolated Vocals", type="numpy"),
102
+ gr.Textbox(label="Transcribed Lyrics", lines=10, max_lines=20)
103
+ ],
104
+ title="Audio Lyrics Extractor",
105
+ description="Upload an audio file to extract vocals and transcribe lyrics",
106
+ analytics_enabled=False
107
+ )
108
+ return interface
109
+
110
+ if __name__ == "__main__":
111
+ if not check_dependencies():
112
+ print("Please install missing dependencies")
113
+ exit(1)
114
+ interface = create_interface()
115
+ interface.launch()
demucs_handler.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torchaudio
3
+ from demucs.pretrained import get_model
4
+ from demucs.apply import apply_model
5
+ import tempfile
6
+ import os
7
+ import numpy as np
8
+ import librosa
9
+
10
+ class DemucsProcessor:
11
+ def __init__(self, model_name="htdemucs"):
12
+ self.model_name = model_name
13
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
14
+ print(f"Using device: {self.device}")
15
+
16
+ self.model = get_model(model_name)
17
+ self.model.to(self.device)
18
+ self.sources = self.model.sources
19
+ print(f"Model loaded successfully on {self.device}")
20
+ print(f"Available sources: {self.sources}")
21
+ def load_audio(self, file_path):
22
+ try:
23
+ waveform, sample_rate = torchaudio.load(file_path)
24
+ print(f"Audio loaded - Shape: {waveform.shape}, Sample rate: {sample_rate}")
25
+
26
+ # Handle mono input
27
+ if waveform.dim() == 1:
28
+ waveform = waveform.unsqueeze(0)
29
+ if waveform.shape[0] == 1:
30
+ waveform = waveform.repeat(2, 1)
31
+
32
+ return waveform, sample_rate
33
+ except Exception as e:
34
+ print(f"Error loading with torchaudio: {e}")
35
+ try:
36
+ # Fallback to librosa
37
+ audio, sr = librosa.load(file_path, sr=44100, mono=False)
38
+ if audio.ndim == 1:
39
+ audio = np.vstack([audio, audio])
40
+ waveform = torch.from_numpy(audio)
41
+ return waveform, sr
42
+ except Exception as e:
43
+ raise RuntimeError(f"Failed to load audio: {str(e)}")
44
+
45
+ def separate_vocals(self, audio_path):
46
+ try:
47
+ # Load audio
48
+ waveform, sample_rate = self.load_audio(audio_path)
49
+ print(f"Audio loaded - Shape: {waveform.shape}, Sample rate: {sample_rate}")
50
+
51
+ # Ensure correct shape and device
52
+ waveform = waveform.to(self.device)
53
+ # Add batch dimension
54
+ waveform = waveform.unsqueeze(0)
55
+
56
+ # Process the entire audio at once instead of segments
57
+ with torch.no_grad():
58
+ sources = apply_model(self.model, waveform)
59
+
60
+ # Get vocals
61
+ vocals_idx = self.sources.index('vocals')
62
+ vocals = sources[:, vocals_idx]
63
+
64
+ # Save to temporary file
65
+ with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
66
+ torchaudio.save(
67
+ tmp.name,
68
+ vocals.squeeze(0).cpu(),
69
+ sample_rate,
70
+ format='wav'
71
+ )
72
+ return tmp.name
73
+
74
+ except Exception as e:
75
+ raise RuntimeError(f"Separation failed: {str(e)}")
76
+
77
+ def configure_model():
78
+ return {
79
+ "segment_size": 16 if torch.cuda.is_available() else 4, # Increased from 8
80
+ "overlap": 0.1,
81
+ "sample_rate": 44100,
82
+ "channels": 2
83
+ }
84
+
85
+ def check_dependencies():
86
+ try:
87
+ import torch
88
+ import torchaudio
89
+ import librosa
90
+ import demucs
91
+ from demucs.pretrained import get_model
92
+
93
+ # Test audio loading
94
+ test_audio = np.random.random(44100)
95
+ test_tensor = torch.from_numpy(test_audio)
96
+
97
+ print("All required packages are installed correctly")
98
+ return True
99
+ except ImportError as e:
100
+ print(f"Missing dependency: {str(e)}")
101
+ return False
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio==4.0.2
2
+ demucs==4.0.1
3
+ transformers==4.31.0
4
+ torch==2.0.1
5
+ torchaudio==2.0.2
6
+ torchvision==0.15.2
7
+ soundfile
8
+ librosa
whisper_handler.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import pipeline
3
+ import librosa
4
+ import soundfile as sf
5
+ import numpy as np
6
+ class WhisperTranscriber:
7
+ def __init__(self, model_size="medium"):
8
+ self.model_size = model_size
9
+ self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
10
+
11
+ self.model = pipeline(
12
+ "automatic-speech-recognition",
13
+ model=f"openai/whisper-{model_size}",
14
+ chunk_length_s=30,
15
+ device=self.device,
16
+ batch_size=8,
17
+ torch_dtype=torch.float16,
18
+ return_timestamps=True
19
+ )
20
+
21
+ def preprocess_audio(self, audio_path, target_sr=16000):
22
+ # Load audio with librosa for better preprocessing
23
+ y, sr = librosa.load(audio_path, sr=None)
24
+
25
+ # Resample to 16kHz (Whisper's expected rate)
26
+ y_resampled = librosa.resample(y=y, orig_sr=sr, target_sr=target_sr)
27
+
28
+ # Apply noise reduction
29
+ y_cleaned = librosa.effects.preemphasis(y_resampled)
30
+
31
+ # Normalize audio
32
+ y_normalized = librosa.util.normalize(y_cleaned)
33
+
34
+ # Remove silence and very quiet parts
35
+ y_filtered = librosa.effects.trim(
36
+ y_normalized,
37
+ top_db=30,
38
+ frame_length=2048,
39
+ hop_length=512
40
+ )[0]
41
+
42
+ return y_filtered, target_sr
43
+
44
+ def transcribe(self, audio_path):
45
+ try:
46
+ # Preprocess audio
47
+ audio_data, sample_rate = self.preprocess_audio(audio_path)
48
+ print(f"Audio loaded and preprocessed - Shape: {audio_data.shape}, Sample rate: {sample_rate}")
49
+
50
+ # Transcribe
51
+ result = self.model(
52
+ audio_data,
53
+ generate_kwargs={
54
+ "task": "transcribe",
55
+ "language": "en",
56
+ "max_new_tokens": 256,
57
+ "temperature": 0.7 # Added to reduce hallucination
58
+ }
59
+ )
60
+
61
+ # Extract transcription with timestamps if available
62
+ if isinstance(result, dict):
63
+ if "chunks" in result:
64
+ transcription = " ".join([chunk["text"] for chunk in result["chunks"]])
65
+ else:
66
+ transcription = result["text"]
67
+ else:
68
+ transcription = result
69
+
70
+ return transcription
71
+
72
+ except Exception as e:
73
+ print(f"Error in transcribe: {str(e)}")
74
+ raise
75
+
76
+ # Example usage
77
+ if __name__ == "__main__":
78
+ transcriber = WhisperTranscriber(model_size="medium")
79
+ transcription = transcriber.transcribe("path_to_your_audio_file.wav")
80
+ print(f"Transcription: {transcription}")