Spaces:
Sleeping
Sleeping
File size: 3,522 Bytes
cef05ee |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
import torch
import torchaudio
from demucs.pretrained import get_model
from demucs.apply import apply_model
import tempfile
import os
import numpy as np
import librosa
class DemucsProcessor:
def __init__(self, model_name="htdemucs"):
self.model_name = model_name
self.device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {self.device}")
self.model = get_model(model_name)
self.model.to(self.device)
self.sources = self.model.sources
print(f"Model loaded successfully on {self.device}")
print(f"Available sources: {self.sources}")
def load_audio(self, file_path):
try:
waveform, sample_rate = torchaudio.load(file_path)
print(f"Audio loaded - Shape: {waveform.shape}, Sample rate: {sample_rate}")
# Handle mono input
if waveform.dim() == 1:
waveform = waveform.unsqueeze(0)
if waveform.shape[0] == 1:
waveform = waveform.repeat(2, 1)
return waveform, sample_rate
except Exception as e:
print(f"Error loading with torchaudio: {e}")
try:
# Fallback to librosa
audio, sr = librosa.load(file_path, sr=44100, mono=False)
if audio.ndim == 1:
audio = np.vstack([audio, audio])
waveform = torch.from_numpy(audio)
return waveform, sr
except Exception as e:
raise RuntimeError(f"Failed to load audio: {str(e)}")
def separate_vocals(self, audio_path):
try:
# Load audio
waveform, sample_rate = self.load_audio(audio_path)
print(f"Audio loaded - Shape: {waveform.shape}, Sample rate: {sample_rate}")
# Ensure correct shape and device
waveform = waveform.to(self.device)
# Add batch dimension
waveform = waveform.unsqueeze(0)
# Process the entire audio at once instead of segments
with torch.no_grad():
sources = apply_model(self.model, waveform)
# Get vocals
vocals_idx = self.sources.index('vocals')
vocals = sources[:, vocals_idx]
# Save to temporary file
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
torchaudio.save(
tmp.name,
vocals.squeeze(0).cpu(),
sample_rate,
format='wav'
)
return tmp.name
except Exception as e:
raise RuntimeError(f"Separation failed: {str(e)}")
def configure_model():
return {
"segment_size": 16 if torch.cuda.is_available() else 4, # Increased from 8
"overlap": 0.1,
"sample_rate": 44100,
"channels": 2
}
def check_dependencies():
try:
import torch
import torchaudio
import librosa
import demucs
from demucs.pretrained import get_model
# Test audio loading
test_audio = np.random.random(44100)
test_tensor = torch.from_numpy(test_audio)
print("All required packages are installed correctly")
return True
except ImportError as e:
print(f"Missing dependency: {str(e)}")
return False
|