Spaces:

jacob-c
/

largermodel_lyrics_generation

Paused

App Files Files Community

root commited on Mar 24

Commit

7dfa01d

1 Parent(s): bc74a52

ss

Browse files

Files changed (3) hide show

app.py +30 -8
requirements.txt +2 -1
utils.py +41 -6

app.py CHANGED Viewed

@@ -4,9 +4,10 @@ import gradio as gr
 import torch
 import numpy as np
 from transformers import (
-    AutoModelForSequenceClassification,
-    AutoTokenizer,
-    pipeline,
     AutoModelForCausalLM,
     BitsAndBytesConfig
 )
@@ -33,7 +34,15 @@ SAMPLE_RATE = 22050  # Standard sample rate for audio processing
 CUDA_AVAILABLE = ensure_cuda_availability()
 # Load genre classification model
-genre_tokenizer = AutoTokenizer.from_pretrained(GENRE_MODEL_NAME)
 genre_model = AutoModelForSequenceClassification.from_pretrained(GENRE_MODEL_NAME)
 # Load LLM with appropriate quantization for T4 GPU
@@ -72,12 +81,25 @@ def extract_audio_features(audio_file):
     return {
         "features": mfccs_mean,
-        "duration": duration
     }
-def classify_genre(audio_features):
     """Classify the genre of the audio using the loaded model."""
-    inputs = genre_tokenizer(str(audio_features), return_tensors="pt", truncation=True, max_length=512)
     with torch.no_grad():
         outputs = genre_model(**inputs)
@@ -140,7 +162,7 @@ def process_audio(audio_file):
         audio_data = extract_audio_features(audio_file)
         # Classify genre
-        top_genres = classify_genre(audio_data["features"])
         # Format genre results using utility function
         genre_results = format_genre_results(top_genres)

 import torch
 import numpy as np
 from transformers import (
+    AutoModelForSequenceClassification,
+    AutoFeatureExtractor,
+    AutoTokenizer,
+    pipeline,
     AutoModelForCausalLM,
     BitsAndBytesConfig
 )
 CUDA_AVAILABLE = ensure_cuda_availability()
 # Load genre classification model
+try:
+    # Try to load feature extractor first (for audio models)
+    genre_processor = AutoFeatureExtractor.from_pretrained(GENRE_MODEL_NAME)
+    print(f"Loaded feature extractor for genre classification model: {GENRE_MODEL_NAME}")
+except Exception as e:
+    print(f"Error loading feature extractor, using basic processing: {str(e)}")
+    genre_processor = None
+# Load the model
 genre_model = AutoModelForSequenceClassification.from_pretrained(GENRE_MODEL_NAME)
 # Load LLM with appropriate quantization for T4 GPU
     return {
         "features": mfccs_mean,
+        "duration": duration,
+        "waveform": y,
+        "sample_rate": sr
     }
+def classify_genre(audio_data):
     """Classify the genre of the audio using the loaded model."""
+    if genre_processor is not None:
+        # Use the feature extractor if available
+        inputs = genre_processor(
+            audio_data["waveform"],
+            sampling_rate=audio_data["sample_rate"],
+            return_tensors="pt"
+        )
+    else:
+        # Fallback to basic feature processing
+        # Convert MFCC features to tensor and reshape appropriately
+        features_tensor = torch.tensor(audio_data["features"]).unsqueeze(0)
+        inputs = {"input_features": features_tensor}
     with torch.no_grad():
         outputs = genre_model(**inputs)
         audio_data = extract_audio_features(audio_file)
         # Classify genre
+        top_genres = classify_genre(audio_data)
         # Format genre results using utility function
         genre_results = format_genre_results(top_genres)

requirements.txt CHANGED Viewed

@@ -9,4 +9,5 @@ huggingface-hub>=0.20.3
 bitsandbytes>=0.41.1
 sentencepiece>=0.1.99
 safetensors>=0.4.1
-scipy>=1.12.0

 bitsandbytes>=0.41.1
 sentencepiece>=0.1.99
 safetensors>=0.4.1
+scipy>=1.12.0
+soundfile>=0.12.1

utils.py CHANGED Viewed

@@ -4,8 +4,23 @@ import librosa
 def load_audio(audio_file, sr=22050):
     """Load an audio file and convert to mono if needed."""
-    y, sr = librosa.load(audio_file, sr=sr, mono=True)
-    return y, sr
 def extract_audio_duration(y, sr):
     """Get the duration of audio in seconds."""
@@ -13,9 +28,14 @@ def extract_audio_duration(y, sr):
 def extract_mfcc_features(y, sr, n_mfcc=20):
     """Extract MFCC features from audio."""
-    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
-    mfccs_mean = np.mean(mfccs.T, axis=0)
-    return mfccs_mean
 def calculate_lyrics_length(duration):
     """Calculate appropriate lyrics length based on audio duration."""
@@ -39,4 +59,19 @@ def ensure_cuda_availability():
         print(f"CUDA is available with {device_count} device(s). Using: {device_name}")
     else:
         print("CUDA is not available. Using CPU for inference.")
-    return cuda_available

 def load_audio(audio_file, sr=22050):
     """Load an audio file and convert to mono if needed."""
+    try:
+        # Try to load audio with librosa
+        y, sr = librosa.load(audio_file, sr=sr, mono=True)
+        return y, sr
+    except Exception as e:
+        print(f"Error loading audio with librosa: {str(e)}")
+        # Fallback to basic loading if necessary
+        import soundfile as sf
+        try:
+            y, sr = sf.read(audio_file)
+            # Convert to mono if stereo
+            if len(y.shape) > 1:
+                y = y.mean(axis=1)
+            return y, sr
+        except Exception as e2:
+            print(f"Error loading audio with soundfile: {str(e2)}")
+            raise ValueError(f"Could not load audio file: {audio_file}")
 def extract_audio_duration(y, sr):
     """Get the duration of audio in seconds."""
 def extract_mfcc_features(y, sr, n_mfcc=20):
     """Extract MFCC features from audio."""
+    try:
+        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
+        mfccs_mean = np.mean(mfccs.T, axis=0)
+        return mfccs_mean
+    except Exception as e:
+        print(f"Error extracting MFCCs: {str(e)}")
+        # Return a fallback feature vector if extraction fails
+        return np.zeros(n_mfcc)
 def calculate_lyrics_length(duration):
     """Calculate appropriate lyrics length based on audio duration."""
         print(f"CUDA is available with {device_count} device(s). Using: {device_name}")
     else:
         print("CUDA is not available. Using CPU for inference.")
+    return cuda_available
+def preprocess_audio_for_model(waveform, sample_rate, target_sample_rate=16000, max_length=16000):
+    """Preprocess audio for model input (resample, pad/trim)."""
+    # Resample if needed
+    if sample_rate != target_sample_rate:
+        waveform = librosa.resample(waveform, orig_sr=sample_rate, target_sr=target_sample_rate)
+    # Trim or pad to expected length
+    if len(waveform) > max_length:
+        waveform = waveform[:max_length]
+    elif len(waveform) < max_length:
+        padding = max_length - len(waveform)
+        waveform = np.pad(waveform, (0, padding), 'constant')
+    return waveform