root commited on
Commit
7dfa01d
·
1 Parent(s): bc74a52
Files changed (3) hide show
  1. app.py +30 -8
  2. requirements.txt +2 -1
  3. utils.py +41 -6
app.py CHANGED
@@ -4,9 +4,10 @@ import gradio as gr
4
  import torch
5
  import numpy as np
6
  from transformers import (
7
- AutoModelForSequenceClassification,
8
- AutoTokenizer,
9
- pipeline,
 
10
  AutoModelForCausalLM,
11
  BitsAndBytesConfig
12
  )
@@ -33,7 +34,15 @@ SAMPLE_RATE = 22050 # Standard sample rate for audio processing
33
  CUDA_AVAILABLE = ensure_cuda_availability()
34
 
35
  # Load genre classification model
36
- genre_tokenizer = AutoTokenizer.from_pretrained(GENRE_MODEL_NAME)
 
 
 
 
 
 
 
 
37
  genre_model = AutoModelForSequenceClassification.from_pretrained(GENRE_MODEL_NAME)
38
 
39
  # Load LLM with appropriate quantization for T4 GPU
@@ -72,12 +81,25 @@ def extract_audio_features(audio_file):
72
 
73
  return {
74
  "features": mfccs_mean,
75
- "duration": duration
 
 
76
  }
77
 
78
- def classify_genre(audio_features):
79
  """Classify the genre of the audio using the loaded model."""
80
- inputs = genre_tokenizer(str(audio_features), return_tensors="pt", truncation=True, max_length=512)
 
 
 
 
 
 
 
 
 
 
 
81
 
82
  with torch.no_grad():
83
  outputs = genre_model(**inputs)
@@ -140,7 +162,7 @@ def process_audio(audio_file):
140
  audio_data = extract_audio_features(audio_file)
141
 
142
  # Classify genre
143
- top_genres = classify_genre(audio_data["features"])
144
 
145
  # Format genre results using utility function
146
  genre_results = format_genre_results(top_genres)
 
4
  import torch
5
  import numpy as np
6
  from transformers import (
7
+ AutoModelForSequenceClassification,
8
+ AutoFeatureExtractor,
9
+ AutoTokenizer,
10
+ pipeline,
11
  AutoModelForCausalLM,
12
  BitsAndBytesConfig
13
  )
 
34
  CUDA_AVAILABLE = ensure_cuda_availability()
35
 
36
  # Load genre classification model
37
+ try:
38
+ # Try to load feature extractor first (for audio models)
39
+ genre_processor = AutoFeatureExtractor.from_pretrained(GENRE_MODEL_NAME)
40
+ print(f"Loaded feature extractor for genre classification model: {GENRE_MODEL_NAME}")
41
+ except Exception as e:
42
+ print(f"Error loading feature extractor, using basic processing: {str(e)}")
43
+ genre_processor = None
44
+
45
+ # Load the model
46
  genre_model = AutoModelForSequenceClassification.from_pretrained(GENRE_MODEL_NAME)
47
 
48
  # Load LLM with appropriate quantization for T4 GPU
 
81
 
82
  return {
83
  "features": mfccs_mean,
84
+ "duration": duration,
85
+ "waveform": y,
86
+ "sample_rate": sr
87
  }
88
 
89
+ def classify_genre(audio_data):
90
  """Classify the genre of the audio using the loaded model."""
91
+ if genre_processor is not None:
92
+ # Use the feature extractor if available
93
+ inputs = genre_processor(
94
+ audio_data["waveform"],
95
+ sampling_rate=audio_data["sample_rate"],
96
+ return_tensors="pt"
97
+ )
98
+ else:
99
+ # Fallback to basic feature processing
100
+ # Convert MFCC features to tensor and reshape appropriately
101
+ features_tensor = torch.tensor(audio_data["features"]).unsqueeze(0)
102
+ inputs = {"input_features": features_tensor}
103
 
104
  with torch.no_grad():
105
  outputs = genre_model(**inputs)
 
162
  audio_data = extract_audio_features(audio_file)
163
 
164
  # Classify genre
165
+ top_genres = classify_genre(audio_data)
166
 
167
  # Format genre results using utility function
168
  genre_results = format_genre_results(top_genres)
requirements.txt CHANGED
@@ -9,4 +9,5 @@ huggingface-hub>=0.20.3
9
  bitsandbytes>=0.41.1
10
  sentencepiece>=0.1.99
11
  safetensors>=0.4.1
12
- scipy>=1.12.0
 
 
9
  bitsandbytes>=0.41.1
10
  sentencepiece>=0.1.99
11
  safetensors>=0.4.1
12
+ scipy>=1.12.0
13
+ soundfile>=0.12.1
utils.py CHANGED
@@ -4,8 +4,23 @@ import librosa
4
 
5
  def load_audio(audio_file, sr=22050):
6
  """Load an audio file and convert to mono if needed."""
7
- y, sr = librosa.load(audio_file, sr=sr, mono=True)
8
- return y, sr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  def extract_audio_duration(y, sr):
11
  """Get the duration of audio in seconds."""
@@ -13,9 +28,14 @@ def extract_audio_duration(y, sr):
13
 
14
  def extract_mfcc_features(y, sr, n_mfcc=20):
15
  """Extract MFCC features from audio."""
16
- mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
17
- mfccs_mean = np.mean(mfccs.T, axis=0)
18
- return mfccs_mean
 
 
 
 
 
19
 
20
  def calculate_lyrics_length(duration):
21
  """Calculate appropriate lyrics length based on audio duration."""
@@ -39,4 +59,19 @@ def ensure_cuda_availability():
39
  print(f"CUDA is available with {device_count} device(s). Using: {device_name}")
40
  else:
41
  print("CUDA is not available. Using CPU for inference.")
42
- return cuda_available
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  def load_audio(audio_file, sr=22050):
6
  """Load an audio file and convert to mono if needed."""
7
+ try:
8
+ # Try to load audio with librosa
9
+ y, sr = librosa.load(audio_file, sr=sr, mono=True)
10
+ return y, sr
11
+ except Exception as e:
12
+ print(f"Error loading audio with librosa: {str(e)}")
13
+ # Fallback to basic loading if necessary
14
+ import soundfile as sf
15
+ try:
16
+ y, sr = sf.read(audio_file)
17
+ # Convert to mono if stereo
18
+ if len(y.shape) > 1:
19
+ y = y.mean(axis=1)
20
+ return y, sr
21
+ except Exception as e2:
22
+ print(f"Error loading audio with soundfile: {str(e2)}")
23
+ raise ValueError(f"Could not load audio file: {audio_file}")
24
 
25
  def extract_audio_duration(y, sr):
26
  """Get the duration of audio in seconds."""
 
28
 
29
  def extract_mfcc_features(y, sr, n_mfcc=20):
30
  """Extract MFCC features from audio."""
31
+ try:
32
+ mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
33
+ mfccs_mean = np.mean(mfccs.T, axis=0)
34
+ return mfccs_mean
35
+ except Exception as e:
36
+ print(f"Error extracting MFCCs: {str(e)}")
37
+ # Return a fallback feature vector if extraction fails
38
+ return np.zeros(n_mfcc)
39
 
40
  def calculate_lyrics_length(duration):
41
  """Calculate appropriate lyrics length based on audio duration."""
 
59
  print(f"CUDA is available with {device_count} device(s). Using: {device_name}")
60
  else:
61
  print("CUDA is not available. Using CPU for inference.")
62
+ return cuda_available
63
+
64
+ def preprocess_audio_for_model(waveform, sample_rate, target_sample_rate=16000, max_length=16000):
65
+ """Preprocess audio for model input (resample, pad/trim)."""
66
+ # Resample if needed
67
+ if sample_rate != target_sample_rate:
68
+ waveform = librosa.resample(waveform, orig_sr=sample_rate, target_sr=target_sample_rate)
69
+
70
+ # Trim or pad to expected length
71
+ if len(waveform) > max_length:
72
+ waveform = waveform[:max_length]
73
+ elif len(waveform) < max_length:
74
+ padding = max_length - len(waveform)
75
+ waveform = np.pad(waveform, (0, padding), 'constant')
76
+
77
+ return waveform