Kabatubare commited on
Commit
30a5efb
1 Parent(s): 86776b4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -16
app.py CHANGED
@@ -1,42 +1,42 @@
1
- import gradio as gr
 
2
  import librosa
3
  import numpy as np
4
- import torch
5
- import logging
6
  from transformers import AutoModelForAudioClassification
 
7
 
8
  logging.basicConfig(level=logging.INFO)
9
 
 
10
  model_path = "./"
11
  model = AutoModelForAudioClassification.from_pretrained(model_path)
12
 
13
  def preprocess_audio(audio_path, sr=22050):
14
- # Load and trim the audio file
15
  audio, sr = librosa.load(audio_path, sr=sr)
16
  audio, _ = librosa.effects.trim(audio)
17
  return audio, sr
18
 
19
  def extract_features(audio, sr):
20
- # Extract various features from the audio
21
- S = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=128)
22
- log_S = librosa.power_to_db(S, ref=np.max)
23
- y_harmonic, y_percussive = librosa.effects.hpss(audio)
24
- chroma = librosa.feature.chroma_cqt(y=y_harmonic, sr=sr)
25
- contrast = librosa.feature.spectral_contrast(S=S, sr=sr)
26
- tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(audio), sr=sr)
27
 
28
- # Stack features and add batch dimension
29
- features = np.vstack([log_S, chroma, contrast, tonnetz])
30
- features_tensor = torch.tensor(features).float().unsqueeze(0) # (1, feature_dim, time_steps)
31
 
32
- return features_tensor
 
33
 
34
  def predict_voice(audio_file_path):
35
  try:
36
  audio, sr = preprocess_audio(audio_file_path)
37
  features = extract_features(audio, sr)
38
 
39
- # Model prediction
40
  with torch.no_grad():
41
  outputs = model(features)
42
  logits = outputs.logits
 
1
+ import torch
2
+ import torch.nn.functional as F
3
  import librosa
4
  import numpy as np
5
+ import gradio as gr
 
6
  from transformers import AutoModelForAudioClassification
7
+ import logging
8
 
9
  logging.basicConfig(level=logging.INFO)
10
 
11
+ # Load your model here
12
  model_path = "./"
13
  model = AutoModelForAudioClassification.from_pretrained(model_path)
14
 
15
  def preprocess_audio(audio_path, sr=22050):
 
16
  audio, sr = librosa.load(audio_path, sr=sr)
17
  audio, _ = librosa.effects.trim(audio)
18
  return audio, sr
19
 
20
  def extract_features(audio, sr):
21
+ S = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=128, hop_length=512, n_fft=2048)
22
+ S_DB = librosa.power_to_db(S, ref=np.max)
23
+
24
+ # Reshape the spectrogram to a sequence of overlapping 16x16 patches
25
+ patches = librosa.util.frame(S_DB.flatten(), frame_length=16*16, hop_length=(16-6)*(16-6)).T
26
+ patches = patches.reshape(patches.shape[0], 16, 16)
 
27
 
28
+ # Linear projection layer equivalent (patch embedding layer)
29
+ patch_embeddings = patches.reshape(patches.shape[0], -1)
30
+ patch_embeddings = torch.tensor(patch_embeddings).float()
31
 
32
+ # Assuming positional embeddings and [CLS] token embedding are handled within the model
33
+ return patch_embeddings.unsqueeze(0) # Add batch dimension for compatibility with model
34
 
35
  def predict_voice(audio_file_path):
36
  try:
37
  audio, sr = preprocess_audio(audio_file_path)
38
  features = extract_features(audio, sr)
39
 
 
40
  with torch.no_grad():
41
  outputs = model(features)
42
  logits = outputs.logits