Spaces:

Kabatubare
/

voice_clone_detection_v1

Paused

App Files Files Community

Kabatubare commited on Mar 14

Commit

30a5efb

•

1 Parent(s): 86776b4

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -16

app.py CHANGED Viewed

@@ -1,42 +1,42 @@
-import gradio as gr
 import librosa
 import numpy as np
-import torch
-import logging
 from transformers import AutoModelForAudioClassification
 logging.basicConfig(level=logging.INFO)
 model_path = "./"
 model = AutoModelForAudioClassification.from_pretrained(model_path)
 def preprocess_audio(audio_path, sr=22050):
-    # Load and trim the audio file
     audio, sr = librosa.load(audio_path, sr=sr)
     audio, _ = librosa.effects.trim(audio)
     return audio, sr
 def extract_features(audio, sr):
-    # Extract various features from the audio
-    S = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=128)
-    log_S = librosa.power_to_db(S, ref=np.max)
-    y_harmonic, y_percussive = librosa.effects.hpss(audio)
-    chroma = librosa.feature.chroma_cqt(y=y_harmonic, sr=sr)
-    contrast = librosa.feature.spectral_contrast(S=S, sr=sr)
-    tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(audio), sr=sr)
-    # Stack features and add batch dimension
-    features = np.vstack([log_S, chroma, contrast, tonnetz])
-    features_tensor = torch.tensor(features).float().unsqueeze(0)  # (1, feature_dim, time_steps)
-    return features_tensor
 def predict_voice(audio_file_path):
     try:
         audio, sr = preprocess_audio(audio_file_path)
         features = extract_features(audio, sr)
-        # Model prediction
         with torch.no_grad():
             outputs = model(features)
             logits = outputs.logits

+import torch
+import torch.nn.functional as F
 import librosa
 import numpy as np
+import gradio as gr
 from transformers import AutoModelForAudioClassification
+import logging
 logging.basicConfig(level=logging.INFO)
+# Load your model here
 model_path = "./"
 model = AutoModelForAudioClassification.from_pretrained(model_path)
 def preprocess_audio(audio_path, sr=22050):
     audio, sr = librosa.load(audio_path, sr=sr)
     audio, _ = librosa.effects.trim(audio)
     return audio, sr
 def extract_features(audio, sr):
+    S = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=128, hop_length=512, n_fft=2048)
+    S_DB = librosa.power_to_db(S, ref=np.max)
+    # Reshape the spectrogram to a sequence of overlapping 16x16 patches
+    patches = librosa.util.frame(S_DB.flatten(), frame_length=16*16, hop_length=(16-6)*(16-6)).T
+    patches = patches.reshape(patches.shape[0], 16, 16)
+    # Linear projection layer equivalent (patch embedding layer)
+    patch_embeddings = patches.reshape(patches.shape[0], -1)
+    patch_embeddings = torch.tensor(patch_embeddings).float()
+    # Assuming positional embeddings and [CLS] token embedding are handled within the model
+    return patch_embeddings.unsqueeze(0)  # Add batch dimension for compatibility with model
 def predict_voice(audio_file_path):
     try:
         audio, sr = preprocess_audio(audio_file_path)
         features = extract_features(audio, sr)
         with torch.no_grad():
             outputs = model(features)
             logits = outputs.logits