Spaces:

modeliticai
/

P-FAD

Runtime error

App Files Files Community

mrneuralnet commited on Aug 24, 2023

Commit

3fb4562

•

1 Parent(s): f067c08

Initial commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +3 -33
.gitignore +160 -0
LICENSE +21 -0
app.py +108 -0
config.yaml +16 -0
configs/finetuning/whisper_frontend_mesonet.yaml +16 -0
configs/training/lcnn.yaml +14 -0
configs/training/mesonet.yaml +15 -0
configs/training/rawnet3.yaml +13 -0
configs/training/specrnet.yaml +14 -0
configs/training/whisper_frontend_lcnn.yaml +16 -0
configs/training/whisper_frontend_lcnn_mfcc.yaml +15 -0
configs/training/whisper_frontend_mesonet.yaml +16 -0
configs/training/whisper_frontend_mesonet_mfcc.yaml +17 -0
configs/training/whisper_frontend_specrnet.yaml +15 -0
configs/training/whisper_frontend_specrnet_mfcc.yaml +16 -0
configs/training/whisper_lcnn.yaml +15 -0
configs/training/whisper_mesonet.yaml +16 -0
configs/training/whisper_specrnet.yaml +15 -0
download_whisper.py +29 -0
evaluate_models.py +316 -0
install.sh +6 -0
mesonet_whisper_mfcc_finetuned.pth +3 -0
sample_files/[FAKE] - jokowi - cupid [vocals].mp3 +3 -0
sample_files/[REAL] - Obama at Rutgers： 'Ignorance Is Not a Virtue'_[cut_49sec].mp3 +3 -0
sample_files/[REAL] - Obama's speech to the class of 2020 in 2 minutes ｜ The Washington Post.wav +3 -0
sample_files/[[FAKE] - y2mate.com - DeepFake AI generated synthetic video of Barack Obama.mp3 +3 -0
src/__init__.py +3 -0
src/commons.py +22 -0
src/datasets/__init__.py +0 -0
src/datasets/asvspoof_dataset.py +155 -0
src/datasets/base_dataset.py +180 -0
src/datasets/deepfake_asvspoof_dataset.py +86 -0
src/datasets/detection_dataset.py +125 -0
src/datasets/fakeavceleb_dataset.py +94 -0
src/datasets/folder_dataset.py +75 -0
src/datasets/in_the_wild_dataset.py +62 -0
src/datasets/wavefake_dataset.py +85 -0
src/frontends.py +72 -0
src/metrics.py +15 -0
src/models/__init__.py +0 -0
src/models/assets/mel_filters.npz +0 -0
src/models/assets/tiny_enc.en.pt +3 -0
src/models/lcnn.py +247 -0
src/models/meso_net.py +146 -0
src/models/models.py +73 -0
src/models/rawnet3.py +323 -0
src/models/specrnet.py +226 -0
src/models/whisper_lcnn.py +89 -0
src/models/whisper_main.py +323 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,5 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

 *.pt filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text
+*.mp3 filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,160 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) Piotr Kawa, Marcin Plata, Michał Czuba, Piotr Szymański, Piotr Syga
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

app.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import base64
+import json
+import os, shutil
+import re
+import time
+import uuid
+import cv2
+import numpy as np
+import streamlit as st
+from pydub import AudioSegment
+import torch
+import yaml
+# from extract_video import extract_method_single_video
+from utils import st_file_selector, img2base64
+from evaluate_models import inference, load_model
+from src import commons
+import os
+DEBUG = True
+def main():
+    st.markdown("###")
+    uploaded_file = st.file_uploader('Upload an audio file', type=['wav', 'mp3'], accept_multiple_files=False)
+    with st.spinner(f'Loading samples...'):
+        while not os.path.isdir("sample_files"):
+            time.sleep(1)
+    st.markdown("### or")
+    selected_file = st_file_selector(st, path='sample_files', key = 'selected', label = 'Choose a sample image/video')
+    if uploaded_file:
+        random_id = uuid.uuid1()
+        ext = uploaded_file.name.split('.')[-1]
+        base_folder = "temps"
+        filename = "{}.{}".format(random_id, ext)
+        file_type = uploaded_file.type.split("/")[0]
+        filepath = f"{base_folder}/{filename}"
+        uploaded_file_length = len(uploaded_file.getvalue())
+        if uploaded_file_length > 0:
+            with open(filepath, 'wb') as f:
+                f.write(uploaded_file.read())
+            st.audio(uploaded_file, format=ext)
+    elif selected_file:
+        base_folder = "sample_files"
+        file_type = selected_file.split(".")[-1]
+        filename = selected_file.split("/")[-1]
+        filepath = f"{base_folder}/{selected_file}"
+        st.write('file_type', file_type)
+        with open(filepath, 'rb') as f:
+            audio_bytes = f.read()
+        st.audio(audio_bytes, format=file_type)
+    else:
+        return
+    with st.spinner(f'Analyzing {file_type}...'):
+        seed = config["data"].get("seed", 42)
+        # fix all seeds - this should not actually change anything
+        commons.set_seed(seed)
+        result = inference(
+            model,
+            datasets_path=filepath,
+            device=device,
+        )
+        result = result[0]
+    if 'Real' == result[0]:
+        st.success(f'Audio is real! \nprob:{result[1]}', icon="✅")
+    else:
+        st.error(f'Audio is fake! \nprob:{result[1]}', icon="🚨")
+    st.divider()
+    st.write('## Response JSON')
+    st.write(result)
+def setup():
+    if not os.path.isdir("temps"):
+        os.makedirs("temps")
+if __name__ == "__main__":
+    if torch.cuda.is_available():
+        device = "cuda"
+    else:
+        device = "cpu"
+    with open('config.yaml', "r") as f:
+        config = yaml.safe_load(f)
+    model = load_model(config, device)
+    st.title("Face Fake Detection")
+    setup()
+    main()

config.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+data:
+  seed: 42
+checkpoint:
+  path: C:\Users\manfr\Projects\deepfake-whisper-features\mesonet_whisper_mfcc_finetuned.pth
+model:
+  name: whisper_frontend_mesonet
+  optimizer:
+    lr: 1.0e-06
+    weight_decay: 0.0001
+  parameters:
+    fc1_dim: 1024
+    freeze_encoder: false
+    frontend_algorithm: ["mfcc"]
+    input_channels: 2

configs/finetuning/whisper_frontend_mesonet.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+data:
+  seed: 42
+checkpoint:
+  path: "trained_models/whisper_frontend_mesonet/ckpt.pth"
+model:
+  name: "whisper_frontend_mesonet"
+  parameters:
+    freeze_encoder: false
+    input_channels: 2
+    fc1_dim: 1024
+    frontend_algorithm: ["lfcc"]
+  optimizer:
+    lr: 1.0e-06
+    weight_decay: 0.0001

configs/training/lcnn.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+data:
+  seed: 42
+checkpoint:
+  path: ""
+model:
+  name: "lcnn"
+  parameters:
+    input_channels: 1
+    frontend_algorithm: ["mfcc"]
+  optimizer:
+    lr: 0.0001
+    weight_decay: 0.0001

configs/training/mesonet.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+data:
+  seed: 42
+checkpoint:
+  path: ""
+model:
+  name: "mesonet"
+  parameters:
+    input_channels: 1
+    fc1_dim: 1024
+    frontend_algorithm: ["lfcc"]
+  optimizer:
+    lr: 0.0001
+    weight_decay: 0.0001

configs/training/rawnet3.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+data:
+  seed: 42
+checkpoint:
+  path: ""
+model:
+  name: "rawnet3"
+  parameters: {}
+  optimizer:
+    lr: 0.001
+    weight_decay: 0.00005  # 5e-5

configs/training/specrnet.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+data:
+  seed: 42
+checkpoint:
+  path: ""
+model:
+  name: "specrnet"
+  parameters:
+    input_channels: 1
+    frontend_algorithm: ["lfcc"]
+  optimizer:
+    lr: 0.0001
+    weight_decay: 0.0001

configs/training/whisper_frontend_lcnn.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+data:
+  seed: 42
+checkpoint:
+  path: ""
+model:
+  name: "whisper_frontend_lcnn"
+  parameters:
+    freeze_encoder: True
+    input_channels: 2
+    frontend_algorithm: ["lfcc"]
+  optimizer:
+    lr: 0.0001
+    weight_decay: 0.0001

configs/training/whisper_frontend_lcnn_mfcc.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+data:
+  seed: 42
+checkpoint:
+  path: ""
+model:
+  name: "whisper_frontend_lcnn"
+  parameters:
+    freeze_encoder: True
+    input_channels: 2
+    frontend_algorithm: ["mfcc"]
+  optimizer:
+    lr: 0.0001
+    weight_decay: 0.0001

configs/training/whisper_frontend_mesonet.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+data:
+  seed: 42
+checkpoint:
+  path: ""
+model:
+  name: "whisper_frontend_mesonet"
+  parameters:
+    freeze_encoder: True
+    input_channels: 2
+    fc1_dim: 1024
+    frontend_algorithm: ["lfcc"]
+  optimizer:
+    lr: 0.0001
+    weight_decay: 0.0001

configs/training/whisper_frontend_mesonet_mfcc.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+data:
+  seed: 42
+checkpoint:
+  path: ""
+model:
+  name: "whisper_frontend_mesonet"
+  parameters:
+    freeze_encoder: True
+    input_channels: 2
+    fc1_dim: 1024
+    frontend_algorithm: ["mfcc"]
+  optimizer:
+    lr: 0.0001
+    weight_decay: 0.0001

configs/training/whisper_frontend_specrnet.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+data:
+  seed: 42
+checkpoint:
+  path: ""
+model:
+  name: "whisper_frontend_specrnet"
+  parameters:
+    freeze_encoder: True
+    input_channels: 2
+    frontend_algorithm: ["lfcc"]
+  optimizer:
+    lr: 0.0001
+    weight_decay: 0.0001

configs/training/whisper_frontend_specrnet_mfcc.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+data:
+  seed: 42
+checkpoint:
+  path: ""
+model:
+  name: "whisper_frontend_specrnet"
+  parameters:
+    freeze_encoder: True
+    input_channels: 2
+    frontend_algorithm: ["mfcc"]
+  optimizer:
+    lr: 0.0001
+    weight_decay: 0.0001

configs/training/whisper_lcnn.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+data:
+  seed: 42
+checkpoint:
+  path: ""
+model:
+  name: "whisper_lcnn"
+  parameters:
+    freeze_encoder: True
+    input_channels: 1
+    frontend_algorithm: ["lfcc"]
+  optimizer:
+    lr: 0.0001
+    weight_decay: 0.0001

configs/training/whisper_mesonet.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+data:
+  seed: 42
+checkpoint:
+  path: ""
+model:
+  name: "whisper_mesonet"
+  parameters:
+    freeze_encoder: True
+    input_channels: 1
+    fc1_dim: 1024
+    frontend_algorithm: []
+  optimizer:
+    lr: 0.0001
+    weight_decay: 0.0001

configs/training/whisper_specrnet.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+data:
+  seed: 42
+checkpoint:
+  path: ""
+model:
+  name: "whisper_specrnet"
+  parameters:
+    freeze_encoder: True
+    input_channels: 1
+    frontend_algorithm: ["lfcc"]
+  optimizer:
+    lr: 0.0001
+    weight_decay: 0.0001

download_whisper.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# pip install git+https://github.com/openai/whisper.git
+from collections import OrderedDict
+import whisper
+import torch
+from src.commons import WHISPER_MODEL_WEIGHTS_PATH
+def download_whisper():
+    model = whisper.load_model("tiny.en")
+    return model
+def extract_and_save_encoder(model):
+    model_ckpt = OrderedDict()
+    model_ckpt['model_state_dict'] = OrderedDict()
+    for key, value in model.encoder.state_dict().items():
+        model_ckpt['model_state_dict'][f'encoder.{key}'] = value
+    model_ckpt['dims'] = model.dims
+    torch.save(model_ckpt, WHISPER_MODEL_WEIGHTS_PATH)
+if __name__ == "__main__":
+    model = download_whisper()
+    print("Downloaded Whisper model!")
+    extract_and_save_encoder(model)
+    print(f"Saved encoder at '{WHISPER_MODEL_WEIGHTS_PATH}'")

evaluate_models.py ADDED Viewed

	@@ -0,0 +1,316 @@

+import argparse
+import logging
+from pathlib import Path
+from typing import Dict, List, Optional, Union
+import sys
+import torch
+import yaml
+from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
+from torch.utils.data import DataLoader
+from src import metrics, commons
+from src.models import models
+from src.datasets.base_dataset import SimpleAudioFakeDataset
+from src.datasets.in_the_wild_dataset import InTheWildDataset
+from src.datasets.folder_dataset import FolderDataset, FileDataset
+def get_dataset(
+    datasets_paths: List[Union[Path, str]],
+    amount_to_use: Optional[int],
+) -> SimpleAudioFakeDataset:
+    data_val = FolderDataset(
+        path=datasets_paths[0]
+    )
+    return data_val
+def get_dataset_file(
+    datasets_path,
+    amount_to_use: Optional[int],
+) -> SimpleAudioFakeDataset:
+    data_val = FileDataset(
+        path=datasets_path
+    )
+    return data_val
+def evaluate_nn(
+    model_paths: List[Path],
+    datasets_paths: List[Union[Path, str]],
+    model_config: Dict,
+    device: str,
+    amount_to_use: Optional[int] = None,
+    batch_size: int = 8,
+):
+    logging.info("Loading data...")
+    model_name, model_parameters = model_config["name"], model_config["parameters"]
+    # Load model architecture
+    model = models.get_model(
+        model_name=model_name,
+        config=model_parameters,
+        device=device,
+    )
+    # If provided weights, apply corresponding ones (from an appropriate fold)
+    if len(model_paths):
+        state_dict = torch.load(model_paths, map_location=device)
+        model.load_state_dict(state_dict)
+    model = model.to(device)
+    data_val = get_dataset(
+        datasets_paths=datasets_paths,
+        amount_to_use=amount_to_use,
+    )
+    logging.info(
+        f"Testing '{model_name}' model, weights path: '{model_paths}', on {len(data_val)} audio files."
+    )
+    test_loader = DataLoader(
+        data_val,
+        batch_size=batch_size,
+        shuffle=True,
+        drop_last=False,
+        num_workers=3,
+    )
+    batches_number = len(data_val) // batch_size
+    num_correct = 0.0
+    num_total = 0.0
+    y_pred = torch.Tensor([]).to(device)
+    y = torch.Tensor([]).to(device)
+    y_pred_label = torch.Tensor([]).to(device)
+    preds  = []
+    for i, (batch_x, _, batch_y, metadata) in enumerate(test_loader):
+        model.eval()
+        _, path, _, _ = metadata
+        if i % 10 == 0:
+            print(f"Batch [{i}/{batches_number}]")
+        with torch.no_grad():
+            batch_x = batch_x.to(device)
+            batch_y = batch_y.to(device)
+            num_total += batch_x.size(0)
+            batch_pred = model(batch_x).squeeze(1)
+            batch_pred = torch.sigmoid(batch_pred)
+            batch_pred_label = (batch_pred + 0.5).int()
+            num_correct += (batch_pred_label == batch_y.int()).sum(dim=0).item()
+            y_pred = torch.concat([y_pred, batch_pred], dim=0)
+            y_pred_label = torch.concat([y_pred_label, batch_pred_label], dim=0)
+            y = torch.concat([y, batch_y], dim=0)
+            for i in range(len(y_pred_label)):
+                label = 'Fake' if y_pred_label[i] == 0 else 'Real'
+                print(f'{path[i]}')
+                print(f'          Prediction: : {label}')
+                print(f'          Probability: {y_pred[i]})')
+                preds.append((label, y_pred[i].detach().cpu().item()))
+    return preds
+    eval_accuracy = (num_correct / num_total) * 100
+    precision, recall, f1_score, support = precision_recall_fscore_support(
+        y.cpu().numpy(), y_pred_label.cpu().numpy(), average="binary", beta=1.0
+    )
+    auc_score = roc_auc_score(y_true=y.cpu().numpy(), y_score=y_pred.cpu().numpy())
+    # For EER flip values, following original evaluation implementation
+    y_for_eer = 1 - y
+    thresh, eer, fpr, tpr = metrics.calculate_eer(
+        y=y_for_eer.cpu().numpy(),
+        y_score=y_pred.cpu().numpy(),
+    )
+    eer_label = f"eval/eer"
+    accuracy_label = f"eval/accuracy"
+    precision_label = f"eval/precision"
+    recall_label = f"eval/recall"
+    f1_label = f"eval/f1_score"
+    auc_label = f"eval/auc"
+    logging.info(
+        f"{eer_label}: {eer:.4f}, {accuracy_label}: {eval_accuracy:.4f}, {precision_label}: {precision:.4f}, {recall_label}: {recall:.4f}, {f1_label}: {f1_score:.4f}, {auc_label}: {auc_score:.4f}"
+    )
+def load_model(config, device):
+    model_config = config['model']
+    model_name, model_parameters = model_config["name"], model_config["parameters"]
+    model_paths = config["checkpoint"].get("path", [])
+    # Load model architecture
+    model = models.get_model(
+        model_name=model_name,
+        config=model_parameters,
+        device=device,
+    )
+    # If provided weights, apply corresponding ones (from an appropriate fold)
+    if len(model_paths):
+        state_dict = torch.load(model_paths, map_location=device)
+        model.load_state_dict(state_dict)
+    model = model.to(device)
+    return model
+def inference(
+    model,
+    datasets_path,
+    device: str,
+    amount_to_use: Optional[int] = None,
+    batch_size: int = 8,
+):
+    logging.info("Loading data...")
+    data_val = get_dataset_file(
+        datasets_path=datasets_path,
+        amount_to_use=amount_to_use,
+    )
+    test_loader = DataLoader(
+        data_val,
+        batch_size=batch_size,
+        shuffle=True,
+        drop_last=False,
+        num_workers=3,
+    )
+    batches_number = len(data_val) // batch_size
+    num_correct = 0.0
+    num_total = 0.0
+    y_pred = torch.Tensor([]).to(device)
+    y = torch.Tensor([]).to(device)
+    y_pred_label = torch.Tensor([]).to(device)
+    preds  = []
+    for i, (batch_x, _, batch_y, metadata) in enumerate(test_loader):
+        model.eval()
+        _, path, _, _ = metadata
+        if i % 10 == 0:
+            print(f"Batch [{i}/{batches_number}]")
+        with torch.no_grad():
+            batch_x = batch_x.to(device)
+            batch_y = batch_y.to(device)
+            num_total += batch_x.size(0)
+            batch_pred = model(batch_x).squeeze(1)
+            batch_pred = torch.sigmoid(batch_pred)
+            batch_pred_label = (batch_pred + 0.5).int()
+            num_correct += (batch_pred_label == batch_y.int()).sum(dim=0).item()
+            y_pred = torch.concat([y_pred, batch_pred], dim=0)
+            y_pred_label = torch.concat([y_pred_label, batch_pred_label], dim=0)
+            y = torch.concat([y, batch_y], dim=0)
+            for i in range(len(y_pred_label)):
+                label = 'Fake' if y_pred_label[i] == 0 else 'Real'
+                print(f'{path[i]}')
+                print(f'          Prediction: : {label}')
+                print(f'          Probability: {y_pred[i]})')
+                preds.append((label, y_pred[i].detach().cpu().item()))
+    return preds
+    eval_accuracy = (num_correct / num_total) * 100
+    precision, recall, f1_score, support = precision_recall_fscore_support(
+        y.cpu().numpy(), y_pred_label.cpu().numpy(), average="binary", beta=1.0
+    )
+    auc_score = roc_auc_score(y_true=y.cpu().numpy(), y_score=y_pred.cpu().numpy())
+    # For EER flip values, following original evaluation implementation
+    y_for_eer = 1 - y
+    thresh, eer, fpr, tpr = metrics.calculate_eer(
+        y=y_for_eer.cpu().numpy(),
+        y_score=y_pred.cpu().numpy(),
+    )
+    eer_label = f"eval/eer"
+    accuracy_label = f"eval/accuracy"
+    precision_label = f"eval/precision"
+    recall_label = f"eval/recall"
+    f1_label = f"eval/f1_score"
+    auc_label = f"eval/auc"
+    logging.info(
+        f"{eer_label}: {eer:.4f}, {accuracy_label}: {eval_accuracy:.4f}, {precision_label}: {precision:.4f}, {recall_label}: {recall:.4f}, {f1_label}: {f1_score:.4f}, {auc_label}: {auc_score:.4f}"
+    )
+def main(args):
+    LOGGER = logging.getLogger()
+    LOGGER.setLevel(logging.INFO)
+    ch = logging.StreamHandler()
+    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
+    ch.setFormatter(formatter)
+    LOGGER.addHandler(ch)
+    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+    if not args.cpu and torch.cuda.is_available():
+        device = "cuda"
+    else:
+        device = "cpu"
+    with open(args.config, "r") as f:
+        config = yaml.safe_load(f)
+    seed = config["data"].get("seed", 42)
+    # fix all seeds - this should not actually change anything
+    commons.set_seed(seed)
+    evaluate_nn(
+        model_paths=config["checkpoint"].get("path", []),
+        datasets_paths=[
+            args.folder_path,
+        ],
+        model_config=config["model"],
+        amount_to_use=args.amount,
+        device=device,
+    )
+def parse_args():
+    parser = argparse.ArgumentParser()
+    # If assigned as None, then it won't be taken into account
+    FOLDER_DATASET_PATH  = "sample_files"
+    parser.add_argument(
+        "--folder_path", type=str, default=FOLDER_DATASET_PATH
+    )
+    default_model_config = "config.yaml"
+    parser.add_argument(
+        "--config",
+        help="Model config file path (default: config.yaml)",
+        type=str,
+        default=default_model_config,
+    )
+    default_amount = None
+    parser.add_argument(
+        "--amount",
+        "-a",
+        help=f"Amount of files to load from each directory (default: {default_amount} - use all).",
+        type=int,
+        default=default_amount,
+    )
+    parser.add_argument("--cpu", "-c", help="Force using cpu", action="store_true")
+    return parser.parse_args()
+if __name__ == "__main__":
+    main(parse_args())

install.sh ADDED Viewed

	@@ -0,0 +1,6 @@

+conda install pytorch==1.11.0 torchvision==0.12.0 torchaudio==0.11.0 cudatoolkit=11.3 -c pytorch -y
+pip install asteroid-filterbanks==0.4.0
+pip install librosa==0.9.2
+pip install git+https://github.com/openai/whisper.git@7858aa9c08d98f75575035ecd6481f462d66ca27
+pip install pandas==2.0.2

mesonet_whisper_mfcc_finetuned.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a34a00d0961303274e1cf7a2dc2b6e9f9d568ff0416300be1aaee1c2e2ceee12
+size 32983925

sample_files/[FAKE] - jokowi - cupid [vocals].mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1ce8dce41de4f44908c57deea26d4efe5a74f9a37700a76a94ac065e862304c0
+size 775449

sample_files/[REAL] - Obama at Rutgers： 'Ignorance Is Not a Virtue'_[cut_49sec].mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6694c6d329f8a372896808c1f7c1e487eec65e5ad2fb3d244d80729b211ac0c4
+size 1950720

sample_files/[REAL] - Obama's speech to the class of 2020 in 2 minutes ｜ The Washington Post.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f045f2b80fdc136c63bfc5897dd9a3d34a3b60dba886cae297d4425db30d5d9
+size 27507540

sample_files/[[FAKE] - y2mate.com - DeepFake AI generated synthetic video of Barack Obama.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fd7100f013cb23ae4af3e00594330838dcae39dc86d669ef8fd215a6a6d88f53
+size 273900

src/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ import logging
2	+
3	+ logging.getLogger(__name__).addHandler(logging.NullHandler())

src/commons.py ADDED Viewed

	@@ -0,0 +1,22 @@

+"""Utility file for src toolkit."""
+import os
+import random
+import numpy as np
+import torch
+WHISPER_MODEL_WEIGHTS_PATH = "src/models/assets/tiny_enc.en.pt"
+def set_seed(seed: int):
+    """Fix PRNG seed for reproducable experiments.
+    """
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+    os.environ["PYTHONHASHSEED"] = str(seed)

src/datasets/__init__.py ADDED Viewed

File without changes

src/datasets/asvspoof_dataset.py ADDED Viewed

	@@ -0,0 +1,155 @@

+from pathlib import Path
+import pandas as pd
+if __name__ == "__main__":
+    import sys
+    sys.path.append(str(Path(__file__).parent.parent.parent.absolute()))
+from src.datasets.base_dataset import SimpleAudioFakeDataset
+ASVSPOOF_SPLIT = {
+    "train": ['A01', 'A07', 'A08', 'A02', 'A09', 'A10', 'A03', 'A04', 'A05', 'A06', 'A11', 'A12', 'A13', 'A14', 'A15', 'A16', 'A17', 'A18', 'A19'],
+    "test":  ['A01', 'A07', 'A08', 'A02', 'A09', 'A10', 'A03', 'A04', 'A05', 'A06', 'A11', 'A12', 'A13', 'A14', 'A15', 'A16', 'A17', 'A18', 'A19'],
+    "val":   ['A01', 'A07', 'A08', 'A02', 'A09', 'A10', 'A03', 'A04', 'A05', 'A06', 'A11', 'A12', 'A13', 'A14', 'A15', 'A16', 'A17', 'A18', 'A19'],
+    "partition_ratio": [0.7, 0.15],
+    "seed": 45,
+}
+class ASVSpoofDataset(SimpleAudioFakeDataset):
+    protocol_folder_name = "ASVspoof2019_LA_cm_protocols"
+    subset_dir_prefix = "ASVspoof2019_LA_"
+    subsets = ("train", "dev", "eval")
+    def __init__(self, path, subset="train", transform=None):
+        super().__init__(subset, transform)
+        self.path = path
+        self.allowed_attacks = ASVSPOOF_SPLIT[subset]
+        self.partition_ratio = ASVSPOOF_SPLIT["partition_ratio"]
+        self.seed = ASVSPOOF_SPLIT["seed"]
+        self.samples = pd.DataFrame()
+        for subset in self.subsets:
+            subset_dir = Path(self.path) / f"{self.subset_dir_prefix}{subset}"
+            subset_protocol_path = self.get_protocol_path(subset)
+            subset_samples = self.read_protocol(subset_dir, subset_protocol_path)
+            self.samples = pd.concat([self.samples, subset_samples])
+        self.transform = transform
+    def get_protocol_path(self, subset):
+        paths = list((Path(self.path) / self.protocol_folder_name).glob("*.txt"))
+        for path in paths:
+            if subset in Path(path).stem:
+                return path
+    def read_protocol(self, subset_dir, protocol_path):
+        samples = {
+            "user_id": [],
+            "sample_name": [],
+            "attack_type": [],
+            "label": [],
+            "path": []
+        }
+        real_samples = []
+        fake_samples = []
+        with open(protocol_path, "r") as file:
+            for line in file:
+                attack_type = line.strip().split(" ")[3]
+                if attack_type == "-":
+                    real_samples.append(line)
+                elif attack_type in self.allowed_attacks:
+                    fake_samples.append(line)
+                if attack_type not in self.allowed_attacks:
+                    continue
+        fake_samples = self.split_samples(fake_samples)
+        for line in fake_samples:
+            samples = self.add_line_to_samples(samples, line, subset_dir)
+        real_samples = self.split_samples(real_samples)
+        for line in real_samples:
+            samples = self.add_line_to_samples(samples, line, subset_dir)
+        return pd.DataFrame(samples)
+    @staticmethod
+    def add_line_to_samples(samples, line, subset_dir):
+        user_id, sample_name, _, attack_type, label = line.strip().split(" ")
+        samples["user_id"].append(user_id)
+        samples["sample_name"].append(sample_name)
+        samples["attack_type"].append(attack_type)
+        samples["label"].append(label)
+        assert (subset_dir / "flac" / f"{sample_name}.flac").exists()
+        samples["path"].append(subset_dir / "flac" / f"{sample_name}.flac")
+        return samples
+class ASVSpoof2019DatasetOriginal(ASVSpoofDataset):
+    subsets = {"train": "train", "test": "dev", "val": "eval"}
+    protocol_folder_name = "ASVspoof2019_LA_cm_protocols"
+    subset_dir_prefix = "ASVspoof2019_LA_"
+    subset_dirs_attacks = {
+        "train": ["A01", "A02", "A03", "A04", "A05", "A06"],
+        "dev":  ["A01", "A02", "A03", "A04", "A05", "A06"],
+        "eval": [
+            "A07", "A08", "A09", "A10", "A11",  "A12", "A13", "A14", "A15",
+            "A16", "A17", "A18", "A19"
+        ]
+    }
+    def __init__(self, path, fold_subset="train"):
+        """
+        Initialise object. Skip __init__ of ASVSpoofDataset doe to different
+        logic, but follow SimpleAudioFakeDataset constructor.
+        """
+        super(ASVSpoofDataset, self).__init__(float('inf'), fold_subset)
+        self.path = path
+        subset = self.subsets[fold_subset]
+        self.allowed_attacks = self.subset_dirs_attacks[subset]
+        subset_dir = Path(self.path) / f"{self.subset_dir_prefix}{subset}"
+        subset_protocol_path = self.get_protocol_path(subset)
+        self.samples = self.read_protocol(subset_dir, subset_protocol_path)
+    def read_protocol(self, subset_dir, protocol_path):
+        samples = {
+            "user_id": [],
+            "sample_name": [],
+            "attack_type": [],
+            "label": [],
+            "path": []
+        }
+        real_samples = []
+        fake_samples = []
+        with open(protocol_path, "r") as file:
+            for line in file:
+                attack_type = line.strip().split(" ")[3]
+                if attack_type == "-":
+                    real_samples.append(line)
+                elif attack_type in self.allowed_attacks:
+                    fake_samples.append(line)
+                else:
+                    raise ValueError(
+                        "Tried to load attack that shouldn't be here!"
+                    )
+        for line in fake_samples:
+            samples = self.add_line_to_samples(samples, line, subset_dir)
+        for line in real_samples:
+            samples = self.add_line_to_samples(samples, line, subset_dir)
+        return pd.DataFrame(samples)

src/datasets/base_dataset.py ADDED Viewed

	@@ -0,0 +1,180 @@

+"""Base dataset classes."""
+import logging
+import math
+import random
+import numpy as np
+import pandas as pd
+import torch
+import torchaudio
+from torch.utils.data import Dataset
+from torch.utils.data.dataset import T_co
+LOGGER = logging.getLogger(__name__)
+SAMPLING_RATE = 16_000
+APPLY_NORMALIZATION = True
+APPLY_TRIMMING = True
+APPLY_PADDING = True
+FRAMES_NUMBER = 480_000  # <- originally 64_600
+SOX_SILENCE = [
+    # trim all silence that is longer than 0.2s and louder than 1% volume (relative to the file)
+    # from beginning and middle/end
+    ["silence", "1", "0.2", "1%", "-1", "0.2", "1%"],
+]
+class SimpleAudioFakeDataset(Dataset):
+    def __init__(
+        self,
+        subset,
+        transform=None,
+        return_label: bool = True,
+        return_meta: bool = True,
+    ):
+        self.transform = transform
+        self.subset = subset
+        self.allowed_attacks = None
+        self.partition_ratio = None
+        self.seed = None
+        self.return_label = return_label
+        self.return_meta = return_meta
+    def split_samples(self, samples_list):
+        if isinstance(samples_list, pd.DataFrame):
+            samples_list = samples_list.sort_values(by=list(samples_list.columns))
+            samples_list = samples_list.sample(frac=1, random_state=self.seed)
+        else:
+            samples_list = sorted(samples_list)
+            random.seed(self.seed)
+            random.shuffle(samples_list)
+        p, s = self.partition_ratio
+        subsets = np.split(
+            samples_list, [int(p * len(samples_list)), int((p + s) * len(samples_list))]
+        )
+        return dict(zip(["train", "test", "val"], subsets))[self.subset]
+    def df2tuples(self):
+        tuple_samples = []
+        for i, elem in self.samples.iterrows():
+            tuple_samples.append(
+                (str(elem["path"]), elem["label"], elem["attack_type"])
+            )
+        self.samples = tuple_samples
+        return self.samples
+    def __getitem__(self, index) -> T_co:
+        if isinstance(self.samples, pd.DataFrame):
+            sample = self.samples.iloc[index]
+            path = str(sample["path"])
+            label = sample["label"]
+            attack_type = sample["attack_type"]
+            if type(attack_type) != str and math.isnan(attack_type):
+                attack_type = "N/A"
+        else:
+            path, label, attack_type = self.samples[index]
+        waveform, sample_rate = torchaudio.load(path, normalize=APPLY_NORMALIZATION)
+        import librosa
+        # waveform, sample_rate = librosa.load(path, sr=SAMPLING_RATE)
+        # waveform = torch.tensor(waveform)
+        print('waveform', waveform)
+        real_sec_length = len(waveform[0]) / sample_rate
+        waveform, sample_rate = apply_preprocessing(waveform, sample_rate)
+        return_data = [waveform, sample_rate]
+        if self.return_label:
+            label = 1 if label == "bonafide" else 0
+            return_data.append(label)
+        if self.return_meta:
+            return_data.append(
+                (
+                    attack_type,
+                    path,
+                    self.subset,
+                    real_sec_length,
+                )
+            )
+        return return_data
+    def __len__(self):
+        return len(self.samples)
+def apply_preprocessing(
+    waveform,
+    sample_rate,
+):
+    if sample_rate != SAMPLING_RATE and SAMPLING_RATE != -1:
+        waveform, sample_rate = resample_wave(waveform, sample_rate, SAMPLING_RATE)
+    # Stereo to mono
+    if waveform.dim() > 1 and waveform.shape[0] > 1:
+        waveform = waveform[:1, ...]
+    # Trim too long utterances...
+    if APPLY_TRIMMING:
+        waveform, sample_rate = apply_trim(waveform, sample_rate)
+    # ... or pad too short ones.
+    if APPLY_PADDING:
+        waveform = apply_pad(waveform, FRAMES_NUMBER)
+    return waveform, sample_rate
+def resample_wave(waveform, sample_rate, target_sample_rate):
+    # waveform, sample_rate = torchaudio.sox_effects.apply_effects_tensor(
+    #     waveform, sample_rate, [["rate", f"{target_sample_rate}"]]
+    # )
+    waveform = torchaudio.functional.resample(waveform, orig_freq=sample_rate, new_freq=target_sample_rate)
+    return waveform, target_sample_rate
+def resample_file(path, target_sample_rate, normalize=True):
+    waveform, sample_rate = torchaudio.sox_effects.apply_effects_file(
+        path, [["rate", f"{target_sample_rate}"]], normalize=normalize
+    )
+    return waveform, sample_rate
+def apply_trim(waveform, sample_rate):
+    # (
+    #     waveform_trimmed,
+    #     sample_rate_trimmed,
+    # ) = torchaudio.sox_effects.apply_effects_tensor(waveform, sample_rate, SOX_SILENCE)
+    ["silence", "1", "0.2", "1%", "-1", "0.2", "1%"],
+    waveform_trimmed = torchaudio.functional.vad(waveform, sample_rate=sample_rate)
+    if waveform_trimmed.size()[1] > 0:
+        waveform = waveform_trimmed
+    return waveform, sample_rate
+def apply_pad(waveform, cut):
+    """Pad wave by repeating signal until `cut` length is achieved."""
+    waveform = waveform.squeeze(0)
+    waveform_len = waveform.shape[0]
+    if waveform_len >= cut:
+        return waveform[:cut]
+    # need to pad
+    num_repeats = int(cut / waveform_len) + 1
+    padded_waveform = torch.tile(waveform, (1, num_repeats))[:, :cut][0]
+    return padded_waveform

src/datasets/deepfake_asvspoof_dataset.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import logging
+from pathlib import Path
+import pandas as pd
+from src.datasets.base_dataset import SimpleAudioFakeDataset
+DF_ASVSPOOF_SPLIT = {
+    "partition_ratio": [0.7, 0.15],
+    "seed": 45
+}
+LOGGER = logging.getLogger()
+class DeepFakeASVSpoofDataset(SimpleAudioFakeDataset):
+    protocol_file_name = "keys/CM/trial_metadata.txt"
+    subset_dir_prefix = "ASVspoof2021_DF_eval"
+    subset_parts = ("part00", "part01", "part02", "part03")
+    def __init__(self, path, subset="train", transform=None):
+        super().__init__(subset, transform)
+        self.path = path
+        self.partition_ratio = DF_ASVSPOOF_SPLIT["partition_ratio"]
+        self.seed = DF_ASVSPOOF_SPLIT["seed"]
+        self.flac_paths = self.get_file_references()
+        self.samples = self.read_protocol()
+        self.transform = transform
+        LOGGER.info(f"Spoof: {len(self.samples[self.samples['label'] == 'spoof'])}")
+        LOGGER.info(f"Original: {len(self.samples[self.samples['label'] == 'bonafide'])}")
+    def get_file_references(self):
+        flac_paths = {}
+        for part in self.subset_parts:
+            path = Path(self.path) / f"{self.subset_dir_prefix}_{part}" / self.subset_dir_prefix / "flac"
+            flac_list = list(path.glob("*.flac"))
+            for path in flac_list:
+                flac_paths[path.stem] = path
+        return flac_paths
+    def read_protocol(self):
+        samples = {
+            "sample_name": [],
+            "label": [],
+            "path": [],
+            "attack_type": [],
+        }
+        real_samples = []
+        fake_samples = []
+        with open(Path(self.path) / self.protocol_file_name, "r") as file:
+            for line in file:
+                label = line.strip().split(" ")[5]
+                if label == "bonafide":
+                    real_samples.append(line)
+                elif label == "spoof":
+                    fake_samples.append(line)
+        fake_samples = self.split_samples(fake_samples)
+        for line in fake_samples:
+            samples = self.add_line_to_samples(samples, line)
+        real_samples = self.split_samples(real_samples)
+        for line in real_samples:
+            samples = self.add_line_to_samples(samples, line)
+        return pd.DataFrame(samples)
+    def add_line_to_samples(self, samples, line):
+        _, sample_name, _, _, _, label, _, _ = line.strip().split(" ")
+        samples["sample_name"].append(sample_name)
+        samples["label"].append(label)
+        samples["attack_type"].append(label)
+        sample_path = self.flac_paths[sample_name]
+        assert sample_path.exists()
+        samples["path"].append(sample_path)
+        return samples

src/datasets/detection_dataset.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import logging
+from typing import List, Optional
+import pandas as pd
+from src.datasets.base_dataset import SimpleAudioFakeDataset
+from src.datasets.deepfake_asvspoof_dataset import DeepFakeASVSpoofDataset
+from src.datasets.fakeavceleb_dataset import FakeAVCelebDataset
+from src.datasets.wavefake_dataset import WaveFakeDataset
+from src.datasets.asvspoof_dataset import ASVSpoof2019DatasetOriginal
+LOGGER = logging.getLogger()
+class DetectionDataset(SimpleAudioFakeDataset):
+    def __init__(
+        self,
+        asvspoof_path=None,
+        wavefake_path=None,
+        fakeavceleb_path=None,
+        asvspoof2019_path=None,
+        subset: str = "val",
+        transform=None,
+        oversample: bool = True,
+        undersample: bool = False,
+        return_label: bool = True,
+        reduced_number: Optional[int] = None,
+        return_meta: bool = False,
+    ):
+        super().__init__(
+            subset=subset,
+            transform=transform,
+            return_label=return_label,
+            return_meta=return_meta,
+        )
+        datasets = self._init_datasets(
+            asvspoof_path=asvspoof_path,
+            wavefake_path=wavefake_path,
+            fakeavceleb_path=fakeavceleb_path,
+            asvspoof2019_path=asvspoof2019_path,
+            subset=subset,
+        )
+        self.samples = pd.concat([ds.samples for ds in datasets], ignore_index=True)
+        if oversample:
+            self.oversample_dataset()
+        elif undersample:
+            self.undersample_dataset()
+        if reduced_number:
+            LOGGER.info(f"Using reduced number of samples - {reduced_number}!")
+            self.samples = self.samples.sample(
+                min(len(self.samples), reduced_number),
+                random_state=42,
+            )
+    def _init_datasets(
+        self,
+        asvspoof_path: Optional[str],
+        wavefake_path: Optional[str],
+        fakeavceleb_path: Optional[str],
+        asvspoof2019_path: Optional[str],
+        subset: str,
+    ) -> List[SimpleAudioFakeDataset]:
+        datasets = []
+        if asvspoof_path is not None:
+            asvspoof_dataset = DeepFakeASVSpoofDataset(asvspoof_path, subset=subset)
+            datasets.append(asvspoof_dataset)
+        if wavefake_path is not None:
+            wavefake_dataset = WaveFakeDataset(wavefake_path, subset=subset)
+            datasets.append(wavefake_dataset)
+        if fakeavceleb_path is not None:
+            fakeavceleb_dataset = FakeAVCelebDataset(fakeavceleb_path, subset=subset)
+            datasets.append(fakeavceleb_dataset)
+        if asvspoof2019_path is not None:
+            la_dataset = ASVSpoof2019DatasetOriginal(
+                asvspoof2019_path, fold_subset=subset
+            )
+            datasets.append(la_dataset)
+        return datasets
+    def oversample_dataset(self):
+        samples = self.samples.groupby(by=["label"])
+        bona_length = len(samples.groups["bonafide"])
+        spoof_length = len(samples.groups["spoof"])
+        diff_length = spoof_length - bona_length
+        if diff_length < 0:
+            raise NotImplementedError
+        if diff_length > 0:
+            bonafide = samples.get_group("bonafide").sample(diff_length, replace=True)
+            self.samples = pd.concat([self.samples, bonafide], ignore_index=True)
+    def undersample_dataset(self):
+        samples = self.samples.groupby(by=["label"])
+        bona_length = len(samples.groups["bonafide"])
+        spoof_length = len(samples.groups["spoof"])
+        if spoof_length < bona_length:
+            raise NotImplementedError
+        if spoof_length > bona_length:
+            spoofs = samples.get_group("spoof").sample(bona_length, replace=True)
+            self.samples = pd.concat(
+                [samples.get_group("bonafide"), spoofs], ignore_index=True
+            )
+    def get_bonafide_only(self):
+        samples = self.samples.groupby(by=["label"])
+        self.samples = samples.get_group("bonafide")
+        return self.samples
+    def get_spoof_only(self):
+        samples = self.samples.groupby(by=["label"])
+        self.samples = samples.get_group("spoof")
+        return self.samples

src/datasets/fakeavceleb_dataset.py ADDED Viewed

	@@ -0,0 +1,94 @@

+from pathlib import Path
+import pandas as pd
+from src.datasets.base_dataset import SimpleAudioFakeDataset
+FAKEAVCELEB_SPLIT = {
+    "train": ['faceswap-wav2lip', 'fsgan-wav2lip', 'wav2lip', 'rtvc'],
+    "test":  ['faceswap-wav2lip', 'fsgan-wav2lip', 'wav2lip', 'rtvc'],
+    "val":   ['faceswap-wav2lip', 'fsgan-wav2lip', 'wav2lip', 'rtvc'],
+    "partition_ratio": [0.7, 0.15],
+    "seed": 45
+}
+class FakeAVCelebDataset(SimpleAudioFakeDataset):
+    audio_folder = "FakeAVCeleb-audio"
+    audio_extension = ".mp3"
+    metadata_file = Path(audio_folder) / "meta_data.csv"
+    subsets = ("train", "dev", "eval")
+    def __init__(self, path, subset="train", transform=None):
+        super().__init__(subset, transform)
+        self.path = path
+        self.subset = subset
+        self.allowed_attacks = FAKEAVCELEB_SPLIT[subset]
+        self.partition_ratio = FAKEAVCELEB_SPLIT["partition_ratio"]
+        self.seed = FAKEAVCELEB_SPLIT["seed"]
+        self.metadata = self.get_metadata()
+        self.samples = pd.concat([self.get_fake_samples(), self.get_real_samples()], ignore_index=True)
+    def get_metadata(self):
+        md = pd.read_csv(Path(self.path) / self.metadata_file)
+        md["audio_type"] = md["type"].apply(lambda x: x.split("-")[-1])
+        return md
+    def get_fake_samples(self):
+        samples = {
+            "user_id": [],
+            "sample_name": [],
+            "attack_type": [],
+            "label": [],
+            "path": []
+        }
+        for attack_name in self.allowed_attacks:
+            fake_samples = self.metadata[
+                (self.metadata["method"] == attack_name) & (self.metadata["audio_type"] == "FakeAudio")
+            ]
+            samples_list = fake_samples.iterrows()
+            samples_list = self.split_samples(samples_list)
+            for _, sample in samples_list:
+                samples["user_id"].append(sample["source"])
+                samples["sample_name"].append(Path(sample["filename"]).stem)
+                samples["attack_type"].append(sample["method"])
+                samples["label"].append("spoof")
+                samples["path"].append(self.get_file_path(sample))
+        return pd.DataFrame(samples)
+    def get_real_samples(self):
+        samples = {
+            "user_id": [],
+            "sample_name": [],
+            "attack_type": [],
+            "label": [],
+            "path": []
+        }
+        samples_list = self.metadata[
+            (self.metadata["method"] == "real") & (self.metadata["audio_type"] == "RealAudio")
+        ]
+        samples_list = self.split_samples(samples_list)
+        for index, sample in samples_list.iterrows():
+            samples["user_id"].append(sample["source"])
+            samples["sample_name"].append(Path(sample["filename"]).stem)
+            samples["attack_type"].append("-")
+            samples["label"].append("bonafide")
+            samples["path"].append(self.get_file_path(sample))
+        return pd.DataFrame(samples)
+    def get_file_path(self, sample):
+        path = "/".join([self.audio_folder, *sample["path"].split("/")[1:]])
+        return Path(self.path) / path / Path(sample["filename"]).with_suffix(self.audio_extension)

src/datasets/folder_dataset.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import numpy as np
+import pandas as pd
+import os
+from pathlib import Path
+from src.datasets.base_dataset import SimpleAudioFakeDataset
+class FolderDataset(SimpleAudioFakeDataset):
+    def __init__(
+        self,
+        path,
+        subset="test",
+        transform=None,
+    ):
+        super().__init__(subset=subset, transform=transform)
+        self.path = path
+        self.samples = self.read_samples()
+    def read_samples(self):
+        path = Path(self.path)
+        print('ori path', path)
+        print('list', os.listdir(path))
+        samples = []
+        for filepath in os.listdir(path):
+            samples.append({
+                'path': path / filepath,
+                'label': '',
+                'attack_type': '',
+            })
+        samples = pd.DataFrame(samples)
+        print('samples', samples)
+        return samples
+class FileDataset(SimpleAudioFakeDataset):
+    def __init__(
+        self,
+        path,
+        subset="test",
+        transform=None,
+    ):
+        super().__init__(subset=subset, transform=transform)
+        self.path = path
+        self.samples = self.read_samples()
+    def read_samples(self):
+        path = Path(self.path)
+        samples = [{'path': path, 'label': '', 'attack_type':''}]
+        samples = pd.DataFrame(samples)
+        print('samples', samples)
+        return samples
+if __name__ == "__main__":
+    dataset = InTheWildDataset(
+        path="../datasets/release_in_the_wild",
+        subset="val",
+        seed=242,
+        split_strategy="per_speaker"
+    )
+    print(len(dataset))
+    print(len(dataset.samples["user_id"].unique()))
+    print(dataset.samples["user_id"].unique())
+    print(dataset[0])

src/datasets/in_the_wild_dataset.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import numpy as np
+import pandas as pd
+from pathlib import Path
+from src.datasets.base_dataset import SimpleAudioFakeDataset
+class InTheWildDataset(SimpleAudioFakeDataset):
+    def __init__(
+        self,
+        path,
+        subset="train",
+        transform=None,
+        seed=None,
+        partition_ratio=(0.7, 0.15),
+        split_strategy="random"
+    ):
+        super().__init__(subset=subset, transform=transform)
+        self.path = path
+        self.read_samples()
+        self.partition_ratio = partition_ratio
+        self.seed = seed
+    def read_samples(self):
+        path = Path(self.path)
+        meta_path = path / "meta.csv"
+        self.samples = pd.read_csv(meta_path)
+        self.samples["path"] = self.samples["file"].apply(lambda n: str(path / n))
+        self.samples["file"] = self.samples["file"].apply(lambda n: Path(n).stem)
+        self.samples["label"] = self.samples["label"].map({"bona-fide": "bonafide", "spoof": "spoof"})
+        self.samples["attack_type"] = self.samples["label"].map({"bonafide": "-", "spoof": "X"})
+        self.samples.rename(columns={'file': 'sample_name', 'speaker': 'user_id'}, inplace=True)
+    def split_samples_per_speaker(self, samples):
+        speaker_list = pd.Series(samples["user_id"].unique())
+        speaker_list = speaker_list.sort_values()
+        speaker_list = speaker_list.sample(frac=1, random_state=self.seed)
+        speaker_list = list(speaker_list)
+        p, s = self.partition_ratio
+        subsets = np.split(speaker_list, [int(p * len(speaker_list)), int((p + s) * len(speaker_list))])
+        speaker_subset = dict(zip(['train', 'test', 'val'], subsets))[self.subset]
+        return self.samples[self.samples["user_id"].isin(speaker_subset)]
+if __name__ == "__main__":
+    dataset = InTheWildDataset(
+        path="../datasets/release_in_the_wild",
+        subset="val",
+        seed=242,
+        split_strategy="per_speaker"
+    )
+    print(len(dataset))
+    print(len(dataset.samples["user_id"].unique()))
+    print(dataset.samples["user_id"].unique())
+    print(dataset[0])

src/datasets/wavefake_dataset.py ADDED Viewed

	@@ -0,0 +1,85 @@

+from pathlib import Path
+import pandas as pd
+from src.datasets.base_dataset import SimpleAudioFakeDataset
+WAVEFAKE_SPLIT = {
+    "train": ['multi_band_melgan', 'melgan_large', 'parallel_wavegan', 'waveglow', 'full_band_melgan', 'melgan', 'hifiGAN'],
+    "test":  ['multi_band_melgan', 'melgan_large', 'parallel_wavegan', 'waveglow', 'full_band_melgan', 'melgan', 'hifiGAN'],
+    "val":   ['multi_band_melgan', 'melgan_large', 'parallel_wavegan', 'waveglow', 'full_band_melgan', 'melgan', 'hifiGAN'],
+    "partition_ratio": [0.7, 0.15],
+    "seed": 45
+}
+class WaveFakeDataset(SimpleAudioFakeDataset):
+    fake_data_path = "generated_audio"
+    jsut_real_data_path = "real_audio/jsut_ver1.1/basic5000/wav"
+    ljspeech_real_data_path = "real_audio/LJSpeech-1.1/wavs"
+    def __init__(self, path, subset="train", transform=None):
+        super().__init__(subset, transform)
+        self.path = Path(path)
+        self.fold_subset = subset
+        self.allowed_attacks = WAVEFAKE_SPLIT[subset]
+        self.partition_ratio = WAVEFAKE_SPLIT["partition_ratio"]
+        self.seed = WAVEFAKE_SPLIT["seed"]
+        self.samples = pd.concat([self.get_fake_samples(), self.get_real_samples()], ignore_index=True)
+    def get_fake_samples(self):
+        samples = {
+            "user_id": [],
+            "sample_name": [],
+            "attack_type": [],
+            "label": [],
+            "path": []
+        }
+        samples_list = list((self.path / self.fake_data_path).glob("*/*.wav"))
+        samples_list = self.filter_samples_by_attack(samples_list)
+        samples_list = self.split_samples(samples_list)
+        for sample in samples_list:
+            samples["user_id"].append(None)
+            samples["sample_name"].append("_".join(sample.stem.split("_")[:-1]))
+            samples["attack_type"].append(self.get_attack_from_path(sample))
+            samples["label"].append("spoof")
+            samples["path"].append(sample)
+        return pd.DataFrame(samples)
+    def filter_samples_by_attack(self, samples_list):
+        return [s for s in samples_list if self.get_attack_from_path(s) in self.allowed_attacks]
+    def get_real_samples(self):
+        samples = {
+            "user_id": [],
+            "sample_name": [],
+            "attack_type": [],
+            "label": [],
+            "path": []
+        }
+        samples_list = list((self.path / self.jsut_real_data_path).glob("*.wav"))
+        samples_list += list((self.path / self.ljspeech_real_data_path).glob("*.wav"))
+        samples_list = self.split_samples(samples_list)
+        for sample in samples_list:
+            samples["user_id"].append(None)
+            samples["sample_name"].append(sample.stem)
+            samples["attack_type"].append("-")
+            samples["label"].append("bonafide")
+            samples["path"].append(sample)
+        return pd.DataFrame(samples)
+    @staticmethod
+    def get_attack_from_path(path):
+        folder_name = path.parents[0].relative_to(path.parents[1])
+        return str(folder_name).split("_", maxsplit=1)[-1]

src/frontends.py ADDED Viewed

	@@ -0,0 +1,72 @@

+from typing import List, Union, Callable
+import torch
+import torchaudio
+SAMPLING_RATE = 16_000
+win_length = 400  # int((25 / 1_000) * SAMPLING_RATE)
+hop_length = 160  # int((10 / 1_000) * SAMPLING_RATE)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+MFCC_FN = torchaudio.transforms.MFCC(
+    sample_rate=SAMPLING_RATE,
+    n_mfcc=128,
+    melkwargs={
+        "n_fft": 512,
+        "win_length": win_length,
+        "hop_length": hop_length,
+    },
+).to(device)
+LFCC_FN = torchaudio.transforms.LFCC(
+    sample_rate=SAMPLING_RATE,
+    n_lfcc=128,
+    speckwargs={
+        "n_fft": 512,
+        "win_length": win_length,
+        "hop_length": hop_length,
+    },
+).to(device)
+MEL_SCALE_FN = torchaudio.transforms.MelScale(
+    n_mels=80,
+    n_stft=257,
+    sample_rate=SAMPLING_RATE,
+).to(device)
+delta_fn = torchaudio.transforms.ComputeDeltas(
+    win_length=400,
+    mode="replicate",
+)
+def get_frontend(
+    frontends: List[str],
+) -> Union[torchaudio.transforms.MFCC, torchaudio.transforms.LFCC, Callable,]:
+    if "mfcc" in frontends:
+        return prepare_mfcc_double_delta
+    elif "lfcc" in frontends:
+        return prepare_lfcc_double_delta
+    raise ValueError(f"{frontends} frontend is not supported!")
+def prepare_lfcc_double_delta(input):
+    if input.ndim < 4:
+        input = input.unsqueeze(1)  # (bs, 1, n_lfcc, frames)
+    x = LFCC_FN(input)
+    delta = delta_fn(x)
+    double_delta = delta_fn(delta)
+    x = torch.cat((x, delta, double_delta), 2)  # -> [bs, 1, 128 * 3, 1500]
+    return x[:, :, :, :3000]  # (bs, n, n_lfcc * 3, frames)
+def prepare_mfcc_double_delta(input):
+    if input.ndim < 4:
+        input = input.unsqueeze(1)  # (bs, 1, n_lfcc, frames)
+    x = MFCC_FN(input)
+    delta = delta_fn(x)
+    double_delta = delta_fn(delta)
+    x = torch.cat((x, delta, double_delta), 2)  # -> [bs, 1, 128 * 3, 1500]
+    return x[:, :, :, :3000]  # (bs, n, n_lfcc * 3, frames)

src/metrics.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from typing import Tuple
+import numpy as np
+from scipy.interpolate import interp1d
+from scipy.optimize import brentq
+from sklearn.metrics import roc_curve
+from sklearn.metrics import roc_curve
+def calculate_eer(y, y_score) -> Tuple[float, float, np.ndarray, np.ndarray]:
+    fpr, tpr, thresholds = roc_curve(y, -y_score)
+    eer = brentq(lambda x: 1.0 - x - interp1d(fpr, tpr)(x), 0.0, 1.0)
+    thresh = interp1d(fpr, thresholds)(eer)
+    return thresh, eer, fpr, tpr

src/models/__init__.py ADDED Viewed

File without changes

src/models/assets/mel_filters.npz ADDED Viewed

Binary file (2.05 kB). View file

src/models/assets/tiny_enc.en.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:206cca585e8ee06b813f958f72c548aebd489f125ef8949ad437f9fcc86e8cda
+size 32853468

src/models/lcnn.py ADDED Viewed

	@@ -0,0 +1,247 @@

+"""
+This code is modified version of LCNN baseline
+from ASVSpoof2021 challenge - https://github.com/asvspoof-challenge/2021/blob/main/LA/Baseline-LFCC-LCNN/project/baseline_LA/model.py
+"""
+import sys
+import torch
+import torch.nn as torch_nn
+from src import frontends
+NUM_COEFFICIENTS = 384
+# For blstm
+class BLSTMLayer(torch_nn.Module):
+    """ Wrapper over dilated conv1D
+    Input tensor:  (batchsize=1, length, dim_in)
+    Output tensor: (batchsize=1, length, dim_out)
+    We want to keep the length the same
+    """
+    def __init__(self, input_dim, output_dim):
+        super().__init__()
+        if output_dim % 2 != 0:
+            print("Output_dim of BLSTMLayer is {:d}".format(output_dim))
+            print("BLSTMLayer expects a layer size of even number")
+            sys.exit(1)
+        # bi-directional LSTM
+        self.l_blstm = torch_nn.LSTM(
+            input_dim,
+            output_dim // 2,
+            bidirectional=True
+        )
+    def forward(self, x):
+        # permute to (length, batchsize=1, dim)
+        blstm_data, _ = self.l_blstm(x.permute(1, 0, 2))
+        # permute it backt to (batchsize=1, length, dim)
+        return blstm_data.permute(1, 0, 2)
+class MaxFeatureMap2D(torch_nn.Module):
+    """ Max feature map (along 2D)
+    MaxFeatureMap2D(max_dim=1)
+    l_conv2d = MaxFeatureMap2D(1)
+    data_in = torch.rand([1, 4, 5, 5])
+    data_out = l_conv2d(data_in)
+    Input:
+    ------
+    data_in: tensor of shape (batch, channel, ...)
+    Output:
+    -------
+    data_out: tensor of shape (batch, channel//2, ...)
+    Note
+    ----
+    By default, Max-feature-map is on channel dimension,
+    and maxout is used on (channel ...)
+    """
+    def __init__(self, max_dim = 1):
+        super().__init__()
+        self.max_dim = max_dim
+    def forward(self, inputs):
+        # suppose inputs (batchsize, channel, length, dim)
+        shape = list(inputs.size())
+        if self.max_dim >= len(shape):
+            print("MaxFeatureMap: maximize on %d dim" % (self.max_dim))
+            print("But input has %d dimensions" % (len(shape)))
+            sys.exit(1)
+        if shape[self.max_dim] // 2 * 2 != shape[self.max_dim]:
+            print("MaxFeatureMap: maximize on %d dim" % (self.max_dim))
+            print("But this dimension has an odd number of data")
+            sys.exit(1)
+        shape[self.max_dim] = shape[self.max_dim]//2
+        shape.insert(self.max_dim, 2)
+        # view to (batchsize, 2, channel//2, ...)
+        # maximize on the 2nd dim
+        m, i = inputs.view(*shape).max(self.max_dim)
+        return m
+##############
+## FOR MODEL
+##############
+class LCNN(torch_nn.Module):
+    """ Model definition
+    """
+    def __init__(self, **kwargs):
+        super().__init__()
+        input_channels = kwargs.get("input_channels", 1)
+        num_coefficients = kwargs.get("num_coefficients", NUM_COEFFICIENTS)
+        # Working sampling rate
+        self.num_coefficients = num_coefficients
+        # dimension of embedding vectors
+        # here, the embedding is just the activation before sigmoid()
+        self.v_emd_dim = 1
+        # it can handle models with multiple front-end configuration
+        # by default, only a single front-end
+        self.m_transform = torch_nn.Sequential(
+            torch_nn.Conv2d(input_channels, 64, (5, 5), 1, padding=(2, 2)),
+            MaxFeatureMap2D(),
+            torch.nn.MaxPool2d((2, 2), (2, 2)),
+            torch_nn.Conv2d(32, 64, (1, 1), 1, padding=(0, 0)),
+            MaxFeatureMap2D(),
+            torch_nn.BatchNorm2d(32, affine=False),
+            torch_nn.Conv2d(32, 96, (3, 3), 1, padding=(1, 1)),
+            MaxFeatureMap2D(),
+            torch.nn.MaxPool2d((2, 2), (2, 2)),
+            torch_nn.BatchNorm2d(48, affine=False),
+            torch_nn.Conv2d(48, 96, (1, 1), 1, padding=(0, 0)),
+            MaxFeatureMap2D(),
+            torch_nn.BatchNorm2d(48, affine=False),
+            torch_nn.Conv2d(48, 128, (3, 3), 1, padding=(1, 1)),
+            MaxFeatureMap2D(),
+            torch.nn.MaxPool2d((2, 2), (2, 2)),
+            torch_nn.Conv2d(64, 128, (1, 1), 1, padding=(0, 0)),
+            MaxFeatureMap2D(),
+            torch_nn.BatchNorm2d(64, affine=False),
+            torch_nn.Conv2d(64, 64, (3, 3), 1, padding=(1, 1)),
+            MaxFeatureMap2D(),
+            torch_nn.BatchNorm2d(32, affine=False),
+            torch_nn.Conv2d(32, 64, (1, 1), 1, padding=(0, 0)),
+            MaxFeatureMap2D(),
+            torch_nn.BatchNorm2d(32, affine=False),
+            torch_nn.Conv2d(32, 64, (3, 3), 1, padding=(1, 1)),
+            MaxFeatureMap2D(),
+            torch_nn.MaxPool2d((2, 2), (2, 2)),
+            torch_nn.Dropout(0.7)
+        )
+        self.m_before_pooling = torch_nn.Sequential(
+            BLSTMLayer((self.num_coefficients//16) * 32, (self.num_coefficients//16) * 32),
+            BLSTMLayer((self.num_coefficients//16) * 32, (self.num_coefficients//16) * 32)
+        )
+        self.m_output_act = torch_nn.Linear((self.num_coefficients // 16) * 32, self.v_emd_dim)
+    def _compute_embedding(self, x):
+        """ definition of forward method
+        Assume x (batchsize, length, dim)
+        Output x (batchsize * number_filter, output_dim)
+        """
+        # resample if necessary
+        # x = self.m_resampler(x.squeeze(-1)).unsqueeze(-1)
+        # number of sub models
+        batch_size = x.shape[0]
+        # buffer to store output scores from sub-models
+        output_emb = torch.zeros(
+            [batch_size, self.v_emd_dim],
+            device=x.device,
+            dtype=x.dtype
+        )
+        # compute scores for each sub-models
+        idx = 0
+        # compute scores
+        #  1. unsqueeze to (batch, 1, frame_length, fft_bin)
+        #  2. compute hidden features
+        x = x.permute(0,1,3,2)
+        hidden_features = self.m_transform(x)
+        #  3. (batch, channel, frame//N, feat_dim//N) ->
+        #     (batch, frame//N, channel * feat_dim//N)
+        #     where N is caused by conv with stride
+        hidden_features = hidden_features.permute(0, 2, 1, 3).contiguous()
+        frame_num = hidden_features.shape[1]
+        hidden_features = hidden_features.view(batch_size, frame_num, -1)
+        #  4. pooling
+        #  4. pass through LSTM then summingc
+        hidden_features_lstm = self.m_before_pooling(hidden_features)
+        #  5. pass through the output layer
+        tmp_emb = self.m_output_act((hidden_features_lstm + hidden_features).mean(1))
+        output_emb[idx * batch_size : (idx+1) * batch_size] = tmp_emb
+        return output_emb
+    def _compute_score(self, feature_vec):
+        # feature_vec is [batch * submodel, 1]
+        return torch.sigmoid(feature_vec).squeeze(1)
+    def forward(self, x):
+        feature_vec = self._compute_embedding(x)
+        return feature_vec
+class FrontendLCNN(LCNN):
+    """ Model definition
+    """
+    def __init__(self, device: str = "cuda", **kwargs):
+        super().__init__(**kwargs)
+        self.device = device
+        frontend_name = kwargs.get("frontend_algorithm", [])
+        self.frontend = frontends.get_frontend(frontend_name)
+        print(f"Using {frontend_name} frontend")
+    def _compute_frontend(self, x):
+        frontend = self.frontend(x)
+        if frontend.ndim < 4:
+            return frontend.unsqueeze(1)  # (bs, 1, n_lfcc, frames)
+        return frontend # (bs, n, n_lfcc, frames)
+    def forward(self, x):
+        x = self._compute_frontend(x)
+        feature_vec = self._compute_embedding(x)
+        return feature_vec
+if __name__ == "__main__":
+    device = "cuda"
+    print("Definition of model")
+    model = FrontendLCNN(input_channels=2, num_coefficients=80, device=device, frontend_algorithm=["mel_spec"])
+    model = model.to(device)
+    batch_size = 12
+    mock_input = torch.rand((batch_size, 64_600,), device=device)
+    output = model(mock_input)
+    print(output.shape)

src/models/meso_net.py ADDED Viewed

	@@ -0,0 +1,146 @@

+"""
+This code is modified version of MesoNet DeepFake detection solution
+from FakeAVCeleb repository - https://github.com/DASH-Lab/FakeAVCeleb/blob/main/models/MesoNet.py.
+"""
+import torch
+import torch.nn as nn
+from src import frontends
+class MesoInception4(nn.Module):
+    """
+    Pytorch Implemention of MesoInception4
+    Author: Honggu Liu
+    Date: July 7, 2019
+    """
+    def __init__(self, num_classes=1, **kwargs):
+        super().__init__()
+        self.fc1_dim = kwargs.get("fc1_dim", 1024)
+        input_channels = kwargs.get("input_channels", 3)
+        self.num_classes = num_classes
+        #InceptionLayer1
+        self.Incption1_conv1 = nn.Conv2d(input_channels, 1, 1, padding=0, bias=False)
+        self.Incption1_conv2_1 = nn.Conv2d(input_channels, 4, 1, padding=0, bias=False)
+        self.Incption1_conv2_2 = nn.Conv2d(4, 4, 3, padding=1, bias=False)
+        self.Incption1_conv3_1 = nn.Conv2d(input_channels, 4, 1, padding=0, bias=False)
+        self.Incption1_conv3_2 = nn.Conv2d(4, 4, 3, padding=2, dilation=2, bias=False)
+        self.Incption1_conv4_1 = nn.Conv2d(input_channels, 2, 1, padding=0, bias=False)
+        self.Incption1_conv4_2 = nn.Conv2d(2, 2, 3, padding=3, dilation=3, bias=False)
+        self.Incption1_bn = nn.BatchNorm2d(11)
+        #InceptionLayer2
+        self.Incption2_conv1 = nn.Conv2d(11, 2, 1, padding=0, bias=False)
+        self.Incption2_conv2_1 = nn.Conv2d(11, 4, 1, padding=0, bias=False)
+        self.Incption2_conv2_2 = nn.Conv2d(4, 4, 3, padding=1, bias=False)
+        self.Incption2_conv3_1 = nn.Conv2d(11, 4, 1, padding=0, bias=False)
+        self.Incption2_conv3_2 = nn.Conv2d(4, 4, 3, padding=2, dilation=2, bias=False)
+        self.Incption2_conv4_1 = nn.Conv2d(11, 2, 1, padding=0, bias=False)
+        self.Incption2_conv4_2 = nn.Conv2d(2, 2, 3, padding=3, dilation=3, bias=False)
+        self.Incption2_bn = nn.BatchNorm2d(12)
+        #Normal Layer
+        self.conv1 = nn.Conv2d(12, 16, 5, padding=2, bias=False)
+        self.relu = nn.ReLU(inplace=True)
+        self.leakyrelu = nn.LeakyReLU(0.1)
+        self.bn1 = nn.BatchNorm2d(16)
+        self.maxpooling1 = nn.MaxPool2d(kernel_size=(2, 2))
+        self.conv2 = nn.Conv2d(16, 16, 5, padding=2, bias=False)
+        self.maxpooling2 = nn.MaxPool2d(kernel_size=(4, 4))
+        self.dropout = nn.Dropout2d(0.5)
+        self.fc1 = nn.Linear(self.fc1_dim, 16)
+        self.fc2 = nn.Linear(16, num_classes)
+    #InceptionLayer
+    def InceptionLayer1(self, input):
+        x1 = self.Incption1_conv1(input)
+        x2 = self.Incption1_conv2_1(input)
+        x2 = self.Incption1_conv2_2(x2)
+        x3 = self.Incption1_conv3_1(input)
+        x3 = self.Incption1_conv3_2(x3)
+        x4 = self.Incption1_conv4_1(input)
+        x4 = self.Incption1_conv4_2(x4)
+        y = torch.cat((x1, x2, x3, x4), 1)
+        y = self.Incption1_bn(y)
+        y = self.maxpooling1(y)
+        return y
+    def InceptionLayer2(self, input):
+        x1 = self.Incption2_conv1(input)
+        x2 = self.Incption2_conv2_1(input)
+        x2 = self.Incption2_conv2_2(x2)
+        x3 = self.Incption2_conv3_1(input)
+        x3 = self.Incption2_conv3_2(x3)
+        x4 = self.Incption2_conv4_1(input)
+        x4 = self.Incption2_conv4_2(x4)
+        y = torch.cat((x1, x2, x3, x4), 1)
+        y = self.Incption2_bn(y)
+        y = self.maxpooling1(y)
+        return y
+    def forward(self, input):
+        x = self._compute_embedding(input)
+        return x
+    def _compute_embedding(self, input):
+        x = self.InceptionLayer1(input) #(Batch, 11, 128, 128)
+        x = self.InceptionLayer2(x) #(Batch, 12, 64, 64)
+        x = self.conv1(x) #(Batch, 16, 64 ,64)
+        x = self.relu(x)
+        x = self.bn1(x)
+        x = self.maxpooling1(x) #(Batch, 16, 32, 32)
+        x = self.conv2(x) #(Batch, 16, 32, 32)
+        x = self.relu(x)
+        x = self.bn1(x)
+        x = self.maxpooling2(x) #(Batch, 16, 8, 8)
+        x = x.view(x.size(0), -1) #(Batch, 16*8*8)
+        x = self.dropout(x)
+        x = nn.AdaptiveAvgPool1d(self.fc1_dim)(x)
+        x = self.fc1(x) #(Batch, 16)  ### <-- o tu
+        x = self.leakyrelu(x)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        return x
+class FrontendMesoInception4(MesoInception4):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.device = kwargs['device']
+        frontend_name = kwargs.get("frontend_algorithm", [])
+        self.frontend = frontends.get_frontend(frontend_name)
+        print(f"Using {frontend_name} frontend")
+    def forward(self, x):
+        x = self.frontend(x)
+        x = self._compute_embedding(x)
+        return x
+if __name__ == "__main__":
+    model = FrontendMesoInception4(
+        input_channels=2,
+        fc1_dim=1024,
+        device='cuda',
+        frontend_algorithm="lfcc"
+    )
+    def count_parameters(model) -> int:
+        pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+        return pytorch_total_params
+    print(count_parameters(model))

src/models/models.py ADDED Viewed

	@@ -0,0 +1,73 @@

+from typing import Dict
+from src.models import (
+    lcnn,
+    specrnet,
+    whisper_specrnet,
+    rawnet3,
+    whisper_lcnn,
+    meso_net,
+    whisper_meso_net
+)
+def get_model(model_name: str, config: Dict, device: str):
+    if model_name == "rawnet3":
+        return rawnet3.prepare_model()
+    elif model_name == "lcnn":
+        return lcnn.FrontendLCNN(device=device, **config)
+    elif model_name == "specrnet":
+        return specrnet.FrontendSpecRNet(
+            device=device,
+            **config,
+        )
+    elif model_name == "mesonet":
+        return meso_net.FrontendMesoInception4(
+            input_channels=config.get("input_channels", 1),
+            fc1_dim=config.get("fc1_dim", 1024),
+            frontend_algorithm=config.get("frontend_algorithm", "lfcc"),
+            device=device,
+        )
+    elif model_name == "whisper_lcnn":
+        return whisper_lcnn.WhisperLCNN(
+            input_channels=config.get("input_channels", 1),
+            freeze_encoder=config.get("freeze_encoder", False),
+            device=device,
+        )
+    elif model_name == "whisper_specrnet":
+        return whisper_specrnet.WhisperSpecRNet(
+            input_channels=config.get("input_channels", 1),
+            freeze_encoder=config.get("freeze_encoder", False),
+            device=device,
+        )
+    elif model_name == "whisper_mesonet":
+        return whisper_meso_net.WhisperMesoNet(
+            input_channels=config.get("input_channels", 1),
+            freeze_encoder=config.get("freeze_encoder", True),
+            fc1_dim=config.get("fc1_dim", 1024),
+            device=device,
+        )
+    elif model_name == "whisper_frontend_lcnn":
+        return whisper_lcnn.WhisperMultiFrontLCNN(
+            input_channels=config.get("input_channels", 2),
+            freeze_encoder=config.get("freeze_encoder", False),
+            frontend_algorithm=config.get("frontend_algorithm", "lfcc"),
+            device=device,
+        )
+    elif model_name == "whisper_frontend_specrnet":
+        return whisper_specrnet.WhisperMultiFrontSpecRNet(
+            input_channels=config.get("input_channels", 2),
+            freeze_encoder=config.get("freeze_encoder", False),
+            frontend_algorithm=config.get("frontend_algorithm", "lfcc"),
+            device=device,
+        )
+    elif model_name == "whisper_frontend_mesonet":
+        return whisper_meso_net.WhisperMultiFrontMesoNet(
+            input_channels=config.get("input_channels", 2),
+            fc1_dim=config.get("fc1_dim", 1024),
+            freeze_encoder=config.get("freeze_encoder", True),
+            frontend_algorithm=config.get("frontend_algorithm", "lfcc"),
+            device=device,
+        )
+    else:
+        raise ValueError(f"Model '{model_name}' not supported")

src/models/rawnet3.py ADDED Viewed

	@@ -0,0 +1,323 @@

+"""
+This file contains implementation of RawNet3 architecture.
+The original implementation can be found here: https://github.com/Jungjee/RawNet/tree/master/python/RawNet3
+"""
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from asteroid_filterbanks import Encoder, ParamSincFB  # pip install asteroid_filterbanks
+class RawNet3(nn.Module):
+    def __init__(self, block, model_scale, context, summed, C=1024, **kwargs):
+        super().__init__()
+        nOut = kwargs["nOut"]
+        self.context = context
+        self.encoder_type = kwargs["encoder_type"]
+        self.log_sinc = kwargs["log_sinc"]
+        self.norm_sinc = kwargs["norm_sinc"]
+        self.out_bn = kwargs["out_bn"]
+        self.summed = summed
+        self.preprocess = nn.Sequential(
+            PreEmphasis(), nn.InstanceNorm1d(1, eps=1e-4, affine=True)
+        )
+        self.conv1 = Encoder(
+            ParamSincFB(
+                C // 4,
+                251,
+                stride=kwargs["sinc_stride"],
+            )
+        )
+        self.relu = nn.ReLU()
+        self.bn1 = nn.BatchNorm1d(C // 4)
+        self.layer1 = block(
+            C // 4, C, kernel_size=3, dilation=2, scale=model_scale, pool=5
+        )
+        self.layer2 = block(
+            C, C, kernel_size=3, dilation=3, scale=model_scale, pool=3
+        )
+        self.layer3 = block(C, C, kernel_size=3, dilation=4, scale=model_scale)
+        self.layer4 = nn.Conv1d(3 * C, 1536, kernel_size=1)
+        if self.context:
+            attn_input = 1536 * 3
+        else:
+            attn_input = 1536
+        print("self.encoder_type", self.encoder_type)
+        if self.encoder_type == "ECA":
+            attn_output = 1536
+        elif self.encoder_type == "ASP":
+            attn_output = 1
+        else:
+            raise ValueError("Undefined encoder")
+        self.attention = nn.Sequential(
+            nn.Conv1d(attn_input, 128, kernel_size=1),
+            nn.ReLU(),
+            nn.BatchNorm1d(128),
+            nn.Conv1d(128, attn_output, kernel_size=1),
+            nn.Softmax(dim=2),
+        )
+        self.bn5 = nn.BatchNorm1d(3072)
+        self.fc6 = nn.Linear(3072, nOut)
+        self.bn6 = nn.BatchNorm1d(nOut)
+        self.mp3 = nn.MaxPool1d(3)
+    def forward(self, x):
+        """
+        :param x: input mini-batch (bs, samp)
+        """
+        with torch.cuda.amp.autocast(enabled=False):
+            x = self.preprocess(x)
+            x = torch.abs(self.conv1(x))
+            if self.log_sinc:
+                x = torch.log(x + 1e-6)
+            if self.norm_sinc == "mean":
+                x = x - torch.mean(x, dim=-1, keepdim=True)
+            elif self.norm_sinc == "mean_std":
+                m = torch.mean(x, dim=-1, keepdim=True)
+                s = torch.std(x, dim=-1, keepdim=True)
+                s[s < 0.001] = 0.001
+                x = (x - m) / s
+        if self.summed:
+            x1 = self.layer1(x)
+            x2 = self.layer2(x1)
+            x3 = self.layer3(self.mp3(x1) + x2)
+        else:
+            x1 = self.layer1(x)
+            x2 = self.layer2(x1)
+            x3 = self.layer3(x2)
+        x = self.layer4(torch.cat((self.mp3(x1), x2, x3), dim=1))
+        x = self.relu(x)
+        t = x.size()[-1]
+        if self.context:
+            global_x = torch.cat(
+                (
+                    x,
+                    torch.mean(x, dim=2, keepdim=True).repeat(1, 1, t),
+                    torch.sqrt(
+                        torch.var(x, dim=2, keepdim=True).clamp(
+                            min=1e-4, max=1e4
+                        )
+                    ).repeat(1, 1, t),
+                ),
+                dim=1,
+            )
+        else:
+            global_x = x
+        w = self.attention(global_x)
+        mu = torch.sum(x * w, dim=2)
+        sg = torch.sqrt(
+            (torch.sum((x**2) * w, dim=2) - mu**2).clamp(min=1e-4, max=1e4)
+        )
+        x = torch.cat((mu, sg), 1)
+        x = self.bn5(x)
+        x = self.fc6(x)
+        if self.out_bn:
+            x = self.bn6(x)
+        return x
+class PreEmphasis(torch.nn.Module):
+    def __init__(self, coef: float = 0.97) -> None:
+        super().__init__()
+        self.coef = coef
+        # make kernel
+        # In pytorch, the convolution operation uses cross-correlation. So, filter is flipped.
+        self.register_buffer(
+            "flipped_filter",
+            torch.FloatTensor([-self.coef, 1.0]).unsqueeze(0).unsqueeze(0),
+        )
+    def forward(self, input: torch.tensor) -> torch.tensor:
+        assert (
+            len(input.size()) == 2
+        ), "The number of dimensions of input tensor must be 2!"
+        # reflect padding to match lengths of in/out
+        input = input.unsqueeze(1)
+        input = F.pad(input, (1, 0), "reflect")
+        return F.conv1d(input, self.flipped_filter)
+class AFMS(nn.Module):
+    """
+    Alpha-Feature map scaling, added to the output of each residual block[1,2].
+    Reference:
+    [1] RawNet2 : https://www.isca-speech.org/archive/Interspeech_2020/pdfs/1011.pdf
+    [2] AMFS    : https://www.koreascience.or.kr/article/JAKO202029757857763.page
+    """
+    def __init__(self, nb_dim: int) -> None:
+        super().__init__()
+        self.alpha = nn.Parameter(torch.ones((nb_dim, 1)))
+        self.fc = nn.Linear(nb_dim, nb_dim)
+        self.sig = nn.Sigmoid()
+    def forward(self, x):
+        y = F.adaptive_avg_pool1d(x, 1).view(x.size(0), -1)
+        y = self.sig(self.fc(y)).view(x.size(0), x.size(1), -1)
+        x = x + self.alpha
+        x = x * y
+        return x
+class Bottle2neck(nn.Module):
+    def __init__(
+        self,
+        inplanes,
+        planes,
+        kernel_size=None,
+        dilation=None,
+        scale=4,
+        pool=False,
+    ):
+        super().__init__()
+        width = int(math.floor(planes / scale))
+        self.conv1 = nn.Conv1d(inplanes, width * scale, kernel_size=1)
+        self.bn1 = nn.BatchNorm1d(width * scale)
+        self.nums = scale - 1
+        convs = []
+        bns = []
+        num_pad = math.floor(kernel_size / 2) * dilation
+        for i in range(self.nums):
+            convs.append(
+                nn.Conv1d(
+                    width,
+                    width,
+                    kernel_size=kernel_size,
+                    dilation=dilation,
+                    padding=num_pad,
+                )
+            )
+            bns.append(nn.BatchNorm1d(width))
+        self.convs = nn.ModuleList(convs)
+        self.bns = nn.ModuleList(bns)
+        self.conv3 = nn.Conv1d(width * scale, planes, kernel_size=1)
+        self.bn3 = nn.BatchNorm1d(planes)
+        self.relu = nn.ReLU()
+        self.width = width
+        self.mp = nn.MaxPool1d(pool) if pool else False
+        self.afms = AFMS(planes)
+        if inplanes != planes:  # if change in number of filters
+            self.residual = nn.Sequential(
+                nn.Conv1d(inplanes, planes, kernel_size=1, stride=1, bias=False)
+            )
+        else:
+            self.residual = nn.Identity()
+    def forward(self, x):
+        residual = self.residual(x)
+        out = self.conv1(x)
+        out = self.relu(out)
+        out = self.bn1(out)
+        spx = torch.split(out, self.width, 1)
+        for i in range(self.nums):
+            if i == 0:
+                sp = spx[i]
+            else:
+                sp = sp + spx[i]
+            sp = self.convs[i](sp)
+            sp = self.relu(sp)
+            sp = self.bns[i](sp)
+            if i == 0:
+                out = sp
+            else:
+                out = torch.cat((out, sp), 1)
+        out = torch.cat((out, spx[self.nums]), 1)
+        out = self.conv3(out)
+        out = self.relu(out)
+        out = self.bn3(out)
+        out += residual
+        if self.mp:
+            out = self.mp(out)
+        out = self.afms(out)
+        return out
+def prepare_model():
+    model = RawNet3(
+        Bottle2neck,
+        model_scale=8,
+        context=True,
+        summed=True,
+        encoder_type="ECA",
+        nOut=1,  # number of slices
+        out_bn=False,
+        sinc_stride=10,
+        log_sinc=True,
+        norm_sinc="mean",
+        grad_mult=1,
+    )
+    return model
+if __name__ == "__main__":
+    model = RawNet3(
+        Bottle2neck,
+        model_scale=8,
+        context=True,
+        summed=True,
+        encoder_type="ECA",
+        nOut=1,  # number of slices
+        out_bn=False,
+        sinc_stride=10,
+        log_sinc=True,
+        norm_sinc="mean",
+        grad_mult=1,
+    )
+    gpu = False
+    model.eval()
+    print("RawNet3 initialised & weights loaded!")
+    if torch.cuda.is_available():
+        print("Cuda available, conducting inference on GPU")
+        model = model.to("cuda")
+        gpu = True
+    audios = torch.rand(32, 64_600)
+    out = model(audios)
+    print(out.shape)

src/models/specrnet.py ADDED Viewed

	@@ -0,0 +1,226 @@

+"""
+This file contains implementation of SpecRNet architecture.
+We base our codebase on the implementation of RawNet2 by Hemlata Tak (tak@eurecom.fr).
+It is available here: https://github.com/asvspoof-challenge/2021/blob/main/LA/Baseline-RawNet2/model.py
+"""
+from typing import Dict
+import torch.nn as nn
+from src import frontends
+def get_config(input_channels: int) -> Dict:
+    return {
+        "filts": [input_channels, [input_channels, 20], [20, 64], [64, 64]],
+        "nb_fc_node": 64,
+        "gru_node": 64,
+        "nb_gru_layer": 2,
+        "nb_classes": 1,
+    }
+class Residual_block2D(nn.Module):
+    def __init__(self, nb_filts, first=False):
+        super().__init__()
+        self.first = first
+        if not self.first:
+            self.bn1 = nn.BatchNorm2d(num_features=nb_filts[0])
+        self.lrelu = nn.LeakyReLU(negative_slope=0.3)
+        self.conv1 = nn.Conv2d(
+            in_channels=nb_filts[0],
+            out_channels=nb_filts[1],
+            kernel_size=3,
+            padding=1,
+            stride=1,
+        )
+        self.bn2 = nn.BatchNorm2d(num_features=nb_filts[1])
+        self.conv2 = nn.Conv2d(
+            in_channels=nb_filts[1],
+            out_channels=nb_filts[1],
+            padding=1,
+            kernel_size=3,
+            stride=1,
+        )
+        if nb_filts[0] != nb_filts[1]:
+            self.downsample = True
+            self.conv_downsample = nn.Conv2d(
+                in_channels=nb_filts[0],
+                out_channels=nb_filts[1],
+                padding=0,
+                kernel_size=1,
+                stride=1,
+            )
+        else:
+            self.downsample = False
+        self.mp = nn.MaxPool2d(2)
+    def forward(self, x):
+        identity = x
+        if not self.first:
+            out = self.bn1(x)
+            out = self.lrelu(out)
+        else:
+            out = x
+        out = self.conv1(x)
+        out = self.bn2(out)
+        out = self.lrelu(out)
+        out = self.conv2(out)
+        if self.downsample:
+            identity = self.conv_downsample(identity)
+        out += identity
+        out = self.mp(out)
+        return out
+class SpecRNet(nn.Module):
+    def __init__(self, input_channels, **kwargs):
+        super().__init__()
+        config = get_config(input_channels=input_channels)
+        self.device = kwargs.get("device", "cuda")
+        self.first_bn = nn.BatchNorm2d(num_features=config["filts"][0])
+        self.selu = nn.SELU(inplace=True)
+        self.block0 = nn.Sequential(
+            Residual_block2D(nb_filts=config["filts"][1], first=True)
+        )
+        self.block2 = nn.Sequential(Residual_block2D(nb_filts=config["filts"][2]))
+        config["filts"][2][0] = config["filts"][2][1]
+        self.block4 = nn.Sequential(Residual_block2D(nb_filts=config["filts"][2]))
+        self.avgpool = nn.AdaptiveAvgPool2d(1)
+        self.fc_attention0 = self._make_attention_fc(
+            in_features=config["filts"][1][-1], l_out_features=config["filts"][1][-1]
+        )
+        self.fc_attention2 = self._make_attention_fc(
+            in_features=config["filts"][2][-1], l_out_features=config["filts"][2][-1]
+        )
+        self.fc_attention4 = self._make_attention_fc(
+            in_features=config["filts"][2][-1], l_out_features=config["filts"][2][-1]
+        )
+        self.bn_before_gru = nn.BatchNorm2d(num_features=config["filts"][2][-1])
+        self.gru = nn.GRU(
+            input_size=config["filts"][2][-1],
+            hidden_size=config["gru_node"],
+            num_layers=config["nb_gru_layer"],
+            batch_first=True,
+            bidirectional=True,
+        )
+        self.fc1_gru = nn.Linear(
+            in_features=config["gru_node"] * 2, out_features=config["nb_fc_node"] * 2
+        )
+        self.fc2_gru = nn.Linear(
+            in_features=config["nb_fc_node"] * 2,
+            out_features=config["nb_classes"],
+            bias=True,
+        )
+        self.sig = nn.Sigmoid()
+    def _compute_embedding(self, x):
+        x = self.first_bn(x)
+        x = self.selu(x)
+        x0 = self.block0(x)
+        y0 = self.avgpool(x0).view(x0.size(0), -1)
+        y0 = self.fc_attention0(y0)
+        y0 = self.sig(y0).view(y0.size(0), y0.size(1), -1)
+        y0 = y0.unsqueeze(-1)
+        x = x0 * y0 + y0
+        x = nn.MaxPool2d(2)(x)
+        x2 = self.block2(x)
+        y2 = self.avgpool(x2).view(x2.size(0), -1)
+        y2 = self.fc_attention2(y2)
+        y2 = self.sig(y2).view(y2.size(0), y2.size(1), -1)
+        y2 = y2.unsqueeze(-1)
+        x = x2 * y2 + y2
+        x = nn.MaxPool2d(2)(x)
+        x4 = self.block4(x)
+        y4 = self.avgpool(x4).view(x4.size(0), -1)
+        y4 = self.fc_attention4(y4)
+        y4 = self.sig(y4).view(y4.size(0), y4.size(1), -1)
+        y4 = y4.unsqueeze(-1)
+        x = x4 * y4 + y4
+        x = nn.MaxPool2d(2)(x)
+        x = self.bn_before_gru(x)
+        x = self.selu(x)
+        x = nn.AdaptiveAvgPool2d((1, None))(x)
+        x = x.squeeze(-2)
+        x = x.permute(0, 2, 1)
+        self.gru.flatten_parameters()
+        x, _ = self.gru(x)
+        x = x[:, -1, :]
+        x = self.fc1_gru(x)
+        x = self.fc2_gru(x)
+        return x
+    def forward(self, x):
+        x = self._compute_embedding(x)
+        return x
+    def _make_attention_fc(self, in_features, l_out_features):
+        l_fc = []
+        l_fc.append(nn.Linear(in_features=in_features, out_features=l_out_features))
+        return nn.Sequential(*l_fc)
+class FrontendSpecRNet(SpecRNet):
+    def __init__(self, input_channels, **kwargs):
+        super().__init__(input_channels, **kwargs)
+        self.device = kwargs['device']
+        frontend_name = kwargs.get("frontend_algorithm", [])
+        self.frontend = frontends.get_frontend(frontend_name)
+        print(f"Using {frontend_name} frontend")
+    def _compute_frontend(self, x):
+        frontend = self.frontend(x)
+        if frontend.ndim < 4:
+            return frontend.unsqueeze(1)  # (bs, 1, n_lfcc, frames)
+        return frontend # (bs, n, n_lfcc, frames)
+    def forward(self, x):
+        x = self._compute_frontend(x)
+        x = self._compute_embedding(x)
+        return x
+if __name__ == "__main__":
+    print("Definition of model")
+    device = "cuda"
+    input_channels = 1
+    config = {
+        "filts": [input_channels, [input_channels, 20], [20, 64], [64, 64]],
+        "nb_fc_node": 64,
+        "gru_node": 64,
+        "nb_gru_layer": 2,
+        "nb_classes": 1,
+    }
+    def count_parameters(model) -> int:
+        pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+        return pytorch_total_params
+    model = FrontendSpecRNet(input_channels=1, device=device, frontend_algorithm=["lfcc"])
+    model = model.to(device)
+    print(count_parameters(model))

src/models/whisper_lcnn.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import torch
+from src.models.whisper_main import ModelDimensions, Whisper, log_mel_spectrogram
+from src.models.lcnn import LCNN
+from src import frontends
+from src.commons import WHISPER_MODEL_WEIGHTS_PATH
+class WhisperLCNN(LCNN):
+    def __init__(self, input_channels, freeze_encoder, **kwargs):
+        super().__init__(input_channels=input_channels, **kwargs)
+        self.device = kwargs['device']
+        checkpoint = torch.load(WHISPER_MODEL_WEIGHTS_PATH)
+        dims = ModelDimensions(**checkpoint["dims"].__dict__)
+        model = Whisper(dims)
+        model = model.to(self.device)
+        model.load_state_dict(checkpoint["model_state_dict"])
+        self.whisper_model = model
+        if freeze_encoder:
+            for param in self.whisper_model.parameters():
+                param.requires_grad = False
+    def compute_whisper_features(self, x):
+        specs = []
+        for sample in x:
+            specs.append(log_mel_spectrogram(sample))
+        x = torch.stack(specs)
+        x = self.whisper_model(x)
+        x = x.permute(0, 2, 1)  # (bs, frames, 3 x n_lfcc)
+        x = x.unsqueeze(1)  # (bs, 1, frames, 3 x n_lfcc)
+        x = x.repeat(
+            (1, 1, 1, 2)
+        )  # (bs, 1, frames, 3 x n_lfcc) -> (bs, 1, frames, 3000)
+        return x
+    def forward(self, x):
+        # we assume that the data is correct (i.e. 30s)
+        x = self.compute_whisper_features(x)
+        out = self._compute_embedding(x)
+        return out
+class WhisperMultiFrontLCNN(WhisperLCNN):
+    def __init__(self, input_channels, freeze_encoder, **kwargs):
+        super().__init__(input_channels=input_channels, freeze_encoder=freeze_encoder, **kwargs)
+        self.frontend = frontends.get_frontend(kwargs['frontend_algorithm'])
+        print(f"Using {self.frontend} frontend!")
+    def forward(self, x):
+        # Frontend computation
+        frontend_x = self.frontend(x)
+        x = self.compute_whisper_features(x)
+        x = torch.cat([x, frontend_x], 1)
+        out = self._compute_embedding(x)
+        return out
+if __name__ == "__main__":
+    import numpy as np
+    input_channels = 1
+    device = "cpu"
+    classifier = WhisperLCNN(
+        input_channels=input_channels,
+        freeze_encoder=True,
+        device=device,
+    )
+    input_channels = 2
+    classifier_2 = WhisperMultiFrontLCNN(
+        input_channels=input_channels,
+        freeze_encoder=True,
+        device=device,
+        frontend_algorithm="lfcc"
+    )
+    x = np.random.rand(2, 30 * 16_000).astype(np.float32)
+    x = torch.from_numpy(x)
+    out = classifier(x)
+    print(out.shape)
+    out = classifier_2(x)
+    print(out.shape)

src/models/whisper_main.py ADDED Viewed

	@@ -0,0 +1,323 @@

+# Based on https://github.com/openai/whisper/blob/main/whisper/model.py
+from dataclasses import dataclass
+from functools import lru_cache
+import os
+from typing import Iterable, Optional, Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+from torch import nn
+def exact_div(x, y):
+    assert x % y == 0
+    return x // y
+# hard-coded audio hyperparameters
+SAMPLE_RATE = 16000
+N_FFT = 400
+N_MELS = 80
+HOP_LENGTH = 160
+CHUNK_LENGTH = 30
+N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE  # 480000: number of samples in a chunk
+N_FRAMES = exact_div(
+    N_SAMPLES, HOP_LENGTH
+)  # 3000: number of frames in a mel spectrogram input
+def pad_or_trim(
+    array: Union[torch.Tensor, np.ndarray],
+    length: int = N_SAMPLES,
+    *,
+    axis: int = -1,
+) -> torch.Tensor:
+    """
+    Pad or trim the audio array to N_SAMPLES, as expected by the encoder.
+    """
+    if not torch.is_tensor(array):
+        array = torch.from_numpy(array)
+    if array.shape[axis] > length:
+        array = array.index_select(
+            dim=axis, index=torch.arange(length, device=array.device)
+        )
+    if array.shape[axis] < length:
+        # pad multiple times
+        num_repeats = int(length / array.shape[axis]) + 1
+        array = torch.tile(array, (1, num_repeats))[:, :length]
+    return array
+@lru_cache(maxsize=None)
+def mel_filters(device, n_mels: int = N_MELS) -> torch.Tensor:
+    """
+    load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
+    Allows decoupling librosa dependency; saved using:
+        np.savez_compressed(
+            "mel_filters.npz",
+            mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80),
+        )
+    """
+    assert n_mels == 80, f"Unsupported n_mels: {n_mels}"
+    with np.load(
+        os.path.join(os.path.dirname(__file__), "assets/mel_filters.npz")
+    ) as f:
+        return torch.from_numpy(f[f"mel_{n_mels}"]).to(device)
+def log_mel_spectrogram(audio: torch.Tensor, n_mels: int = N_MELS):
+    """
+    Compute the log-Mel spectrogram of
+    Parameters
+    ----------
+    audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
+        The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
+    n_mels: int
+        The number of Mel-frequency filters, only 80 is supported
+    Returns
+    -------
+    torch.Tensor, shape = (80, n_frames)
+        A Tensor that contains the Mel spectrogram
+    """
+    window = torch.hann_window(N_FFT).to(audio.device)
+    stft = torch.stft(audio, N_FFT, HOP_LENGTH, window=window, return_complex=True)
+    magnitudes = stft[:, :-1].abs() ** 2
+    filters = mel_filters(audio.device, n_mels)
+    mel_spec = filters @ magnitudes
+    log_spec = torch.clamp(mel_spec, min=1e-10).log10()
+    log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
+    log_spec = (log_spec + 4.0) / 4.0
+    return log_spec
+@dataclass
+class ModelDimensions:
+    n_mels: int
+    n_audio_ctx: int
+    n_audio_state: int
+    n_audio_head: int
+    n_audio_layer: int
+    n_vocab: int
+    n_text_ctx: int
+    n_text_state: int
+    n_text_head: int
+    n_text_layer: int
+class LayerNorm(nn.LayerNorm):
+    def forward(self, x: Tensor) -> Tensor:
+        return super().forward(x.float()).type(x.dtype)
+class Linear(nn.Linear):
+    def forward(self, x: Tensor) -> Tensor:
+        return F.linear(
+            x,
+            self.weight.to(x.dtype),
+            None if self.bias is None else self.bias.to(x.dtype),
+        )
+class Conv1d(nn.Conv1d):
+    def _conv_forward(
+        self, x: Tensor, weight: Tensor, bias: Optional[Tensor]
+    ) -> Tensor:
+        return super()._conv_forward(
+            x, weight.to(x.dtype), None if bias is None else bias.to(x.dtype)
+        )
+def sinusoids(length, channels, max_timescale=10_000):
+    """Returns sinusoids for positional embedding"""
+    assert channels % 2 == 0
+    log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
+    inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2))
+    scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
+    return torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)
+class MultiHeadAttention(nn.Module):
+    def __init__(self, n_state: int, n_head: int):
+        super().__init__()
+        self.n_head = n_head
+        self.query = Linear(n_state, n_state)
+        self.key = Linear(n_state, n_state, bias=False)
+        self.value = Linear(n_state, n_state)
+        self.out = Linear(n_state, n_state)
+    def forward(
+        self,
+        x: Tensor,
+        xa: Optional[Tensor] = None,
+        mask: Optional[Tensor] = None,
+        kv_cache: Optional[dict] = None,
+    ):
+        q = self.query(x)
+        if kv_cache is None or xa is None or self.key not in kv_cache:
+            # hooks, if installed (i.e. kv_cache is not None), will prepend the cached kv tensors;
+            # otherwise, perform key/value projections for self- or cross-attention as usual.
+            k = self.key(x if xa is None else xa)
+            v = self.value(x if xa is None else xa)
+        else:
+            # for cross-attention, calculate keys and values once and reuse in subsequent calls.
+            k = kv_cache[self.key]
+            v = kv_cache[self.value]
+        wv = self.qkv_attention(q, k, v, mask)
+        return self.out(wv)
+    def qkv_attention(
+        self, q: Tensor, k: Tensor, v: Tensor, mask: Optional[Tensor] = None
+    ):
+        n_batch, n_ctx, n_state = q.shape
+        scale = (n_state // self.n_head) ** -0.25
+        q = q.view(*q.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) * scale
+        k = k.view(*k.shape[:2], self.n_head, -1).permute(0, 2, 3, 1) * scale
+        v = v.view(*v.shape[:2], self.n_head, -1).permute(0, 2, 1, 3)
+        qk = q @ k
+        if mask is not None:
+            qk = qk + mask[:n_ctx, :n_ctx]
+        w = F.softmax(qk.float(), dim=-1).to(q.dtype)
+        return (w @ v).permute(0, 2, 1, 3).flatten(start_dim=2)
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, n_state: int, n_head: int, cross_attention: bool = False):
+        super().__init__()
+        self.attn = MultiHeadAttention(n_state, n_head)
+        self.attn_ln = LayerNorm(n_state)
+        self.cross_attn = (
+            MultiHeadAttention(n_state, n_head) if cross_attention else None
+        )
+        self.cross_attn_ln = LayerNorm(n_state) if cross_attention else None
+        n_mlp = n_state * 4
+        self.mlp = nn.Sequential(
+            Linear(n_state, n_mlp), nn.GELU(), Linear(n_mlp, n_state)
+        )
+        self.mlp_ln = LayerNorm(n_state)
+    def forward(
+        self,
+        x: Tensor,
+        xa: Optional[Tensor] = None,
+        mask: Optional[Tensor] = None,
+        kv_cache: Optional[dict] = None,
+    ):
+        x = x + self.attn(self.attn_ln(x), mask=mask, kv_cache=kv_cache)
+        if self.cross_attn:
+            x = x + self.cross_attn(self.cross_attn_ln(x), xa, kv_cache=kv_cache)
+        x = x + self.mlp(self.mlp_ln(x))
+        return x
+class AudioEncoder(nn.Module):
+    def __init__(
+        self, n_mels: int, n_ctx: int, n_state: int, n_head: int, n_layer: int
+    ):
+        super().__init__()
+        self.conv1 = Conv1d(n_mels, n_state, kernel_size=3, padding=1)
+        self.conv2 = Conv1d(n_state, n_state, kernel_size=3, stride=2, padding=1)
+        self.register_buffer("positional_embedding", sinusoids(n_ctx, n_state))
+        self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList(
+            [ResidualAttentionBlock(n_state, n_head) for _ in range(n_layer)]
+        )
+        self.ln_post = LayerNorm(n_state)
+    def forward(self, x: Tensor):
+        """
+        x : torch.Tensor, shape = (batch_size, n_mels, n_ctx)
+            the mel spectrogram of the audio
+        """
+        x = F.gelu(self.conv1(x))
+        x = F.gelu(self.conv2(x))
+        x = x.permute(0, 2, 1)
+        assert x.shape[1:] == self.positional_embedding.shape, "incorrect audio shape"
+        x = (x + self.positional_embedding).to(x.dtype)
+        for block in self.blocks:
+            x = block(x)
+        x = self.ln_post(x)
+        return x
+class TextDecoder(nn.Module):
+    def __init__(
+        self, n_vocab: int, n_ctx: int, n_state: int, n_head: int, n_layer: int
+    ):
+        super().__init__()
+        self.token_embedding = nn.Embedding(n_vocab, n_state)
+        self.positional_embedding = nn.Parameter(torch.empty(n_ctx, n_state))
+        self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList(
+            [
+                ResidualAttentionBlock(n_state, n_head, cross_attention=True)
+                for _ in range(n_layer)
+            ]
+        )
+        self.ln = LayerNorm(n_state)
+        mask = torch.empty(n_ctx, n_ctx).fill_(-np.inf).triu_(1)
+        self.register_buffer("mask", mask, persistent=False)
+    def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = None):
+        """
+        x : torch.LongTensor, shape = (batch_size, <= n_ctx)
+            the text tokens
+        xa : torch.Tensor, shape = (batch_size, n_mels, n_audio_ctx)
+            the encoded audio features to be attended on
+        """
+        offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0
+        x = (
+            self.token_embedding(x)
+            + self.positional_embedding[offset : offset + x.shape[-1]]
+        )
+        x = x.to(xa.dtype)
+        for block in self.blocks:
+            x = block(x, xa, mask=self.mask, kv_cache=kv_cache)
+        x = self.ln(x)
+        logits = (
+            x @ torch.transpose(self.token_embedding.weight.to(x.dtype), 0, 1)
+        ).float()
+        return logits
+class Whisper(nn.Module):
+    def __init__(self, dims: ModelDimensions):
+        super().__init__()
+        self.dims = dims
+        self.encoder = AudioEncoder(
+            self.dims.n_mels,
+            self.dims.n_audio_ctx,
+            self.dims.n_audio_state,
+            self.dims.n_audio_head,
+            self.dims.n_audio_layer,
+        )
+    def forward(self, mel: torch.Tensor):
+        return self.encoder(mel)
+    @property
+    def device(self):
+        return next(self.parameters()).device