Spaces:

ModIA
/

FrenchDroneKeyword

Sleeping

App Files Files Community

blanchon commited on Jun 24, 2022

Commit

e18a750

•

1 Parent(s): 9e822eb

first commit

Browse files

Files changed (19) hide show

__pycache__/dataloading.cpython-310.pyc +0 -0
__pycache__/gradio_utils.cpython-310.pyc +0 -0
__pycache__/preprocessing.cpython-310.pyc +0 -0
__pycache__/resnet.cpython-310.pyc +0 -0
app.py +73 -0
best_model_gradio.ipynb +504 -0
dataloading.py +107 -0
gradio_utils.py +42 -0
model/HOP_LENGHT.joblib +3 -0
model/MAX_TIME.joblib +3 -0
model/METHOD.joblib +3 -0
model/N_MFCC.joblib +3 -0
model/SAMPLE_RATE.joblib +3 -0
model/label_encoder.joblib +3 -0
model/model.joblib +3 -0
model/only_mffc_transform.joblib +3 -0
preprocessing.py +202 -0
requirements.txt +8 -0
resnet.py +70 -0

__pycache__/dataloading.cpython-310.pyc ADDED Viewed

Binary file (4.55 kB). View file

__pycache__/gradio_utils.cpython-310.pyc ADDED Viewed

Binary file (1.5 kB). View file

__pycache__/preprocessing.cpython-310.pyc ADDED Viewed

Binary file (8.42 kB). View file

__pycache__/resnet.cpython-310.pyc ADDED Viewed

Binary file (2.13 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import numpy as np
+import skorch
+import torch
+import torch.nn as nn
+import gradio as gr
+import librosa
+from joblib import dump, load
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import LabelEncoder
+from resnet import ResNet
+from gradio_utils import load_as_librosa, predict_gradio
+from dataloading import uniformize, to_numpy
+from preprocessing import MfccTransformer, TorchTransform
+SEED : int = 42
+np.random.seed(SEED)
+torch.manual_seed(SEED)
+model = load('./model/model.joblib')
+only_mffc_transform = load('./model/only_mffc_transform.joblib')
+label_encoder = load('./model/label_encoder.joblib')
+SAMPLE_RATE = load("./model/SAMPLE_RATE.joblib")
+METHOD = load("./model/METHOD.joblib")
+MAX_TIME = load("./model/MAX_TIME.joblib")
+N_MFCC = load("./model/N_MFCC.joblib")
+HOP_LENGHT = load("./model/HOP_LENGHT.joblib")
+sklearn_model = Pipeline(
+            steps=[
+                ("mfcc", only_mffc_transform),
+                ("model", model)
+            ]
+        )
+uniform_lambda = lambda y, sr: uniformize(y, sr, METHOD, MAX_TIME)
+title = r"ResNet 9"
+description = r"""
+<center>
+The resnet9 model was trained to classify drone speech command.
+<img src="http://zeus.blanchon.cc/dropshare/modia.png" width=200px>
+</center>
+"""
+article = r"""
+- [Deep Residual Learning for Image Recognition](https://arxiv.org/pdf/1512.03385)
+"""
+demo_men = gr.Interface(
+    title = title,
+    description = description,
+    article = article,
+    fn=lambda data: predict_gradio(
+        data=data,
+        uniform_lambda=uniform_lambda,
+        sklearn_model=sklearn_model,
+        label_transform=label_encoder,
+        target_sr=SAMPLE_RATE),
+    inputs = gr.Audio(source="microphone", type="numpy"),
+    outputs = gr.Label(),
+    # allow_flagging = "manual",
+    # flagging_options = ['recule', 'tournedroite', 'arretetoi', 'tournegauche', 'gauche', 'avance', 'droite'],
+    # flagging_dir = "./flag/men"
+)
+demo_men.launch()

best_model_gradio.ipynb ADDED Viewed

	@@ -0,0 +1,504 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Best Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The autoreload extension is already loaded. To reload it, use:\n",
+      "  %reload_ext autoreload\n"
+     ]
+    }
+   ],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "import numpy as np\n",
+    "\n",
+    "import skorch\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "\n",
+    "import gradio as gr\n",
+    "\n",
+    "import librosa\n",
+    "\n",
+    "from joblib import dump, load\n",
+    "\n",
+    "from sklearn.pipeline import Pipeline\n",
+    "from sklearn.preprocessing import LabelEncoder\n",
+    "\n",
+    "from resnet import ResNet\n",
+    "from gradio_utils import load_as_librosa, predict_gradio\n",
+    "from dataloading import uniformize, to_numpy\n",
+    "from preprocessing import MfccTransformer, TorchTransform\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Notebook params\n",
+    "SEED : int = 42\n",
+    "np.random.seed(SEED)\n",
+    "torch.manual_seed(SEED)\n",
+    "\n",
+    "# Dataloading params\n",
+    "PATHS: list[str] = [\n",
+    "    \"../data/\",\n",
+    "    \"../new_data/JulienNestor\",\n",
+    "    \"../new_data/classroom_data\",\n",
+    "    \"../new_data/class\",\n",
+    "    \"../new_data/JulienRaph\",\n",
+    "]\n",
+    "REMOVE_LABEL: list[str] = [\n",
+    "         \"penduleinverse\", \"pendule\", \n",
+    "         \"decollage\", \"atterrissage\",\n",
+    "         \"plushaut\", \"plusbas\",\n",
+    "         \"etatdurgence\",\n",
+    "         \"faisunflip\", \n",
+    "         \"faisUnFlip\", \"arreteToi\", \"etatDurgence\",\n",
+    "        #  \"tournedroite\", \"arretetoi\", \"tournegauche\"\n",
+    "]\n",
+    "SAMPLE_RATE: int = 16_000\n",
+    "METHOD: str = \"time_stretch\"\n",
+    "MAX_TIME: float = 3.0\n",
+    "\n",
+    "# Features Extraction params\n",
+    "N_MFCC: int = 64\n",
+    "HOP_LENGHT = 2_048"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 1 - Dataloading"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 1-Dataloading\n",
+    "from dataloading import load_dataset, to_numpy\n",
+    "dataset, uniform_lambda = load_dataset(PATHS,\n",
+    "      remove_label=REMOVE_LABEL,\n",
+    "      sr=SAMPLE_RATE,\n",
+    "      method=METHOD,\n",
+    "      max_time=MAX_TIME\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['recule',\n",
+       " 'tournedroite',\n",
+       " 'arretetoi',\n",
+       " 'tournegauche',\n",
+       " 'gauche',\n",
+       " 'avance',\n",
+       " 'droite']"
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "list(dataset[\"ground_truth\"].unique())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 2-Train and split\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "dataset_train, dataset_test = train_test_split(dataset, random_state=0)\n",
+    "\n",
+    "X_train = to_numpy(dataset_train[\"y_uniform\"])\n",
+    "y_train = to_numpy(dataset_train[\"ground_truth\"])\n",
+    "X_test = to_numpy(dataset_test[\"y_uniform\"])\n",
+    "y_test = to_numpy(dataset_test[\"ground_truth\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 2 - Preprocessing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "only_mffc_transform = Pipeline(\n",
+    "    steps=[\n",
+    "        (\"mfcc\", MfccTransformer(N_MFCC=N_MFCC, reshape_output=False, hop_length=HOP_LENGHT)),\n",
+    "        (\"torch\", TorchTransform())\n",
+    "    ]\n",
+    ")\n",
+    "\n",
+    "only_mffc_transform.fit(X_train)\n",
+    "\n",
+    "X_train_mfcc_torch = only_mffc_transform.transform(X_train)\n",
+    "X_test_mfcc_torch = only_mffc_transform.transform(X_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Train a LabelEncoder (if needed)\n",
+    "label_encoder = LabelEncoder()\n",
+    "label_encoder.fit(y_train)\n",
+    "y_train_enc = label_encoder.transform(y_train)\n",
+    "y_test_enc = label_encoder.transform(y_test)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 3 - ResNet"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if hasattr(torch, \"has_mps\") and torch.has_mps:\n",
+    "    device = torch.device(\"mps\")\n",
+    "elif hasattr(torch, \"has_cuda\") and torch.has_cuda:\n",
+    "    device = torch.device(\"cuda\")\n",
+    "else:\n",
+    "    device = torch.device(\"cpu\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3.1 - nn.Module"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# from resnet import ResNet"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3.2 - Train"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  epoch    train_loss     dur\n",
+      "-------  ------------  ------\n",
+      "      1        \u001b[36m2.8646\u001b[0m  0.4461\n",
+      "      2        \u001b[36m1.9534\u001b[0m  0.4322\n",
+      "      3        \u001b[36m1.8164\u001b[0m  0.4331\n",
+      "      4        \u001b[36m1.6889\u001b[0m  0.4318\n",
+      "      5        \u001b[36m1.5808\u001b[0m  0.4329\n",
+      "      6        \u001b[36m1.4659\u001b[0m  0.4355\n",
+      "      7        \u001b[36m1.2894\u001b[0m  0.4285\n",
+      "      8        1.3207  0.4280\n",
+      "      9        \u001b[36m1.1546\u001b[0m  0.4274\n",
+      "     10        \u001b[36m1.0586\u001b[0m  0.4287\n",
+      "     11        \u001b[36m1.0195\u001b[0m  0.4313\n",
+      "     12        \u001b[36m0.8246\u001b[0m  0.4302\n",
+      "     13        \u001b[36m0.7612\u001b[0m  0.4330\n",
+      "     14        \u001b[36m0.7296\u001b[0m  0.4315\n",
+      "     15        \u001b[36m0.6690\u001b[0m  0.4293\n",
+      "     16        \u001b[36m0.6205\u001b[0m  0.4291\n",
+      "     17        \u001b[36m0.5764\u001b[0m  0.4290\n",
+      "     18        \u001b[36m0.4839\u001b[0m  0.4284\n",
+      "     19        0.4984  0.4314\n",
+      "     20        \u001b[36m0.4666\u001b[0m  0.4324\n",
+      "     21        \u001b[36m0.4132\u001b[0m  0.4322\n",
+      "     22        0.4440  0.4300\n",
+      "     23        0.4463  0.4300\n",
+      "     24        \u001b[36m0.4075\u001b[0m  0.4287\n",
+      "     25        \u001b[36m0.3908\u001b[0m  0.4282\n",
+      "     26        \u001b[36m0.3759\u001b[0m  0.4278\n",
+      "     27        \u001b[36m0.3612\u001b[0m  0.4296\n",
+      "     28        \u001b[36m0.3189\u001b[0m  0.4281\n",
+      "     29        0.3489  0.4308\n",
+      "     30        0.3308  0.4301\n",
+      "     31        0.3353  0.4299\n",
+      "     32        \u001b[36m0.3074\u001b[0m  0.4298\n",
+      "     33        0.3339  0.4350\n",
+      "     34        \u001b[36m0.2921\u001b[0m  0.4383\n",
+      "     35        \u001b[36m0.2852\u001b[0m  0.4345\n",
+      "     36        0.3170  0.4334\n",
+      "     37        0.2853  0.4304\n",
+      "     38        0.2857  0.4307\n",
+      "     39        \u001b[36m0.2607\u001b[0m  0.4310\n",
+      "     40        0.2765  0.4292\n",
+      "     41        0.2831  0.4305\n",
+      "     42        0.2836  0.4295\n",
+      "     43        0.2742  0.4307\n",
+      "     44        0.2653  0.4302\n",
+      "     45        \u001b[36m0.2370\u001b[0m  0.4335\n",
+      "     46        0.2475  0.4292\n",
+      "     47        0.2692  0.4329\n",
+      "     48        0.2657  0.4306\n",
+      "     49        0.2875  0.4305\n",
+      "     50        0.2839  0.4315\n",
+      "     51        0.2555  0.4307\n",
+      "     52        0.2794  0.4332\n",
+      "     53        \u001b[36m0.2272\u001b[0m  0.4302\n",
+      "     54        0.2519  0.4305\n",
+      "     55        0.2388  0.4307\n",
+      "     56        0.2504  0.4314\n",
+      "     57        0.2345  0.4328\n",
+      "     58        \u001b[36m0.2252\u001b[0m  0.4316\n",
+      "     59        0.2436  0.4329\n",
+      "     60        0.2297  0.4309\n",
+      "     61        0.2594  0.4306\n",
+      "     62        0.2412  0.4300\n",
+      "     63        0.2399  0.4319\n",
+      "     64        0.2600  0.4334\n",
+      "     65        0.2599  0.4304\n",
+      "     66        0.2360  0.4317\n",
+      "     67        0.2537  0.4301\n",
+      "     68        0.2268  0.4299\n",
+      "     69        0.2436  0.4301\n",
+      "     70        \u001b[36m0.2193\u001b[0m  0.4308\n",
+      "     71        0.2284  0.4322\n",
+      "     72        0.2339  0.4317\n",
+      "     73        0.2330  0.4331\n",
+      "     74        \u001b[36m0.2063\u001b[0m  0.4327\n",
+      "     75        0.2568  0.4332\n",
+      "     76        0.2372  0.4324\n",
+      "     77        0.2249  0.4327\n",
+      "     78        0.2449  0.4314\n",
+      "     79        0.2455  0.4310\n",
+      "     80        \u001b[36m0.2003\u001b[0m  0.4321\n",
+      "     81        0.2172  0.4318\n",
+      "     82        0.2278  0.4333\n",
+      "     83        0.2178  0.4334\n",
+      "     84        0.2240  0.4312\n",
+      "     85        0.2329  0.4338\n",
+      "     86        0.2267  0.4326\n",
+      "     87        0.2479  0.4341\n",
+      "     88        0.2266  0.4355\n",
+      "     89        0.2541  0.4350\n",
+      "     90        0.2167  0.4324\n",
+      "     91        0.2282  0.4353\n",
+      "     92        0.2097  0.4367\n",
+      "     93        0.2038  0.4351\n",
+      "     94        0.2078  0.4372\n",
+      "     95        0.2437  0.4344\n",
+      "     96        0.2283  0.4333\n",
+      "     97        0.2263  0.4329\n",
+      "     98        0.2146  0.4346\n",
+      "     99        0.2238  0.4323\n",
+      "    100        0.2035  0.4348\n",
+      "    101        0.2287  0.4348\n",
+      "    102        0.2231  0.4328\n",
+      "    103        0.2171  0.4326\n",
+      "    104        0.2417  0.4329\n",
+      "Stopping since train_loss has not improved in the last 25 epochs.\n",
+      "0.941908713692946\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Define net\n",
+    "n_labels = np.unique(dataset.ground_truth).size\n",
+    "net = ResNet(in_channels=1, num_classes=n_labels)\n",
+    "\n",
+    "# Define model\n",
+    "model = skorch.NeuralNetClassifier(\n",
+    "    module=net,\n",
+    "    criterion=nn.CrossEntropyLoss(),\n",
+    "    callbacks=[skorch.callbacks.EarlyStopping(monitor=\"train_loss\", patience=25)],\n",
+    "    max_epochs=200,\n",
+    "    lr=0.01,\n",
+    "    batch_size=128,\n",
+    "    train_split=None,\n",
+    "    device=device,\n",
+    ")\n",
+    "\n",
+    "model.check_data(X_train_mfcc_torch, y_train_enc)\n",
+    "model.fit(X_train_mfcc_torch, y_train_enc)\n",
+    "\n",
+    "print(model.score(X_test_mfcc_torch, y_test_enc))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['./model/HOP_LENGHT.joblib']"
+      ]
+     },
+     "execution_count": 39,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from joblib import dump, load\n",
+    "\n",
+    "dump(model, './model/model.joblib') \n",
+    "dump(only_mffc_transform, './model/only_mffc_transform.joblib') \n",
+    "dump(label_encoder, './model/label_encoder.joblib')\n",
+    "dump(SAMPLE_RATE, \"./model/SAMPLE_RATE.joblib\")\n",
+    "dump(METHOD, \"./model/METHOD.joblib\")\n",
+    "dump(MAX_TIME, \"./model/MAX_TIME.joblib\")\n",
+    "dump(N_MFCC, \"./model/N_MFCC.joblib\")\n",
+    "dump(HOP_LENGHT, \"./model/HOP_LENGHT.joblib\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = load('./model/model.joblib') \n",
+    "only_mffc_transform = load('./model/only_mffc_transform.joblib') \n",
+    "label_encoder = load('./model/label_encoder.joblib') \n",
+    "SAMPLE_RATE = load(\"./model/SAMPLE_RATE.joblib\")\n",
+    "METHOD = load(\"./model/METHOD.joblib\")\n",
+    "MAX_TIME = load(\"./model/MAX_TIME.joblib\")\n",
+    "N_MFCC = load(\"./model/N_MFCC.joblib\")\n",
+    "HOP_LENGHT = load(\"./model/HOP_LENGHT.joblib\")\n",
+    "\n",
+    "sklearn_model = Pipeline(\n",
+    "            steps=[\n",
+    "                (\"mfcc\", only_mffc_transform),\n",
+    "                (\"model\", model)\n",
+    "            ]\n",
+    "        )\n",
+    "\n",
+    "uniform_lambda = lambda y, sr: uniformize(y, sr, METHOD, MAX_TIME)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mThe Kernel crashed while executing code in the the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. View Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    }
+   ],
+   "source": [
+    "title = r\"ResNet 9\"\n",
+    "\n",
+    "description = r\"\"\"\n",
+    "<center>\n",
+    "The resnet9 model was trained to classify drone speech command.\n",
+    "<img src=\"http://zeus.blanchon.cc/dropshare/modia.png\" width=200px>\n",
+    "</center>\n",
+    "\"\"\"\n",
+    "article = r\"\"\"\n",
+    "- [Deep Residual Learning for Image Recognition](https://arxiv.org/pdf/1512.03385)\n",
+    "\"\"\"\n",
+    "\n",
+    "demo_men = gr.Interface(\n",
+    "    title = title,\n",
+    "    description = description,\n",
+    "    article = article, \n",
+    "    fn=lambda data: predict_gradio(\n",
+    "        data=data, \n",
+    "        uniform_lambda=uniform_lambda, \n",
+    "        sklearn_model=sklearn_model,\n",
+    "        label_transform=label_encoder,\n",
+    "        target_sr=SAMPLE_RATE),\n",
+    "    inputs = gr.Audio(source=\"microphone\", type=\"numpy\"),\n",
+    "    outputs = gr.Label(),\n",
+    "    # allow_flagging = \"manual\",\n",
+    "    # flagging_options = ['recule', 'tournedroite', 'arretetoi', 'tournegauche', 'gauche', 'avance', 'droite'],\n",
+    "    # flagging_dir = \"./flag/men\"\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.10.4 ('ml')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.4"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "f1f34988cae7bd54e626a86efbacac2b339eeffffea662e9af12f610fca26db7"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

dataloading.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import numpy as np
+import pandas as pd
+import librosa
+from pathlib import Path
+from typing import Callable, Literal, Optional
+def load_dataset(
+    paths: list[str],
+    remove_label: list[str] = [""],
+    sr: int = 22050,
+    method : Literal["fix_length", "time_stretch"] = "fix_length",
+    max_time: float = 4.0) -> tuple[pd.DataFrame, Callable[[np.ndarray, int], np.ndarray]]:
+    """Folder dataset in memory loader (return fully loaded pandas dataframe).
+    - For sklearn, load the whole dataset if possible otherwise use `proportion` to only load a part of the dataset.
+    - For pytorch, load the whole dataset if possible otherwise use `proportion` to only load a part of the dataset.
+        And convert output to Tensor on the fly.
+    Use `to_numpy(df.y)` to extract a numpy matrix with a (n_row, ...) shape.
+    Expect a dataset folder structure as: paths = [paths1, paths2, ...]
+        - paths1
+            - sub1
+                - blabla_GroundTruth1.wav
+                - blabla_GroundTruth2.wav
+            - sub2
+                - ...
+            ...
+        - ...
+    Args:
+        paths (list[Path]): list of dataset directory to parse.
+        remove_label (list[str], optional): list of label to remove. Defaults to None.. Defaults to [""].
+        shuffle (bool, optional): True to suffle the dataframe. Defaults to True.
+        proportion (float, optional): Proportion of file to load. Defaults to 1.0.
+        sr (int, optional): Sample Rate to resample audio file. Defaults to 22050.
+        method (Literal['fix_length';, 'time_stretch'], optional): uniformization method to apply. Defaults to "fix_length".
+        max_time (float, optional): Common audio duration . Defaults to 4.0.
+    Returns:
+        df (pd.DataFrame): A pd.DataFrame with such define column:
+        - absolute_path (str): file-system absolute path of the .wav file.
+        - labels (list[str]): list of labels defining the sound file (ie, subdirectories and post _ filename).
+        - ground_truth (str): ground_truth label meaning the last one after _ in the sound filename.
+        - y_original_signal (np.ndarray): sound signal normalize as `float64` and resample with the given sr by `librosa.load`
+        - y_original_duration (float): y_original_signal signal duration.
+        - y_uniform (np.ndarray): uniformized sound signal compute from y_original_signal using the chosen uniform method.
+        uniform_transform (Callable[[np.ndarray, int], np.ndarray]]): A lambda function to uniformized an audio signal as the same in df.
+    """
+    data = []
+    uniform_transform = lambda y, sr: uniformize(y, sr, method, max_time)
+    for path in paths:
+        path = Path(path)
+        for wav_file in path.rglob("*.wav"):
+            wav_file_dict = dict()
+            absolute_path = wav_file.absolute()
+            *labels, label = absolute_path.relative_to(path.absolute()).parts
+            label = label.replace(".wav", "").split("_")
+            labels.extend(label)
+            ground_truth = labels[-1]
+            if ground_truth not in remove_label:
+                y_original, sr = librosa.load(path=absolute_path, sr=sr)
+                # WARINING : Convert the sampling rate to 22.05 KHz,
+                # normalize the bit depth between -1 and 1 and convert stereo to mono
+                wav_file_dict["absolute_path"] = absolute_path
+                wav_file_dict["labels"] = labels
+                wav_file_dict["ground_truth"] = ground_truth
+                ## Save original sound signal
+                wav_file_dict["y_original_signal"] = y_original
+                duration = librosa.get_duration(y=y_original, sr=sr)
+                wav_file_dict["y_original_duration"] = duration
+                ## Save uniformized sound signal
+                wav_file_dict["y_uniform"] = uniform_transform(y_original, sr)
+                data.append(wav_file_dict)
+    df = pd.DataFrame(data)
+    return df, uniform_transform
+def uniformize(
+        audio: np.ndarray,
+        sr: int,
+        method: Literal["fix_length", "time_stretch"] = "fix_length",
+        max_time: float = 4.0
+        ):
+    if method == "fix_length":
+        return librosa.util.fix_length(audio, size=int(np.ceil(max_time*sr)))
+    elif method == "time_stretch":
+        duration = librosa.get_duration(y=audio, sr=sr)
+        return librosa.effects.time_stretch(audio, rate=duration/max_time)
+def to_numpy(ds: pd.Series) -> np.ndarray:
+    """Transform a pd.Series (ie columns slice) in a numpy array with the shape (n_row, cell_array.flatten()).
+    Args:
+        df (pd.Series): Columns to transform in numpy.
+    Returns:
+        np.ndarray: resulting np.array from the ds pd.Series.
+    """
+    numpy_df = np.stack([*ds.to_numpy()])
+    C, *o = numpy_df.shape
+    if o:
+        return numpy_df.reshape(numpy_df.shape[0], np.prod(o))
+    else:
+        return numpy_df.reshape(numpy_df.shape[0])

gradio_utils.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from typing import Callable, Optional
+import numpy as np
+import librosa
+import gradio as gr
+def predict_gradio(data: tuple[int, np.ndarray],
+        uniform_lambda: Callable[[np.ndarray, int], np.ndarray],
+        sklearn_model,
+        label_transform,
+        target_sr: int = 22_050) -> Optional[dict]:
+    if data is None:
+        return
+    classes = sklearn_model.classes_
+    if label_transform is not None:
+        classes = label_transform.inverse_transform(classes)
+    y, sr = data[1], data[0]
+    y_original_signal = load_as_librosa(y, sr, target_sr=target_sr)
+    y_uniform = uniform_lambda(y_original_signal, target_sr).astype(np.float32)
+    prediction = sklearn_model.predict_proba(y_uniform.reshape(1, -1))
+    result = {str(label): float(confidence) for (
+        label, confidence) in zip(classes, prediction.flatten())}
+    return result
+def load_as_librosa(y: np.ndarray, sr: int, target_sr: int = 22050) -> np.ndarray:
+    data_dtype = y.dtype
+    dtype_min = np.iinfo(data_dtype).min
+    dtype_max = np.iinfo(data_dtype).max
+    dtype_range = np.abs(dtype_max-dtype_min)
+    y_normalize = (y.astype(np.float32)-dtype_min)/dtype_range
+    y_normalize_resample = librosa.resample(y=y_normalize,
+        orig_sr=sr,
+        target_sr=target_sr)
+    return y_normalize_resample

model/HOP_LENGHT.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5ed7bcd9e9d07c9918817127d9d4d3862f00d680cf13572fd8776d611bddd7ee
+size 15

model/MAX_TIME.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5c63e7c444792b99fe2d588a2454f6a5b45f23e4973a77e6f2e3e280d5385bd1
+size 21

model/METHOD.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0225bfd3de4895f2472fde5df0f7f9d67b1b922e62e84395a41fefb3122a4d09
+size 27

model/N_MFCC.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4e148c4bd8680b2de4785d81d31a1e4fbbd65c87e687e64c68d68c52aa2c4004
+size 5

model/SAMPLE_RATE.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:510a2ce6eba70c0d21f882833ca726e75e0d1a7cbae3badd55f96c0a8e909ede
+size 15

model/label_encoder.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f350bf3ad2da734f600262b0384aa61125de535a3eff8b80640af0f06e319246
+size 617

model/model.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c88130d5500b9e58fb2bc8e5b3cce918c83fdb94c2361d991e24f79452328b00
+size 53219183

model/only_mffc_transform.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5d34fac514bbe21f95e0b62b679e86cced3a7b496c5bd12f087516d55bb9be71
+size 255

preprocessing.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import numpy as np
+import torch
+import librosa
+from sklearn.base import BaseEstimator, TransformerMixin
+from typing import Callable, Optional
+class ReductionTransformer(BaseEstimator, TransformerMixin):
+    def __init__(self, windows_number: int = 300, statistique: Callable[[np.ndarray], np.ndarray] = np.mean):
+        self.windows_number = windows_number
+        self.statistique = statistique
+    def fit(self, X: np.ndarray, y: Optional[np.ndarray] = None):
+        return self
+    def fit_transform(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> np.ndarray:
+        self.fit(X, y)
+        return self.transform(X, y)
+    def transform(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> np.ndarray:
+        X_ = X.copy()
+        *c_, size_ = X_.shape
+        windows_size_ = size_//self.windows_number
+        metrique_clip = X_[..., :self.windows_number*windows_size_]
+        return np.apply_along_axis(self.statistique,
+                    axis=-1,
+                    arr=metrique_clip.reshape((*c_, self.windows_number, windows_size_)))
+    def inverse_transform(self, X: np.ndarray) -> np.ndarray:
+        raise NotImplementedError
+class MeanTransformer(BaseEstimator, TransformerMixin):
+    def __init__(self, windows_number: int = 300):
+        self.windows_number = windows_number
+        self.windows_size = 0
+    def fit(self, X: np.ndarray, y: Optional[np.ndarray] = None):
+        return self
+    def fit_transform(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> np.ndarray:
+        self.fit(X, y)
+        return self.transform(X, y)
+    def transform(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> np.ndarray:
+        X_ = X.copy()
+        *c_, size_ = X_.shape
+        windows_size_ = size_//self.windows_number
+        self.windows_size = windows_size_
+        metrique_clip = X_[..., :self.windows_number*windows_size_]
+        return np.mean(metrique_clip.reshape((*c_, self.windows_number, windows_size_)), axis=-1)
+    def inverse_transform(self, X: np.ndarray) -> np.ndarray:
+        original_size = self.windows_size*self.windows_number
+        X_reconstruct = np.interp(
+            x  = np.arange(start=0, stop=original_size, step=1),
+            xp = np.arange(start=0, stop=original_size, step=self.windows_size),
+            fp = X
+        )
+        return X_reconstruct
+class StdTransformer(BaseEstimator, TransformerMixin):
+    def __init__(self, windows_number: int = 300):
+        self.windows_number = windows_number
+    def fit(self, X: np.ndarray, y: Optional[np.ndarray] = None):
+        return self
+    def fit_transform(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> np.ndarray:
+        self.fit(X, y)
+        return self.transform(X, y)
+    def transform(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> np.ndarray:
+        X_ = X.copy()
+        *c_, size_ = X_.shape
+        windows_size_ = size_//self.windows_number
+        metrique_clip = X_[..., :self.windows_number*windows_size_]
+        return np.std(metrique_clip.reshape((*c_, self.windows_number, windows_size_)), axis=-1)
+    def inverse_transform(self, X: np.ndarray) -> np.ndarray:
+        raise NotImplementedError
+class MfccTransformer(BaseEstimator, TransformerMixin):
+    def __init__(self, sr: int = 22050, N_MFCC: int = 12, hop_length: int = 1024, reshape_output: bool = True):
+        self.sr = sr
+        self.N_MFCC = N_MFCC
+        self.hop_length = hop_length
+        self.reshape_output = reshape_output
+    def reshape(self, X: np.ndarray) -> np.ndarray:
+        X_ = X.copy()
+        c_, *_ = X_.shape
+        return X_.reshape(c_, -1, self.N_MFCC)
+    def fit(self, X: np.ndarray, y: Optional[np.ndarray] = None):
+        return self
+    def fit_transform(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> np.ndarray:
+        self.fit(X, y)
+        return self.transform(X, y)
+    def transform(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> np.ndarray:
+        X_ = X.copy()
+        c_, *_ = X_.shape
+        mfcc = librosa.feature.mfcc(y=X_,
+            sr=self.sr,
+            hop_length=self.hop_length,
+            n_mfcc=self.N_MFCC
+        )
+        if self.reshape_output:
+            mfcc = mfcc.reshape(c_, -1)
+        return mfcc
+    def inverse_transform(self, X: np.ndarray) -> np.ndarray:
+        X_reconstruct = librosa.feature.inverse.mfcc_to_audio(
+            mfcc = X,
+            n_mels = self.N_MFCC,
+        )
+        return X_reconstruct
+class MelTransformer(BaseEstimator, TransformerMixin):
+    def __init__(self, sr: int = 22050, N_MEL: int = 12, hop_length: int = 1024, reshape_output: bool = True):
+        self.sr = sr
+        self.N_MEL = N_MEL
+        self.hop_length = hop_length
+        self.reshape_output = reshape_output
+    def reshape(self, X: np.ndarray) -> np.ndarray:
+        X_ = X.copy()
+        c_, *_ = X_.shape
+        return X_.reshape(c_, -1, self.N_MEL)
+    def fit(self, X: np.ndarray, y: Optional[np.ndarray] = None):
+        return self
+    def fit_transform(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> np.ndarray:
+        self.fit(X, y)
+        return self.transform(X, y)
+    def transform(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> np.ndarray:
+        X_ = X.copy()
+        c_, *_ = X_.shape
+        mel = librosa.feature.melspectrogram(y=X,
+            sr=self.sr,
+            hop_length=self.hop_length,
+            n_mels=self.N_MEL
+        )
+        if self.reshape_output:
+            mel = mel.reshape(c_, -1)
+        return mel
+    def inverse_transform(self, X: np.ndarray) -> np.ndarray:
+        X_reconstruct = librosa.feature.inverse.mel_to_audio(
+            M = X,
+            sr = self.sr,
+            hop_length = self.hop_length
+        )
+        return X_reconstruct
+class TorchTransform(BaseEstimator, TransformerMixin):
+    def __init__(self):
+        pass
+    def fit(self, X: np.ndarray, y: Optional[np.ndarray] = None):
+        return self
+    def fit_transform(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> torch.Tensor:
+        self.fit(X, y)
+        return self.transform(X, y)
+    def transform(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> torch.Tensor:
+        return torch.tensor(X).unsqueeze(dim=1)
+    def inverse_transform(self, X: torch.Tensor) -> np.ndarray:
+        return np.array(X.squeeze(dim=1))
+class ShuffleTransformer(BaseEstimator, TransformerMixin):
+    def __init__(self, p: float = 0.005):
+        self.p = p
+    def fit(self, X: np.ndarray, y: Optional[np.ndarray] = None):
+        return self
+    def fit_transform(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> np.ndarray:
+        self.fit(X, y)
+        return self.transform(X, y)
+    def transform(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> np.ndarray:
+        will_swap = np.random.choice(X.shape[0], int(self.p*X.shape[0]))
+        will_swap_with = np.random.choice(X.shape[0], int(self.p*X.shape[0]))
+        if hasattr(X, "copy"):
+            X_ = X.copy()
+        elif hasattr(X, "clone"):
+            X_ = X.clone()
+        else:
+            X_ = X
+        X_[will_swap, ...] = X_[will_swap_with, ...]
+        return X_
+    def inverse_transform(self, X: np.ndarray) -> np.ndarray:
+        raise NotImplementedError

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+numpy
+matplotlib
+numpy
+pandas
+scikit-learn
+skorch
+librosa
+gradio

resnet.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import torch
+import torch.nn as nn
+class ResNet(nn.Module):
+    def __init__(self, in_channels: int, num_classes: int):
+        """ResNet9"""
+        super().__init__()
+        self.conv1 = ConvBlock(in_channels, 64)
+        self.conv2 = ConvBlock(64, 128, pool=True)
+        self.res1 = nn.Sequential(
+            ConvBlock(128, 128),
+            ConvBlock(128, 128)
+        )
+        self.conv3 = ConvBlock(128, 256)
+        self.conv4 = ConvBlock(256, 512, pool=True)
+        self.res2 = nn.Sequential(
+            ConvBlock(512, 512),
+            ConvBlock(512, 512)
+        )
+        self.classifier = nn.Sequential(
+            nn.MaxPool2d(kernel_size=(4, 4)),
+            nn.AdaptiveAvgPool2d(1),
+            nn.Flatten(),
+            nn.Linear(512, 128),
+            nn.Dropout(0.25),
+            nn.Linear(128, num_classes),
+            nn.Dropout(0.25),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.conv1(x)
+        x = self.conv2(x)
+        x = self.res1(x) + x #skip
+        x = self.conv3(x)
+        x = self.conv4(x)
+        x = self.res2(x) + x #skip
+        prediction = self.classifier(x)
+        return prediction
+class ConvBlock(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, pool: bool = False, pool_no: int = 2):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.pool = pool
+        self.pool_no = pool_no
+        if self.pool:
+            self.pool_block = nn.Sequential(
+                nn.ReLU(inplace=True),
+                nn.MaxPool2d(self.pool_no)
+            )
+        else:
+            self.pool_block = nn.Sequential(
+                nn.ReLU(inplace=True),
+            )
+        self.block = nn.Sequential(
+            nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
+            nn.BatchNorm2d(out_channels),
+            self.pool_block
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.block(x)
+        return x