{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Text task notebook template\n", "## Loading the necessary libraries" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "from fastapi import APIRouter\n", "from datetime import datetime\n", "from datasets import load_dataset\n", "import librosa\n", "from sklearn.metrics import accuracy_score\n", "import random\n", "import pandas as pd\n", "import numpy as np\n", "import sys\n", "import json\n", "sys.path.append('../tasks')\n", "\n", "from utils.evaluation import AudioEvaluationRequest\n", "from utils.emissions import tracker, clean_emissions_data, get_space_info\n", "\n", "\n", "# Define the label mapping\n", "LABEL_MAPPING = {\n", " \"chainsaw\": 0,\n", " \"environment\": 1\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from huggingface_hub import login\n", "login()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Loading the datasets and splitting them" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "request = AudioEvaluationRequest()\n", "\n", "# Load and prepare the dataset\n", "dataset = load_dataset(request.dataset_name)\n", "\n", "# Split dataset\n", "train_test = dataset[\"train\"].train_test_split(test_size=request.test_size, seed=request.test_seed)\n", "test_dataset = train_test[\"test\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Analysis" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train = dataset[\"train\"]\n", "test = dataset['test']\n", "\n", "train_df = pd.DataFrame(train)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "train_df[\"path\"] = train_df[\"audio\"].apply(lambda x: x['path'])\n", "train_df[\"array\"] = train_df[\"audio\"].apply(lambda x: x['array'])\n", "train_df[\"sampling_rate\"] = train_df[\"audio\"].apply(lambda x: x['sampling_rate'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Target sampling rate\n", "target_sr = 12000\n", "\n", "# Function to resample the audio array\n", "def resample_audio(array, orig_sr, target_sr):\n", " array = np.array(array) # Ensure it's a numpy array\n", " if orig_sr != target_sr:\n", " array = librosa.resample(array, orig_sr=orig_sr, target_sr=target_sr)\n", " return array\n", "\n", "# Apply resampling to each row\n", "train_df[\"resampled_array\"] = train_df.apply(\n", " lambda row: resample_audio(row[\"array\"], row[\"sampling_rate\"], target_sr), axis=1\n", ")\n", "\n", "# Update the sampling rate column to reflect the target rate\n", "train_df[\"sampling_rate\"] = target_sr\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train_df.sampling_rate.describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Random Baseline" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# Start tracking emissions\n", "tracker.start()\n", "tracker.start_task(\"inference\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "#--------------------------------------------------------------------------------------------\n", "# YOUR MODEL INFERENCE CODE HERE\n", "# Update the code below to replace the random baseline by your model inference within the inference pass where the energy consumption and emissions are tracked.\n", "#-------------------------------------------------------------------------------------------- \n", "\n", "# Make random predictions (placeholder for actual model inference)\n", "true_labels = test_dataset[\"label\"]\n", "\n", "\n", "predictions = [random.randint(0, 1) for _ in range(len(true_labels))]\n", "\n", "predictions\n", "\n", "#--------------------------------------------------------------------------------------------\n", "# YOUR MODEL INFERENCE STOPS HERE\n", "#-------------------------------------------------------------------------------------------- " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Stop tracking emissions\n", "emissions_data = tracker.stop_task()\n", "emissions_data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Calculate accuracy\n", "accuracy = accuracy_score(true_labels, predictions)\n", "accuracy" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Prepare results dictionary\n", "results = {\n", " \"submission_timestamp\": datetime.now().isoformat(),\n", " \"accuracy\": float(accuracy),\n", " \"energy_consumed_wh\": emissions_data.energy_consumed * 1000,\n", " \"emissions_gco2eq\": emissions_data.emissions * 1000,\n", " \"emissions_data\": clean_emissions_data(emissions_data),\n", " \"dataset_config\": {\n", " \"dataset_name\": request.dataset_name,\n", " \"test_size\": request.test_size,\n", " \"test_seed\": request.test_seed\n", " }\n", "}\n", "\n", "results" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.6" } }, "nbformat": 4, "nbformat_minor": 2 }