Spaces:

zeno-ml
/

audio-transcription

Running

File size: 15,496 Bytes

c24ff9a

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(\"./speakers_all.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['id'] = df['filename'].apply(lambda x: x + \".wav\")\n",
    "df = df[df['file_missing?'] == False]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>age_onset</th>\n",
       "      <th>birthplace</th>\n",
       "      <th>filename</th>\n",
       "      <th>native_language</th>\n",
       "      <th>sex</th>\n",
       "      <th>speakerid</th>\n",
       "      <th>country</th>\n",
       "      <th>file_missing?</th>\n",
       "      <th>Unnamed: 9</th>\n",
       "      <th>Unnamed: 10</th>\n",
       "      <th>Unnamed: 11</th>\n",
       "      <th>id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>27.0</td>\n",
       "      <td>9.0</td>\n",
       "      <td>virginia, south africa</td>\n",
       "      <td>afrikaans1</td>\n",
       "      <td>afrikaans</td>\n",
       "      <td>female</td>\n",
       "      <td>1</td>\n",
       "      <td>south africa</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>afrikaans1.wav</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>40.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>pretoria, south africa</td>\n",
       "      <td>afrikaans2</td>\n",
       "      <td>afrikaans</td>\n",
       "      <td>male</td>\n",
       "      <td>2</td>\n",
       "      <td>south africa</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>afrikaans2.wav</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>43.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>pretoria, transvaal, south africa</td>\n",
       "      <td>afrikaans3</td>\n",
       "      <td>afrikaans</td>\n",
       "      <td>male</td>\n",
       "      <td>418</td>\n",
       "      <td>south africa</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>afrikaans3.wav</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>26.0</td>\n",
       "      <td>8.0</td>\n",
       "      <td>pretoria, south africa</td>\n",
       "      <td>afrikaans4</td>\n",
       "      <td>afrikaans</td>\n",
       "      <td>male</td>\n",
       "      <td>1159</td>\n",
       "      <td>south africa</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>afrikaans4.wav</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>19.0</td>\n",
       "      <td>6.0</td>\n",
       "      <td>cape town, south africa</td>\n",
       "      <td>afrikaans5</td>\n",
       "      <td>afrikaans</td>\n",
       "      <td>male</td>\n",
       "      <td>1432</td>\n",
       "      <td>south africa</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>afrikaans5.wav</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     age  age_onset                         birthplace    filename  \\\n",
       "32  27.0        9.0             virginia, south africa  afrikaans1   \n",
       "33  40.0        5.0             pretoria, south africa  afrikaans2   \n",
       "34  43.0        4.0  pretoria, transvaal, south africa  afrikaans3   \n",
       "35  26.0        8.0             pretoria, south africa  afrikaans4   \n",
       "36  19.0        6.0            cape town, south africa  afrikaans5   \n",
       "\n",
       "   native_language     sex  speakerid       country  file_missing?  \\\n",
       "32       afrikaans  female          1  south africa          False   \n",
       "33       afrikaans    male          2  south africa          False   \n",
       "34       afrikaans    male        418  south africa          False   \n",
       "35       afrikaans    male       1159  south africa          False   \n",
       "36       afrikaans    male       1432  south africa          False   \n",
       "\n",
       "    Unnamed: 9  Unnamed: 10 Unnamed: 11              id  \n",
       "32         NaN          NaN         NaN  afrikaans1.wav  \n",
       "33         NaN          NaN         NaN  afrikaans2.wav  \n",
       "34         NaN          NaN         NaN  afrikaans3.wav  \n",
       "35         NaN          NaN         NaN  afrikaans4.wav  \n",
       "36         NaN          NaN         NaN  afrikaans5.wav  "
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['label'] = \"Please call Stella.  Ask her to bring these things with her from the store:  Six spoons of fresh snow peas, five thick slabs of blue cheese, and maybe a snack for her brother Bob.  We also need a small plastic snake and a big toy frog for the kids.  She can scoop these things into three red bags, and we will go meet her Wednesday at the train station.\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['age', 'age_onset', 'birthplace', 'filename', 'native_language', 'sex',\n",
       "       'speakerid', 'country', 'file_missing?', 'Unnamed: 9', 'Unnamed: 10',\n",
       "       'Unnamed: 11', 'id', 'label'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.drop(\"Unnamed: 9\", axis=1)\n",
    "df = df.drop(\"Unnamed: 10\", axis=1)\n",
    "df = df.drop(\"Unnamed: 11\", axis=1)\n",
    "df = df.drop(\"file_missing?\", axis=1)\n",
    "df = df.drop(\"filename\", axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.loc[df['sex'] == 'famale', 'sex'] = 'female'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pycountry_convert as pc\n",
    "\n",
    "def country_to_continent(country_name):\n",
    "    try:\n",
    "        country_alpha2 = pc.country_name_to_country_alpha2(country_name, cn_name_format=pc.COUNTRY_NAME_FORMAT_LOWER)\n",
    "        country_continent_code = pc.country_alpha2_to_continent_code(country_alpha2)\n",
    "        country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)\n",
    "        return country_continent_name\n",
    "    except:\n",
    "        return None\n",
    "\n",
    "df[\"continent\"] = df[\"country\"].map(lambda x: country_to_continent(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False    1647\n",
       "True      493\n",
       "Name: continent, dtype: int64"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[\"continent\"].isnull().value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.drop([1544, 1771])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv(\"metadata.csv\", index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Whisper"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████████| 139M/139M [00:04<00:00, 30.3MiB/s]\n"
     ]
    }
   ],
   "source": [
    "import whisper\n",
    "model = whisper.load_model(\"base\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/anaconda3/lib/python3.8/site-packages/whisper/transcribe.py:78: UserWarning: FP16 is not supported on CPU; using FP32 instead\n",
      "  warnings.warn(\"FP16 is not supported on CPU; using FP32 instead\")\n"
     ]
    },
    {
     "ename": "TypeError",
     "evalue": "expected np.ndarray (got list)",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m/var/folders/tq/kqg2ct9d123gd0wmshf2bd3r0000gp/T/ipykernel_157/3894641212.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtranscribe\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"/Users/acabrera/dev/data/speech-accent-archive/recordings/recordings/afrikaans1.wav\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"/Users/acabrera/dev/data/speech-accent-archive/recordings/recordings/afrikaans1.wav\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m/opt/anaconda3/lib/python3.8/site-packages/whisper/transcribe.py\u001b[0m in \u001b[0;36mtranscribe\u001b[0;34m(model, audio, verbose, temperature, compression_ratio_threshold, logprob_threshold, no_speech_threshold, condition_on_previous_text, **decode_options)\u001b[0m\n\u001b[1;32m     82\u001b[0m         \u001b[0mdecode_options\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"fp16\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     83\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 84\u001b[0;31m     \u001b[0mmel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlog_mel_spectrogram\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maudio\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     85\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     86\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mdecode_options\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"language\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/opt/anaconda3/lib/python3.8/site-packages/whisper/audio.py\u001b[0m in \u001b[0;36mlog_mel_spectrogram\u001b[0;34m(audio, n_mels)\u001b[0m\n\u001b[1;32m    110\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maudio\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    111\u001b[0m             \u001b[0maudio\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_audio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maudio\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 112\u001b[0;31m         \u001b[0maudio\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfrom_numpy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maudio\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    113\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    114\u001b[0m     \u001b[0mwindow\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhann_window\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mN_FFT\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maudio\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mTypeError\u001b[0m: expected np.ndarray (got list)"
     ]
    }
   ],
   "source": [
    "result = model.transcribe([\"/Users/acabrera/dev/data/speech-accent-archive/recordings/recordings/afrikaans1.wav\", \"/Users/acabrera/dev/data/speech-accent-archive/recordings/recordings/afrikaans1.wav\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "' Please call Stella, ask her to bring these things with her from the store. 6 spoons of fresh snow peas, 5 thick slabs of blue cheese and maybe a snack for her brother Bob. We also need a small plastic snake and a big twig frog for the kids. She can scoop these things into free-rate bags and we will go meet a wind state train station.'"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "result[\"text\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.8.12 ('base')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "40d3a090f54c6569ab1632332b64b2c03c39dcf918b08424e98f38b5ae0af88f"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}