{ "cells": [ { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\"./speakers_all.csv\")" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "df['id'] = df['filename'].apply(lambda x: x + \".wav\")\n", "df = df[df['file_missing?'] == False]" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ageage_onsetbirthplacefilenamenative_languagesexspeakeridcountryfile_missing?Unnamed: 9Unnamed: 10Unnamed: 11id
3227.09.0virginia, south africaafrikaans1afrikaansfemale1south africaFalseNaNNaNNaNafrikaans1.wav
3340.05.0pretoria, south africaafrikaans2afrikaansmale2south africaFalseNaNNaNNaNafrikaans2.wav
3443.04.0pretoria, transvaal, south africaafrikaans3afrikaansmale418south africaFalseNaNNaNNaNafrikaans3.wav
3526.08.0pretoria, south africaafrikaans4afrikaansmale1159south africaFalseNaNNaNNaNafrikaans4.wav
3619.06.0cape town, south africaafrikaans5afrikaansmale1432south africaFalseNaNNaNNaNafrikaans5.wav
\n", "
" ], "text/plain": [ " age age_onset birthplace filename \\\n", "32 27.0 9.0 virginia, south africa afrikaans1 \n", "33 40.0 5.0 pretoria, south africa afrikaans2 \n", "34 43.0 4.0 pretoria, transvaal, south africa afrikaans3 \n", "35 26.0 8.0 pretoria, south africa afrikaans4 \n", "36 19.0 6.0 cape town, south africa afrikaans5 \n", "\n", " native_language sex speakerid country file_missing? \\\n", "32 afrikaans female 1 south africa False \n", "33 afrikaans male 2 south africa False \n", "34 afrikaans male 418 south africa False \n", "35 afrikaans male 1159 south africa False \n", "36 afrikaans male 1432 south africa False \n", "\n", " Unnamed: 9 Unnamed: 10 Unnamed: 11 id \n", "32 NaN NaN NaN afrikaans1.wav \n", "33 NaN NaN NaN afrikaans2.wav \n", "34 NaN NaN NaN afrikaans3.wav \n", "35 NaN NaN NaN afrikaans4.wav \n", "36 NaN NaN NaN afrikaans5.wav " ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "df['label'] = \"Please call Stella. Ask her to bring these things with her from the store: Six spoons of fresh snow peas, five thick slabs of blue cheese, and maybe a snack for her brother Bob. We also need a small plastic snake and a big toy frog for the kids. She can scoop these things into three red bags, and we will go meet her Wednesday at the train station.\"" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['age', 'age_onset', 'birthplace', 'filename', 'native_language', 'sex',\n", " 'speakerid', 'country', 'file_missing?', 'Unnamed: 9', 'Unnamed: 10',\n", " 'Unnamed: 11', 'id', 'label'],\n", " dtype='object')" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.columns" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "df = df.drop(\"Unnamed: 9\", axis=1)\n", "df = df.drop(\"Unnamed: 10\", axis=1)\n", "df = df.drop(\"Unnamed: 11\", axis=1)\n", "df = df.drop(\"file_missing?\", axis=1)\n", "df = df.drop(\"filename\", axis=1)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "df.loc[df['sex'] == 'famale', 'sex'] = 'female'" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "import pycountry_convert as pc\n", "\n", "def country_to_continent(country_name):\n", " try:\n", " country_alpha2 = pc.country_name_to_country_alpha2(country_name, cn_name_format=pc.COUNTRY_NAME_FORMAT_LOWER)\n", " country_continent_code = pc.country_alpha2_to_continent_code(country_alpha2)\n", " country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)\n", " return country_continent_name\n", " except:\n", " return None\n", "\n", "df[\"continent\"] = df[\"country\"].map(lambda x: country_to_continent(x))" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "False 1647\n", "True 493\n", "Name: continent, dtype: int64" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[\"continent\"].isnull().value_counts()" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "df = df.drop([1544, 1771])" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "df.to_csv(\"metadata.csv\", index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Whisper" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|███████████████████████████████████████| 139M/139M [00:04<00:00, 30.3MiB/s]\n" ] } ], "source": [ "import whisper\n", "model = whisper.load_model(\"base\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/opt/anaconda3/lib/python3.8/site-packages/whisper/transcribe.py:78: UserWarning: FP16 is not supported on CPU; using FP32 instead\n", " warnings.warn(\"FP16 is not supported on CPU; using FP32 instead\")\n" ] }, { "ename": "TypeError", "evalue": "expected np.ndarray (got list)", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m/var/folders/tq/kqg2ct9d123gd0wmshf2bd3r0000gp/T/ipykernel_157/3894641212.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtranscribe\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"/Users/acabrera/dev/data/speech-accent-archive/recordings/recordings/afrikaans1.wav\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"/Users/acabrera/dev/data/speech-accent-archive/recordings/recordings/afrikaans1.wav\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m/opt/anaconda3/lib/python3.8/site-packages/whisper/transcribe.py\u001b[0m in \u001b[0;36mtranscribe\u001b[0;34m(model, audio, verbose, temperature, compression_ratio_threshold, logprob_threshold, no_speech_threshold, condition_on_previous_text, **decode_options)\u001b[0m\n\u001b[1;32m 82\u001b[0m \u001b[0mdecode_options\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"fp16\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 83\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 84\u001b[0;31m \u001b[0mmel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlog_mel_spectrogram\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maudio\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 85\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 86\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdecode_options\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"language\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/opt/anaconda3/lib/python3.8/site-packages/whisper/audio.py\u001b[0m in \u001b[0;36mlog_mel_spectrogram\u001b[0;34m(audio, n_mels)\u001b[0m\n\u001b[1;32m 110\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maudio\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 111\u001b[0m \u001b[0maudio\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_audio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maudio\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 112\u001b[0;31m \u001b[0maudio\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfrom_numpy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maudio\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 113\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 114\u001b[0m \u001b[0mwindow\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhann_window\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mN_FFT\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maudio\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mTypeError\u001b[0m: expected np.ndarray (got list)" ] } ], "source": [ "result = model.transcribe([\"/Users/acabrera/dev/data/speech-accent-archive/recordings/recordings/afrikaans1.wav\", \"/Users/acabrera/dev/data/speech-accent-archive/recordings/recordings/afrikaans1.wav\"])" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "' Please call Stella, ask her to bring these things with her from the store. 6 spoons of fresh snow peas, 5 thick slabs of blue cheese and maybe a snack for her brother Bob. We also need a small plastic snake and a big twig frog for the kids. She can scoop these things into free-rate bags and we will go meet a wind state train station.'" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result[\"text\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3.8.12 ('base')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "40d3a090f54c6569ab1632332b64b2c03c39dcf918b08424e98f38b5ae0af88f" } } }, "nbformat": 4, "nbformat_minor": 2 }