{ "cells": [ { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset, concatenate_datasets\n", "import soundfile as sf" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "PAD = \"\"\n", "UNK = \"\"\n", "SIL = \"\"\n", "SPN = \"\"" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "VOCAB_ITEMS =\"\"\"\n", "AA\n", "AE\n", "AH\n", "AO\n", "AW\n", "AX\n", "AY\n", "EH\n", "ER\n", "EY\n", "IH\n", "IY\n", "OW\n", "OY\n", "UH\n", "UW\n", "UX\n", "B\n", "CH\n", "D\n", "DH\n", "DX\n", "EL\n", "EM\n", "EN\n", "F\n", "G\n", "HH\n", "JH\n", "K\n", "L\n", "M\n", "N\n", "NG\n", "NX\n", "P\n", "Q\n", "R\n", "S\n", "SH\n", "T\n", "TH\n", "V\n", "W\n", "WH\n", "Y\n", "Z\n", "ZH\n", " \n", ".\n", ",\n", "?\n", "!\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "_VOCAB_SPLIT = VOCAB_ITEMS.split(\"\\n\")[1:-1]" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "VOCAB = {e[1]:e[0] for e in enumerate(_VOCAB_SPLIT)}" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [], "source": [ "TIMIT_MAPPING = {\n", " 'ax': 'AH',\n", " 'ax-h': 'AH',\n", " 'axr': 'ER',\n", " 'dx': 'T',\n", " 'el': ['AH', 'L'],\n", " 'em': ['AH', 'M'],\n", " 'en': ['AH', 'N'],\n", " 'eng': ['IH', 'NG'],\n", " 'hv': 'HH',\n", " 'ix': 'IH',\n", " 'nx': ['N', 'T'],\n", " 'pau': '',\n", " 'epi': '',\n", " 'ux': 'UW'\n", "}\n", "TIMIT_IGNORE = ['bcl', 'dcl', 'gcl', 'kcl', 'pcl', 'tcl']\n", "TIMIT_DISCARD = ['dx', 'nx', 'q']" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [], "source": [ "def map_timit_to_cmudict(timit):\n", " output = []\n", "\n", " start = 1 if timit[0] == \"h#\" else 0\n", " end = -1 if timit[-1] == \"h#\" else None\n", " timit = timit[start:end]\n", "\n", " for phone in timit:\n", " if phone in TIMIT_MAPPING:\n", " if type(TIMIT_MAPPING[phone]) == list:\n", " output += TIMIT_MAPPING[phone]\n", " else:\n", " output.append(TIMIT_MAPPING[phone])\n", " elif phone in TIMIT_IGNORE:\n", " pass\n", " else:\n", " if not phone.upper() in VOCAB:\n", " print(\"Invalid phone\", phone.upper())\n", " output.append(phone.upper())\n", " return output" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "timit = load_dataset('timit_asr')" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [], "source": [ "def is_discardable(batch):\n", " for phoneme in batch[\"phonetic_detail\"][\"utterance\"]:\n", " if phoneme in TIMIT_DISCARD:\n", " return False\n", " return True" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "timit_filt = timit[\"train\"].filter(lambda eg: is_discardable(eg))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "timit_filt2 = timit[\"test\"].filter(lambda eg: is_discardable(eg))" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [], "source": [ "timit = concatenate_datasets([timit_filt, timit_filt2])" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [], "source": [ "MAX_TOKENS = 1120000" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [], "source": [ "manifest_path = \"manifest.tsv\"\n", "transcript_path = \"transcript\"" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [], "source": [ "BASE = timit[0][\"file\"].split(\"/data/\")[0] + \"/data/\"" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [], "source": [ "resplit = timit.train_test_split(test_size=0.1)" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [], "source": [ "for split in [\"train\", \"test\"]:\n", " fsplit = split\n", " if fsplit == \"test\":\n", " fsplit = \"valid\"\n", " with open(f\"{fsplit}.tsv\", \"w\") as manifest, open(f\"{fsplit}.ltr\", \"w\") as transcript:\n", " manifest.write(BASE + \"\\n\")\n", " for item in resplit[split]:\n", " frames, sr = sf.read(item[\"file\"])\n", " manifest.write(f\"{item['file'].replace(BASE, '')}\\t{len(frames)}\\n\")\n", " utt = item['phonetic_detail']['utterance']\n", " mapped = map_timit_to_cmudict(utt)\n", " transcript.write(f\"{' '.join(mapped)}\\n\")\n" ] } ], "metadata": { "interpreter": { "hash": "279d017b1d681737e71f35b98eaa9087df824225149f0ac59acfe151b4fa281b" }, "kernelspec": { "display_name": "Python 3.8.12 ('psst')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }