jimregan
/

psst-partial-timit

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 73,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset, concatenate_datasets\n",
+    "import soundfile as sf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "PAD = \"<pad>\"\n",
+    "UNK = \"<unk>\"\n",
+    "SIL = \"<sil>\"\n",
+    "SPN = \"<spn>\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "VOCAB_ITEMS =\"\"\"\n",
+    "AA\n",
+    "AE\n",
+    "AH\n",
+    "AO\n",
+    "AW\n",
+    "AX\n",
+    "AY\n",
+    "EH\n",
+    "ER\n",
+    "EY\n",
+    "IH\n",
+    "IY\n",
+    "OW\n",
+    "OY\n",
+    "UH\n",
+    "UW\n",
+    "UX\n",
+    "B\n",
+    "CH\n",
+    "D\n",
+    "DH\n",
+    "DX\n",
+    "EL\n",
+    "EM\n",
+    "EN\n",
+    "F\n",
+    "G\n",
+    "HH\n",
+    "JH\n",
+    "K\n",
+    "L\n",
+    "M\n",
+    "N\n",
+    "NG\n",
+    "NX\n",
+    "P\n",
+    "Q\n",
+    "R\n",
+    "S\n",
+    "SH\n",
+    "T\n",
+    "TH\n",
+    "V\n",
+    "W\n",
+    "WH\n",
+    "Y\n",
+    "Z\n",
+    "ZH\n",
+    " \n",
+    ".\n",
+    ",\n",
+    "?\n",
+    "!\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "_VOCAB_SPLIT = VOCAB_ITEMS.split(\"\\n\")[1:-1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "VOCAB = {e[1]:e[0] for e in enumerate(_VOCAB_SPLIT)}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "TIMIT_MAPPING = {\n",
+    "    'ax': 'AH',\n",
+    "    'ax-h': 'AH',\n",
+    "    'axr': 'ER',\n",
+    "    'dx': 'T',\n",
+    "    'el': ['AH', 'L'],\n",
+    "    'em': ['AH', 'M'],\n",
+    "    'en': ['AH', 'N'],\n",
+    "    'eng': ['IH', 'NG'],\n",
+    "    'hv': 'HH',\n",
+    "    'ix': 'IH',\n",
+    "    'nx': ['N', 'T'],\n",
+    "    'pau': '<sil>',\n",
+    "    'epi': '<sil>',\n",
+    "    'ux': 'UW'\n",
+    "}\n",
+    "TIMIT_IGNORE = ['bcl', 'dcl', 'gcl', 'kcl', 'pcl', 'tcl']\n",
+    "TIMIT_DISCARD = ['dx', 'nx', 'q']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def map_timit_to_cmudict(timit):\n",
+    "    output = []\n",
+    "\n",
+    "    start = 1 if timit[0] == \"h#\" else 0\n",
+    "    end = -1 if timit[-1] == \"h#\" else None\n",
+    "    timit = timit[start:end]\n",
+    "\n",
+    "    for phone in timit:\n",
+    "        if phone in TIMIT_MAPPING:\n",
+    "            if type(TIMIT_MAPPING[phone]) == list:\n",
+    "                output += TIMIT_MAPPING[phone]\n",
+    "            else:\n",
+    "                output.append(TIMIT_MAPPING[phone])\n",
+    "        elif phone in TIMIT_IGNORE:\n",
+    "            pass\n",
+    "        else:\n",
+    "            if not phone.upper() in VOCAB:\n",
+    "                print(\"Invalid phone\", phone.upper())\n",
+    "            output.append(phone.upper())\n",
+    "    return output"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "timit = load_dataset('timit_asr')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 75,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def is_discardable(batch):\n",
+    "    for phoneme in batch[\"phonetic_detail\"][\"utterance\"]:\n",
+    "        if phoneme in TIMIT_DISCARD:\n",
+    "            return False\n",
+    "    return True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "timit_filt = timit[\"train\"].filter(lambda eg: is_discardable(eg))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "timit_filt2 = timit[\"test\"].filter(lambda eg: is_discardable(eg))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 78,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "timit = concatenate_datasets([timit_filt, timit_filt2])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "MAX_TOKENS = 1120000"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "manifest_path = \"manifest.tsv\"\n",
+    "transcript_path = \"transcript\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 70,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "BASE = timit[0][\"file\"].split(\"/data/\")[0] + \"/data/\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 82,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "resplit = timit.train_test_split(test_size=0.1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 86,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for split in [\"train\", \"test\"]:\n",
+    "    fsplit = split\n",
+    "    if fsplit == \"test\":\n",
+    "        fsplit = \"valid\"\n",
+    "    with open(f\"{fsplit}.tsv\", \"w\") as manifest, open(f\"{fsplit}.ltr\", \"w\") as transcript:\n",
+    "        manifest.write(BASE + \"\\n\")\n",
+    "        for item in resplit[split]:\n",
+    "            frames, sr = sf.read(item[\"file\"])\n",
+    "            manifest.write(f\"{item['file'].replace(BASE, '')}\\t{len(frames)}\\n\")\n",
+    "            utt = item['phonetic_detail']['utterance']\n",
+    "            mapped = map_timit_to_cmudict(utt)\n",
+    "            transcript.write(f\"{' '.join(mapped)}\\n\")\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "279d017b1d681737e71f35b98eaa9087df824225149f0ac59acfe151b4fa281b"
+  },
+  "kernelspec": {
+   "display_name": "Python 3.8.12 ('psst')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.12"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}