{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset, concatenate_datasets\n",
    "import soundfile as sf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "PAD = \"<pad>\"\n",
    "UNK = \"<unk>\"\n",
    "SIL = \"<sil>\"\n",
    "SPN = \"<spn>\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "VOCAB_ITEMS =\"\"\"\n",
    "AA\n",
    "AE\n",
    "AH\n",
    "AO\n",
    "AW\n",
    "AX\n",
    "AY\n",
    "EH\n",
    "ER\n",
    "EY\n",
    "IH\n",
    "IY\n",
    "OW\n",
    "OY\n",
    "UH\n",
    "UW\n",
    "UX\n",
    "B\n",
    "CH\n",
    "D\n",
    "DH\n",
    "DX\n",
    "EL\n",
    "EM\n",
    "EN\n",
    "F\n",
    "G\n",
    "HH\n",
    "JH\n",
    "K\n",
    "L\n",
    "M\n",
    "N\n",
    "NG\n",
    "NX\n",
    "P\n",
    "Q\n",
    "R\n",
    "S\n",
    "SH\n",
    "T\n",
    "TH\n",
    "V\n",
    "W\n",
    "WH\n",
    "Y\n",
    "Z\n",
    "ZH\n",
    " \n",
    ".\n",
    ",\n",
    "?\n",
    "!\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "_VOCAB_SPLIT = VOCAB_ITEMS.split(\"\\n\")[1:-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "VOCAB = {e[1]:e[0] for e in enumerate(_VOCAB_SPLIT)}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "TIMIT_MAPPING = {\n",
    "    'ax': 'AH',\n",
    "    'ax-h': 'AH',\n",
    "    'axr': 'ER',\n",
    "    'dx': 'T',\n",
    "    'el': ['AH', 'L'],\n",
    "    'em': ['AH', 'M'],\n",
    "    'en': ['AH', 'N'],\n",
    "    'eng': ['IH', 'NG'],\n",
    "    'hv': 'HH',\n",
    "    'ix': 'IH',\n",
    "    'nx': ['N', 'T'],\n",
    "    'pau': '<sil>',\n",
    "    'epi': '<sil>',\n",
    "    'ux': 'UW'\n",
    "}\n",
    "TIMIT_IGNORE = ['bcl', 'dcl', 'gcl', 'kcl', 'pcl', 'tcl']\n",
    "TIMIT_DISCARD = ['dx', 'nx', 'q']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "def map_timit_to_cmudict(timit):\n",
    "    output = []\n",
    "\n",
    "    start = 1 if timit[0] == \"h#\" else 0\n",
    "    end = -1 if timit[-1] == \"h#\" else None\n",
    "    timit = timit[start:end]\n",
    "\n",
    "    for phone in timit:\n",
    "        if phone in TIMIT_MAPPING:\n",
    "            if type(TIMIT_MAPPING[phone]) == list:\n",
    "                output += TIMIT_MAPPING[phone]\n",
    "            else:\n",
    "                output.append(TIMIT_MAPPING[phone])\n",
    "        elif phone in TIMIT_IGNORE:\n",
    "            pass\n",
    "        else:\n",
    "            if not phone.upper() in VOCAB:\n",
    "                print(\"Invalid phone\", phone.upper())\n",
    "            output.append(phone.upper())\n",
    "    return output"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "timit = load_dataset('timit_asr')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [],
   "source": [
    "def is_discardable(batch):\n",
    "    for phoneme in batch[\"phonetic_detail\"][\"utterance\"]:\n",
    "        if phoneme in TIMIT_DISCARD:\n",
    "            return False\n",
    "    return True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "timit_filt = timit[\"train\"].filter(lambda eg: is_discardable(eg))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "timit_filt2 = timit[\"test\"].filter(lambda eg: is_discardable(eg))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [],
   "source": [
    "timit = concatenate_datasets([timit_filt, timit_filt2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "MAX_TOKENS = 1120000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "manifest_path = \"manifest.tsv\"\n",
    "transcript_path = \"transcript\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": [
    "BASE = timit[0][\"file\"].split(\"/data/\")[0] + \"/data/\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [],
   "source": [
    "resplit = timit.train_test_split(test_size=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [],
   "source": [
    "for split in [\"train\", \"test\"]:\n",
    "    fsplit = split\n",
    "    if fsplit == \"test\":\n",
    "        fsplit = \"valid\"\n",
    "    with open(f\"{fsplit}.tsv\", \"w\") as manifest, open(f\"{fsplit}.ltr\", \"w\") as transcript:\n",
    "        manifest.write(BASE + \"\\n\")\n",
    "        for item in resplit[split]:\n",
    "            frames, sr = sf.read(item[\"file\"])\n",
    "            manifest.write(f\"{item['file'].replace(BASE, '')}\\t{len(frames)}\\n\")\n",
    "            utt = item['phonetic_detail']['utterance']\n",
    "            mapped = map_timit_to_cmudict(utt)\n",
    "            transcript.write(f\"{' '.join(mapped)}\\n\")\n"
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "279d017b1d681737e71f35b98eaa9087df824225149f0ac59acfe151b4fa281b"
  },
  "kernelspec": {
   "display_name": "Python 3.8.12 ('psst')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}