{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import soundfile as sf\n", "import wave" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def smp_headers(filename: str):\n", " with open(filename, \"rb\") as f:\n", " f.seek(0)\n", " raw_headers = f.read(1024)\n", " raw_headers = raw_headers.rstrip(b'\\x00')\n", " asc_headers = raw_headers.decode(\"ascii\")\n", " asc_headers.rstrip('\\x00')\n", " tmp = [a for a in asc_headers.split(\"\\r\\n\")]\n", " back = -1\n", " while abs(back) > len(tmp) + 1:\n", " if tmp[back] == '=':\n", " break\n", " back -= 1\n", " tmp = tmp[0:back-1]\n", " return dict(a.split(\"=\") for a in tmp)\n", "\n", "\n", "def smp_read_sf(filename: str):\n", " headers = smp_headers(filename)\n", " if headers[\"msb\"] == \"last\":\n", " ENDIAN = \"LITTLE\"\n", " else:\n", " ENDIAN = \"BIG\"\n", "\n", " data, sr = sf.read(filename, channels=int(headers[\"nchans\"]),\n", " samplerate=16000, endian=ENDIAN, start=512,\n", " dtype=\"int16\", format=\"RAW\", subtype=\"PCM_16\")\n", " return (data, sr)\n", "\n", "\n", "def write_wav(filename, arr):\n", " with wave.open(filename, \"w\") as f:\n", " f.setnchannels(1)\n", " f.setsampwidth(2)\n", " f.setframerate(16000)\n", " f.writeframes(arr)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from pathlib import Path" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "WAXHOLM = \"/Users/joregan/Playing/waxholm\"\n", "OUTPUT = \"/Users/joregan/Playing/waxholm_fairseq\"" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "SCENES_PATH = Path(WAXHOLM) / \"scenes_formatted\"\n", "OUTPUT_PATH = Path(OUTPUT)\n", "if not OUTPUT_PATH.is_dir():\n", " OUTPUT_PATH.mkdir()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "TRAIN_FILES = []\n", "with open(Path(WAXHOLM) / \"alloktrainfiles\") as trainf:\n", " for line in trainf.readlines():\n", " TRAIN_FILES.append(line.strip())\n", "TEST_FILES = []\n", "with open(Path(WAXHOLM) / \"testfiles\") as testf:\n", " for line in testf.readlines():\n", " TEST_FILES.append(line.strip())" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1835 327\n" ] } ], "source": [ "print(len(TRAIN_FILES), len(TEST_FILES))" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "import re\n", "\n", "def get_labels(mixfile):\n", " labels = \"\"\n", " saw_label = False\n", " with open(mixfile) as infile:\n", " for line in infile.readlines():\n", " if not saw_label:\n", " if line.lower().startswith(\"labels:\"):\n", " saw_label = True\n", " labels = line[7:].strip()\n", " else:\n", " if line.startswith(\"FR\"):\n", " break\n", " else:\n", " labels = \" \".join([labels, line.strip()])\n", " labels = re.sub(\" +\", \" \", labels)\n", " return labels" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'A:H\\'A: pa p: |h J\\'A:Ggv V\\'ILv pap: sm p:v S\\'E: pa H\\'U:R 2Dd\\'EM Bb\\']:TtE0NG Gg\\']:R 2Tt\\'I STt\"A:VE0#STtR`\\\\M p: \\']: p: \\']M J\\'A: Kk\\'AN F\"O#2S`[TtA Tt\\'I F\"IN#H`AM .'" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "get_labels(\"/Users/joregan/Playing/waxholm/scenes_formatted/fp2043/fp2043.16.03.smp.mix\")" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "def segment_label(label, skip_pause=True):\n", " phones = []\n", " i = 0\n", " while i < len(label):\n", " start_i = i\n", " end_i = i\n", " if label[i:i+2] in [\"NG\", \"E0\", \"SJ\", \"TJ\", \"kl\", \"sm\", \"kl\", \"pa\"]:\n", " phones.append(label[i:i+2])\n", " i += 2\n", " elif label[i:i+2] == \"p:\":\n", " if not skip_pause:\n", " phones.append(\"p:\")\n", " i += 2\n", " elif label[i:i+1] == \"#\":\n", " i += 1\n", " else:\n", " if label[i:i+1] in [\"'\", \"`\", \"\\\"\", \"2\", \"~\"]:\n", " i += 1\n", " end_i += 1\n", " if label[i+1:i+2] in [\":\", \"3\", \"4\"]:\n", " end_i += 1\n", " phones.append(label[start_i:end_i+1])\n", " i = end_i + 1\n", " return phones\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "assert segment_label(\"Bb\\']:TtE0NG\") == ['B', 'b', \"']:\", 'T', 't', 'E0', 'NG']\n", "assert segment_label(\"STt\\\"A:VE0#STtR`\\\\M\") == ['S', 'T', 't', '\"A:', 'V', 'E0', 'S', 'T', 't', 'R', '`\\\\', 'M']\n", "assert segment_label(\"p:v\") == ['v']" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "def proc_label(label, stress=False):\n", " def strip_stress(phone, stress):\n", " if stress:\n", " return phone\n", " if phone[0] in [\"'\", \"`\", \"\\\"\"]:\n", " return phone[1:]\n", " else:\n", " return phone\n", " words = []\n", " for word in label.split(\" \"):\n", " if word in [\"p:pa\", \"pap:\", \"p:pap:\", \"pa\"]:\n", " words.append(\"pa\")\n", " elif word == \"p:\" or word == \".\":\n", " continue\n", " elif word == \"|h\":\n", " words.append(\"hes\")\n", " elif word in [\"sm\", \"ha\", \"kl\"]:\n", " words.append(word)\n", " else:\n", " phones = [strip_stress(p, stress) for p in segment_label(word)]\n", " words.append(\" \".join(phones))\n", " return(\" | \".join(words)) + \" |\"" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "A:H'A: pa p: |h J'A:Ggv V'ILv pap: sm p:v S'E: pa H'U:R 2Dd'EM Bb']:TtE0NG Gg']:R 2Tt'I STt\"A:VE0#STtR`\\M p: ']: p: ']M J'A: Kk'AN F\"O#2S`[TtA Tt'I F\"IN#H`AM .\n", "A: H A: | pa | hes | J A: G g v | V I L v | pa | sm | v | S E: | pa | H U: R | 2D d E M | B b ]: T t E0 NG | G g ]: R | 2T t I | S T t A: V E0 S T t R \\ M | ]: | ] M | J A: | K k A N | F O 2S [ T t A | T t I | F I N H A M |\n" ] } ], "source": [ "lbl = get_labels(\"/Users/joregan/Playing/waxholm/scenes_formatted/fp2043/fp2043.16.03.smp.mix\")\n", "plbl = proc_label(lbl)\n", "print(lbl)\n", "print(plbl)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "with open(OUTPUT_PATH / \"train.tsv\", \"w\") as train_tsv,\\\n", " open(OUTPUT_PATH / \"train.ltr\", \"w\") as train_ltr,\\\n", " open(OUTPUT_PATH / \"valid.tsv\", \"w\") as valid_tsv,\\\n", " open(OUTPUT_PATH / \"valid.ltr\", \"w\") as valid_ltr,\\\n", " open(OUTPUT_PATH / \"test.tsv\", \"w\") as test_tsv,\\\n", " open(OUTPUT_PATH / \"test.ltr\", \"w\") as test_ltr:\n", " train_tsv.write(str(OUTPUT_PATH) + \"\\n\")\n", " test_tsv.write(str(OUTPUT_PATH) + \"\\n\")\n", " valid_tsv.write(str(OUTPUT_PATH) + \"\\n\")\n", " valid_amount = 195\n", " for smpfile in SCENES_PATH.glob(\"fp*/*.smp\"):\n", " mixfile = f\"{smpfile}.mix\"\n", " if not Path(mixfile).exists():\n", " continue\n", " stem = smpfile.stem\n", " if f\"{stem}.smp\" in TEST_FILES:\n", " out_tsv = test_tsv\n", " out_ltr = test_ltr\n", " elif valid_amount > 0:\n", " out_tsv = valid_tsv\n", " out_ltr = valid_ltr\n", " valid_amount -= 1\n", " else:\n", " out_tsv = train_tsv\n", " out_ltr = train_ltr\n", "\n", " outwav = f\"{stem}.wav\"\n", " arr, sr = smp_read_sf(str(smpfile))\n", " out_tsv.write(f\"{outwav}\\t{len(arr)}\\n\")\n", " write_wav(outwav, arr)\n", " label = get_labels(mixfile)\n", " ltrline = proc_label(label)\n", " out_ltr.write(ltrline + \"\\n\")\n", " \n", " " ] } ], "metadata": { "kernelspec": { "display_name": "hf", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.15" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }