{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", "import numpy as np\n", "import soundfile as sf\n", "from pathlib import Path\n", "from shutil import copyfile\n", "from tqdm import tqdm\n", "\n", "input_dataset_path = \"[your_local_path]/synpaflex-corpus/v0.1/\"\n", "reorganized_dataset_path = \"../synpaflex/\"\n", "\n", "maximal_duration = 12 # maximal audio file duration in seconds\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "wav_dir = os.path.join(reorganized_dataset_path, \"wavs/\")\n", "os.makedirs(wav_dir, exist_ok=True)\n", "data = []\n", "total_duration = 0\n", "\n", "# Precomputing walk_count for tqdm\n", "walk_count = 0\n", "for subdir, dirs, files in os.walk(input_dataset_path):\n", " walk_count += 1\n", "\n", "# walk through dataset\n", "for subdir, dirs, files in tqdm(os.walk(input_dataset_path), total=walk_count, bar_format='Data Reorganization : {l_bar}{bar}|'):\n", " for filename in files:\n", " filepath = os.path.join(subdir, filename)\n", "\n", " # read wav files\n", " if filepath.endswith(\".wav\"):\n", " try:\n", " wav, sr = sf.read(filepath)\n", " duration = len(wav) / sr\n", " \n", " # Only keep files with shorter durations than maximal_duration\n", " if duration <= maximal_duration:\n", " total_duration += duration\n", " path = Path(filepath)\n", " current_path = Path(path.parent.absolute())\n", " \n", " # find corresponding text file\n", " txt_file_path = os.path.join(current_path, \"txt\", filename.replace('.wav','.txt'))\n", " if not os.path.exists(txt_file_path):\n", " parent_path = Path(current_path.parent.absolute())\n", " txt_file_path = os.path.join(parent_path, \"txt\", filename.replace('.wav', '.txt'))\n", " if not os.path.exists(txt_file_path):\n", " break\n", " norm_text_file_path = txt_file_path.replace(\".txt\", \"_norm.txt\")\n", " text = open(txt_file_path, \"r\").read()\n", " if os.path.exists(norm_text_file_path):\n", " norm_text = open(norm_text_file_path, 'r').read()\n", " else : \n", " norm_text = text\n", " \n", " # ignore file if text contains digits, otherwise copy wav file and keep metadata to memory \n", " if not any(chr.isdigit() for chr in text):\n", " data_line = filename.replace(\".wav\", \"\") + '|' + text + '|' + norm_text\n", " data.append(data_line)\n", " copyfile(filepath, os.path.join(wav_dir, filename))\n", "\n", " except RuntimeError:\n", " print(filepath + \" not recognized and ignored.\") \n", "\n", "# save metadata\n", "with open(os.path.join(reorganized_dataset_path, \"synpaflex.txt\"), 'w') as f:\n", " for item in data:\n", " f.write(\"%s\\n\" % item)\n", "\n", "# display reorganized dataset total duration\n", "duration_hours = total_duration / 3600\n", "print(\"total duration = \" + str(f\"{duration_hours:.2f}\") + \" hours\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }