{ "cells": [ { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "import glob\n", "\n", "def count_files_by_extension(path, extension):\n", " \"\"\"\n", " path : root path to check ,\n", " extension : .wav , ...\n", " \"\"\"\n", "\n", " files = glob.glob(f\"{path}/*.{extension}\")\n", " return len(files)\n", "\n", "\n", "root_path = \"./vin_data/vlsp2020_train_set_02/\"\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "num_wav_files = count_files_by_extension(root_path, \"wav\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "num_txt_files = count_files_by_extension(root_path, \"txt\")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Số lượng file WAV: 56427\n", "Số lượng file text: 56427\n" ] } ], "source": [ "print(f\"Số lượng file WAV: {num_wav_files}\")\n", "print(f\"Số lượng file text: {num_txt_files}\")" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Tần số mẫu (sample rate): 16000 Hz\n", "Số kênh (channels): 1\n" ] } ], "source": [ "import os\n", "import random\n", "import wave\n", "\n", "\n", "def get_random_wav_file_info(folder_path):\n", " wav_files = glob.glob(f\"{folder_path}/*.wav\")\n", " \n", " if not wav_files:\n", " return None, None\n", " \n", " random_wav_file = random.choice(wav_files)\n", " \n", " with wave.open(random_wav_file, 'rb') as wav_file:\n", " sample_rate = wav_file.getframerate()\n", " channels = wav_file.getnchannels()\n", " \n", " return sample_rate, channels\n", "\n", "path_to_wav_folder = \"./vin_data/vlsp2020_train_set_02/\"\n", "\n", "sample_rate, channels = get_random_wav_file_info(path_to_wav_folder)\n", "\n", "if sample_rate is not None and channels is not None:\n", " print(f\"Tần số mẫu (sample rate): {sample_rate} Hz\")\n", " print(f\"Số kênh (channels): {channels}\")\n", "else:\n", " print(\"Nothing.\")\n" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "import os\n", "import csv\n", "from tqdm import tqdm\n", "\n", "def create_csv_from_wav_folder(folder_path, output_csv_file):\n", " wav_files = glob.glob(f\"{folder_path}/*.wav\")\n", "\n", " if not wav_files:\n", " print(\"Không có file WAV nào trong thư mục.\")\n", " return\n", "\n", " # Mở tệp CSV đầu ra và tạo bộ đếm số lượng file WAV\n", " with open(output_csv_file, mode='w', newline='') as csv_file:\n", " csv_writer = csv.writer(csv_file)\n", " csv_writer.writerow(['path', 'name','sentence'])\n", "\n", " for wav_file_path in tqdm(wav_files):\n", "\n", " text_file_path = os.path.splitext(wav_file_path)[0] + \".txt\"\n", " if os.path.exists(text_file_path):\n", " with open(text_file_path, 'r') as txt_file:\n", " text_content = txt_file.read()\n", " else:\n", " text_content = \"Not found.\"\n", "\n", " csv_writer.writerow([wav_file_path, os.path.basename(wav_file_path), sample_rate, channels, text_content])\n" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 56427/56427 [00:37<00:00, 1492.44it/s]\n" ] } ], "source": [ "output_csv_file = \"vin.csv\"\n", "path_to_wav_folder = \"./vin_data/vlsp2020_train_set_02/\"\n", "create_csv_from_wav_folder(path_to_wav_folder, output_csv_file)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pathnamesentence
0./vin_data/vlsp2020_train_set_02/spkyut-201907...spkyut-20190730-utt000000716.wavcây cam canh là loại cây ăn quả dễ trồng dễ ch...
1./vin_data/vlsp2020_train_set_02/database_sa3_...database_sa3_1_150h_15Jan2020_cleaned_utt_0000...những đặc sản vùng miền nổi tiếng như miến don...
2./vin_data/vlsp2020_train_set_02/speaker_544-0...speaker_544-069450-1.wavtrước thông tin này trương nam thành chia sẻ c...
3./vin_data/vlsp2020_train_set_02/database_sa1_...database_sa1_Jan08_Mar19_cleaned_utt_000005361...giống như những nữ hoàng á
4./vin_data/vlsp2020_train_set_02/database_sa2_...database_sa2_Jan4_Feb29_cleaned_utt_0000154206...thay vì phun toàn bộ cánh đồng bằng hóa chất c...
\n", "
" ], "text/plain": [ " path \\\n", "0 ./vin_data/vlsp2020_train_set_02/spkyut-201907... \n", "1 ./vin_data/vlsp2020_train_set_02/database_sa3_... \n", "2 ./vin_data/vlsp2020_train_set_02/speaker_544-0... \n", "3 ./vin_data/vlsp2020_train_set_02/database_sa1_... \n", "4 ./vin_data/vlsp2020_train_set_02/database_sa2_... \n", "\n", " name \\\n", "0 spkyut-20190730-utt000000716.wav \n", "1 database_sa3_1_150h_15Jan2020_cleaned_utt_0000... \n", "2 speaker_544-069450-1.wav \n", "3 database_sa1_Jan08_Mar19_cleaned_utt_000005361... \n", "4 database_sa2_Jan4_Feb29_cleaned_utt_0000154206... \n", "\n", " sentence \n", "0 cây cam canh là loại cây ăn quả dễ trồng dễ ch... \n", "1 những đặc sản vùng miền nổi tiếng như miến don... \n", "2 trước thông tin này trương nam thành chia sẻ c... \n", "3 giống như những nữ hoàng á \n", "4 thay vì phun toàn bộ cánh đồng bằng hóa chất c... " ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd \n", "data = pd.read_csv('vin_test.csv')\n", "data.head(5)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "import csv\n", "import random\n", "\n", "def split_csv_file(input_file, output_file1, output_file2, ratio):\n", " with open(input_file, 'r', newline='', encoding='utf-8') as csvfile:\n", " csvreader = csv.reader(csvfile)\n", " header = next(csvreader) \n", " \n", " data = list(csvreader)\n", " random.shuffle(data)\n", "\n", " total_rows = len(data)\n", " rows_output_file1 = int(total_rows * ratio)\n", " rows_output_file2 = total_rows - rows_output_file1\n", " \n", " # Split the data into two parts\n", " data1 = data[:rows_output_file1]\n", " data2 = data[rows_output_file1:]\n", "\n", " with open(output_file1, 'w', newline='', encoding='utf-8') as csvfile1:\n", " csvwriter1 = csv.writer(csvfile1, quotechar='|', quoting=csv.QUOTE_MINIMAL)\n", " csvwriter1.writerow(header)\n", " csvwriter1.writerows(data1)\n", "\n", " with open(output_file2, 'w', newline='', encoding='utf-8') as csvfile2:\n", " csvwriter2 = csv.writer(csvfile2, quotechar='|', quoting=csv.QUOTE_MINIMAL)\n", " csvwriter2.writerow(header)\n", " csvwriter2.writerows(data2)\n", "\n", "input_file = 'vin.csv'\n", "output_file1 = 'vin_train.csv'\n", "output_file2 = 'vin_test.csv'\n", "ratio = 0.8 \n", "\n", "split_csv_file(input_file, output_file1, output_file2, ratio)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset, DatasetDict\n", "\n", "vivos = DatasetDict()" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "import os\n", "import numpy as np\n", "\n", "import torch\n", "import torchaudio\n", "\n", "import pandas as pd\n", "import whisper\n", "import torchaudio.transforms as at\n", "from pathlib import Path\n", "\n", "def load_wave(wave_path, sample_rate:int=16000) -> torch.Tensor:\n", " waveform, sr = torchaudio.load(wave_path, normalize=True)\n", " if sample_rate != sr:\n", " waveform = at.Resample(sr, sample_rate)(waveform)\n", " return waveform\n", "\n", "\n", "\n", "def get_list_files_vin100h(phase, dataset_path='./vin_data/vlsp2020_train_set_02/', text_max_length=10000, audio_max_sample_length=1000000, sample_rate=16000):\n", " audio_transcript_pair_list = []\n", " if phase == 'train':\n", " csv_file = 'vin_train.csv'\n", " else:\n", " csv_file = 'vin_test.csv'\n", " df = pd.read_csv(csv_file)\n", " for index, row in df.iterrows():\n", " new_path = Path(row['path'])\n", " audio_id = index\n", " text = row['sentence']\n", " if new_path.exists():\n", " audio = load_wave(new_path, sample_rate=sample_rate)[0]\n", " # if len(text) > text_max_length or len(audio) > audio_max_sample_length:\n", " # print('skip file:', new_path, 'with len text:', len(text), 'and len audio', len(audio))\n", " # continue\n", " audio_transcript_pair_list.append((audio_id, str(new_path), text))\n", " print(audio_transcript_pair_list)\n", " return audio, audio_transcript_pair_list\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "get_list_files_vin100h(phase='train')" ] } ], "metadata": { "kernelspec": { "display_name": "DUY", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.17" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }