{ "cells": [ { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "import glob\n", "\n", "def count_files_by_extension(path, extension):\n", " \"\"\"\n", " path : root path to check ,\n", " extension : .wav , ...\n", " \"\"\"\n", "\n", " files = glob.glob(f\"{path}/*.{extension}\")\n", " return len(files)\n", "\n", "\n", "root_path = \"./vin_data/vlsp2020_train_set_02/\"\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "num_wav_files = count_files_by_extension(root_path, \"wav\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "num_txt_files = count_files_by_extension(root_path, \"txt\")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Số lượng file WAV: 56427\n", "Số lượng file text: 56427\n" ] } ], "source": [ "print(f\"Số lượng file WAV: {num_wav_files}\")\n", "print(f\"Số lượng file text: {num_txt_files}\")" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Tần số mẫu (sample rate): 16000 Hz\n", "Số kênh (channels): 1\n" ] } ], "source": [ "import os\n", "import random\n", "import wave\n", "\n", "\n", "def get_random_wav_file_info(folder_path):\n", " wav_files = glob.glob(f\"{folder_path}/*.wav\")\n", " \n", " if not wav_files:\n", " return None, None\n", " \n", " random_wav_file = random.choice(wav_files)\n", " \n", " with wave.open(random_wav_file, 'rb') as wav_file:\n", " sample_rate = wav_file.getframerate()\n", " channels = wav_file.getnchannels()\n", " \n", " return sample_rate, channels\n", "\n", "path_to_wav_folder = \"./vin_data/vlsp2020_train_set_02/\"\n", "\n", "sample_rate, channels = get_random_wav_file_info(path_to_wav_folder)\n", "\n", "if sample_rate is not None and channels is not None:\n", " print(f\"Tần số mẫu (sample rate): {sample_rate} Hz\")\n", " print(f\"Số kênh (channels): {channels}\")\n", "else:\n", " print(\"Nothing.\")\n" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "import os\n", "import csv\n", "from tqdm import tqdm\n", "\n", "def create_csv_from_wav_folder(folder_path, output_csv_file):\n", " wav_files = glob.glob(f\"{folder_path}/*.wav\")\n", "\n", " if not wav_files:\n", " print(\"Không có file WAV nào trong thư mục.\")\n", " return\n", "\n", " # Mở tệp CSV đầu ra và tạo bộ đếm số lượng file WAV\n", " with open(output_csv_file, mode='w', newline='') as csv_file:\n", " csv_writer = csv.writer(csv_file)\n", " csv_writer.writerow(['path', 'name','sentence'])\n", "\n", " for wav_file_path in tqdm(wav_files):\n", "\n", " text_file_path = os.path.splitext(wav_file_path)[0] + \".txt\"\n", " if os.path.exists(text_file_path):\n", " with open(text_file_path, 'r') as txt_file:\n", " text_content = txt_file.read()\n", " else:\n", " text_content = \"Not found.\"\n", "\n", " csv_writer.writerow([wav_file_path, os.path.basename(wav_file_path), sample_rate, channels, text_content])\n" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 56427/56427 [00:37<00:00, 1492.44it/s]\n" ] } ], "source": [ "output_csv_file = \"vin.csv\"\n", "path_to_wav_folder = \"./vin_data/vlsp2020_train_set_02/\"\n", "create_csv_from_wav_folder(path_to_wav_folder, output_csv_file)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | path | \n", "name | \n", "sentence | \n", "
---|---|---|---|
0 | \n", "./vin_data/vlsp2020_train_set_02/spkyut-201907... | \n", "spkyut-20190730-utt000000716.wav | \n", "cây cam canh là loại cây ăn quả dễ trồng dễ ch... | \n", "
1 | \n", "./vin_data/vlsp2020_train_set_02/database_sa3_... | \n", "database_sa3_1_150h_15Jan2020_cleaned_utt_0000... | \n", "những đặc sản vùng miền nổi tiếng như miến don... | \n", "
2 | \n", "./vin_data/vlsp2020_train_set_02/speaker_544-0... | \n", "speaker_544-069450-1.wav | \n", "trước thông tin này trương nam thành chia sẻ c... | \n", "
3 | \n", "./vin_data/vlsp2020_train_set_02/database_sa1_... | \n", "database_sa1_Jan08_Mar19_cleaned_utt_000005361... | \n", "giống như những nữ hoàng á | \n", "
4 | \n", "./vin_data/vlsp2020_train_set_02/database_sa2_... | \n", "database_sa2_Jan4_Feb29_cleaned_utt_0000154206... | \n", "thay vì phun toàn bộ cánh đồng bằng hóa chất c... | \n", "