In [64]:
import os
import glob

def count_files_by_extension(path, extension):
    """
    path : root path to check,
    extension : .wav, ...
    """
    total_count = 0
    
    for foldername, subfolders, filenames in os.walk(path):
        files = glob.glob(os.path.join(foldername, f"*.{extension}"))
        total_count += len(files)
    
    return total_count


root_path = "./Cleaned_MITI/dataset_2"

In [65]:
num_wav_files = count_files_by_extension(root_path, "wav")
num_txt_files = count_files_by_extension(root_path, "txt")

In [66]:
print(f"Số lượng file WAV: {num_wav_files}")
print(f"Số lượng file text: {num_txt_files}")

Số lượng file WAV: 2099
Số lượng file text: 2099


In [70]:
import os
import random
import wave


def get_random_wav_file_info(folder_path):
    for foldername, subfolders, filenames in os.walk(folder_path):     
        wav_files = glob.glob(f"{foldername}/*.wav")
    
    if not wav_files:
        return None, None
    
    random_wav_file = random.choice(wav_files)
    
    with wave.open(random_wav_file, 'rb') as wav_file:
        sample_rate = wav_file.getframerate()
        channels = wav_file.getnchannels()
    
    return sample_rate, channels

path_to_wav_folder = "./Cleaned_MITI/dataset_2/"

sample_rate, channels = get_random_wav_file_info(path_to_wav_folder)

if sample_rate is not None and channels is not None:
    print(f"Tần số mẫu (sample rate): {sample_rate} Hz")
    print(f"Số kênh (channels): {channels}")
else:
    print("Nothing.")


Tần số mẫu (sample rate): 44100 Hz
Số kênh (channels): 1


In [None]:
def remove_special_characters(input_string):
    special_characters = ['.', ',', '-', '_', " "]
    
    # Duyệt qua từng ký tự trong chuỗi
    filtered_string = ''.join([char for char in input_string if char not in special_characters])
    
    return filtered_string

# Sử dụng hàm
input_string = "Hello, this_is_a-test.string!"
output_string = remove_special_characters(input_string)
print(output_string)  # Kết quả: "Hello thisisa teststring"


In [86]:
import os
import csv
from tqdm import tqdm
import glob
from transformers.models.whisper.english_normalizer import BasicTextNormalizer
normalizer = BasicTextNormalizer()
def create_csv_from_wav_folder(folder_path, output_csv_file):
    with open(output_csv_file, mode='w', newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(['path', 'name', 'sentence'])

        for person_foldername, _, _ in os.walk(folder_path):
            if "person_" in person_foldername:
                wav_files = glob.glob(os.path.join(person_foldername, "*.wav"))

                for wav_file_path in tqdm(wav_files):
                    wav_filename = os.path.basename(wav_file_path)
                    text_filename = os.path.splitext(wav_filename)[0] + ".txt"
                    text_file_path = os.path.join(person_foldername, text_filename)

                    if os.path.exists(text_file_path):
                        with open(text_file_path, 'r') as txt_file:
                           text_content =  normalizer(txt_file.read())
                    else:
                        text_content = "Not found."

                    csv_writer.writerow([wav_file_path, wav_filename, text_content])

root_path = "./Cleaned_MITI/dataset_2"  
output_csv_file = "MITI.csv"  

create_csv_from_wav_folder(root_path, output_csv_file)


 84%|████████▎ | 164/196 [00:00<00:00, 1629.92it/s]

100%|██████████| 196/196 [00:00<00:00, 1580.86it/s]
100%|██████████| 218/218 [00:00<00:00, 1440.12it/s]
100%|██████████| 216/216 [00:00<00:00, 1364.20it/s]
100%|██████████| 205/205 [00:00<00:00, 1412.14it/s]
100%|██████████| 204/204 [00:00<00:00, 1426.29it/s]
100%|██████████| 220/220 [00:00<00:00, 1511.87it/s]
100%|██████████| 225/225 [00:00<00:00, 1499.30it/s]
100%|██████████| 175/175 [00:00<00:00, 1492.85it/s]
100%|██████████| 220/220 [00:00<00:00, 1496.34it/s]
100%|██████████| 220/220 [00:00<00:00, 1480.81it/s]


In [89]:
import pandas as pd 
data = pd.read_csv('MITI.csv')
len(data)

2099

In [90]:
import csv
import random

def split_csv_file(input_file, output_file1, output_file2, ratio):
    with open(input_file, 'r', newline='', encoding='utf-8') as csvfile:
        csvreader = csv.reader(csvfile)
        header = next(csvreader) 
        
        data = list(csvreader)
        random.shuffle(data)

        total_rows = len(data)
        rows_output_file1 = int(total_rows * ratio)
        rows_output_file2 = total_rows - rows_output_file1
        
        # Split the data into two parts
        data1 = data[:rows_output_file1]
        data2 = data[rows_output_file1:]

    with open(output_file1, 'w', newline='', encoding='utf-8') as csvfile1:
        csvwriter1 = csv.writer(csvfile1, quotechar='|', quoting=csv.QUOTE_MINIMAL)
        csvwriter1.writerow(header)
        csvwriter1.writerows(data1)

    with open(output_file2, 'w', newline='', encoding='utf-8') as csvfile2:
        csvwriter2 = csv.writer(csvfile2, quotechar='|', quoting=csv.QUOTE_MINIMAL)
        csvwriter2.writerow(header)
        csvwriter2.writerows(data2)

input_file = 'MITI.csv'
output_file1 = 'MITI_train.csv'
output_file2 = 'MITI_test.csv'
ratio = 0.8  

split_csv_file(input_file, output_file1, output_file2, ratio)


In [None]:
from datasets import load_dataset, DatasetDict

vivos = DatasetDict()

In [46]:
import os
import numpy as np

import torch
import torchaudio

import pandas as pd
import whisper
import torchaudio.transforms as at
from pathlib import Path

def load_wave(wave_path, sample_rate:int=16000) -> torch.Tensor:
    waveform, sr = torchaudio.load(wave_path, normalize=True)
    if sample_rate != sr:
        waveform = at.Resample(sr, sample_rate)(waveform)
    return waveform



def get_list_files_vin100h(phase, dataset_path='./vin_data/vlsp2020_train_set_02/', text_max_length=10000, audio_max_sample_length=1000000, sample_rate=16000):
    audio_transcript_pair_list = []
    if phase == 'train':
        csv_file = 'vin_train.csv'
    else:
        csv_file = 'vin_test.csv'
    df = pd.read_csv(csv_file)
    for index, row in df.iterrows():
        new_path = Path(row['path'])
        audio_id = index
        text = row['sentence']
        if new_path.exists():
            audio = load_wave(new_path, sample_rate=sample_rate)[0]
            # if len(text) > text_max_length or len(audio) > audio_max_sample_length:
            #     print('skip file:', new_path, 'with len text:', len(text), 'and len audio', len(audio))
            #     continue
            audio_transcript_pair_list.append((audio_id, str(new_path), text))
            print(audio_transcript_pair_list)
    return audio,  audio_transcript_pair_list


In [None]:
get_list_files_vin100h(phase='train')