#!/home/haroon/python_virtual_envs/whisper_fine_tuning/bin/python # from datasets import load_dataset, DatasetDict # common_voice = DatasetDict() # common_voice["train"] = load_dataset("mozilla-foundation/common_voice_11_0", # "hi", # split="train+validation", # token=True) # common_voice["test"] = load_dataset("mozilla-foundation/common_voice_11_0", # "hi", # split="test", # token=True) # common_voice = common_voice.remove_columns([ # "accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"]) # # # common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000)) # # # # def prepare_dataset(batch): # # audio = batch["audio"] # # audio["array"] # # audio["sampling_rate"] from datasets import Dataset import pandas as pd import numpy as np import soundfile as sf from scipy.signal import resample def convert_mp3_to_numpy(mp3_path: str) -> np.array: # Converts an MP3 file to a NumPy array with 16000 Hz mono and float64 data type. # Returns a NumPy array containing the audio data. # Raises ValueError: If the audio is not mono or the sampling rate is not supported. # Read the audio data using soundfile audio, sample_rate = sf.read(mp3_path) # Check if audio is mono if audio.ndim != 1: raise ValueError("Audio must be mono channel.") # Resample audio to 16000 Hz using scipy.signal.resample if sample_rate != 16000: audio = resample(audio, int(audio.shape[0] * (16000 / sample_rate))) # Convert to NumPy array with float64 data type audio = np.array(audio, dtype=np.float64) return audio def load_dataset(csv_file: str, audio_dir: str) -> DatasetDict: # data = pd.read_csv(csv_file, sep='|', names=['path', 'sentence'], header=None) # data = pd.read_csv(filepath_or_buffer=csv_file, sep='|', header=None, index_col=None) df = pd.read_csv(filepath_or_buffer=csv_file, sep='|', header=None, names=['path', 'sentence']) df['path'] = audio_dir + df['path'] + '.mp3' # df['path'] # df['sentence'] print(df) # Create a Dataset from the data path_list = df['path'].tolist() # num_rows = df.shape[0] full_dataset = Dataset.from_dict({ 'path': path_list, 'sentence': df['sentence'].tolist(), 'audio': [{ 'path': path, 'array': convert_mp3_to_numpy(path), 'sampling_rate': 16000} for path in path_list] }) # 'path', 'array', 'sampling_rate' # Split the dataset into train and test sets # dataset_dict = DatasetDict() # train_dataset = full_dataset.train_test_split(test_size=0.2, seed=42)['train'] # test_dataset = full_dataset.train_test_split(test_size=0.2, seed=42)['test'] # # dataset_dict['train'] = train_dataset # dataset_dict['test'] = test_dataset # # OR: return full_dataset.train_test_split(test_size=0.2, seed=42) # Load data from the CSV file # cat ../../IMS-Toucan_May_2023/Data/Fiftylangmale/metadata_base.csv | cut -d'|' -f1,2 > Data/Fiftylangmale/metadata_base.csv # head -4 Data/Fiftylangmale/metadata_base.csv > Data/Fiftylangmale/metadata_small.csv # /home/haroon/git_repos/whisper_related/community-events/Data/Fiftylangmale/mp3/ base_data_dir = '/home/haroon/git_repos/whisper_related/community-events/Data' audio_dir = f'{base_data_dir}/Fiftylangmale/mp3/' csv_file = f'{base_data_dir}/Fiftylangmale/metadata_small.csv' # csv_file = '/home/haroon/git_repos/whisper_related/community-events/Data/Fiftylangmale/metadata_small.csv' # csv_file = os.path.join(data_dir, "data.csv") dataset_dict = load_dataset(csv_file=csv_file, audio_dir=audio_dir) # # Example usage # mp3_file = "your_audio.mp3" # Replace with your actual MP3 file path # audio_data = convert_mp3_to_numpy(mp3_file) # # # Now you can use the audio_data as a NumPy array # print(audio_data.shape) # Output: (audio_length,) for mono audio # print(audio_data.dtype) # Output: float64 ''' a = common_voice type(a) -> datasets.dataset_dict.DatasetDict a.keys() -> 'train' type(a['train']) -> datasets.arrow_dataset.Dataset a['train'] -> Dataset({ features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'] type(a['train']['path']) -> list type(a['train']['sentence']) -> list type(a['train']['audio']) -> list type(a['train']['path'][0]) -> str a['train']['path'][0] -> '/home/haroon/.cache/huggingface/datasets/downloads/extracted/19da7992f84c9f6fbb0b9f00f7d850f460c81cf35b4cf1f0c78fee7c0a9ceec8/hi_train_0/common_voice_hi_26008353.mp3' type(a['train']['sentence'][0]) -> str a['train']['sentence'][0] -> 'हमने उसका जन्मदिन मनाया।' audio0 = a['train']['audio'][0] type(audio0) -> dict audio0.keys() -> 'path', 'array', 'sampling_rate' type(audio0['path']) -> str audio0['path'] -> '/home/haroon/.cache/huggingface/datasets/downloads/extracted/19da7992f84c9f6fbb0b9f00f7d850f460c81cf35b4cf1f0c78fee7c0a9ceec8/hi_train_0/common_voice_hi_26008353.mp3' type(audio0['array']) -> numpy.ndarray audio0_array = audio0['array'] type(audio0_array[0]) -> numpy.float64 type(audio0['sampling_rate']) -> int audio0['sampling_rate'] -> 48000 ''' ''' print(common_voice["train"][0].keys()) common_voice["train"][0] --> keys: 'audio', 'sentence' common_voice["train"][0]['audio'] -> keys: 'path': str, 'array': list(float), 'sampling_rate': int common_voice["train"][0]['sentence'] -> text '''