community-events / whisper-fine-tuning-event /fine-tune-whisper-non-streaming-no_comments_data_loading_only.py
showgan's picture
Training in progress, step 1000
72621ec verified
#!/home/haroon/python_virtual_envs/whisper_fine_tuning/bin/python
# from datasets import load_dataset, DatasetDict
# common_voice = DatasetDict()
# common_voice["train"] = load_dataset("mozilla-foundation/common_voice_11_0",
# "hi",
# split="train+validation",
# token=True)
# common_voice["test"] = load_dataset("mozilla-foundation/common_voice_11_0",
# "hi",
# split="test",
# token=True)
# common_voice = common_voice.remove_columns([
# "accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"])
#
# # common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))
#
#
# # def prepare_dataset(batch):
# # audio = batch["audio"]
# # audio["array"]
# # audio["sampling_rate"]
from datasets import Dataset
import pandas as pd
import numpy as np
import soundfile as sf
from scipy.signal import resample
def convert_mp3_to_numpy(mp3_path: str) -> np.array:
# Converts an MP3 file to a NumPy array with 16000 Hz mono and float64 data type.
# Returns a NumPy array containing the audio data.
# Raises ValueError: If the audio is not mono or the sampling rate is not supported.
# Read the audio data using soundfile
audio, sample_rate = sf.read(mp3_path)
# Check if audio is mono
if audio.ndim != 1:
raise ValueError("Audio must be mono channel.")
# Resample audio to 16000 Hz using scipy.signal.resample
if sample_rate != 16000:
audio = resample(audio, int(audio.shape[0] * (16000 / sample_rate)))
# Convert to NumPy array with float64 data type
audio = np.array(audio, dtype=np.float64)
return audio
def load_dataset(csv_file: str, audio_dir: str) -> DatasetDict:
# data = pd.read_csv(csv_file, sep='|', names=['path', 'sentence'], header=None)
# data = pd.read_csv(filepath_or_buffer=csv_file, sep='|', header=None, index_col=None)
df = pd.read_csv(filepath_or_buffer=csv_file, sep='|', header=None, names=['path', 'sentence'])
df['path'] = audio_dir + df['path'] + '.mp3'
# df['path']
# df['sentence']
print(df)
# Create a Dataset from the data
path_list = df['path'].tolist()
# num_rows = df.shape[0]
full_dataset = Dataset.from_dict({
'path': path_list,
'sentence': df['sentence'].tolist(),
'audio': [{
'path': path,
'array': convert_mp3_to_numpy(path),
'sampling_rate': 16000} for path in path_list]
})
# 'path', 'array', 'sampling_rate'
# Split the dataset into train and test sets
# dataset_dict = DatasetDict()
# train_dataset = full_dataset.train_test_split(test_size=0.2, seed=42)['train']
# test_dataset = full_dataset.train_test_split(test_size=0.2, seed=42)['test']
#
# dataset_dict['train'] = train_dataset
# dataset_dict['test'] = test_dataset
#
# OR:
return full_dataset.train_test_split(test_size=0.2, seed=42)
# Load data from the CSV file
# cat ../../IMS-Toucan_May_2023/Data/Fiftylangmale/metadata_base.csv | cut -d'|' -f1,2 > Data/Fiftylangmale/metadata_base.csv
# head -4 Data/Fiftylangmale/metadata_base.csv > Data/Fiftylangmale/metadata_small.csv
# /home/haroon/git_repos/whisper_related/community-events/Data/Fiftylangmale/mp3/
base_data_dir = '/home/haroon/git_repos/whisper_related/community-events/Data'
audio_dir = f'{base_data_dir}/Fiftylangmale/mp3/'
csv_file = f'{base_data_dir}/Fiftylangmale/metadata_small.csv'
# csv_file = '/home/haroon/git_repos/whisper_related/community-events/Data/Fiftylangmale/metadata_small.csv'
# csv_file = os.path.join(data_dir, "data.csv")
dataset_dict = load_dataset(csv_file=csv_file, audio_dir=audio_dir)
# # Example usage
# mp3_file = "your_audio.mp3" # Replace with your actual MP3 file path
# audio_data = convert_mp3_to_numpy(mp3_file)
#
# # Now you can use the audio_data as a NumPy array
# print(audio_data.shape) # Output: (audio_length,) for mono audio
# print(audio_data.dtype) # Output: float64
'''
a = common_voice
type(a) -> datasets.dataset_dict.DatasetDict
a.keys() -> 'train'
type(a['train']) -> datasets.arrow_dataset.Dataset
a['train'] -> Dataset({
features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment']
type(a['train']['path']) -> list
type(a['train']['sentence']) -> list
type(a['train']['audio']) -> list
type(a['train']['path'][0]) -> str
a['train']['path'][0] -> '/home/haroon/.cache/huggingface/datasets/downloads/extracted/19da7992f84c9f6fbb0b9f00f7d850f460c81cf35b4cf1f0c78fee7c0a9ceec8/hi_train_0/common_voice_hi_26008353.mp3'
type(a['train']['sentence'][0]) -> str
a['train']['sentence'][0] -> 'हमने उसका जन्मदिन मनाया।'
audio0 = a['train']['audio'][0]
type(audio0) -> dict
audio0.keys() -> 'path', 'array', 'sampling_rate'
type(audio0['path']) -> str
audio0['path'] -> '/home/haroon/.cache/huggingface/datasets/downloads/extracted/19da7992f84c9f6fbb0b9f00f7d850f460c81cf35b4cf1f0c78fee7c0a9ceec8/hi_train_0/common_voice_hi_26008353.mp3'
type(audio0['array']) -> numpy.ndarray
audio0_array = audio0['array']
type(audio0_array[0]) -> numpy.float64
type(audio0['sampling_rate']) -> int
audio0['sampling_rate'] -> 48000
'''
'''
print(common_voice["train"][0].keys())
common_voice["train"][0] --> keys: 'audio', 'sentence'
common_voice["train"][0]['audio'] -> keys: 'path': str, 'array': list(float), 'sampling_rate': int
common_voice["train"][0]['sentence'] -> text
'''