fine_tuning_dir = "/home/kevingeng/Disk2/laronix/laronix_automos/fine_tuned/SSD/model/Org_Tony_John_Negel_Train_557_Dev_79_Test_81/checkpoint-160" """ TODO: + [ ] Data load + [ ] Train / Test / Dev spilt + [ ] Train / Test Phase + [ ] Logging with Train / Dev / Test Loss + [ ] Evalutation metrics """ import pdb import string from pathlib import Path import evaluate import librosa import torch import torch.nn as nn from datasets import Dataset, concatenate_datasets, load_dataset from transformers import AutoProcessor wer = evaluate.load("wer") torch.cuda.set_device("cuda:0") audio_dir = "./data/Patient_sil_trim_16k_normed_5_snr_40" healthy_dir = "./data/Healthy" Fary_PAL_30 = "./data/Fary_PAL_p326_20230110_30" John_p326 = "./data/John_p326/output" John_video = "./data/20230103_video" ## train p326_300_dir = "./data/John_p326_large" P1tony_arthur = "data/Participant1_Tony_Recording/CLEAN_SENTENCES/SCRIPTED/Arthur_the_Rat/PAL" P1tony_rainbow = "data/Participant1_Tony_Recording/CLEAN_SENTENCES/SCRIPTED/Rainbow_Passage/Laronix" P1tony = "data/Participant1_Tony_Recording/CLEAN_SENTENCES/CONVERSATIONAL/PAL" P4Negel = 'data/4_negal_152_clean_all' def dataclean(example): if example["audio"]["sampling_rate"] != 16000: resampled_audio = librosa.resample( y=example["audio"]["array"], orig_sr=example["audio"]["sampling_rate"], target_sr=16000, ) return { "audio": { "path": example["audio"]["path"], "array": resampled_audio, "sampling_rate": 16000, }, "transcription": example["transcription"] .upper() .translate(str.maketrans("", "", string.punctuation)), } else: return { "transcription": example["transcription"] .upper() .translate(str.maketrans("", "", string.punctuation)) } P1tony_dataset = load_dataset("audiofolder", data_dir=P1tony, split="train") P1tony_dataset = P1tony_dataset.map(dataclean) P1tony_scripted1 = load_dataset( "audiofolder", data_dir=P1tony_rainbow, split="train" ) P1tony_scripted2 = load_dataset( "audiofolder", data_dir=P1tony_arthur, split="train" ) P1tony_scripted1 = P1tony_scripted1.map(dataclean) P1tony_scripted2 = P1tony_scripted2.map(dataclean) P1tony_scripted = concatenate_datasets([P1tony_scripted1, P1tony_scripted2]) class ChangeSampleRate(nn.Module): def __init__(self, input_rate: int, output_rate: int): super().__init__() self.output_rate = output_rate self.input_rate = input_rate def forward(self, wav: torch.tensor) -> torch.tensor: # Only accepts 1-channel waveform input wav = wav.view(wav.size(0), -1) new_length = wav.size(-1) * self.output_rate // self.input_rate indices = torch.arange(new_length) * ( self.input_rate / self.output_rate ) round_down = wav[:, indices.long()] round_up = wav[:, (indices.long() + 1).clamp(max=wav.size(-1) - 1)] output = round_down * (1.0 - indices.fmod(1.0)).unsqueeze( 0 ) + round_up * indices.fmod(1.0).unsqueeze(0) return output # resample and clean text data def dataclean(example): # pdb.set_trace() if example["audio"]["sampling_rate"] != 16000: resampled_audio = librosa.resample( y=example["audio"]["array"], orig_sr=example["audio"]["sampling_rate"], target_sr=16000, ) return { "audio": { "path": example["audio"]["path"], "array": resampled_audio, "sampling_rate": 16000, }, "transcription": example["transcription"] .upper() .translate(str.maketrans("", "", string.punctuation)), } else: return { "transcription": example["transcription"] .upper() .translate(str.maketrans("", "", string.punctuation)) } # processor = AutoFeatureExtractor.from_pretrained( # "facebook/wav2vec2-base-960h" # ) processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") def prepare_dataset(batch): audio = batch["audio"] batch = processor( audio["array"], sampling_rate=audio["sampling_rate"], text=batch["transcription"], ) batch["input_length"] = len(batch["input_values"][0]) return batch src_dataset = load_dataset("audiofolder", data_dir=audio_dir, split="train") src_dataset = src_dataset.map(dataclean) p326_300_dataset = load_dataset( "audiofolder", data_dir=p326_300_dir, split="train" ) p326_300_dataset = p326_300_dataset.map(dataclean) P4Negel_dataset = load_dataset("audiofolder", data_dir=P4Negel, split="train") P4Negel_dataset = P4Negel_dataset.map(dataclean) healthy_test_dataset = load_dataset( "audiofolder", data_dir=healthy_dir, split="train" ) healthy_test_dataset = healthy_test_dataset.map(dataclean) Fary_PAL_test_dataset = load_dataset( "audiofolder", data_dir=Fary_PAL_30, split="train" ) Fary_PAL_test_dataset = Fary_PAL_test_dataset.map(dataclean) John_p326_test_dataset = load_dataset( "audiofolder", data_dir=John_p326, split="train" ) John_p326_test_dataset = John_p326_test_dataset.map(dataclean) John_video_test_dataset = load_dataset( "audiofolder", data_dir=John_video, split="train" ) John_video_test_dataset = John_video_test_dataset.map(dataclean) def train_dev_test_split( dataset: Dataset, dev_rate=0.1, test_rate=0.1, seed=1 ): """ input: dataset dev_rate, test_rate seed ------- Output: dataset_dict{"train", "dev", "test"} """ train_dev_test = dataset.train_test_split(test_size=test_rate, seed=seed) test = train_dev_test["test"] train_dev = train_dev_test["train"] # pdb.set_trace() if len(train_dev) <= int(len(dataset) * dev_rate): train = Dataset.from_dict({"audio": [], "transcription": []}) dev = train_dev else: train_dev = train_dev.train_test_split( test_size=int(len(dataset) * dev_rate), seed=seed ) train = train_dev["train"] dev = train_dev["test"] return train, dev, test P1tony_train, P1tony_dev, P1tony_test = train_dev_test_split( P1tony_dataset, dev_rate=0.5, test_rate=0.5, seed=1 ) P1tony_train_ = concatenate_datasets([P1tony_train, P1tony_scripted]) # train_dev / test ds = src_dataset.train_test_split(test_size=0.1, seed=1) # dataset_libri = load_dataset( # "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation" # ) train_dev = ds["train"] # train / dev train_dev = train_dev.train_test_split( test_size=int(len(src_dataset) * 0.1), seed=1 ) # Tony Tony_train = P1tony_train_ Tony_dev = P1tony_dev Tony_test = P1tony_test # John John_train, John_dev, John_test = train_dev_test_split(p326_300_dataset, dev_rate=0.1, test_rate=0.1) # Negel Negel_train, Negel_dev, Negel_test = train_dev_test_split(P4Negel_dataset, dev_rate=0.1, test_rate=0.1) # train/dev/test train = train_dev["train"] test = ds["test"] dev = train_dev["test"] # combined combine_train = concatenate_datasets([train, Tony_train, John_train, Negel_train]) conbine_dev = concatenate_datasets([dev, Tony_dev, John_dev, Negel_dev]) conbine_test = concatenate_datasets([test, Tony_test, John_test, Negel_test]) # encoded_train = combine_train.map(prepare_dataset, num_proc=4) # encoded_dev = conbine_dev.map(prepare_dataset, num_proc=4) # encoded_test = conbine_test.map(prepare_dataset, num_proc=4) # # extra_test # encoded_Fary = Fary_PAL_test_dataset.map(prepare_dataset, num_proc=4) # encoded_healthy = healthy_test_dataset.map(prepare_dataset, num_proc=4) # encoded_ori_test = test.map(prepare_dataset, num_proc=4) # encoded_Tony_test = Tony_test.map(prepare_dataset, num_proc=4) # encoded_John_test = John_test.map(prepare_dataset, num_proc=4) # encoded_Negel_test = Negel_test.map(prepare_dataset, num_proc=4) # encoded_train = train.map(prepare_dataset, num_proc=4) # encoded_dev = dev.map(prepare_dataset, num_proc=4) # p326_encoded_train = p326_300_dataset.map(prepare_dataset, num_proc=4) # combine large p326 in to training set # encoded_train = concatenate_datasets([encoded_train, p326_encoded_train]) # encoded_John_p326 = John_p326_test_dataset.map(prepare_dataset, num_proc=4) # encoded_John_video = John_video_test_dataset.map(prepare_dataset, num_proc=4) # pdb.set_trace() import numpy as np WER = evaluate.load("wer") ## Whisper decoding from transformers import (Seq2SeqTrainer, Seq2SeqTrainingArguments, WhisperFeatureExtractor, WhisperForConditionalGeneration, WhisperModel, WhisperProcessor, WhisperTokenizer) processor = WhisperProcessor.from_pretrained("openai/whisper-medium") model = WhisperForConditionalGeneration.from_pretrained( fine_tuning_dir, ).to("cuda:0") # model = WhisperForConditionalGeneration.from_pretrained( # "openai/whisper-medium", # ).to("cuda:0") tokenizer = WhisperTokenizer.from_pretrained( "openai/whisper-medium", language="English", task="transcribe" ) from pathlib import Path id = Path(fine_tuning_dir).stem pdb.set_trace() # tokenizer.push_to_hub("KevinGeng/%s" % id) # import pdb feature_extractor = WhisperFeatureExtractor.from_pretrained( "openai/whisper-medium" ) def whisper_prepare_dataset(batch): # load and resample audio data from 48 to 16kHz audio = batch["audio"] # compute log-Mel input features from input audio array batch["input_features"] = feature_extractor( audio["array"], sampling_rate=audio["sampling_rate"] ).input_features[0] # encode target text to label ids batch["labels"] = tokenizer(batch["transcription"]).input_ids return batch torch.cuda.empty_cache() def my_map_to_pred(batch): # pdb.set_trace() audio = batch["audio"] input_features = processor( audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt", ).input_features # batch["reference"] = whisper_processor.tokenizer._normalize(batch['text']) batch["reference"] = processor.tokenizer._normalize(batch["transcription"]) with torch.no_grad(): # predicted_ids = whisper_model.generate(input_features.to("cuda"))[0] predicted_ids = model.generate(input_features.to("cuda"))[0] transcription = model.decode(predicted_ids) batch["prediction"] = model.tokenizer._normalize(transcription) return batch from dataclasses import dataclass from typing import Any, Dict, List, Union import torch @dataclass class DataCollatorSpeechSeq2SeqWithPadding: processor: Any def __call__( self, features: List[Dict[str, Union[List[int], torch.Tensor]]] ) -> Dict[str, torch.Tensor]: # split inputs and labels since they have to be of different lengths and need different padding methods # first treat the audio inputs by simply returning torch tensors input_features = [ {"input_features": feature["input_features"]} for feature in features ] batch = self.processor.feature_extractor.pad( input_features, return_tensors="pt" ) # get the tokenized label sequences label_features = [ {"input_ids": feature["labels"]} for feature in features ] # pad the labels to max length labels_batch = self.processor.tokenizer.pad( label_features, return_tensors="pt" ) # replace padding with -100 to ignore loss correctly labels = labels_batch["input_ids"].masked_fill( labels_batch.attention_mask.ne(1), -100 ) # if bos token is appended in previous tokenization step, # cut bos token here as it's append later anyways if ( (labels[:, 0] == self.processor.tokenizer.bos_token_id) .all() .cpu() .item() ): labels = labels[:, 1:] batch["labels"] = labels return batch data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor) def compute_metrics(pred): pred_ids = pred.predictions label_ids = pred.label_ids # replace -100 with the pad_token_id label_ids[label_ids == -100] = tokenizer.pad_token_id # we do not want to group tokens when computing the metrics pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True) label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True) wer = 100 * WER.compute(predictions=pred_str, references=label_str) return {"wer": wer} encoded_train = combine_train.map(whisper_prepare_dataset, num_proc=4) encoded_dev = conbine_dev.map(whisper_prepare_dataset, num_proc=4) encoded_test = conbine_test.map(whisper_prepare_dataset, num_proc=4) # extra_test encoded_ori_test = test.map(whisper_prepare_dataset, num_proc=4) # 7 / 16 encoded_Tony_test = Tony_test.map(whisper_prepare_dataset, num_proc=4) # 0 / 19 encoded_John_test = John_test.map(whisper_prepare_dataset, num_proc=4) # 0 / 30 # [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False] encoded_Negel_test = Negel_test.map(whisper_prepare_dataset, num_proc=4) # 12 / 16 # [False, True, True, True, True, True, True, False, True, True, True, True, True, True, False, False] encoded_Fary = Fary_PAL_test_dataset.map(whisper_prepare_dataset, num_proc=4) # 12 / 30 # [True, True, True, True, True, False, False, False, True, False, False, True, False, False, True, False, True, True, False, True, True, False, False, False, False, False, False, False, False, False] encoded_healthy = healthy_test_dataset.map(whisper_prepare_dataset, num_proc=4) # 5 / 160 # [False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, True, True, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False] # Make sure the content Variablity train_tuple = tuple(encoded_train['transcription']) dev_tuple = tuple(encoded_dev['transcription']) train_dev_tuple = (train_tuple + dev_tuple) pdb.set_trace() new_encoded_test = encoded_test.select(np.where(np.array([False if x in train_dev_tuple else True for x in encoded_test['transcription']]))[0]) new_encoded_ori_test = encoded_ori_test.select(np.where(np.array([False if x in train_dev_tuple else True for x in encoded_ori_test['transcription']]))[0]) new_encoded_Tony_test = encoded_Tony_test.select(np.where(np.array([False if x in train_dev_tuple else True for x in encoded_Tony_test['transcription']]))[0]) new_encoded_John_test = encoded_John_test.select(np.where(np.array([False if x in train_dev_tuple else True for x in encoded_John_test['transcription']]))[0]) new_encoded_Negel_test = encoded_Negel_test.select(np.where(np.array([False if x in train_dev_tuple else True for x in encoded_Negel_test['transcription']]))[0]) new_encoded_Fary = encoded_Fary.select(np.where(np.array([False if x in train_dev_tuple else True for x in encoded_Fary['transcription']]))[0]) new_encoded_healthy = encoded_healthy.select(np.where(np.array([False if x in train_dev_tuple else True for x in encoded_healthy['transcription']]))[0]) pdb.set_trace() torch.cuda.empty_cache() training_args = Seq2SeqTrainingArguments( output_dir=fine_tuning_dir, # change to a repo name of your choice per_device_train_batch_size=8, gradient_accumulation_steps=1, # increase by 2x for every 2x decrease in batch size learning_rate=1e-5, warmup_steps=50, max_steps=1000, gradient_checkpointing=True, fp16=True, evaluation_strategy="steps", save_strategy="steps", per_device_eval_batch_size=8, predict_with_generate=True, generation_max_length=512, save_steps=20, eval_steps=20, logging_steps=10, report_to=["tensorboard"], load_best_model_at_end=True, metric_for_best_model="wer", greater_is_better=False, save_total_limit=10, push_to_hub=False, ) from transformers import EarlyStoppingCallback, Trainer, TrainingArguments trainer = Seq2SeqTrainer( args=training_args, model=model, train_dataset=encoded_train, eval_dataset=encoded_dev, data_collator=data_collator, compute_metrics=compute_metrics, tokenizer=processor.feature_extractor, callbacks=[EarlyStoppingCallback(early_stopping_patience=10)], ) # callbacks=[EvalLoggingCallback()] pdb.set_trace() result_dict = { "Ori_Test": trainer.evaluate(encoded_ori_test), "Tony_Test": trainer.evaluate(encoded_Tony_test), "John_Test": trainer.evaluate(encoded_John_test), "Negel_Test": trainer.evaluate(encoded_Negel_test), "Zeroshot_Fary_Test": trainer.evaluate(encoded_Fary), "Healthy_Test": trainer.evaluate(encoded_healthy), } # print(result_dict) pdb.set_trace() trainer.evaluate(encoded_test) trainer.evaluate(new_encoded_test) new_result_dict = { "Ori_Test": trainer.evaluate(new_encoded_ori_test), # 'eval_wer': 12.345679012345679, "Tony_Test": trainer.evaluate(new_encoded_Tony_test), # 'eval_wer': 25.0, "John_Test": trainer.evaluate(new_encoded_John_test), "Negel_Test": trainer.evaluate(new_encoded_Negel_test), # 2.08 "Zeroshot_Fary_Test": trainer.evaluate(new_encoded_Fary), ## 11.49 "Healthy_Test": trainer.evaluate(new_encoded_healthy), } print(new_result_dict) # pdb.set_trace() # result_dict = { # "Ori_Test": trainer.evaluate(encoded_ori_test), # "Tony_Test": trainer.evaluate(encoded_Tony_test), # "John_Test": trainer.evaluate(encoded_John_test), # "Negel_Test": trainer.evaluate(encoded_Negel_test), # "Zeroshot_Fary_Test": trainer.evaluate(encoded_Fary), # "Healthy_Test": trainer.evaluate(encoded_healthy), # } # pdb.set_trace() # # Evaluation # model.push_to_hub("KevinGeng/%s" % id)