|
|
|
|
|
import os
|
|
|
import torch
|
|
|
|
|
|
abs_path = os.path.abspath('.')
|
|
|
|
|
|
base_dir = os.path.dirname(abs_path)
|
|
|
|
|
|
os.environ['TRANSFORMERS_CACHE'] = os.path.join(base_dir, 'models_cache')
|
|
|
os.environ['TRANSFORMERS_OFFLINE'] = '0'
|
|
|
os.environ['HF_DATASETS_CACHE'] = os.path.join(base_dir, 'datasets_cache')
|
|
|
os.environ['HF_DATASETS_OFFLINE'] = '0'
|
|
|
|
|
|
|
|
|
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
|
|
print(f"\n\n Device to be used: {device} \n\n")
|
|
|
|
|
|
|
|
|
|
|
|
model_name = "openai/whisper-tiny"
|
|
|
|
|
|
|
|
|
|
|
|
language = "Odia"
|
|
|
task = "transcribe"
|
|
|
print(f"\n\n Loading {model_name} for {language} to {task}...this might take a while.. \n\n")
|
|
|
|
|
|
|
|
|
|
|
|
output_dir = "./"
|
|
|
overwrite_output_dir = True
|
|
|
max_steps = 16000
|
|
|
|
|
|
per_device_train_batch_size = 8
|
|
|
|
|
|
per_device_eval_batch_size = 2
|
|
|
|
|
|
gradient_accumulation_steps = 1
|
|
|
|
|
|
dataloader_num_workers = 0
|
|
|
gradient_checkpointing = False
|
|
|
evaluation_strategy ="steps"
|
|
|
|
|
|
eval_steps = 1000
|
|
|
save_strategy = "steps"
|
|
|
save_steps = 1000
|
|
|
|
|
|
save_total_limit = 5
|
|
|
learning_rate = 1e-5
|
|
|
lr_scheduler_type = "cosine"
|
|
|
warmup_steps = 8000
|
|
|
|
|
|
logging_steps = 25
|
|
|
|
|
|
|
|
|
weight_decay = 0
|
|
|
dropout = 0.1
|
|
|
load_best_model_at_end = True
|
|
|
metric_for_best_model = "wer"
|
|
|
greater_is_better = False
|
|
|
bf16 = True
|
|
|
|
|
|
tf32 = True
|
|
|
|
|
|
generation_max_length = 448
|
|
|
report_to = ["tensorboard"]
|
|
|
predict_with_generate = True
|
|
|
push_to_hub = True
|
|
|
|
|
|
freeze_feature_encoder = False
|
|
|
early_stopping_patience = 10
|
|
|
apply_spec_augment = True
|
|
|
torch_compile = False
|
|
|
optim="adamw_hf"
|
|
|
|
|
|
|
|
|
|
|
|
print("\n\n Loading Datasets...this might take a while..\n\n")
|
|
|
|
|
|
from datasets import load_dataset, DatasetDict, Features, Value, Audio
|
|
|
|
|
|
|
|
|
|
|
|
openslr = DatasetDict()
|
|
|
|
|
|
my_dataset = DatasetDict()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
openslr["train"] = load_dataset("Ranjit/or_in_dataset", split="train+validation", cache_dir=os.path.join(base_dir, 'datasets_cache'), trust_remote_code=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
openslr["test"] = load_dataset("Ranjit/or_in_dataset", split="test", cache_dir=os.path.join(base_dir, 'datasets_cache'), trust_remote_code=True)
|
|
|
|
|
|
|
|
|
|
|
|
print("\n\n Datasets Loaded \n\n")
|
|
|
|
|
|
|
|
|
|
|
|
print(openslr)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("\n OpenSLR-53 - Odia \n")
|
|
|
print(openslr)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from datasets import concatenate_datasets, Audio
|
|
|
|
|
|
sampling_rate = 16000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
openslr = openslr.cast_column("audio", Audio(sampling_rate))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
openslr = openslr.rename_column("transcription", "sentence")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
openslr = openslr.remove_columns(
|
|
|
set(openslr['train'].features.keys()) - {"audio", "sentence"}
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(f'OpenSlR: {openslr["train"][0]["audio"]["array"].dtype}')
|
|
|
print("\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
my_dataset['train'] = concatenate_datasets([openslr['train']])
|
|
|
my_dataset['test'] = concatenate_datasets([openslr['test']])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
my_dataset['train'] = my_dataset['train'].shuffle(seed=10)
|
|
|
|
|
|
print("\n\n AFTER MERGING, train and validation sets are: ")
|
|
|
print(my_dataset)
|
|
|
print("\n")
|
|
|
|
|
|
|
|
|
print("\n FINAL DATASET: \n")
|
|
|
print(my_dataset)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("\n\n Preprocessing Datasets...this might take a while..\n\n")
|
|
|
|
|
|
from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor
|
|
|
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny")
|
|
|
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny", task="transcribe")
|
|
|
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny", task="transcribe")
|
|
|
|
|
|
def prepare_dataset(batch):
|
|
|
|
|
|
audio = batch["audio"]
|
|
|
|
|
|
|
|
|
inputs = processor.feature_extractor(
|
|
|
audio["array"],
|
|
|
sampling_rate=audio["sampling_rate"],
|
|
|
)
|
|
|
batch["input_features"] = inputs.input_features[0]
|
|
|
|
|
|
|
|
|
transcription = batch["sentence"]
|
|
|
|
|
|
|
|
|
batch["labels"] = tokenizer(transcription).input_ids
|
|
|
|
|
|
return batch
|
|
|
|
|
|
|
|
|
my_dataset = my_dataset.map(prepare_dataset,
|
|
|
num_proc=1,
|
|
|
load_from_cache_file=True,
|
|
|
cache_file_names={
|
|
|
"train" : os.path.join(base_dir, 'datasets_cache', 'preprocessed_train_cache_8.arrow'),
|
|
|
"test" : os.path.join(base_dir, 'datasets_cache', 'preprocessed_test_cache_8.arrow'),
|
|
|
}
|
|
|
)
|
|
|
print("\n\n AFTER PREPROCESSING, final train and validation sets are: ")
|
|
|
print(my_dataset)
|
|
|
print("\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("\n Removing UNUSED Cache Files: \n")
|
|
|
try:
|
|
|
|
|
|
|
|
|
print(f"{openslr.cleanup_cache_files()} for openslr")
|
|
|
|
|
|
|
|
|
|
|
|
print(f"{my_dataset.cleanup_cache_files()} for my_dataset")
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"\n\n UNABLE to REMOVE some Cache files. \n Error: {e} \n\n")
|
|
|
|
|
|
from transformers import WhisperForConditionalGeneration
|
|
|
|
|
|
model = WhisperForConditionalGeneration.from_pretrained(model_name)
|
|
|
model = model.to(device)
|
|
|
|
|
|
|
|
|
|
|
|
import torch
|
|
|
|
|
|
from dataclasses import dataclass
|
|
|
from typing import Any, Dict, List, Union
|
|
|
|
|
|
@dataclass
|
|
|
class DataCollatorSpeechSeq2SeqWithPadding:
|
|
|
processor: Any
|
|
|
decoder_start_token_id: int
|
|
|
|
|
|
def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
|
|
|
|
|
|
|
|
|
input_features = [{"input_features": feature["input_features"]} for feature in features]
|
|
|
batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
|
|
|
|
|
|
|
|
|
label_features = [{"input_ids": feature["labels"]} for feature in features]
|
|
|
|
|
|
labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
|
|
|
|
|
|
|
|
|
labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
|
|
|
|
|
|
|
|
|
|
|
|
if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
|
|
|
labels = labels[:, 1:]
|
|
|
|
|
|
batch["labels"] = labels
|
|
|
|
|
|
return batch
|
|
|
|
|
|
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor, decoder_start_token_id=model.config.decoder_start_token_id)
|
|
|
|
|
|
|
|
|
|
|
|
import evaluate
|
|
|
|
|
|
wer_metric = evaluate.load("wer", cache_dir=os.path.join(base_dir, "metrics_cache"))
|
|
|
cer_metric = evaluate.load("cer", cache_dir=os.path.join(base_dir, "metrics_cache"))
|
|
|
|
|
|
do_normalize_eval = False
|
|
|
|
|
|
def compute_metrics(pred):
|
|
|
pred_ids = pred.predictions
|
|
|
label_ids = pred.label_ids
|
|
|
|
|
|
|
|
|
label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
|
|
|
|
|
|
|
|
|
pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
|
|
|
label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)
|
|
|
|
|
|
if do_normalize_eval:
|
|
|
pred_str = [normalizer(pred) for pred in pred_str]
|
|
|
label_str = [normalizer(label) for label in label_str]
|
|
|
|
|
|
wer = 100 * wer_metric.compute(predictions=pred_str, references=label_str)
|
|
|
cer = 100 * cer_metric.compute(predictions=pred_str, references=label_str)
|
|
|
|
|
|
return {"cer": cer, "wer": wer}
|
|
|
|
|
|
|
|
|
|
|
|
print("\n\n Loading Model to Device..\n\n")
|
|
|
|
|
|
|
|
|
model.config.apply_spec_augment = apply_spec_augment
|
|
|
model.config.max_length = generation_max_length
|
|
|
model.config.dropout = dropout
|
|
|
model.config.forced_decoder_ids = None
|
|
|
model.config.suppress_tokens = []
|
|
|
if gradient_checkpointing:
|
|
|
model.config.use_cache = False
|
|
|
if freeze_feature_encoder:
|
|
|
model.freeze_feature_encoder()
|
|
|
|
|
|
model.generation_config.max_length = generation_max_length
|
|
|
|
|
|
|
|
|
from transformers import Seq2SeqTrainingArguments
|
|
|
|
|
|
training_args = Seq2SeqTrainingArguments(
|
|
|
output_dir=output_dir,
|
|
|
overwrite_output_dir=overwrite_output_dir,
|
|
|
max_steps=max_steps,
|
|
|
per_device_train_batch_size=per_device_train_batch_size,
|
|
|
per_device_eval_batch_size=per_device_eval_batch_size,
|
|
|
gradient_accumulation_steps=gradient_accumulation_steps,
|
|
|
gradient_checkpointing=gradient_checkpointing,
|
|
|
dataloader_num_workers=dataloader_num_workers,
|
|
|
evaluation_strategy=evaluation_strategy,
|
|
|
eval_steps=eval_steps,
|
|
|
save_strategy=save_strategy,
|
|
|
save_steps=save_steps,
|
|
|
save_total_limit=save_total_limit,
|
|
|
learning_rate=learning_rate,
|
|
|
lr_scheduler_type=lr_scheduler_type,
|
|
|
warmup_steps=warmup_steps,
|
|
|
logging_steps=logging_steps,
|
|
|
weight_decay=weight_decay,
|
|
|
load_best_model_at_end=load_best_model_at_end,
|
|
|
metric_for_best_model=metric_for_best_model,
|
|
|
greater_is_better=greater_is_better,
|
|
|
bf16=bf16,
|
|
|
tf32=tf32,
|
|
|
torch_compile=torch_compile,
|
|
|
optim=optim,
|
|
|
generation_max_length=generation_max_length,
|
|
|
report_to=report_to,
|
|
|
predict_with_generate=predict_with_generate,
|
|
|
push_to_hub=push_to_hub,
|
|
|
)
|
|
|
|
|
|
from transformers import Seq2SeqTrainer
|
|
|
import transformers as tf
|
|
|
|
|
|
trainer = Seq2SeqTrainer(
|
|
|
args=training_args,
|
|
|
model=model,
|
|
|
train_dataset=my_dataset["train"],
|
|
|
eval_dataset=my_dataset["test"],
|
|
|
data_collator=data_collator,
|
|
|
compute_metrics=compute_metrics,
|
|
|
tokenizer=processor.feature_extractor,
|
|
|
callbacks=[tf.EarlyStoppingCallback(early_stopping_patience=early_stopping_patience)],
|
|
|
)
|
|
|
|
|
|
processor.save_pretrained("best_model")
|
|
|
|
|
|
|
|
|
|
|
|
print("\n\n Training STARTED..\n\n")
|
|
|
|
|
|
train_result = trainer.train()
|
|
|
|
|
|
print("\n\n Training COMPLETED...\n\n")
|
|
|
|
|
|
|
|
|
|
|
|
print("\n\n Evaluating Model & Saving Metrics...\n\n")
|
|
|
|
|
|
processor.save_pretrained(save_directory=output_dir)
|
|
|
|
|
|
|
|
|
metrics = train_result.metrics
|
|
|
trainer.save_metrics("train", metrics)
|
|
|
trainer.save_state()
|
|
|
|
|
|
metrics = trainer.evaluate(
|
|
|
metric_key_prefix="eval",
|
|
|
max_length=training_args.generation_max_length,
|
|
|
num_beams=training_args.generation_num_beams,
|
|
|
)
|
|
|
|
|
|
trainer.save_metrics("eval", metrics)
|
|
|
|
|
|
|
|
|
|
|
|
if push_to_hub:
|
|
|
print("\n\n Pushing to Hub...\n\n")
|
|
|
|
|
|
trainer.create_model_card()
|
|
|
|
|
|
trainer.push_to_hub()
|
|
|
|
|
|
|
|
|
print("\n\n DONEEEEEE \n\n") |