In [None]:
# This notebook is currently designed for a GPU using fp16. Hyperparameters however are barely tuned.

In [None]:
import random
import torch
from pathlib import Path

In [None]:
EXPERIMENT_NAME = '00'
DATA_PATH = Path('../data/common_voice/de')

model_dir = Path('decoder_only/de') / EXPERIMENT_NAME
log_dir = model_dir / 'logs'
log_dir.mkdir(exist_ok=True, parents=True)

config = {
 'use_train_frac': 1.0, # When using all samples the wav2vec-outputs take up ~275GB disk space!!(~360,000 samples)
 'use_val_frac': 0.2,
 'encoder_id': 'jonatasgrosman/wav2vec2-large-xlsr-53-german',
 'decoder_id': 'dbmdz/german-gpt2',
 'decoder_pad_token': '_',
 'decoder_bos_token': '~',
 'num_beams': 1,
 'batch_size': 16,
 'weight_decay': 0.,
 'accumulate_grad': 2,
 'max_epochs': 10,
 'max_len': 36 # len(max(tokenizer(common_voice['validation']['sentence'] + common_voice['test']['sentence']).input_ids, key=len))
}

# Feature Extraction

In [None]:
from huggingface_hub import notebook_login
from datasets import load_dataset
from datasets.features import Audio
from transformers import Wav2Vec2Model, Wav2Vec2FeatureExtractor

In [None]:
notebook_login()

In [None]:
def extract_features_to_files(model, feature_extractor, dataset_split, batch_size, output_path):
 output_path = Path(output_path)
 output_path.mkdir(parents=True, exist_ok=True)

 model.eval().cuda()
 for i in range(0, len(dataset_split), batch_size):
 batch = dataset_split[i:i+batch_size]
 sent_batch = batch['sentence']
 audio_batch = batch['audio']
 for i, eg in enumerate(audio_batch):
 # Remove the longest examples, should be only three and these may lead to OOM- or Index-Errors.
 if len(eg['array']) > 300_000:
 print('Too Long.')
 audio_batch.pop(i)
 sent_batch.pop(i)
 features = feature_extractor([eg['array'] for eg in audio_batch],
 sampling_rate=16_000,
 return_tensors='pt',
 padding='longest')

 with torch.no_grad():
 out = model(features.input_values.cuda(), attention_mask=features.attention_mask.cuda())

 assert len(sent_batch) == len(audio_batch) == len(out.last_hidden_state)
 for sent, audio, hs in zip(sent_batch, audio_batch, out.last_hidden_state.bfloat16().cpu()):
 file_name = audio['path'].split('/')[-1]
 torch.save(
 # .clone() is necessary: https://github.com/pytorch/pytorch/issues/1995
 {'sentence': sent, 'wave2vec_features': hs.clone()},
 output_path / file_name
 )

In [None]:
if not DATA_PATH.exists():
 
 common_voice = load_dataset('mozilla-foundation/common_voice_7_0', 'de', use_auth_token=True)
 
 random.seed(419)
 train_inds = list(range(len(common_voice['train'])))
 random.shuffle(train_inds)
 val_inds = list(range(len(common_voice['validation'])))
 random.shuffle(val_inds)
 
 train_inds = train_inds[:int(config['use_train_frac'] * len(train_inds))]
 train = common_voice['train'].select(train_inds)
 train = train.cast_column('audio', Audio(sampling_rate=16_000))
 
 val_inds = val_inds[:int(config['use_val_frac'] * len(val_inds))]
 val = common_voice['validation'].select(val_inds)
 val = val.cast_column('audio', Audio(sampling_rate=16_000))
 
 # Load Model for feature extraction.
 wave2vec_extractor = Wav2Vec2FeatureExtractor.from_pretrained(config['encoder_id'])
 wave2vec = Wav2Vec2Model.from_pretrained(config['encoder_id'])
 wave2vec.eval().cuda()
 
 extract_features_to_files(wave2vec, wave2vec_extractor, train, batch_size=8, output_path=DATA_PATH / 'train')
 extract_features_to_files(wave2vec, wave2vec_extractor, val, batch_size=8, output_path=DATA_PATH / 'val')
 
 wave2vec.cpu()
 torch.cuda.empty_cache()

# Training

In [None]:
import json
from accelerate import Accelerator
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.utils.tensorboard import SummaryWriter
from transformers import AutoTokenizer, Wav2Vec2FeatureExtractor
from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2BaseModelOutput
from data_loading import make_collate_fn, S2TDataset
from wer import calculate_wer # Not what's used in eval.py.
from model import Wav2VecGPT2Model

In [None]:
tokenizer = AutoTokenizer.from_pretrained(config['decoder_id'])
tokenizer.add_special_tokens({'pad_token': config['decoder_pad_token'], 'bos_token': config['decoder_bos_token']})

model = Wav2VecGPT2Model.from_encoder_decoder_pretrained(
 config['encoder_id'], config['decoder_id'], max_length=config['max_len'], num_beams=config['num_beams']
)

model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.pad_token_id = tokenizer.pad_token_id

In [None]:
collate_fn = make_collate_fn(tokenizer)

train_ds = S2TDataset(DATA_PATH / 'train')
train_dl = DataLoader(train_ds, batch_size=config['batch_size'], shuffle=True, collate_fn=collate_fn, num_workers=4)

val_ds = S2TDataset(DATA_PATH / 'val')
val_dl = DataLoader(val_ds, batch_size=config['batch_size'], shuffle=False, collate_fn=collate_fn, num_workers=4)

In [None]:
high_lr_modules = ['cross_attn', 'crossattention', 'enc_to_dec_proj', 'encoder_outputs_pos_emb']
high_lr_params = [p for n, p in model.named_parameters() if any(m in n for m in high_lr_modules)]

optimizer_grouped_parameters = [
 {
 "params": high_lr_params,
 "lr": 5e-4,
 },
 {
 "params": [p for n, p in model.decoder.named_parameters() if not any(m in n for m in high_lr_modules)],
 "lr": 1e-6,
 },
]
optimizer = AdamW(optimizer_grouped_parameters, weight_decay=0.)

In [None]:
accelerator = Accelerator(fp16=True)
print(f'Using {accelerator.device}.')

In [None]:
model, optimizer, train_dl, val_dl = accelerator.prepare(model, optimizer, train_dl, val_dl)

In [None]:
with open(log_dir / 'config.json', 'w') as config_file:
 json.dump(config, config_file, indent=4)
 
writer = SummaryWriter(log_dir)
val_golds = [eg['sentence'] for eg in val_ds]
best_val_wer = 10.
global_train_step = 0

for epoch in range(config['max_epochs']):
 
 model.train()
 model.encoder.cpu() # Model gets moved to gpu for evaluation (see below).
 torch.cuda.empty_cache()
 for batch_step, (encoder_hidden_states, att_mask, input_ids) in enumerate(train_dl):
 if encoder_hidden_states.shape[1] > 1024:
 # That's too long for the position embeddings. 
 # TODO: handle this in model code.
 print(f'SKIPPED: {encoder_hidden_states.shape}')
 continue
 global_train_step += 1
 
 out = model(labels=input_ids, encoder_outputs=Wav2Vec2BaseModelOutput(encoder_hidden_states))
 accelerator.backward(out.loss)
 writer.add_scalar('train_loss', out.loss.item(), global_train_step)
 
 if (batch_step + 1) % config['accumulate_grad'] == 0:
 optimizer.step()
 optimizer.zero_grad()
 
 if batch_step % 300 == 0:
 print(out.loss.item())
 
 model.eval()
 model.cuda() # Necessary for input_ids to be initialized on the correct device.
 val_preds = []
 for encoder_hidden_states, att_mask, _ in val_dl:
 with torch.no_grad():
 generated = model.generate(
 encoder_outputs=Wav2Vec2BaseModelOutput(last_hidden_state=encoder_hidden_states)
 )
 val_preds += tokenizer.batch_decode(generated)
 val_preds = [pred.lstrip('~').rstrip('_') for pred in val_preds]
 wer = calculate_wer(val_preds, val_golds)
 writer.add_scalar('val_wer', wer, epoch)
 print('WER: ', wer)
 
 if wer < best_val_wer:
 torch.save(model.state_dict(), model_dir / 'model.pt')
 print('Saved Model.')
 best_val_wer = wer

In [None]:
# # Load saved pytorch model and save with all necessary model files.
# output_path = model_dir /'full_model'
# 
# model.load_state_dict(torch.load(model_dir / 'model.pt'))
# 
# tokenizer.save_pretrained(output_path)
# wave2vec_extractor = Wav2Vec2FeatureExtractor.from_pretrained(config['encoder_id'])
# wave2vec_extractor.save_pretrained(output_path)
# model.save_pretrained(output_path)