""" Code by Nathan Fradet https://github.com/Natooz """ """ Reorganised from his original Jupyter Notebook into a straight-forward code for quick execution on a supercomputing cluster """ from copy import deepcopy from pathlib import Path from random import shuffle from torch import Tensor, argmax from torch.utils.data import DataLoader from torch.cuda import is_available as cuda_available, is_bf16_supported from torch.backends.mps import is_available as mps_available from transformers import AutoModelForCausalLM, MistralConfig, Trainer, TrainingArguments, GenerationConfig from transformers.trainer_utils import set_seed from evaluate import load as load_metric from miditok import REMI, TokenizerConfig from miditok.pytorch_data import DatasetTok, DataCollator from tqdm import tqdm # Seed set_seed(777) # Our tokenizer's configuration PITCH_RANGE = (21, 109) BEAT_RES = {(0, 1): 8, (1, 2): 4, (2, 4): 2, (4, 8): 1} NUM_VELOCITIES = 24 SPECIAL_TOKENS = ["PAD", "MASK", "BOS", "EOS"] USE_CHORDS = False USE_RESTS = False USE_TEMPOS = True USE_TIME_SIGNATURE = False USE_PROGRAMS = False NUM_TEMPOS = 32 TEMPO_RANGE = (50, 200) # (min_tempo, max_tempo) TOKENIZER_PARAMS = { "pitch_range": PITCH_RANGE, "beat_res": BEAT_RES, "num_velocities": NUM_VELOCITIES, "special_tokens": SPECIAL_TOKENS, "use_chords": USE_CHORDS, "use_rests": USE_RESTS, "use_tempos": USE_TEMPOS, "use_time_signatures": USE_TIME_SIGNATURE, "use_programs": USE_PROGRAMS, "num_tempos": NUM_TEMPOS, "tempo_range": TEMPO_RANGE, } config = TokenizerConfig(**TOKENIZER_PARAMS) # Creates the tokenizer tokenizer = REMI(config) # Trains the tokenizer with Byte Pair Encoding (BPE) to build the vocabulary, here 10k tokens midi_paths = list(Path('Maestro').glob('**/*.mid')) + list(Path('Maestro').glob('**/*.midi')) print(midi_paths[:5]) tokenizer.learn_bpe( vocab_size=1000, files_paths=midi_paths, start_from_empty_voc=False, ) tokenizer.save_params("tokenizer.json") # Split MIDI paths in train/valid/test sets total_num_files = len(midi_paths) num_files_valid = round(total_num_files * 0.2) num_files_test = round(total_num_files * 0.1) shuffle(midi_paths) midi_paths_valid = midi_paths[:num_files_valid] midi_paths_test = midi_paths[num_files_valid:num_files_valid + num_files_test] midi_paths_train = midi_paths[num_files_valid + num_files_test:] # Loads tokens and create data collator kwargs_dataset = {"min_seq_len": 256, "max_seq_len": 1024, "tokenizer": tokenizer} dataset_train = DatasetTok(midi_paths_train, **kwargs_dataset) dataset_valid = DatasetTok(midi_paths_valid, **kwargs_dataset) dataset_test = DatasetTok(midi_paths_test, **kwargs_dataset) collator = DataCollator( tokenizer["PAD_None"], tokenizer["BOS_None"], tokenizer["EOS_None"] ) model_config = MistralConfig( vocab_size=len(tokenizer), hidden_size=512, intermediate_size=2048, num_hidden_layers=8, num_attention_heads=8, num_key_value_heads=4, sliding_window=256, max_position_embeddings=8192, pad_token_id=tokenizer['PAD_None'], bos_token_id=tokenizer['BOS_None'], eos_token_id=tokenizer['EOS_None'], ) # Creates model using the correct configuration model = AutoModelForCausalLM.from_config(model_config) metrics = {metric: load_metric(metric) for metric in ["accuracy"]} def compute_metrics(eval_pred): """ Compute metrics for pretraining. Must use preprocess_logits function that converts logits to predictions (argmax or sampling). :param eval_pred: EvalPrediction containing predictions and labels :return: metrics """ predictions, labels = eval_pred not_pad_mask = labels != -100 labels, predictions = labels[not_pad_mask], predictions[not_pad_mask] return metrics["accuracy"].compute(predictions=predictions.flatten(), references=labels.flatten()) def preprocess_logits(logits: Tensor, _: Tensor) -> Tensor: """ Preprocess the logits before accumulating them during evaluation. This allows to significantly reduce the memory usage and make the training tractable. """ pred_ids = argmax(logits, dim=-1) # long dtype return pred_ids # Create config for the Trainer USE_CUDA = cuda_available() if not cuda_available(): FP16 = FP16_EVAL = BF16 = BF16_EVAL = False elif is_bf16_supported(): BF16 = BF16_EVAL = True FP16 = FP16_EVAL = False else: BF16 = BF16_EVAL = False FP16 = FP16_EVAL = True USE_MPS = not USE_CUDA and mps_available() training_config = TrainingArguments( "runs", False, True, True, False, "steps", per_device_train_batch_size=16, per_device_eval_batch_size=48, gradient_accumulation_steps=3, eval_accumulation_steps=None, eval_steps=100, learning_rate=1e-4, weight_decay=0.01, max_grad_norm=3.0, max_steps=1000, lr_scheduler_type="cosine_with_restarts", warmup_ratio=0.3, log_level="debug", logging_strategy="steps", logging_steps=20, save_strategy="steps", save_steps=1000, save_total_limit=5, no_cuda=not USE_CUDA, seed=444, fp16=FP16, fp16_full_eval=FP16_EVAL, bf16=BF16, bf16_full_eval=BF16_EVAL, load_best_model_at_end=True, label_smoothing_factor=0., optim="adamw_torch", report_to=["tensorboard"], gradient_checkpointing=True, ) collator = DataCollator(tokenizer["PAD_None"], tokenizer["BOS_None"], tokenizer["EOS_None"], copy_inputs_as_labels=True) trainer = Trainer( model=model, args=training_config, data_collator=collator, train_dataset=dataset_train, eval_dataset=dataset_valid, compute_metrics=compute_metrics, callbacks=None, preprocess_logits_for_metrics=preprocess_logits, ) # Training train_result = trainer.train() trainer.save_model() # Saves the tokenizer too trainer.log_metrics("train", train_result.metrics) trainer.save_metrics("train", train_result.metrics) trainer.save_state() trainer.push_to_hub()