Spaces:

mimbres
/

YourMT3

Running on Zero

File size: 17,948 Bytes

a03c9b4

# Copyright 2024 The YourMT3 Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Please see the details in the LICENSE file.
""" tokenizer.py: Encodes and decodes events to/from tokens. """
import numpy as np
import warnings
from abc import ABC, abstractmethod
from utils.note_event_dataclasses import Event, EventRange, Note  #, Codec
from utils.event_codec import FastCodec as Codec
from utils.note_event_dataclasses import NoteEvent
from utils.note2event import note_event2event
from utils.event2note import event2note_event, note_event2note
from typing import List, Optional, Union, Tuple, Dict, Counter


#TODO: Too complex to be an abstract class.
class EventTokenizerBase(ABC):
    """
    A base class for encoding and decoding events to and from tokens.
    """

    def __init__(
        self,
        base_codec: Union[Codec, str] = 'mt3',
        special_tokens: List[str] = ['PAD', 'EOS', 'UNK'],
        extra_tokens: List[str] = [],
        max_shift_steps: int = 206,  # 1001 in Gardner et al.
        program_vocabulary: Optional[Dict] = None,
        drum_vocabulary: Optional[Dict] = None,
    ) -> None:
        """
        Initializes the EventTokenizerBase object.

        :param base_codec: The codec to use for encoding and decoding.
        :param special_tokens: None or list of special tokens to include in the vocabulary.
        :param extra_tokens: None or list of tokens to be treated as additional special tokens.
        :param program_vocabulary: None or a dictionary mapping program names to program indices.
        :param drum_vocabulary: None or a dictionary mapping drum names to drum indices.
        :param max_shift_steps: The maximum number of shift steps to use for the codec.
        """
        # Initialize the codec attribute based on the input codec parameter.
        if isinstance(base_codec, str):
            # If codec is a string, initialize codec with the appropriate Codec object.
            if base_codec.lower() == 'mt3':
                event_ranges = [
                    EventRange('pitch', min_value=0, max_value=127),
                    EventRange('velocity', min_value=0, max_value=1),
                    EventRange('tie', min_value=0, max_value=0),
                    EventRange('program', min_value=0, max_value=127),
                    EventRange('drum', min_value=0, max_value=127),
                ]
            else:
                raise ValueError(f'Unknown codec name: {base_codec}')

            # Initialize codec
            self.codec = Codec(special_tokens=special_tokens + extra_tokens,
                               max_shift_steps=max_shift_steps,
                               event_ranges=event_ranges,
                               program_vocabulary=program_vocabulary,
                               drum_vocabulary=drum_vocabulary,
                               name='mt3')

        elif isinstance(base_codec, Codec):
            # If codec is a Codec object, store it directly.
            self.codec = base_codec
            if program_vocabulary is not None or drum_vocabulary is not None:
                print('')
                warnings.warn("Vocabulary cannot be applied when using a custom codec.")
        else:
            # If codec is neither a string nor a Codec object, raise a NotImplementedError.
            raise TypeError(f'Unknown codec type: {type(base_codec)}')
        self.num_tokens = self.codec._num_classes

    def _encode(self, events: List[Event]) -> List[int]:
        return [self.codec.encode_event(e) for e in events]

    def _decode(self, tokens: List[int]) -> List[Event]:
        return [self.codec.decode_event_index(idx) for idx in tokens]

    @abstractmethod
    def encode(self):
        """ Encode your custom events to tokens. """
        pass

    @abstractmethod
    def decode(self):
        """ Decode your custom tokens to events."""
        pass


class EventTokenizer(EventTokenizerBase):
    """
    Eencoding and decoding events to and from tokens.
    """

    def __init__(self,
                 base_codec: Union[Codec, str] = 'mt3',
                 special_tokens: List[str] = ['PAD', 'EOS', 'UNK'],
                 extra_tokens: List[str] = [],
                 max_shift_steps: int = 206,
                 program_vocabulary: Optional[Dict] = None,
                 drum_vocabulary: Optional[Dict] = None) -> None:
        """
        Initializes the EventTokenizerBase object.

        :param codec: The codec to use for encoding and decoding.
        :param special_tokens: None or list of special tokens to include in the vocabulary.
        :param extra_tokens: None or list of tokens to be treated as additional special tokens.
        :param program_vocabulary: None or a dictionary mapping program names to program indices.
        :param drum_vocabulary: None or a dictionary mapping drum names to drum indices.
        :param max_shift_steps: The maximum number of shift steps to use for the codec.
        """
        # Initialize the codec attribute based on the input codec parameter.
        super().__init__(
            base_codec=base_codec,
            special_tokens=special_tokens,
            extra_tokens=extra_tokens,
            max_shift_steps=max_shift_steps,
            program_vocabulary=program_vocabulary,
            drum_vocabulary=drum_vocabulary,
        )

    def encode(self, events):
        """ Encode your custom events to tokens. """
        return super()._encode(events)

    def decode(self, tokens):
        """ Decode your custom tokens to events."""
        return super()._decode(tokens)


class NoteEventTokenizer(EventTokenizerBase):
    """ Encodes and decodes note events to/from tokens. """

    def __init__(
            self,
            base_codec: Union[Codec, str] = 'mt3',
            max_length: int = 1024,  # max length of tokens 
            tps: int = 100,
            sort_note_event: bool = True,
            special_tokens: List[str] = ['PAD', 'EOS', 'UNK'],
            extra_tokens: List[str] = [],
            max_shift_steps: int = 206,
            program_vocabulary: Optional[Dict] = None,
            drum_vocabulary: Optional[Dict] = None,
            ignore_decoding_tokens: List[str] = [],
            ignore_decoding_tokens_from_and_to: Optional[List[str]] = None,
            debug_mode: bool = False) -> None:
        """
        Initializes the TaskEventNoteTokenizer object.

        List[NoteEvent] -> encdoe_note_events -> np.ndarray[int]

        np.ndarray[int] -> decode_note_events -> Tuple[List[NoteEvent], List[NoteEvent]]
                             
        :param codec: The codec to use for encoding and decoding.
        :param special_tokens: None or list of special tokens to include in the vocabulary.
        :param extra_tokens: None or list of tokens to be treated as additional special tokens.
        :param program_vocabulary: None or a dictionary mapping program names to program indices.
        :param drum_vocabulary: None or a dictionary mapping drum names to drum indices.
        :param max_shift_steps: The maximum number of shift steps to use for the codec.

        :param ignore_decoding_tokens: List of tokens to ignore during decoding.
        :param ignore_decoding_tokens_from_and_to: List of tokens to ignore during decoding. [from, to]
        """
        super().__init__(base_codec=base_codec,
                         special_tokens=special_tokens,
                         extra_tokens=extra_tokens,
                         max_shift_steps=max_shift_steps,
                         program_vocabulary=program_vocabulary,
                         drum_vocabulary=drum_vocabulary)
        self.max_length = max_length
        self.tps = tps
        self.sort = sort_note_event

        # Prepare prefix, suffix and pad tokens.
        self._prefix = []
        self._suffix = []
        for stk in self.codec.special_tokens:
            if stk == 'EOS':
                self._suffix.append(self.codec.special_tokens.index('EOS'))
            elif stk == 'PAD':
                self._zero_pad = [0] * 1024
            elif stk == 'UNK':
                pass
            else:
                pass
                # raise NotImplementedError(f'Unknown special token: {stk}')
        self.eos_id = self.codec.special_tokens.index('EOS')
        self.pad_id = self.codec.special_tokens.index('PAD')
        self.ids_to_ignore_decoding = [self.codec.special_tokens.index(t) for t in ignore_decoding_tokens]
        self.ignore_tokens_from_and_to = ignore_decoding_tokens_from_and_to
        self.debug_mode = debug_mode

    def _decode(self, tokens):
        # This is event detokenizer, not note_event. It is required for displaying events in validation dashboard
        return super()._decode(tokens)

    def encode(
        self,
        note_events: List[NoteEvent],
        tie_note_events: Optional[List[NoteEvent]] = None,
        start_time: float = 0.,
    ) -> List[int]:
        """ Encodes note events and tie note events to tokens. """
        events = note_event2event(
            note_events=note_events,
            tie_note_events=tie_note_events,
            start_time=start_time,  # required for calcuating relative time
            tps=self.tps,
            sort=self.sort)
        return super()._encode(events)

    def encode_plus(
            self,
            note_events: List[NoteEvent],
            tie_note_events: Optional[List[NoteEvent]] = None,
            start_times: float = 0.,  # Fixing bug: start_time --> start_times 
            add_special_tokens: Optional[bool] = True,
            max_length: Optional[int] = None,  #  if None, use self.max_length
            pad_to_max_length: Optional[bool] = True,
            return_attention_mask: bool = False) -> Union[List[int], Tuple[List[int], List[int]]]:
        """ Encodes note events and tie note info to padded tokens. """
        encoded = self.encode(note_events, tie_note_events, start_times)

        # if task_events:
        #     encoded = super()._encode(task_events) + encoded
        if add_special_tokens:
            if self._prefix:
                encoded = self._prefix + encoded
            if self._suffix:
                encoded = encoded + self._suffix

        if max_length is None:
            max_length = self.max_length

        length = len(encoded)
        if length >= max_length:
            encoded = encoded[:max_length]
            length = max_length

        if return_attention_mask:
            attention_mask = [1] * length

        # <PAD>
        if pad_to_max_length is True:
            if len(self._zero_pad) != max_length:
                self._zero_pad = [self.pad_id] * max_length
            if return_attention_mask:
                attention_mask += self._zero_pad[length:]
            encoded = encoded + self._zero_pad[length:]

        if return_attention_mask:
            return encoded, attention_mask

        return encoded

    def encode_task(self, task_events: List[Event], max_length: Optional[int] = None) -> List[int]:
        # NOTE: This is an event tokenizer that generates task ids, not the list of note_event objects.
        encoded = super()._encode(task_events)

        # <PAD>
        if max_length is not None:
            if len(self._zero_pad_task) != max_length:
                self._zero_pad_task = [self.pad_id] * max_length
            length = len(encoded)
            encoded = encoded + self._zero_pad[length:]

        return encoded

    def decode(
        self,
        tokens: List[int],
        start_time: float = 0.,
        return_events: bool = False,
    ) -> Union[Tuple[List[NoteEvent], List[NoteEvent]], Tuple[List[NoteEvent], List[NoteEvent], List[Tuple[int]],
                                                              List[Event], int]]:
        """Decodes a sequence of tokens into note events.

        Args:
            tokens (List[int]): The list of tokens to be decoded.
            start_time (float, optional): The starting time for the note events. Defaults to 0.
            return_events (bool, optional): Indicates whether to include the raw events in the return value.
                                            Defaults to False.

        Returns:
            Union[Tuple[List[NoteEvent], List[NoteEvent]],
                Tuple[List[NoteEvent], List[NoteEvent], List[Event], int]]: The decoded note events.
            If `return_events` is False, the returned tuple contains `note_events`, `tie_note_events`,
            `last_activity`, and `err_cnt`.
            If `return_events` is True, the returned tuple contains `note_events`, `tie_note_events`,
            `last_activity`, `events`, and `err_cnt`.
        """
        if self.debug_mode:
            ignored_tokens_from_input = [t for t in tokens if t in self.ids_to_ignore_decoding]
            print(ignored_tokens_from_input)

        if self.ids_to_ignore_decoding:
            tokens = [t for t in tokens if t not in self.ids_to_ignore_decoding]

        events = super()._decode(tokens)
        note_events, tie_note_events, last_activity, err_cnt = event2note_event(events, start_time, True, self.tps)
        if return_events:
            return note_events, tie_note_events, last_activity, events, err_cnt
        else:
            return note_events, tie_note_events, last_activity, err_cnt

    def decode_batch(
        self,
        batch_tokens: Union[List[List[int]], np.ndarray],
        start_times: List[float],
        return_events: bool = False
    ) -> Union[Tuple[List[Tuple[List[NoteEvent], List[NoteEvent], List[Tuple[int]], List[float]]], int],
               Tuple[List[Tuple[List[NoteEvent], List[NoteEvent], List[Tuple[int]], List[float]]], List[List[Event]],
                     Counter[str]]]:
        """ 
        Decodes a batch of tokens to note_events and tie_note_events.

        Args:
            batch_tokens (List[List[int]] or np.ndarray): Tokens to be decoded.
            start_times (List[float]): List of start times for each token set.
            return_events (bool, optional): Flag to determine if events should be returned. Defaults to False.

        """
        if isinstance(batch_tokens, np.ndarray):
            batch_tokens = batch_tokens.tolist()

        if len(batch_tokens) != len(start_times):
            raise ValueError('The length of batch_tokens and start_times must be same.')

        zipped_note_events_and_tie = []
        list_events = []
        total_err_cnt = 0

        for tokens, start_time in zip(batch_tokens, start_times):
            if return_events:
                note_events, tie_note_events, last_activity, events, err_cnt = self.decode(
                    tokens, start_time, return_events)
                list_events.append(events)
            else:
                note_events, tie_note_events, last_activity, err_cnt = self.decode(tokens, start_time, return_events)

            zipped_note_events_and_tie.append((note_events, tie_note_events, last_activity, start_time))
            total_err_cnt += err_cnt

        if return_events:
            return zipped_note_events_and_tie, list_events, total_err_cnt
        else:
            return zipped_note_events_and_tie, total_err_cnt

    def decode_list_batches(
        self,
        list_batch_tokens: Union[List[List[List[int]]], List[np.ndarray]],
        list_start_times: Union[List[List[float]], List[float]],
        return_events: bool = False
    ) -> Union[Tuple[List[List[Tuple[List[NoteEvent], List[NoteEvent], List[Tuple[int]], List[float]]]], Counter[str]],
               Tuple[List[List[Tuple[List[NoteEvent], List[NoteEvent], List[Tuple[int]], List[float]]]],
                     List[List[Event]], Counter[str]]]:
        """ 
        Decodes a list of variable-size batches of token array to a list of
        zipped note_events and tie_note_events.

        Args:
            list_batch_tokens: List[np.ndarray], where array shape is (batch_size, variable_length)
            list_start_times: List[float], where the length is sum of all batch_sizes.
            return_events: bool, Defaults to False.

        Returns:
            list_list_zipped_note_events_and_tie:
                List[
                    Tuple[
                        List[NoteEvent]: A list of note events.
                        List[NoteEvent]: A list of tie note events.
                        List[Tuple[int]]: A list of last activity of segment. [(program, pitch), ...]. This is useful
                            for validating notes within a batch of segments extracted from a file.
                        List[float]: A list of segment start times.
                    ]
                ]
            (Optional) list_events:
                List[List[Event]]
            total_err_cnt:
                Counter[str]: error counter.
        """
        list_tokens = []
        for arr in list_batch_tokens:
            for tokens in arr:
                list_tokens.append(tokens)
        assert (len(list_tokens) == len(list_start_times))

        zipped_note_events_and_tie = []
        list_events = []
        total_err_cnt = Counter()
        for tokens, start_time in zip(list_tokens, list_start_times):
            note_events, tie_note_events, last_activity, events, err_cnt = self.decode(
                tokens, start_time, return_events)
            zipped_note_events_and_tie.append((note_events, tie_note_events, last_activity, start_time))
            if return_events:
                list_events.append(events)
            total_err_cnt += err_cnt

        if return_events:
            return zipped_note_events_and_tie, list_events, total_err_cnt
        else:
            return zipped_note_events_and_tie, total_err_cnt