File size: 6,229 Bytes
4e40fcf
 
 
 
 
 
 
 
 
7505654
 
 
4e40fcf
3c9ff56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
---
library_name: transformers
tags: []
---

# Model Card for Model ID

<!-- Provide a quick summary of what the model is/does. -->

사싀 μƒκ°ν•΄λ³΄λ‹ˆ νŒŒμΈνŠœλ‹ μ•„λ‹™λ‹ˆλ‹€.
κΉ‘ν†΅νŠœλ‹μž…λ‹ˆλ‹€.
깑톡에 balanced-Lakh

# Tokenizer
```
from miditok import MMM, TokenizerConfig
from typing import Union, Optional
from pathlib import Path

class CodeplayTokenizer(MMM):
    def __init__(self, config: TokenizerConfig):
        super().__init__(config)
    
    def save_pretrained(
        self,
        save_directory: Union[str, Path],
        *,
        config: Optional[Union[dict, "DataclassInstance"]] = None,
        repo_id: Optional[str] = None,
        push_to_hub: bool = False,
        **push_to_hub_kwargs,
    ) -> Optional[str]:
        """
        Save weights in local directory.

        Args:
            save_directory (`str` or `Path`):
                Path to directory in which the model weights and configuration will be saved.
            config (`dict` or `DataclassInstance`, *optional*):
                Model configuration specified as a key/value dictionary or a dataclass instance.
            push_to_hub (`bool`, *optional*, defaults to `False`):
                Whether or not to push your model to the Huggingface Hub after saving it.
            repo_id (`str`, *optional*):
                ID of your repository on the Hub. Used only if `push_to_hub=True`. Will default to the folder name if
                not provided.
            kwargs:
                Additional key word arguments passed along to the [`~ModelHubMixin.push_to_hub`] method.
        """
        save_directory = Path(save_directory)
        save_directory.mkdir(parents=True, exist_ok=True)

        # save model weights/files (framework-specific)
        self._save_pretrained(save_directory)

        # push to the Hub if required
        if push_to_hub:
            kwargs = push_to_hub_kwargs.copy()  # soft-copy to avoid mutating input
            if config is not None:  # kwarg for `push_to_hub`
                kwargs["config"] = config
            if repo_id is None:
                repo_id = save_directory.name  # Defaults to `save_directory` name
            return self.push_to_hub(repo_id=repo_id, **kwargs)
        return None

GENRE_TOKEN_LIST = ['Rock', 'Pop', 'Jazz']
GENRE_TOKEN_LIST = ['Genre_Unk'] + ['Genre_'+genre for genre in GENRE_TOKEN_LIST]
GENRE_TOKEN_LIST += ['Genre_'+str(i+1) for i in range(40-len(GENRE_TOKEN_LIST))] #40
BAR2_TOKEN_LIST = ['Bar2_Unk'] + ['Bar2_'+str(i+1) for i in range(127)] # 128

def get_custom_tokenizer():
    TOKENIZER_NAME = CodeplayTokenizer
    config = TokenizerConfig(
        num_velocities=16,
        use_chord=True,
        use_pitch_intervals=True,
        use_programs=True,)
    tokenizer = TOKENIZER_NAME(config)
    
    # MMM tokenizer
    mmm = len(tokenizer)-1
    print(f'MMM Tokenizer bandwith : 0 ~ {mmm}, ({mmm+1} tokens)')
    
    # Add genre token
    for genre_tk in GENRE_TOKEN_LIST:
        tokenizer.add_to_vocab(genre_tk)
    genre = len(tokenizer)-1
    print(f'Genre Tokenizer bandwith : {mmm+1} ~ {genre}, ({genre-mmm} tokens)')
    
    # Add cut(bar4) token
    for cut_tk in BAR2_TOKEN_LIST:
        tokenizer.add_to_vocab(cut_tk)
    # Add cut Unused token
    cut = len(tokenizer)-1
    print(f'Bar2 Cut Tokenizer bandwith : {genre+1} ~ {cut}, ({cut-genre} tokens)')
    
    print(f'Total Tokenizer bandwith : 0 ~ {cut}, ({len(tokenizer)} tokens)')
    return tokenizer

def get_nnn_tokenizer(num_velocities=8):
    NNN = CodeplayTokenizer
    config = TokenizerConfig(
        num_velocities=num_velocities,
        use_programs=True
    )
    tokenizer = NNN(config)
    prev_len = len(tokenizer)
    vocabs = list(tokenizer.vocab.keys())
    
    pitches = [v for v in vocabs if v.startswith('Pitch_') ]
    velocities = [v for v in vocabs if v.startswith('Velocity_') ]
    durations = [v for v in vocabs if v.startswith('Duration_') ]
    
    for p in pitches:
        for v in velocities:
            for d in durations:
                new_tk = f'{p}+{v}+{d}'
                tokenizer.add_to_vocab(new_tk)
    
    print(f'MMM Tokenizer bandwith : 0 ~ {prev_len}, ({prev_len} tokens)')
    print(f'NNN Tokenizer bandwith : {prev_len} ~ {len(tokenizer)}, ({len(tokenizer)-prev_len} tokens)')
    return tokenizer
    
lakh_genres = ['Rock', 'Pop', 'Dance/Electronic', 'Jazz', 'R&B', 'Groove', 'Folk', 'Classical', 'World', 'Metal', "Children"]
lakh_emotions = ['nostalgia', 'excitement', 'love', 'anger', 'happiness', 'sadness','calmness', 'gratitude', 'loneliness', 'anticipation']
lakh_tempos = ['Moderato', 'Allegro', 'Presto', 'Andante']
def get_nnn_meta_tokenizer(num_velocities=4):
    NNN = CodeplayTokenizer
    config = TokenizerConfig(
        num_velocities=num_velocities,
        use_programs=True
    )
    tokenizer = NNN(config)
    mmm_len = len(tokenizer)
    vocabs = list(tokenizer.vocab.keys())
    
    pitches = [v for v in vocabs if v.startswith('Pitch_') ]
    velocities = [v for v in vocabs if v.startswith('Velocity_') ]
    durations = [v for v in vocabs if v.startswith('Duration_') ]
    
    for p in pitches:
        for v in velocities:
            for d in durations:
                new_tk = f'{p}+{v}+{d}'
                tokenizer.add_to_vocab(new_tk)
    nnn_len = len(tokenizer)
    
    # genre tokens
    for genre in lakh_genres:
        tokenizer.add_to_vocab(f'Genre_{genre}')
    genre_len = len(tokenizer)
    
    # emotion tokens
    for emotion in lakh_emotions:
        tokenizer.add_to_vocab(f'Emotion_{emotion}')
    emotion_len = len(tokenizer)
    
    for tempo in lakh_tempos:
        tokenizer.add_to_vocab(f'Tempo_{tempo}')
    tempo_len = len(tokenizer)
    
    print(f'MMM Tokenizer bandwith : 0 ~ {mmm_len}, ({mmm_len} tokens)')
    print(f'NNN Tokenizer bandwith : {mmm_len} ~ {nnn_len}, ({nnn_len-mmm_len} tokens)')
    print(f'Genre Tokenizer bandwith : {nnn_len} ~ {genre_len}, ({genre_len-nnn_len} tokens)')
    print(f'Emotion Tokenizer bandwith : {genre_len} ~ {emotion_len}, ({emotion_len-genre_len} tokens)')
    print(f'Tempo Tokenizer bandwith : {emotion_len} ~ {tempo_len}, ({tempo_len-emotion_len} tokens)')

    return tokenizer
```