hoyoMusic / utils.py
MuGeminorum
refine codes
19b4090
raw
history blame
12.8 kB
import os
import re
import torch
import random
from config import *
from tqdm import tqdm
from unidecode import unidecode
from torch.utils.data import Dataset
from transformers import GPT2Model, GPT2LMHeadModel, PreTrainedModel
from samplings import top_p_sampling, top_k_sampling, temperature_sampling
if torch.cuda.is_available():
device = torch.device("cuda")
else:
device = torch.device("cpu")
def template(region):
return f'''A:{region}
S:2
B:9
E:4
B:9
L:1/8
M:3/4
K:D
de |"D"'''
def download(filename=WEIGHT_PATH, url=WEIGHT_URL):
import time
import requests
try:
response = requests.get(url, stream=True)
total_size = int(response.headers.get("content-length", 0))
chunk_size = 1024
with open(filename, "wb") as file, tqdm(
desc=f"Downloading weights to '{filename}'...",
total=total_size,
unit="B",
unit_scale=True,
unit_divisor=1024,
) as bar:
for data in response.iter_content(chunk_size=chunk_size):
size = file.write(data)
bar.update(size)
except Exception as e:
print(f"Error: {e}")
time.sleep(3)
download(filename, ZH_WEIGHT_URL)
class Patchilizer:
"""
A class for converting music bars to patches and vice versa.
"""
def __init__(self):
self.delimiters = ["|:", "::", ":|", "[|", "||", "|]", "|"]
self.regexPattern = f"({'|'.join(map(re.escape, self.delimiters))})"
self.pad_token_id = 0
self.bos_token_id = 1
self.eos_token_id = 2
def split_bars(self, body):
"""
Split a body of music into individual bars.
"""
bars = re.split(self.regexPattern, "".join(body))
bars = list(filter(None, bars))
# remove empty strings
if bars[0] in self.delimiters:
bars[1] = bars[0] + bars[1]
bars = bars[1:]
bars = [bars[i * 2] + bars[i * 2 + 1] for i in range(len(bars) // 2)]
return bars
def bar2patch(self, bar, patch_size=PATCH_SIZE):
"""
Convert a bar into a patch of specified length.
"""
patch = [self.bos_token_id] + [ord(c) for c in bar] + [self.eos_token_id]
patch = patch[:patch_size]
patch += [self.pad_token_id] * (patch_size - len(patch))
return patch
def patch2bar(self, patch):
"""
Convert a patch into a bar.
"""
return "".join(
chr(idx) if idx > self.eos_token_id else ""
for idx in patch
if idx != self.eos_token_id
)
def encode(
self,
abc_code,
patch_length=PATCH_LENGTH,
patch_size=PATCH_SIZE,
add_special_patches=False,
):
"""
Encode music into patches of specified length.
"""
lines = unidecode(abc_code).split("\n")
lines = list(filter(None, lines)) # remove empty lines
body = ""
patches = []
for line in lines:
if len(line) > 1 and (
(line[0].isalpha() and line[1] == ":") or line.startswith("%%score")
):
if body:
bars = self.split_bars(body)
patches.extend(
self.bar2patch(
bar + "\n" if idx == len(bars) - 1 else bar, patch_size
)
for idx, bar in enumerate(bars)
)
body = ""
patches.append(self.bar2patch(line + "\n", patch_size))
else:
body += line + "\n"
if body:
patches.extend(
self.bar2patch(bar, patch_size) for bar in self.split_bars(body)
)
if add_special_patches:
bos_patch = [self.bos_token_id] * (patch_size - 1) + [self.eos_token_id]
eos_patch = [self.bos_token_id] + [self.eos_token_id] * (patch_size - 1)
patches = [bos_patch] + patches + [eos_patch]
return patches[:patch_length]
def decode(self, patches):
"""
Decode patches into music.
"""
return "".join(self.patch2bar(patch) for patch in patches)
class PatchLevelDecoder(PreTrainedModel):
"""
An Patch-level Decoder model for generating patch features in an auto-regressive manner.
It inherits PreTrainedModel from transformers.
"""
def __init__(self, config):
super().__init__(config)
self.patch_embedding = torch.nn.Linear(PATCH_SIZE * 128, config.n_embd)
torch.nn.init.normal_(self.patch_embedding.weight, std=0.02)
self.base = GPT2Model(config)
def forward(self, patches: torch.Tensor) -> torch.Tensor:
"""
The forward pass of the patch-level decoder model.
:param patches: the patches to be encoded
:return: the encoded patches
"""
patches = torch.nn.functional.one_hot(patches, num_classes=128).float()
patches = patches.reshape(len(patches), -1, PATCH_SIZE * 128)
patches = self.patch_embedding(patches.to(self.device))
return self.base(inputs_embeds=patches)
class CharLevelDecoder(PreTrainedModel):
"""
A Char-level Decoder model for generating the characters within each bar patch sequentially.
It inherits PreTrainedModel from transformers.
"""
def __init__(self, config):
super().__init__(config)
self.pad_token_id = 0
self.bos_token_id = 1
self.eos_token_id = 2
self.base = GPT2LMHeadModel(config)
def forward(
self,
encoded_patches: torch.Tensor,
target_patches: torch.Tensor,
patch_sampling_batch_size: int,
):
"""
The forward pass of the char-level decoder model.
:param encoded_patches: the encoded patches
:param target_patches: the target patches
:return: the decoded patches
"""
# preparing the labels for model training
target_masks = target_patches == self.pad_token_id
labels = target_patches.clone().masked_fill_(target_masks, -100)
# masking the labels for model training
target_masks = torch.ones_like(labels)
target_masks = target_masks.masked_fill_(labels == -100, 0)
# select patches
if (
patch_sampling_batch_size != 0
and patch_sampling_batch_size < target_patches.shape[0]
):
indices = list(range(len(target_patches)))
random.shuffle(indices)
selected_indices = sorted(indices[:patch_sampling_batch_size])
target_patches = target_patches[selected_indices, :]
target_masks = target_masks[selected_indices, :]
encoded_patches = encoded_patches[selected_indices, :]
labels = labels[selected_indices, :]
# get input embeddings
inputs_embeds = torch.nn.functional.embedding(
target_patches, self.base.transformer.wte.weight
)
# concatenate the encoded patches with the input embeddings
inputs_embeds = torch.cat(
(encoded_patches.unsqueeze(1), inputs_embeds[:, 1:, :]), dim=1
)
return self.base(
inputs_embeds=inputs_embeds, attention_mask=target_masks, labels=labels
)
def generate(self, encoded_patch: torch.Tensor, tokens: torch.Tensor):
"""
The generate function for generating a patch based on the encoded patch and already generated tokens.
:param encoded_patch: the encoded patch
:param tokens: already generated tokens in the patch
:return: the probability distribution of next token
"""
encoded_patch = encoded_patch.reshape(1, 1, -1)
tokens = tokens.reshape(1, -1)
# Get input embeddings
tokens = torch.nn.functional.embedding(tokens, self.base.transformer.wte.weight)
# Concatenate the encoded patch with the input embeddings
tokens = torch.cat((encoded_patch, tokens[:, 1:, :]), dim=1)
# Get output from model
outputs = self.base(inputs_embeds=tokens)
# Get probabilities of next token
probs = torch.nn.functional.softmax(outputs.logits.squeeze(0)[-1], dim=-1)
return probs
class TunesFormer(PreTrainedModel):
"""
TunesFormer is a hierarchical music generation model based on bar patching.
It includes a patch-level decoder and a character-level decoder.
It inherits PreTrainedModel from transformers.
"""
def __init__(self, encoder_config, decoder_config, share_weights=False):
super().__init__(encoder_config)
self.pad_token_id = 0
self.bos_token_id = 1
self.eos_token_id = 2
if share_weights:
max_layers = max(
encoder_config.num_hidden_layers, decoder_config.num_hidden_layers
)
max_context_size = max(encoder_config.max_length, decoder_config.max_length)
max_position_embeddings = max(
encoder_config.max_position_embeddings,
decoder_config.max_position_embeddings,
)
encoder_config.num_hidden_layers = max_layers
encoder_config.max_length = max_context_size
encoder_config.max_position_embeddings = max_position_embeddings
decoder_config.num_hidden_layers = max_layers
decoder_config.max_length = max_context_size
decoder_config.max_position_embeddings = max_position_embeddings
self.patch_level_decoder = PatchLevelDecoder(encoder_config)
self.char_level_decoder = CharLevelDecoder(decoder_config)
if share_weights:
self.patch_level_decoder.base = self.char_level_decoder.base.transformer
def forward(
self,
patches: torch.Tensor,
patch_sampling_batch_size: int = PATCH_SAMPLING_BATCH_SIZE,
):
"""
The forward pass of the TunesFormer model.
:param patches: the patches to be both encoded and decoded
:return: the decoded patches
"""
patches = patches.reshape(len(patches), -1, PATCH_SIZE)
encoded_patches = self.patch_level_decoder(patches)["last_hidden_state"]
return self.char_level_decoder(
encoded_patches.squeeze(0)[:-1, :],
patches.squeeze(0)[1:, :],
patch_sampling_batch_size,
)
def generate(
self,
patches: torch.Tensor,
tokens: torch.Tensor,
top_p: float = 1,
top_k: int = 0,
temperature: float = 1,
seed: int = None,
):
"""
The generate function for generating patches based on patches.
:param patches: the patches to be encoded
:return: the generated patches
"""
patches = patches.reshape(len(patches), -1, PATCH_SIZE)
encoded_patches = self.patch_level_decoder(patches)["last_hidden_state"]
if tokens == None:
tokens = torch.tensor([self.bos_token_id], device=self.device)
generated_patch = []
random.seed(seed)
while True:
if seed != None:
n_seed = random.randint(0, 1000000)
random.seed(n_seed)
else:
n_seed = None
prob = (
self.char_level_decoder.generate(encoded_patches[0][-1], tokens)
.cpu()
.detach()
.numpy()
)
prob = top_p_sampling(prob, top_p=top_p, return_probs=True)
prob = top_k_sampling(prob, top_k=top_k, return_probs=True)
token = temperature_sampling(prob, temperature=temperature, seed=n_seed)
generated_patch.append(token)
if token == self.eos_token_id or len(tokens) >= PATCH_SIZE - 1:
break
else:
tokens = torch.cat(
(tokens, torch.tensor([token], device=self.device)), dim=0
)
return generated_patch, n_seed
class PatchilizedData(Dataset):
def __init__(self, items, patchilizer):
self.texts = []
for item in tqdm(items):
text = item["control code"] + "\n".join(
item["abc notation"].split("\n")[1:]
)
input_patch = patchilizer.encode(text, add_special_patches=True)
input_patch = torch.tensor(input_patch)
if torch.sum(input_patch) != 0:
self.texts.append(input_patch)
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
return self.texts[idx]