Spaces:

MuGemSt
/

hoyoMusic

Running

App Files Files

MuGeminorum commited on Feb 25

Commit

27cf0c7

•

1 Parent(s): f9ed6a5

add copy btn

Browse files

Files changed (2) hide show

app.py +76 -52
utils.py +78 -67

app.py CHANGED Viewed

@@ -10,24 +10,48 @@ from config import *
 from convert import *
 from transformers import GPT2Config
 import warnings
-warnings.filterwarnings('ignore')
 def get_args(parser):
-    parser.add_argument('-num_tunes', type=int, default=1,
-                        help='the number of independently computed returned tunes')
-    parser.add_argument('-max_patch', type=int, default=128,
-                        help='integer to define the maximum length in tokens of each tune')
-    parser.add_argument('-top_p', type=float, default=0.8,
-                        help='float to define the tokens that are within the sample operation of text generation')
-    parser.add_argument('-top_k', type=int, default=8,
-                        help='integer to define the tokens that are within the sample operation of text generation')
-    parser.add_argument('-temperature', type=float, default=1.2,
-                        help='the temperature of the sampling operation')
-    parser.add_argument('-seed', type=int, default=None,
-                        help='seed for randomstate')
-    parser.add_argument('-show_control_code', type=bool,
-                        default=True, help='whether to show control code')
     args = parser.parse_args()
     return args
@@ -40,14 +64,14 @@ def generate_abc(args, region):
         num_hidden_layers=PATCH_NUM_LAYERS,
         max_length=PATCH_LENGTH,
         max_position_embeddings=PATCH_LENGTH,
-        vocab_size=1
     )
     char_config = GPT2Config(
         num_hidden_layers=CHAR_NUM_LAYERS,
         max_length=PATCH_SIZE,
         max_position_embeddings=PATCH_SIZE,
-        vocab_size=128
     )
     model = TunesFormer(patch_config, char_config, share_weights=SHARE_WEIGHTS)
@@ -60,8 +84,8 @@ def generate_abc(args, region):
     else:
         download()
-    checkpoint = torch.load(filename, map_location=torch.device('cpu'))
-    model.load_state_dict(checkpoint['model'])
     model = model.to(device)
     model.eval()
@@ -76,20 +100,20 @@ def generate_abc(args, region):
     seed = args.seed
     show_control_code = args.show_control_code
-    print(" HYPERPARAMETERS ".center(60, "#"), '\n')
     args = vars(args)
     for key in args.keys():
-        print(f'{key}: {str(args[key])}')
-    print('\n', " OUTPUT TUNES ".center(60, "#"))
     start_time = time.time()
     for i in range(num_tunes):
-        title_artist = f'T:{region} Fragment\nC:Generated by AI\n'
         tune = f"X:{str(i + 1)}\n{title_artist + prompt}"
-        lines = re.split(r'(\n)', tune)
         tune = ""
         skip = False
         for line in lines:
@@ -104,8 +128,7 @@ def generate_abc(args, region):
                 skip = True
         input_patches = torch.tensor(
-            [patchilizer.encode(prompt, add_special_patches=True)[:-1]],
-            device=device
         )
         if tune == "":
@@ -113,10 +136,10 @@ def generate_abc(args, region):
         else:
             prefix = patchilizer.decode(input_patches[0])
-            remaining_tokens = prompt[len(prefix):]
             tokens = torch.tensor(
-                [patchilizer.bos_token_id]+[ord(c) for c in remaining_tokens],
-                device=device
             )
         while input_patches.shape[1] < max_patch:
@@ -126,7 +149,7 @@ def generate_abc(args, region):
                 top_p=top_p,
                 top_k=top_k,
                 temperature=temperature,
-                seed=seed
             )
             tokens = None
@@ -140,17 +163,15 @@ def generate_abc(args, region):
                 if next_bar == "":
                     break
-                next_bar = remaining_tokens+next_bar
                 remaining_tokens = ""
                 predicted_patch = torch.tensor(
-                    patchilizer.bar2patch(next_bar),
-                    device=device
                 ).unsqueeze(0)
                 input_patches = torch.cat(
-                    [input_patches, predicted_patch.unsqueeze(0)],
-                    dim=1
                 )
             else:
@@ -160,11 +181,11 @@ def generate_abc(args, region):
         print("\n")
     print("Generation time: {:.2f} seconds".format(time.time() - start_time))
-    create_dir('./tmp')
     timestamp = time.strftime("%a_%d_%b_%Y_%H_%M_%S", time.localtime())
-    out_midi = abc_to_midi(tunes, f'./tmp/[{region}]{timestamp}.mid')
-    out_xml = abc_to_musicxml(tunes, f'./tmp/[{region}]{timestamp}.musicxml')
-    out_mxl = musicxml_to_mxl(f'./tmp/[{region}]{timestamp}.musicxml')
     pdf_file, jpg_file = mxl2jpg(out_mxl)
     wav_file = midi2wav(out_midi)
@@ -172,8 +193,8 @@ def generate_abc(args, region):
 def inference(region):
-    if os.path.exists('./tmp'):
-        shutil.rmtree('./tmp')
     parser = argparse.ArgumentParser()
     args = get_args(parser)
@@ -184,30 +205,33 @@ with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
             region_opt = gr.Dropdown(
-                choices=[
-                    'Mondstadt', 'Liyue', 'Inazuma', 'Sumeru', 'Fontaine'
-                ],
-                value='Mondstadt',
-                label='Region genre'
             )
             gen_btn = gr.Button("Generate")
         with gr.Column():
-            wav_output = gr.Audio(label='Audio', type='filepath')
             dld_midi = gr.components.File(label="Download MIDI")
             pdf_score = gr.components.File(label="Download PDF score")
             dld_xml = gr.components.File(label="Download MusicXML")
             dld_mxl = gr.components.File(label="Download MXL")
-            abc_output = gr.TextArea(label='abc score')
-            img_score = gr.Image(label='Staff', type='filepath')
     gen_btn.click(
         inference,
         inputs=region_opt,
         outputs=[
-            abc_output, dld_midi, pdf_score,
-            dld_xml, dld_mxl, img_score, wav_output
-        ]
     )
 demo.launch(share=True)

 from convert import *
 from transformers import GPT2Config
 import warnings
+warnings.filterwarnings("ignore")
 def get_args(parser):
+    parser.add_argument(
+        "-num_tunes",
+        type=int,
+        default=1,
+        help="the number of independently computed returned tunes",
+    )
+    parser.add_argument(
+        "-max_patch",
+        type=int,
+        default=128,
+        help="integer to define the maximum length in tokens of each tune",
+    )
+    parser.add_argument(
+        "-top_p",
+        type=float,
+        default=0.8,
+        help="float to define the tokens that are within the sample operation of text generation",
+    )
+    parser.add_argument(
+        "-top_k",
+        type=int,
+        default=8,
+        help="integer to define the tokens that are within the sample operation of text generation",
+    )
+    parser.add_argument(
+        "-temperature",
+        type=float,
+        default=1.2,
+        help="the temperature of the sampling operation",
+    )
+    parser.add_argument("-seed", type=int, default=None, help="seed for randomstate")
+    parser.add_argument(
+        "-show_control_code",
+        type=bool,
+        default=True,
+        help="whether to show control code",
+    )
     args = parser.parse_args()
     return args
         num_hidden_layers=PATCH_NUM_LAYERS,
         max_length=PATCH_LENGTH,
         max_position_embeddings=PATCH_LENGTH,
+        vocab_size=1,
     )
     char_config = GPT2Config(
         num_hidden_layers=CHAR_NUM_LAYERS,
         max_length=PATCH_SIZE,
         max_position_embeddings=PATCH_SIZE,
+        vocab_size=128,
     )
     model = TunesFormer(patch_config, char_config, share_weights=SHARE_WEIGHTS)
     else:
         download()
+    checkpoint = torch.load(filename, map_location=torch.device("cpu"))
+    model.load_state_dict(checkpoint["model"])
     model = model.to(device)
     model.eval()
     seed = args.seed
     show_control_code = args.show_control_code
+    print(" HYPERPARAMETERS ".center(60, "#"), "\n")
     args = vars(args)
     for key in args.keys():
+        print(f"{key}: {str(args[key])}")
+    print("\n", " OUTPUT TUNES ".center(60, "#"))
     start_time = time.time()
     for i in range(num_tunes):
+        title_artist = f"T:{region} Fragment\nC:Generated by AI\n"
         tune = f"X:{str(i + 1)}\n{title_artist + prompt}"
+        lines = re.split(r"(\n)", tune)
         tune = ""
         skip = False
         for line in lines:
                 skip = True
         input_patches = torch.tensor(
+            [patchilizer.encode(prompt, add_special_patches=True)[:-1]], device=device
         )
         if tune == "":
         else:
             prefix = patchilizer.decode(input_patches[0])
+            remaining_tokens = prompt[len(prefix) :]
             tokens = torch.tensor(
+                [patchilizer.bos_token_id] + [ord(c) for c in remaining_tokens],
+                device=device,
             )
         while input_patches.shape[1] < max_patch:
                 top_p=top_p,
                 top_k=top_k,
                 temperature=temperature,
+                seed=seed,
             )
             tokens = None
                 if next_bar == "":
                     break
+                next_bar = remaining_tokens + next_bar
                 remaining_tokens = ""
                 predicted_patch = torch.tensor(
+                    patchilizer.bar2patch(next_bar), device=device
                 ).unsqueeze(0)
                 input_patches = torch.cat(
+                    [input_patches, predicted_patch.unsqueeze(0)], dim=1
                 )
             else:
         print("\n")
     print("Generation time: {:.2f} seconds".format(time.time() - start_time))
+    create_dir("./tmp")
     timestamp = time.strftime("%a_%d_%b_%Y_%H_%M_%S", time.localtime())
+    out_midi = abc_to_midi(tunes, f"./tmp/[{region}]{timestamp}.mid")
+    out_xml = abc_to_musicxml(tunes, f"./tmp/[{region}]{timestamp}.musicxml")
+    out_mxl = musicxml_to_mxl(f"./tmp/[{region}]{timestamp}.musicxml")
     pdf_file, jpg_file = mxl2jpg(out_mxl)
     wav_file = midi2wav(out_midi)
 def inference(region):
+    if os.path.exists("./tmp"):
+        shutil.rmtree("./tmp")
     parser = argparse.ArgumentParser()
     args = get_args(parser)
     with gr.Row():
         with gr.Column():
             region_opt = gr.Dropdown(
+                choices=["Mondstadt", "Liyue", "Inazuma", "Sumeru", "Fontaine"],
+                value="Mondstadt",
+                label="Region genre",
             )
             gen_btn = gr.Button("Generate")
         with gr.Column():
+            wav_output = gr.Audio(label="Audio", type="filepath")
             dld_midi = gr.components.File(label="Download MIDI")
             pdf_score = gr.components.File(label="Download PDF score")
             dld_xml = gr.components.File(label="Download MusicXML")
             dld_mxl = gr.components.File(label="Download MXL")
+            abc_output = gr.Textbox(label="abc score", show_copy_button=True)
+            img_score = gr.Image(label="Staff", type="filepath")
     gen_btn.click(
         inference,
         inputs=region_opt,
         outputs=[
+            abc_output,
+            dld_midi,
+            pdf_score,
+            dld_xml,
+            dld_mxl,
+            img_score,
+            wav_output,
+        ],
     )
 demo.launch(share=True)

utils.py CHANGED Viewed

@@ -35,15 +35,16 @@ def create_dir(dir_path):
 def download(filename=WEIGHT_PATH, url=WEIGHT_URL):
     import time
     import requests
     try:
         response = requests.get(url, stream=True)
-        total_size = int(response.headers.get('content-length', 0))
         chunk_size = 1024
-        with open(filename, 'wb') as file, tqdm(
             desc=f"Downloading weights to '{filename}'...",
             total=total_size,
-            unit='B',
             unit_scale=True,
             unit_divisor=1024,
         ) as bar:
@@ -51,7 +52,7 @@ def download(filename=WEIGHT_PATH, url=WEIGHT_URL):
                 size = file.write(data)
                 bar.update(size)
-    except ConnectionError as e:
         print(f"Error: {e}")
         time.sleep(3)
         download(filename, ZH_WEIGHT_URL)
@@ -59,7 +60,7 @@ def download(filename=WEIGHT_PATH, url=WEIGHT_URL):
 class Patchilizer:
     """
-    A class for converting music bars to patches and vice versa.
     """
     def __init__(self):
@@ -73,7 +74,7 @@ class Patchilizer:
         """
         Split a body of music into individual bars.
         """
-        bars = re.split(self.regexPattern, ''.join(body))
         bars = list(filter(None, bars))
         # remove empty strings
         if bars[0] in self.delimiters:
@@ -87,8 +88,7 @@ class Patchilizer:
         """
         Convert a bar into a patch of specified length.
         """
-        patch = [self.bos_token_id] + \
-            [ord(c) for c in bar] + [self.eos_token_id]
         patch = patch[:patch_size]
         patch += [self.pad_token_id] * (patch_size - len(patch))
         return patch
@@ -97,31 +97,46 @@ class Patchilizer:
         """
         Convert a patch into a bar.
         """
-        return ''.join(chr(idx) if idx > self.eos_token_id else '' for idx in patch if idx != self.eos_token_id)
-    def encode(self, abc_code, patch_length=PATCH_LENGTH, patch_size=PATCH_SIZE, add_special_patches=False):
         """
         Encode music into patches of specified length.
         """
-        lines = unidecode(abc_code).split('\n')
         lines = list(filter(None, lines))  # remove empty lines
         body = ""
         patches = []
         for line in lines:
-            if len(line) > 1 and ((line[0].isalpha() and line[1] == ':') or line.startswith('%%score')):
                 if body:
                     bars = self.split_bars(body)
                     patches.extend(
-                        self.bar2patch(bar + '\n' if idx == len(bars) - 1 else bar, patch_size) for idx, bar in enumerate(bars)
                     )
                     body = ""
-                patches.append(self.bar2patch(line + '\n', patch_size))
             else:
-                body += line + '\n'
         if body:
             patches.extend(
@@ -129,10 +144,8 @@ class Patchilizer:
             )
         if add_special_patches:
-            bos_patch = [self.bos_token_id] * \
-                (patch_size-1) + [self.eos_token_id]
-            eos_patch = [self.bos_token_id] + \
-                [self.eos_token_id] * (patch_size-1)
             patches = [bos_patch] + patches + [eos_patch]
         return patches[:patch_length]
@@ -141,12 +154,12 @@ class Patchilizer:
         """
         Decode patches into music.
         """
-        return ''.join(self.patch2bar(patch) for patch in patches)
 class PatchLevelDecoder(PreTrainedModel):
     """
-    An Patch-level Decoder model for generating patch features in an auto-regressive manner.
     It inherits PreTrainedModel from transformers.
     """
@@ -171,7 +184,7 @@ class PatchLevelDecoder(PreTrainedModel):
 class CharLevelDecoder(PreTrainedModel):
     """
-    A Char-level Decoder model for generating the characters within each bar patch sequentially.
     It inherits PreTrainedModel from transformers.
     """
@@ -182,7 +195,12 @@ class CharLevelDecoder(PreTrainedModel):
         self.eos_token_id = 2
         self.base = GPT2LMHeadModel(config)
-    def forward(self, encoded_patches: torch.Tensor, target_patches: torch.Tensor, patch_sampling_batch_size: int):
         """
         The forward pass of the char-level decoder model.
         :param encoded_patches: the encoded patches
@@ -198,7 +216,10 @@ class CharLevelDecoder(PreTrainedModel):
         target_masks = target_masks.masked_fill_(labels == -100, 0)
         # select patches
-        if patch_sampling_batch_size != 0 and patch_sampling_batch_size < target_patches.shape[0]:
             indices = list(range(len(target_patches)))
             random.shuffle(indices)
             selected_indices = sorted(indices[:patch_sampling_batch_size])
@@ -210,20 +231,16 @@ class CharLevelDecoder(PreTrainedModel):
         # get input embeddings
         inputs_embeds = torch.nn.functional.embedding(
-            target_patches,
-            self.base.transformer.wte.weight
         )
         # concatenate the encoded patches with the input embeddings
         inputs_embeds = torch.cat(
-            (encoded_patches.unsqueeze(1), inputs_embeds[:, 1:, :]),
-            dim=1
         )
         return self.base(
-            inputs_embeds=inputs_embeds,
-            attention_mask=target_masks,
-            labels=labels
         )
     def generate(self, encoded_patch: torch.Tensor, tokens: torch.Tensor):
@@ -237,10 +254,7 @@ class CharLevelDecoder(PreTrainedModel):
         tokens = tokens.reshape(1, -1)
         # Get input embeddings
-        tokens = torch.nn.functional.embedding(
-            tokens,
-            self.base.transformer.wte.weight
-        )
         # Concatenate the encoded patch with the input embeddings
         tokens = torch.cat((encoded_patch, tokens[:, 1:, :]), dim=1)
@@ -249,17 +263,14 @@ class CharLevelDecoder(PreTrainedModel):
         outputs = self.base(inputs_embeds=tokens)
         # Get probabilities of next token
-        probs = torch.nn.functional.softmax(
-            outputs.logits.squeeze(0)[-1],
-            dim=-1
-        )
         return probs
 class TunesFormer(PreTrainedModel):
     """
-    TunesFormer is a hierarchical music generation model based on bar patching.
     It includes a patch-level decoder and a character-level decoder.
     It inherits PreTrainedModel from transformers.
     """
@@ -271,18 +282,14 @@ class TunesFormer(PreTrainedModel):
         self.eos_token_id = 2
         if share_weights:
             max_layers = max(
-                encoder_config.num_hidden_layers,
-                decoder_config.num_hidden_layers
             )
-            max_context_size = max(
-                encoder_config.max_length,
-                decoder_config.max_length
-            )
             max_position_embeddings = max(
                 encoder_config.max_position_embeddings,
-                decoder_config.max_position_embeddings
             )
             encoder_config.num_hidden_layers = max_layers
@@ -298,17 +305,24 @@ class TunesFormer(PreTrainedModel):
         if share_weights:
             self.patch_level_decoder.base = self.char_level_decoder.base.transformer
-    def forward(self, patches: torch.Tensor, patch_sampling_batch_size: int = PATCH_SAMPLING_BATCH_SIZE):
         """
         The forward pass of the TunesFormer model.
         :param patches: the patches to be both encoded and decoded
         :return: the decoded patches
         """
         patches = patches.reshape(len(patches), -1, PATCH_SIZE)
-        encoded_patches = self.patch_level_decoder(
-            patches)["last_hidden_state"]
-        return self.char_level_decoder(encoded_patches.squeeze(0)[:-1, :], patches.squeeze(0)[1:, :], patch_sampling_batch_size)
     def generate(
         self,
@@ -317,7 +331,7 @@ class TunesFormer(PreTrainedModel):
         top_p: float = 1,
         top_k: int = 0,
         temperature: float = 1,
-        seed: int = None
     ):
         """
         The generate function for generating patches based on patches.
@@ -325,8 +339,7 @@ class TunesFormer(PreTrainedModel):
         :return: the generated patches
         """
         patches = patches.reshape(len(patches), -1, PATCH_SIZE)
-        encoded_patches = self.patch_level_decoder(
-            patches)["last_hidden_state"]
         if tokens == None:
             tokens = torch.tensor([self.bos_token_id], device=self.device)
@@ -342,19 +355,17 @@ class TunesFormer(PreTrainedModel):
             else:
                 n_seed = None
-            prob = self.char_level_decoder.generate(
-                encoded_patches[0][-1],
-                tokens
-            ).cpu().detach().numpy()
             prob = top_p_sampling(prob, top_p=top_p, return_probs=True)
             prob = top_k_sampling(prob, top_k=top_k, return_probs=True)
-            token = temperature_sampling(
-                prob,
-                temperature=temperature,
-                seed=n_seed
-            )
             generated_patch.append(token)
             if token == self.eos_token_id or len(tokens) >= PATCH_SIZE - 1:
@@ -362,8 +373,7 @@ class TunesFormer(PreTrainedModel):
             else:
                 tokens = torch.cat(
-                    (tokens, torch.tensor([token], device=self.device)),
-                    dim=0
                 )
         return generated_patch, n_seed
@@ -374,8 +384,9 @@ class PatchilizedData(Dataset):
         self.texts = []
         for item in tqdm(items):
-            text = item['control code'] + \
-                "\n".join(item['abc notation'].split('\n')[1:])
             input_patch = patchilizer.encode(text, add_special_patches=True)
             input_patch = torch.tensor(input_patch)
             if torch.sum(input_patch) != 0:

 def download(filename=WEIGHT_PATH, url=WEIGHT_URL):
     import time
     import requests
     try:
         response = requests.get(url, stream=True)
+        total_size = int(response.headers.get("content-length", 0))
         chunk_size = 1024
+        with open(filename, "wb") as file, tqdm(
             desc=f"Downloading weights to '{filename}'...",
             total=total_size,
+            unit="B",
             unit_scale=True,
             unit_divisor=1024,
         ) as bar:
                 size = file.write(data)
                 bar.update(size)
+    except Exception as e:
         print(f"Error: {e}")
         time.sleep(3)
         download(filename, ZH_WEIGHT_URL)
 class Patchilizer:
     """
+    A class for converting music bars to patches and vice versa.
     """
     def __init__(self):
         """
         Split a body of music into individual bars.
         """
+        bars = re.split(self.regexPattern, "".join(body))
         bars = list(filter(None, bars))
         # remove empty strings
         if bars[0] in self.delimiters:
         """
         Convert a bar into a patch of specified length.
         """
+        patch = [self.bos_token_id] + [ord(c) for c in bar] + [self.eos_token_id]
         patch = patch[:patch_size]
         patch += [self.pad_token_id] * (patch_size - len(patch))
         return patch
         """
         Convert a patch into a bar.
         """
+        return "".join(
+            chr(idx) if idx > self.eos_token_id else ""
+            for idx in patch
+            if idx != self.eos_token_id
+        )
+    def encode(
+        self,
+        abc_code,
+        patch_length=PATCH_LENGTH,
+        patch_size=PATCH_SIZE,
+        add_special_patches=False,
+    ):
         """
         Encode music into patches of specified length.
         """
+        lines = unidecode(abc_code).split("\n")
         lines = list(filter(None, lines))  # remove empty lines
         body = ""
         patches = []
         for line in lines:
+            if len(line) > 1 and (
+                (line[0].isalpha() and line[1] == ":") or line.startswith("%%score")
+            ):
                 if body:
                     bars = self.split_bars(body)
                     patches.extend(
+                        self.bar2patch(
+                            bar + "\n" if idx == len(bars) - 1 else bar, patch_size
+                        )
+                        for idx, bar in enumerate(bars)
                     )
                     body = ""
+                patches.append(self.bar2patch(line + "\n", patch_size))
             else:
+                body += line + "\n"
         if body:
             patches.extend(
             )
         if add_special_patches:
+            bos_patch = [self.bos_token_id] * (patch_size - 1) + [self.eos_token_id]
+            eos_patch = [self.bos_token_id] + [self.eos_token_id] * (patch_size - 1)
             patches = [bos_patch] + patches + [eos_patch]
         return patches[:patch_length]
         """
         Decode patches into music.
         """
+        return "".join(self.patch2bar(patch) for patch in patches)
 class PatchLevelDecoder(PreTrainedModel):
     """
+    An Patch-level Decoder model for generating patch features in an auto-regressive manner.
     It inherits PreTrainedModel from transformers.
     """
 class CharLevelDecoder(PreTrainedModel):
     """
+    A Char-level Decoder model for generating the characters within each bar patch sequentially.
     It inherits PreTrainedModel from transformers.
     """
         self.eos_token_id = 2
         self.base = GPT2LMHeadModel(config)
+    def forward(
+        self,
+        encoded_patches: torch.Tensor,
+        target_patches: torch.Tensor,
+        patch_sampling_batch_size: int,
+    ):
         """
         The forward pass of the char-level decoder model.
         :param encoded_patches: the encoded patches
         target_masks = target_masks.masked_fill_(labels == -100, 0)
         # select patches
+        if (
+            patch_sampling_batch_size != 0
+            and patch_sampling_batch_size < target_patches.shape[0]
+        ):
             indices = list(range(len(target_patches)))
             random.shuffle(indices)
             selected_indices = sorted(indices[:patch_sampling_batch_size])
         # get input embeddings
         inputs_embeds = torch.nn.functional.embedding(
+            target_patches, self.base.transformer.wte.weight
         )
         # concatenate the encoded patches with the input embeddings
         inputs_embeds = torch.cat(
+            (encoded_patches.unsqueeze(1), inputs_embeds[:, 1:, :]), dim=1
         )
         return self.base(
+            inputs_embeds=inputs_embeds, attention_mask=target_masks, labels=labels
         )
     def generate(self, encoded_patch: torch.Tensor, tokens: torch.Tensor):
         tokens = tokens.reshape(1, -1)
         # Get input embeddings
+        tokens = torch.nn.functional.embedding(tokens, self.base.transformer.wte.weight)
         # Concatenate the encoded patch with the input embeddings
         tokens = torch.cat((encoded_patch, tokens[:, 1:, :]), dim=1)
         outputs = self.base(inputs_embeds=tokens)
         # Get probabilities of next token
+        probs = torch.nn.functional.softmax(outputs.logits.squeeze(0)[-1], dim=-1)
         return probs
 class TunesFormer(PreTrainedModel):
     """
+    TunesFormer is a hierarchical music generation model based on bar patching.
     It includes a patch-level decoder and a character-level decoder.
     It inherits PreTrainedModel from transformers.
     """
         self.eos_token_id = 2
         if share_weights:
             max_layers = max(
+                encoder_config.num_hidden_layers, decoder_config.num_hidden_layers
             )
+            max_context_size = max(encoder_config.max_length, decoder_config.max_length)
             max_position_embeddings = max(
                 encoder_config.max_position_embeddings,
+                decoder_config.max_position_embeddings,
             )
             encoder_config.num_hidden_layers = max_layers
         if share_weights:
             self.patch_level_decoder.base = self.char_level_decoder.base.transformer
+    def forward(
+        self,
+        patches: torch.Tensor,
+        patch_sampling_batch_size: int = PATCH_SAMPLING_BATCH_SIZE,
+    ):
         """
         The forward pass of the TunesFormer model.
         :param patches: the patches to be both encoded and decoded
         :return: the decoded patches
         """
         patches = patches.reshape(len(patches), -1, PATCH_SIZE)
+        encoded_patches = self.patch_level_decoder(patches)["last_hidden_state"]
+        return self.char_level_decoder(
+            encoded_patches.squeeze(0)[:-1, :],
+            patches.squeeze(0)[1:, :],
+            patch_sampling_batch_size,
+        )
     def generate(
         self,
         top_p: float = 1,
         top_k: int = 0,
         temperature: float = 1,
+        seed: int = None,
     ):
         """
         The generate function for generating patches based on patches.
         :return: the generated patches
         """
         patches = patches.reshape(len(patches), -1, PATCH_SIZE)
+        encoded_patches = self.patch_level_decoder(patches)["last_hidden_state"]
         if tokens == None:
             tokens = torch.tensor([self.bos_token_id], device=self.device)
             else:
                 n_seed = None
+            prob = (
+                self.char_level_decoder.generate(encoded_patches[0][-1], tokens)
+                .cpu()
+                .detach()
+                .numpy()
+            )
             prob = top_p_sampling(prob, top_p=top_p, return_probs=True)
             prob = top_k_sampling(prob, top_k=top_k, return_probs=True)
+            token = temperature_sampling(prob, temperature=temperature, seed=n_seed)
             generated_patch.append(token)
             if token == self.eos_token_id or len(tokens) >= PATCH_SIZE - 1:
             else:
                 tokens = torch.cat(
+                    (tokens, torch.tensor([token], device=self.device)), dim=0
                 )
         return generated_patch, n_seed
         self.texts = []
         for item in tqdm(items):
+            text = item["control code"] + "\n".join(
+                item["abc notation"].split("\n")[1:]
+            )
             input_patch = patchilizer.encode(text, add_special_patches=True)
             input_patch = torch.tensor(input_patch)
             if torch.sum(input_patch) != 0: