yjhuangcd commited on Jul 20, 2024

Commit

9965bf6

1 Parent(s): 9708aee

First commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +136 -3
compute_std.py +54 -0
datasets/README.md +20 -0
datasets/all_midi.csv +0 -0
datasets/chunk_midi.py +72 -0
datasets/filter_class.py +38 -0
datasets/piano_roll_all.py +139 -0
datasets/select_midi.py +74 -0
diff_collage/README.md +3 -0
diff_collage/__init__.py +5 -0
diff_collage/avg_circle.py +64 -0
diff_collage/avg_long.py +40 -0
diff_collage/condind_circle.py +190 -0
diff_collage/condind_long.py +147 -0
diff_collage/generic_sampler.py +113 -0
diff_collage/loss_helper.py +41 -0
diff_collage/w_img.py +79 -0
diff_collage/w_loss.py +433 -0
environment.yml +282 -0
guided_diffusion/__init__.py +3 -0
guided_diffusion/condition_functions.py +174 -0
guided_diffusion/dist_util.py +104 -0
guided_diffusion/dit.py +983 -0
guided_diffusion/embed_datasets.py +161 -0
guided_diffusion/fp16_util.py +237 -0
guided_diffusion/gaussian_diffusion.py +1400 -0
guided_diffusion/logger.py +521 -0
guided_diffusion/losses.py +77 -0
guided_diffusion/midi_util.py +291 -0
guided_diffusion/nn.py +170 -0
guided_diffusion/pr_datasets_all.py +183 -0
guided_diffusion/resample.py +154 -0
guided_diffusion/respace.py +128 -0
guided_diffusion/script_util.py +531 -0
guided_diffusion/train_util.py +475 -0
guided_diffusion/unet.py +906 -0
load_utils.py +31 -0
music_evaluation/README.md +22 -0
music_evaluation/convert_to_wav.py +42 -0
music_evaluation/demo.ipynb +0 -0
music_evaluation/fad.py +38 -0
music_evaluation/figaro/chord_recognition.py +247 -0
music_evaluation/figaro/constants.py +47 -0
music_evaluation/figaro/evaluate.py +268 -0
music_evaluation/figaro/input_representation.py +655 -0
music_evaluation/figaro/vocab.py +166 -0
music_evaluation/mgeval/__init__.py +0 -0
music_evaluation/mgeval/__init__.pyc +0 -0
music_evaluation/mgeval/core.py +644 -0
music_evaluation/mgeval/core.pyc +0 -0

README.md CHANGED Viewed

@@ -1,3 +1,136 @@
----
-license: mit
----

+# Symbolic Music Generation with Non-Differentiable Rule Guided Diffusion
+This is the codebase for the paper: [Symbolic Music Generation with Non-Differentiable Rule Guided Diffusion](https://arxiv.org/abs/2402.14285).
+We introduced a symbolic music generator with non-differentiable rule guided diffusion models,  drawing inspiration from stochastic control. For music demos, please visit our [project website](https://scg-rule-guided-music.github.io/).
+<img align="center" src="rule_guided_music_gen.png" width="750">
+## Set up the environment
+- Put the pretrained VAE checkpoint under `taming-transformers/checkpoints`
+- Create conda virtual environment via: `conda env create -f environment.yml`
+- Activating virtual env: `conda activate guided`
+## Download Pretrained Checkpoints
+- Pretrained VAE checkpoint under `trained_models/VAE`: put it under `taming-transformers/checkpoints/all_onset/epoch_14.ckpt`.
+- Pretrained Diffusion model checkpoint under `trained_models/diffusion`: put it under `loggings/checkpoints/ema_0.9999_1200000.pt`.
+- Pretrained classifiers for each rule under `trained_models/classifier`: put them under `loggings/classifier/`.
+## Rule Guided Generation
+All the configs of the rule guidance are stored in `scripts/configs/`. `cond_demo` contains the configs that we used to generate the demos for composer co-creation. `cond_table` contains the configs that we used to create the table in the paper. `edit` contains the configs of editing an existing excerpt.
+For instance, if you want to guide diffusion models on all of the rules simultaneously, and use both SCG and classifier guidance, you can use this config `scripts/configs/cond_table/all/scg_classifier_all.yml`. The results will save in this directory: `loggings/cond_table/all/scg_classifier_all`.
+The config file contains the following fields:
+| Field | Description                                                                                                                                                           |
+|-----------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `target_rules`  | Specify the desired attributes here. New rules can be added by writing rule programs in `music_rule_guidance/music_rules.py` and updating `rule_maps`.                 |
+| `guidance`      | Hyper-parameters for guidance, including classifier config for classifier guidance, and when to start or stop using guidance.                                          |
+| `scg`          | Hyper-parameters for stochastic control guidance (SCG, ours).                                                                                                         |
+| `sampling`      | Hyper-parameters for diffusion model sampling. Options include using DDIM or sampling longer sequences with `diff_collage`.                                           |
+To run the rule-guided sampling code, you can use the following script:
+```
+python sample_rule.py
+    --config_path <config_file>
+    --batch_size 4
+    --num_samples 20
+    --data_dir <data_dir>
+    --model DiTRotary_XL_8
+    --model_path loggings/checkpoints/ema_0.9999_1200000.pt
+    --image_size 128 16
+    --in_channels 4
+    --scale_factor 1.2465
+    --class_cond True
+    --num_classes 3
+    --class_label 1
+```
+The meaning of each hyper-parameter is listed as follows:
+| Hyper-parameter   | Description                                                                                                       |
+|-------------------|-------------------------------------------------------------------------------------------------------------------|
+| `config_path`    | Path of the configuration file. For example: `scripts/configs/cond_demo/demo1.yml`.                                    |
+| `batch_size`    | Batch size for generation. Default: 4.                                                                         |
+| `num_samples`   | How many samples to generate in total. Default: 20.                                                           |
+| `data_dir`      | Optional: directory to store data. Used to extract rule label from existing music excerpts. Do not need if target rule labels are given (just leave as default value in this case). |
+| `model`         | Model backbone for diffusion model. Default: DiTRotary_XL_8. |
+| `model_path`    | Path of the pretrained diffusion model.  |
+| `image_size`    | Size of the generated piano roll in latent space (for 10.24s, the size is 128x16).            |
+| `in_channels`    | Number of channels for the latent space of pretrained VAE model.  Default: 4.            |
+| `scale_factor`  | 1 / std of the latents. You can use `compute_std.py` to compute it for a pretrained VAE. By default: 1.2465 (computed for the VAE checkpoint that we provided). |
+| `class_cond`    | Whether to condition on music genre (datasets: maestro, muscore and pop) for generation. Default: True.   |
+| `num_classes`   | Number of classes (datasets). We trained on 3 datasets.    |
+| `class_label`   | 0 for Maestro (classical performance), 1 for Muscore (classical sheet music), 2 for Pop.   |
+To guide on new rules in addition to what we considered (pitch histogram, note density and chord progression).
+You can add the rule function to `music_rule_guidance/music_rules.py`, and add it to `FUNC_DICT` in `rule_maps.py`.
+In addition, you need to pick a loss function for the newly added rule and add it to `LOSS_DICT` in `rule_maps.py`.
+Then you can use the key in `FUNC_DICT` for `target_rules` in the config file.
+This framework also supports editing existing excerpt:
+```
+python scripts/edit.py
+    --config_path scripts/configs/edit_table/nd_500_num16.yml
+    --batch_size 2
+    --num_samples 20
+    --data_dir <data_dir>
+    --model DiTRotary_XL_8
+    --model_path loggings/checkpoints/ema_0.9999_1200000.pt
+    --image_size 128 16
+    --in_channels 4
+    --scale_factor 1.2465
+    --class_cond True
+    --num_classes 3
+    --class_label 2
+```
+## Train diffusion model for music generation
+To train a diffusion model for symbolic music generation, using the following script.
+```
+mpiexec -n 8 python scripts/train_dit.py
+    --dir <loggings/save_dir>
+    --data_dir <datasets/data_dir>
+    --model DiTRotary_XL_8
+    --image_size 128 16
+    --in_channels 4
+    --batch_size 32
+    --encode_rep 4
+    --shift_size 4
+    --pr_image_size 2560
+    --microbatch_encode -1
+    --class_cond True
+    --num_classes 3
+    --scale_factor <scale_factor>
+    --fs 100
+    --save_interval 10000
+    --resume <dir to the last saved model ckpt>
+```
+The meaning of each hyper-parameter is listed as follows:
+| Hyper-parameter   | Description                                                                                                       |
+|-------------------|-------------------------------------------------------------------------------------------------------------------|
+| `mpiexec -n 8`    | Multi-GPU training, using 8 GPUs.                                                                                 |
+| `embed_model_name`| VAE config, default is `kl/f8-all-onset`.                                                                         |
+| `embed_model_ckpt`| Directory of the VAE checkpoint.                                                                                  |
+| `dir`             | Directory to save diffusion checkpoints and generated samples.                                                   |
+| `data_dir`        | Where you store your piano roll data.                                                                             |
+| `model`           | Diffusion model name (config), e.g., `DiTRotary_XL_8`: a DiT XL model with 1D patch_size=8 (seq_len=256).     |
+| `image_size`      | Latent space size (for 10.24s, the size is 128x16).                                                               |
+| `in_channels`     | Latent space channel (default is 4).                                                                                         |
+| `batch_size`      | Batch size on each GPU. Effective batch size is batch_size * num_GPUs. Aim for an effective batch size of 256.    |
+| `encoder_rep`     | How many excerpts to create from a long sequence. Batch_size needs to be greater or equal to encoder_rep. Default: 4.      |
+| `shift_size`      | Time shift between successive music excerpts from a long sequence. Default: 4.      |
+| `pr_image_size`   | Length of a long sequence, need to be compatible with `encoder_rep` and `shift_size`. For example, for `encoder_rep=4` and `shift_size=4`, the excerpts created from a long sequence are 1-8, 5-12, 9-16 and 13-20. Therefore `pr_image_size=20x128=2560`. |                                                                        |
+| `class_cond`      | Train with class conditioning (score(x,y), y is the class).                                                       |
+| `num_classes`     | Number of classes in your conditioning. E.g., 3 - 0 for maestro, 1 for muscore, 2 for pop. |
+| `scale_factor`    | 1 / std of the latents. You can use `compute_std.py` to compute it for a pretrained VAE.                   |
+| `fs`              | Time resolution is 1 / fs. |
+| `save_interval`   | Frequency of saving checkpoints, e.g., every 10k steps.                                                           |
+## References
+This repository is based on [openai/guided-diffusion](https://github.com/openai/guided-diffusion), with modifications for data representation, guidance algorithm and architecture improvements.
+- The VAE architecture is modified upon [taming-transformers](https://github.com/CompVis/taming-transformers).
+- The DiT architecture is modified upon [DiT](https://github.com/facebookresearch/DiT).
+- Music evaluation code is adapted from [mgeval](https://github.com/RichardYang40148/mgeval) and [figaro](https://github.com/dvruette/figaro).
+- MIDI to piano roll representation is adapted from [pretty_midi](https://github.com/craffel/pretty-midi).

compute_std.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import os
+import torch
+from load_utils import load_model
+from guided_diffusion import dist_util
+from guided_diffusion.gaussian_diffusion import _encode, _decode
+from guided_diffusion.pr_datasets_all import load_data
+from tqdm import tqdm
+from guided_diffusion.midi_util import visualize_full_piano_roll, save_piano_roll_midi
+from music_rule_guidance import music_rules
+import matplotlib.pyplot as plt
+import warnings
+warnings.filterwarnings("ignore")
+plt.rcParams["figure.figsize"] = (20,3)
+plt.rcParams['figure.dpi'] = 300
+plt.rcParams['savefig.dpi'] = 300
+MODEL_NAME = 'kl/f8-all-onset'
+MODEL_CKPT = 'taming-transformers/checkpoints/all_onset/epoch_14.ckpt'
+TOTAL_BATCH = 256
+def main():
+    data = load_data(
+        data_dir='datasets/all-len-40-gap-16-no-empty_train.csv',
+        batch_size=32,
+        class_cond=True,
+        image_size=1024,
+        deterministic=False,
+        fs=100,
+    )
+    embed_model = load_model(MODEL_NAME, MODEL_CKPT)
+    del embed_model.loss
+    embed_model.to(dist_util.dev())
+    embed_model.eval()
+    z_list = []
+    with torch.no_grad():
+        for _ in tqdm(range(TOTAL_BATCH)):
+            batch, cond = next(data)
+            batch = batch.to(dist_util.dev())
+            enc = _encode(batch, embed_model, scale_factor=1.)
+            z_list.append(enc.cpu())
+    latents = torch.concat(z_list, dim=0)
+    scale_factor = 1. / latents.flatten().std().item()
+    print(f"scale_factor: {scale_factor}")
+    print("done")
+if __name__ == "__main__":
+    main()

datasets/README.md ADDED Viewed

	@@ -0,0 +1,20 @@

+# Creating data representation for symbolic music
+This directory contains instructions and scripts for creating training dataset.
+Notice that you do not need to prepare dataset if you want to generate music and have target rule labels in mind.
+You will need to prepare dataset if you want to train a model, or you want to extract rule labels from existing music excerpts.
+We train our diffusion model on three datasets: [Maestro](https://magenta.tensorflow.org/datasets/maestro#v300) (classical piano performance), Muscore (crawled from the Muscore website), and Pop ([Pop1k7](https://drive.google.com/file/d/1qw_tVUntblIg4lW16vbpjLXVndkVtgDe/view) and [Pop909](https://github.com/music-x-lab/POP909-Dataset)).
+You can download the data and put the midi files into the corresponding folder: `maestro`, `muscore` and `pop`.
+Then run `piano_roll_all.py` to create piano roll excerpts from the dataset.
+The above script creates piano roll excerpts of 1.28s.
+To create music of 10.24s for training, return to the main folder and run `rearrange_pr_data.py` to concat shorter piano rolls to longer ones.
+The processed data will be saved in `datasets/all-len-40-gap-16-no-empty` by default, and along with the data, there will be two csv files:
+`all-len-40-gap-16-no-empty_train.csv` and `all-len-40-gap-16-no-empty_test.csv` that list the filenames.
+If you want to extract rule label from the piano rolls and condition on a specific dataset, you need to create csv file for each dataset using:
+```
+python filter_class.py --file_path all-len-40-gap-16-no-empty_test.csv --class_label <class label>
+```

datasets/all_midi.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

datasets/chunk_midi.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import os
+import pretty_midi
+import argparse
+def chunk_midi(input_path, output_dir, chunk_length=10.24):
+    # Ensure the output directory exists
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    for midi_file_name in os.listdir(input_path):
+        if not (midi_file_name.endswith('.midi') or midi_file_name.endswith('.mid')):
+            continue  # Skip non-midi files
+        full_path = os.path.join(input_path, midi_file_name)
+        try:
+            midi_data = pretty_midi.PrettyMIDI(full_path)
+        except Exception as e:
+            print(f"Error processing {midi_file_name}: {e}")
+            continue  # Skip to the next file if an error occurs
+        end_time = midi_data.get_end_time()  # Get end time directly with pretty_midi
+        num_chunks = int(end_time // chunk_length) + (1 if end_time % chunk_length > 0 else 0)
+        base_name, file_extension = os.path.splitext(midi_file_name)
+        for i in range(num_chunks):
+            start_time = i * chunk_length
+            segment_end_time = min((i + 1) * chunk_length, midi_data.get_end_time())
+            # Create a new MIDI object for each chunk
+            chunk_midi_data = pretty_midi.PrettyMIDI()
+            # Merge non-drum instruments into a single instrument
+            merged_instrument = pretty_midi.Instrument(program=0, is_drum=False)
+            for instrument in midi_data.instruments:
+                if not instrument.is_drum:
+                    for note in instrument.notes:
+                        if start_time <= note.start < segment_end_time:
+                            # Shift the note start and end times to start at 0
+                            new_note = pretty_midi.Note(
+                                velocity=note.velocity,
+                                pitch=note.pitch,
+                                start=note.start - start_time,
+                                end=note.end - start_time
+                            )
+                            merged_instrument.notes.append(new_note)
+                else:
+                    # If it's a drum instrument, just adjust the note times and append it
+                    new_drum_instrument = pretty_midi.Instrument(program=instrument.program, is_drum=True, name=instrument.name)
+                    new_drum_instrument.notes = [note for note in instrument.notes if start_time <= note.start < segment_end_time]
+                    for note in new_drum_instrument.notes:
+                        note.start -= start_time
+                        note.end -= start_time
+                    chunk_midi_data.instruments.append(new_drum_instrument)
+            # Add the merged instrument to the MIDI object
+            chunk_midi_data.instruments.append(merged_instrument)
+            # Save the chunk with the same extension as the original file
+            new_midi_name = "{}_{}{}".format(base_name, i, file_extension)
+            chunk_midi_data.write(os.path.join(output_dir, new_midi_name))
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Chunk MIDI files into specified lengths.")
+    parser.add_argument("--input_path", type=str, help="Path to the directory containing the MIDI files to chunk.")
+    parser.add_argument("--output_dir", type=str, help="Path to the directory where the chunked MIDI files will be saved.")
+    parser.add_argument("--chunk_length", type=float, default=10.24, help="length to chunk the midi file to (s).")
+    args = parser.parse_args()
+    chunk_midi(args.input_path, args.output_dir, chunk_length=args.chunk_length)

datasets/filter_class.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import pandas as pd
+import argparse
+def filter_and_save_csv(file_path, class_label):
+    """
+    Filters a CSV file to keep only the rows where the 'classes' column equals the specified class label.
+    Saves the filtered DataFrame to a new CSV file.
+    :param file_path: Path to the original CSV file.
+    :param class_label: The class label to filter by.
+    """
+    # Read the CSV file
+    df = pd.read_csv(file_path)
+    # Filter out rows where 'classes' equals the specified class_label
+    filtered_df = df[df['classes'] == class_label]
+    # Save the filtered DataFrame to a new CSV file
+    # The new file name is the original file name with '_cls_<class_label>' appended before the file extension
+    new_file_path = file_path.replace('.csv', f'_cls_{class_label}.csv')
+    filtered_df.to_csv(new_file_path, index=False)
+    print(f"Filtered CSV saved as: {new_file_path}")
+def main():
+    # Set up the argument parser
+    parser = argparse.ArgumentParser(description="Filter a CSV file by class and save to a new file.")
+    parser.add_argument("--file_path", type=str, help="Path to the original CSV file")
+    parser.add_argument("--class_label", type=int, help="The class label to filter by")
+    # Parse arguments
+    args = parser.parse_args()
+    # Call the function with the provided arguments
+    filter_and_save_csv(args.file_path, args.class_label)
+if __name__ == "__main__":
+    main()

datasets/piano_roll_all.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import os
+import torch
+from torch.utils.data import Dataset, DataLoader
+from torchvision import transforms, utils
+import pretty_midi
+import pandas as pd
+import numpy as np
+from tqdm import tqdm
+import math
+from music_rule_guidance.music_rules import MAX_PIANO, MIN_PIANO
+import matplotlib.pyplot as plt
+plt.rcParams["figure.figsize"] = (6,3)
+plt.rcParams['figure.dpi'] = 300
+plt.rcParams['savefig.dpi'] = 300
+CC_SUSTAIN_PEDAL = 64
+def split_csv(csv_path='merged_midi.csv'):
+  # separate training validation testing files
+  df = pd.read_csv(csv_path)
+  save_name = csv_path[:csv_path.rfind('.csv')]
+  for split in ['train', 'validation', 'test']:
+    path = os.path.join(save_name, split + '.csv')
+    df_sub = df[df.split == split]
+    df_sub.to_csv(path, index=False)
+  return
+def quantize_pedal(value, num_bins=8):
+  """Quantize an integer value from 0 to 127 into 8 bins and return the center value of the bin."""
+  if value < 0 or value > 127:
+    raise ValueError("Value should be between 0 and 127")
+  # Determine bin size
+  bin_size = 128 // num_bins  # 16
+  # Quantize the value
+  bin_index = value // bin_size
+  bin_center = bin_size * bin_index + bin_size // 2
+  # Handle edge case for the last bin
+  if bin_center > 127:
+    bin_center = 127
+  return bin_center
+def get_full_piano_roll(midi_data, fs, show=False):
+  # do not process sustain pedal
+  piano_roll, onset_roll = midi_data.get_piano_roll(fs=fs, pedal_threshold=None, onset=True)
+  # save pedal roll explicitly
+  pedal_roll = np.zeros_like(piano_roll)
+  # process pedal
+  for instru in midi_data.instruments:
+    pedal_changes = [_e for _e in instru.control_changes if _e.number == CC_SUSTAIN_PEDAL]
+    for cc in pedal_changes:
+      time_now = int(cc.time * fs)
+      if time_now < pedal_roll.shape[-1]:
+        # need to distinguish control_change 0 and background 0, with quantize 0-16 will be 8
+        # in muscore files, 0 immediately followed by 127, need to shift by one column
+        if pedal_roll[MIN_PIANO, time_now] != 0. and abs(pedal_roll[MIN_PIANO, time_now] - cc.value) > 64:
+          # use shift 2 here to prevent missing change when using interpolation augmentation
+          pedal_roll[MIN_PIANO:MAX_PIANO + 1, min(time_now + 2, pedal_roll.shape[-1] - 1)] = quantize_pedal(cc.value)
+        else:
+          pedal_roll[MIN_PIANO:MAX_PIANO + 1, time_now] = quantize_pedal(cc.value)
+  full_roll = np.concatenate((piano_roll[None], onset_roll[None], pedal_roll[None]), axis=0)
+  if show:
+    plt.imshow(piano_roll[::-1, :1024], vmin=0, vmax=127)
+    plt.show()
+    plt.imshow(pedal_roll[::-1, :1024], vmin=0, vmax=127)
+    plt.show()
+  return full_roll
+def preprocess_midi(target='merged', csv_path='merged_midi.csv', fs=100., image_size=128, overlap=False, show=False):
+  # get piano roll from midi file
+  df = pd.read_csv(csv_path)
+  total_pieces = len(df)
+  if not os.path.exists(target):
+    os.makedirs(target)
+  for split in ['train', 'test']:
+    path = os.path.join(target, split)
+    if not os.path.exists(path):
+      os.makedirs(path)
+  for i in tqdm(range(total_pieces)):
+    midi_filename = df.midi_filename[i]
+    split = df.split[i]
+    dataset = df.dataset[i]
+    path = os.path.join(target, split)
+    midi_data = pretty_midi.PrettyMIDI(os.path.join(dataset, midi_filename))
+    full_roll = get_full_piano_roll(midi_data, fs=fs, show=show)
+    for j in range(0, full_roll.shape[-1], image_size):
+      if j + image_size <= full_roll.shape[-1]:
+        full_roll_excerpt = full_roll[:, :, j:j + image_size]
+      else:
+        full_roll_excerpt = np.zeros((3, full_roll.shape[1], image_size))   # 2x128ximage_size
+        full_roll_excerpt[:, :, : full_roll.shape[-1] - j] = full_roll[:, :, j:]
+      empty_roll = math.isclose(full_roll_excerpt.max(), 0.)
+      if not empty_roll:
+        # Find the last '/' in the string
+        last_slash_index = midi_filename.rfind('/')
+        # Find the '.npy' in the string
+        dot_mid_index = midi_filename.rfind('.mid')
+        # Extract the substring between last '/' and '.mid'
+        save_name = midi_filename[last_slash_index + 1:dot_mid_index]
+        full_roll_excerpt = full_roll_excerpt.astype(np.uint8)
+        np.save(os.path.join(path, save_name + '_' + str(j // image_size) + '.npy'), full_roll_excerpt)
+        # save with dataset name for VAE duplicate file names
+        # np.save(os.path.join(path, dataset + '_' + save_name + '_' + str(j // image_size) + '.npy'), full_roll_excerpt)
+    if overlap:
+      for j in range(image_size//2, full_roll.shape[-1], image_size):   # overlap with image_size//2
+        if j + image_size <= full_roll.shape[-1]:
+          full_roll_excerpt = full_roll[:, :, j:j + image_size]
+        else:
+          full_roll_excerpt = np.zeros((3, full_roll.shape[1], image_size))
+          full_roll_excerpt[:, :, : full_roll.shape[-1] - j] = full_roll[:, :, j:]
+        empty_roll = math.isclose(full_roll_excerpt.max(), 0.)
+        if not empty_roll:
+          last_slash_index = midi_filename.rfind('/')
+          dot_mid_index = midi_filename.rfind('.mid')
+          save_name = midi_filename[last_slash_index + 1:dot_mid_index]
+          full_roll_excerpt = full_roll_excerpt.astype(np.uint8)
+          np.save(os.path.join(path, 'shift_' + save_name + '_' + str(j // image_size) + '.npy'), full_roll_excerpt)
+          # save with dataset name for VAE duplicate file names
+          # np.save(os.path.join(path, dataset + '_' + 'shift_' + save_name + '_' + str(j // image_size) + '.npy'), full_roll_excerpt)
+  return
+def main():
+    # create fs=100 1.28s datasets without overlap (can be rearranged)
+    preprocess_midi(target='all-128-fs100', csv_path='all_midi.csv', fs=100, image_size=128, overlap=False, show=False)
+    # create fs=100 2.56s datasets with overlap (used for vae training), when load in, need to select 1.28s from 2.56s
+    # preprocess_midi(target='all-256-overlap-fs100', csv_path='all_midi.csv', fs=100, image_size=256, overlap=True,
+    #                 show=False)
+    # create fs=12.5 (0.08s) for pixel space diffusion model, rearrangement with length 2
+    # preprocess_midi(target='all-128-fs12.5', csv_path='all_midi.csv', fs=12.5, image_size=128, overlap=False,
+    #                 show=False)
+if __name__ == "__main__":
+    main()

datasets/select_midi.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import os
+import pretty_midi
+import argparse
+import random
+def select_midi(input_path, output_dir, select_length=10.24):
+    # Ensure the output directory exists
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    for midi_file_name in os.listdir(input_path):
+        if not (midi_file_name.endswith('.midi') or midi_file_name.endswith('.mid')):
+            continue  # Skip non-midi files
+        full_path = os.path.join(input_path, midi_file_name)
+        try:
+            midi_data = pretty_midi.PrettyMIDI(full_path)
+        except Exception as e:
+            print(f"Error processing {midi_file_name}: {e}")
+            continue  # Skip to the next file if an error occurs
+        end_time = midi_data.get_end_time()  # Get end time directly with pretty_midi
+        if select_length > end_time:
+            print("Segment length is longer than the MIDI file duration.")
+            continue
+        start_time = random.uniform(0, end_time - select_length)
+        segment_end_time = start_time + select_length
+        # Create a new MIDI object for each chunk
+        chunk_midi_data = pretty_midi.PrettyMIDI()
+        # Merge non-drum instruments into a single instrument
+        merged_instrument = pretty_midi.Instrument(program=0, is_drum=False)
+        for instrument in midi_data.instruments:
+            if not instrument.is_drum:
+                for note in instrument.notes:
+                    if start_time <= note.start < segment_end_time:
+                        # Shift the note start and end times to start at 0
+                        new_note = pretty_midi.Note(
+                            velocity=note.velocity,
+                            pitch=note.pitch,
+                            start=note.start - start_time,
+                            end=note.end - start_time
+                        )
+                        merged_instrument.notes.append(new_note)
+            else:
+                # If it's a drum instrument, just adjust the note times and append it
+                new_drum_instrument = pretty_midi.Instrument(program=instrument.program, is_drum=True,
+                                                             name=instrument.name)
+                new_drum_instrument.notes = [note for note in instrument.notes if
+                                             start_time <= note.start < segment_end_time]
+                for note in new_drum_instrument.notes:
+                    note.start -= start_time
+                    note.end -= start_time
+                chunk_midi_data.instruments.append(new_drum_instrument)
+        # Add the merged instrument to the MIDI object
+        chunk_midi_data.instruments.append(merged_instrument)
+        # Save the chunk with the same name as the original file
+        chunk_midi_data.write(os.path.join(output_dir, midi_file_name))
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Chunk MIDI files into specified lengths.")
+    parser.add_argument("--input_path", type=str, help="Path to the directory containing the MIDI files to chunk.")
+    parser.add_argument("--output_dir", type=str, help="Path to the directory where the chunked MIDI files will be saved.")
+    parser.add_argument("--select_length", type=float, default=10.24, help="length to chunk the midi file to (s).")
+    args = parser.parse_args()
+    select_midi(args.input_path, args.output_dir, select_length=args.select_length)

diff_collage/README.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # Diff Collage
2	+
3	+ This is an implementation of the [DiffCollage](https://arxiv.org/abs/2303.17076) paper. We use DiffCollage to generate long sequence following certain structure (e.g. loop).

diff_collage/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .w_loss import *
+from .generic_sampler import *
+from .condind_long import CondIndSimple
+from .condind_circle import CondIndCircle
+from .avg_long import AvgLong

diff_collage/avg_circle.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import numpy as np
+import torch as th
+from einops import rearrange
+from .generic_sampler import SimpleWork
+from .w_img import split_wimg, avg_merge_wimg
+class AvgCircle(SimpleWork):
+    def __init__(self, shape, eps_scalar_t_fn, num_img, overlap_size=32):
+        c, h, w = shape
+        self.base_img_w = w
+        self.overlap_size = overlap_size
+        self.num_img = num_img
+        final_img_w = w * num_img - self.overlap_size * num_img
+        super().__init__((c, h, final_img_w), self.get_eps_t_fn(eps_scalar_t_fn))
+    def get_eps_t_fn(self, eps_scalar_t_fn):
+        def eps_t_fn(long_x, scalar_t, enable_grad=False):
+            shift = np.random.randint(self.base_img_w)
+            long_x = th.cat(
+                [
+                    long_x[:,:,:,shift:],
+                    long_x[:,:,:,:shift]
+                ],
+                dim=-1
+            )
+            x = th.cat(
+                [
+                    long_x,
+                    long_x[:,:,:,:self.overlap_size]
+                ],
+                dim=-1,
+            )
+            xs, _overlap = split_wimg(x, self.num_img, rtn_overlap=True)
+            assert _overlap == self.overlap_size
+            full_eps = eps_scalar_t_fn(xs, scalar_t, enable_grad) # #((b,n), c, h, w)
+            eps = avg_merge_wimg(full_eps, self.overlap_size, n=self.num_img)
+            eps = th.cat(
+                [
+                    (eps[:,:,:,:self.overlap_size] + eps[:,:,:,-self.overlap_size:])/2.0,
+                    eps[:,:,:,self.overlap_size:-self.overlap_size]
+                ],
+                dim=-1
+            )
+            assert eps.shape == long_x.shape
+            return th.cat(
+                [
+                    eps[:,:,:,-shift:],
+                    eps[:,:,:,:-shift],
+                ],
+                dim=-1
+            )
+            # return eps
+        return eps_t_fn
+    def x0_fn(self, xt, scalar_t, enable_grad=False):
+        cur_eps = self.eps_scalar_t_fn(xt, scalar_t, enable_grad)
+        x0 = xt - scalar_t * cur_eps
+        return x0, {}, {
+            "x0": x0.cpu()
+        }

diff_collage/avg_long.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import torch as th
+from einops import rearrange
+from .generic_sampler import SimpleWork
+from .w_img import split_wimg, avg_merge_wimg
+class AvgLong(SimpleWork):
+    def __init__(self, shape, eps_scalar_t_fn, num_img, overlap_size=32):
+        c, h, w = shape
+        assert overlap_size == w // 2
+        self.overlap_size = overlap_size
+        self.num_img = num_img
+        final_img_w = w * num_img - self.overlap_size * (num_img - 1)
+        super().__init__((c, h, final_img_w), self.get_eps_t_fn(eps_scalar_t_fn))
+    def loss(self, x):
+        x1, x2 = x[:-1], x[1:]
+        return th.sum(
+            (th.abs(x1[:, :, :, -self.overlap_size :] - x2[:, :, :, : self.overlap_size])) ** 2,
+            dim=(1, 2, 3),
+        )
+    def get_eps_t_fn(self, eps_scalar_t_fn):
+        def eps_t_fn(long_x, scalar_t, y=None):
+            xs = split_wimg(long_x, self.num_img, rtn_overlap=False)
+            if y is not None:
+                y = y.repeat_interleave(self.num_img)
+            scalar_t = scalar_t.repeat_interleave(self.num_img)
+            full_eps = eps_scalar_t_fn(xs, scalar_t, y=y)  #((b,n), c, h, w)
+            full_eps = rearrange(
+                full_eps,
+                "(b n) c h w -> n b c h w", n = self.num_img
+            )
+            whole_eps = rearrange(
+                full_eps,
+                "n b c h w -> (b n) c h w"
+            )
+            return avg_merge_wimg(whole_eps, self.overlap_size, n=self.num_img, is_avg=False)
+        return eps_t_fn

diff_collage/condind_circle.py ADDED Viewed

	@@ -0,0 +1,190 @@

+import torch as th
+from einops import rearrange
+from .generic_sampler import SimpleWork
+from .w_img import split_wimg, avg_merge_wimg
+class CondIndCircle(SimpleWork):
+    def __init__(self, shape, eps_scalar_t_fn, num_img, overlap_size=32):
+        c, h, w = shape
+        assert overlap_size == w // 2
+        self.overlap_size = overlap_size
+        self.num_img = num_img
+        final_img_w = w * num_img - self.overlap_size * num_img
+        super().__init__((c, h, final_img_w), self.get_eps_t_fn(eps_scalar_t_fn))
+    def circle_split(self, in_x):
+        long_x = th.cat(
+            [
+                in_x,
+                in_x[:,:,:,:self.overlap_size],
+            ],
+            dim=-1
+        )
+        xs = split_wimg(long_x, self.num_img, rtn_overlap=False)
+        return xs
+    def circle_merge(self, xs, overlap_size=None):
+        if overlap_size is None:
+            overlap_size = self.overlap_size
+        long_xs = avg_merge_wimg(xs, overlap_size, n=self.num_img, is_avg=True)
+        return th.cat(
+            [
+                (
+                    long_xs[:,:,:,:overlap_size] + long_xs[:,:,:,-overlap_size:]
+                ) / 2.0,
+                long_xs[:,:,:,overlap_size:-overlap_size]
+            ],
+            dim=-1
+        )
+    def get_eps_t_fn(self, eps_scalar_t_fn):
+        def eps_t_fn(in_x, scalar_t, y=None):
+            long_x = th.cat(
+                [
+                    in_x,
+                    in_x[:,:,:,:self.overlap_size],
+                ],
+                dim=-1
+            )
+            xs = split_wimg(long_x, self.num_img, rtn_overlap=False)
+            if y is not None:
+                y = y.repeat_interleave(self.num_img)
+            scalar_t = scalar_t.repeat_interleave(self.num_img)
+            full_eps = eps_scalar_t_fn(xs, scalar_t, y=y)  #((b,n), c, h, w)
+            full_eps = rearrange(
+                full_eps,
+                "(b n) c h w -> n b c h w", n = self.num_img
+            )
+            # calculate half eps
+            half_eps = eps_scalar_t_fn(xs[:,:,:,-self.overlap_size:], scalar_t, y=y) #((b,n), c, h, w//2)
+            half_eps = rearrange(
+                half_eps,
+                "(b n) c h w -> n b c h w", n = self.num_img
+            )
+            half_eps[-1]=0
+            full_eps[:,:,:,:,-self.overlap_size:] = full_eps[:,:,:,:,-self.overlap_size:] - half_eps
+            whole_eps = rearrange(
+                full_eps,
+                "n b c h w -> (b n) c h w"
+            )
+            long_eps = avg_merge_wimg(whole_eps, self.overlap_size, n=self.num_img, is_avg=False)
+            return th.cat(
+                [
+                    (
+                        long_eps[:,:,:,:self.overlap_size] + long_eps[:,:,:,-self.overlap_size:]
+                    ) / 2.0,
+                    long_eps[:,:,:,self.overlap_size:-self.overlap_size]
+                ],
+                dim=-1
+            )
+        return eps_t_fn
+class CondIndCircleSR(SimpleWork):
+    def __init__(self, shape, eps_scalar_t_fn, num_img, low_res, overlap_size=32):
+        c, h, w = shape
+        assert overlap_size == w // 2
+        self.overlap_size = overlap_size
+        self.low_overlap_size = low_res.shape[-2] // 2
+        self.num_img = num_img
+        final_img_w = w * num_img - self.overlap_size * num_img
+        assert low_res.shape[-1] == self.low_overlap_size * num_img
+        self.square_fn = self.get_square_sr_fn(eps_scalar_t_fn, low_res)
+        self.half_fn = self.get_half_sr_fn(eps_scalar_t_fn, low_res)
+        super().__init__((c, h, final_img_w), self.get_eps_t_fn())
+    def circle_split(self, in_x, overlap_size=None):
+        if overlap_size is None:
+            overlap_size = self.overlap_size
+        long_x = th.cat(
+            [
+                in_x,
+                in_x[:,:,:,:overlap_size],
+            ],
+            dim=-1
+        )
+        xs = split_wimg(long_x, self.num_img, rtn_overlap=False)
+        return xs
+    def circle_merge(self, xs, overlap_size=None):
+        if overlap_size is None:
+            overlap_size = self.overlap_size
+        long_xs = avg_merge_wimg(xs, overlap_size, n=self.num_img, is_avg=True)
+        return th.cat(
+            [
+                (
+                    long_xs[:,:,:,:overlap_size] + long_xs[:,:,:,-overlap_size:]
+                ) / 2.0,
+                long_xs[:,:,:,overlap_size:-overlap_size]
+            ],
+            dim=-1
+        )
+    def get_square_sr_fn(self, eps_fn, low_res):
+        low_res = self.circle_split(low_res, self.low_overlap_size)
+        def _fn(_x, _t, enable_grad):
+            context = th.enable_grad if enable_grad else th.no_grad
+            with context():
+                vec_t = th.ones(_x.shape[0]).cuda() * _t
+                rtn = eps_fn(_x, vec_t, low_res)
+            rtn = rearrange(
+                rtn,
+                "(b n) c h w -> n b c h w", n = self.num_img
+            )
+            return rtn
+        return _fn
+    def get_half_sr_fn(self, eps_fn, low_res):
+        low_res = self.circle_split(low_res, self.low_overlap_size)
+        def _fn(_x, _t, enable_grad):
+            context = th.enable_grad if enable_grad else th.no_grad
+            with context():
+                vec_t = th.ones(_x.shape[0]).cuda() * _t
+                half_eps = eps_fn(_x[:,:,:,-self.overlap_size:], vec_t, low_res[:,:,:,-self.low_overlap_size:])
+            half_eps = rearrange(
+                half_eps,
+                "(b n) c h w -> n b c h w", n = self.num_img
+            )
+            half_eps[-1]=0
+            return half_eps
+        return _fn
+    def get_eps_t_fn(self):
+        def eps_t_fn(in_x, scalar_t, enable_grad=False):
+            long_x = th.cat(
+                [
+                    in_x,
+                    in_x[:,:,:,:self.overlap_size],
+                ],
+                dim=-1
+            )
+            xs = split_wimg(long_x, self.num_img, rtn_overlap=False)
+            # full eps
+            full_eps = self.square_fn(xs, scalar_t, enable_grad)
+            # calculate half eps
+            half_eps = self.half_fn(xs, scalar_t, enable_grad)
+            full_eps[:,:,:,:,-self.overlap_size:] = full_eps[:,:,:,:,-self.overlap_size:] - half_eps
+            whole_eps = rearrange(
+                full_eps,
+                "n b c h w -> (b n) c h w"
+            )
+            long_eps = avg_merge_wimg(whole_eps, self.overlap_size, n=self.num_img, is_avg=False)
+            return th.cat(
+                [
+                    (
+                        long_eps[:,:,:,:self.overlap_size] + long_eps[:,:,:,-self.overlap_size:]
+                    ) / 2.0,
+                    long_eps[:,:,:,self.overlap_size:-self.overlap_size]
+                ],
+                dim=-1
+            )
+        return eps_t_fn

diff_collage/condind_long.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import torch
+import torch as th
+from einops import rearrange
+from .generic_sampler import SimpleWork
+from .w_img import split_wimg, avg_merge_wimg
+class CondIndSimple(SimpleWork):
+    def __init__(self, shape, eps_scalar_t_fn, num_img, overlap_size=32):
+        c, h, w = shape
+        assert overlap_size == w // 2
+        self.overlap_size = overlap_size
+        self.num_img = num_img
+        final_img_w = w * num_img - self.overlap_size * (num_img - 1)
+        super().__init__((c, h, final_img_w), self.get_eps_t_fn(eps_scalar_t_fn))
+    def loss(self, x):
+        x1, x2 = x[:-1], x[1:]
+        return th.sum(
+            (th.abs(x1[:, :, :, -self.overlap_size :] - x2[:, :, :, : self.overlap_size])) ** 2,
+            dim=(1, 2, 3),
+        )
+    def get_eps_t_fn(self, eps_scalar_t_fn):
+        def eps_t_fn(long_x, scalar_t, y=None):
+            xs = split_wimg(long_x, self.num_img, rtn_overlap=False)
+            if y is not None:
+                y = y.repeat_interleave(self.num_img)
+            scalar_t = scalar_t.repeat_interleave(self.num_img)
+            full_eps = eps_scalar_t_fn(xs, scalar_t, y=y)  #((b,n), c, h, w)
+            full_eps = rearrange(
+                full_eps,
+                "(b n) c h w -> n b c h w", n = self.num_img
+            )
+            # calculate half eps
+            half_eps = eps_scalar_t_fn(xs[:,:,:,-self.overlap_size:], scalar_t, y=y)  #((b,n), c, h, w//2)
+            half_eps = rearrange(
+                half_eps,
+                "(b n) c h w -> n b c h w", n = self.num_img
+            )
+            half_eps[-1]=0
+            full_eps[:,:,:,:,-self.overlap_size:] = full_eps[:,:,:,:,-self.overlap_size:] - half_eps
+            whole_eps = rearrange(
+                full_eps,
+                "n b c h w -> (b n) c h w"
+            )
+            return avg_merge_wimg(whole_eps, self.overlap_size, n=self.num_img, is_avg=False)
+        return eps_t_fn
+class CondIndSR(SimpleWork):
+    def __init__(self, shape, eps_scalar_t_fn, num_img, low_res, overlap_size=128):
+        c, h, w = shape
+        assert overlap_size == w // 2
+        self.overlap_size = overlap_size
+        self.low_overlap_size = low_res.shape[-2] // 2
+        self.num_img = num_img
+        final_img_w = w * num_img - self.overlap_size * (num_img - 1)
+        assert low_res.shape[-1] == self.low_overlap_size * (num_img + 1)
+        self.square_fn = self.get_square_sr_fn(eps_scalar_t_fn, low_res)
+        self.half_fn = self.get_half_sr_fn(eps_scalar_t_fn, low_res)
+        super().__init__((c, h, final_img_w), self.get_eps_t_fn())
+    def get_square_sr_fn(self, eps_fn, low_res):
+        low_res = split_wimg(low_res, self.num_img, False)
+        def _fn(_x, _t, enable_grad):
+            context = th.enable_grad if enable_grad else th.no_grad
+            with context():
+                vec_t = th.ones(_x.shape[0]).cuda() * _t
+                rtn = eps_fn(_x, vec_t, low_res)
+            rtn = rearrange(
+                rtn,
+                "(b n) c h w -> n b c h w", n = self.num_img
+            )
+            return rtn
+        return _fn
+    def get_half_sr_fn(self, eps_fn, low_res):
+        low_res = split_wimg(low_res, self.num_img, False)
+        def _fn(_x, _t, enable_grad):
+            context = th.enable_grad if enable_grad else th.no_grad
+            with context():
+                vec_t = th.ones(_x.shape[0]).cuda() * _t
+                half_eps = eps_fn(_x[:,:,:,-self.overlap_size:], vec_t, low_res[:,:,:,-self.low_overlap_size:])
+            half_eps = rearrange(
+                half_eps,
+                "(b n) c h w -> n b c h w", n = self.num_img
+            )
+            half_eps[-1]=0
+            return half_eps
+        return _fn
+    def get_eps_t_fn(self):
+        def eps_t_fn(in_x, scalar_t, enable_grad=False):
+            xs = split_wimg(in_x, self.num_img, rtn_overlap=False)
+            # full eps
+            full_eps = self.square_fn(xs, scalar_t, enable_grad)
+            # calculate half eps
+            half_eps = self.half_fn(xs, scalar_t, enable_grad)
+            full_eps[:,:,:,:,-self.overlap_size:] = full_eps[:,:,:,:,-self.overlap_size:] - half_eps
+            whole_eps = rearrange(
+                full_eps,
+                "n b c h w -> (b n) c h w"
+            )
+            out_eps = avg_merge_wimg(whole_eps, self.overlap_size, n=self.num_img, is_avg=False)
+            return out_eps
+        return eps_t_fn
+# class CondIndLong(SimpleWork):
+#     def __init__(self, shape, eps_scalar_t_fn, overlap_size=32):
+#         super().__init__(shape, eps_scalar_t_fn)
+#         self.overlap_size = overlap_size
+#     def loss(self, x):
+#         x1, x2 = x[:-1], x[1:]
+#         return th.sum(
+#             (th.abs(x1[:, :, :, -self.overlap_size :] - x2[:, :, :, : self.overlap_size])) ** 2,
+#             dim=(1, 2, 3),
+#         )
+#     def generate_xT(self, n):
+#         white_noise = th.randn((n , *self.shape)).cuda()
+#         return self.noise(white_noise, None) * 80.0
+#     def noise(self, xt, scalar_t):
+#         del scalar_t
+#         noise = th.randn_like(xt)
+#         b, _, _, w = xt.shape
+#         final_img_w = w * b - self.overlap_size * (b - 1)
+#         noise = rearrange(noise, "(t n) c h w -> t c h (n w)", t=1)[:, :, :, :final_img_w]
+#         noise = split_wimg(noise, b, rtn_overlap=False)
+#         return noise
+#     def merge(self, xs):
+#         return avg_merge_wimg(xs, self.overlap_size)

diff_collage/generic_sampler.py ADDED Viewed

	@@ -0,0 +1,113 @@

+from collections import defaultdict
+import math
+import numpy as np
+import torch as th
+from tqdm import tqdm
+__all__ = [
+    "generic_sampler",
+    "SimpleWork",
+]
+def batch_mul(a, b):  # pylint: disable=invalid-name
+    return th.einsum("a...,a...->a...", a, b)
+class SimpleWork:
+    def __init__(self, shape, eps_scalar_t_fn):
+        self.shape = shape
+        self.eps_scalar_t_fn = eps_scalar_t_fn
+    def generate_xT(self, n):
+        return 80.0 * th.randn((n , *self.shape)).cuda()
+    def x0_fn(self, xt, scalar_t, y=None):
+        cur_eps = self.eps_scalar_t_fn(xt, scalar_t, y=y)
+        x0 = xt - scalar_t * cur_eps
+        x0 = th.clip(x0, -1,1)
+        return x0, {}, {"x0": x0.cpu()}
+    def noise(self, xt, scalar_t):
+        del scalar_t
+        return th.randn_like(xt)
+    def  rev_ts(self, n_step, ts_order):
+        _rev_ts = th.pow(
+            th.linspace(
+                np.power(80.0, 1.0 / ts_order),
+                np.power(1e-3, 1.0 / ts_order),
+                n_step + 1
+            ),
+            ts_order
+        )
+        return _rev_ts.cuda()
+def generic_sampler(  # pylint: disable=too-many-locals
+    x,
+    rev_ts,
+    noise_fn,
+    x0_pred_fn,
+    xt_lgv_fn=None,
+    s_churn = 0.0,
+    before_step_fn=None,
+    end_fn=None, # to do???
+    is_tqdm=True,
+    is_traj=True,
+):
+    measure_loss = defaultdict(list)
+    traj = defaultdict(list)
+    if callable(x):
+        x = x()
+    if traj:
+        traj["xt"].append(x.cpu())
+    s_t_min = 0.05
+    s_t_max = 50.0
+    s_noise = 1.003
+    eta = min(s_churn / len(rev_ts), math.sqrt(2.0) - 1)
+    loop = zip(rev_ts[:-1], rev_ts[1:])
+    if is_tqdm:
+        loop = tqdm(loop)
+    running_x = x
+    for cur_t, next_t in loop:
+        # cur_x = traj["xt"][-1].clone().to("cuda")
+        cur_x = running_x
+        if cur_t < s_t_max and cur_t > s_t_min:
+            hat_cur_t = cur_t + eta * cur_t
+            cur_noise = noise_fn(cur_x, cur_t)
+            cur_x = cur_x + s_noise * cur_noise * th.sqrt(hat_cur_t ** 2 - cur_t ** 2)
+            cur_t = hat_cur_t
+        if before_step_fn is not None:
+            # TODO: may change the callabck
+            cur_x = before_step_fn(cur_x, cur_t)
+        x0, loss_info, traj_info = x0_pred_fn(cur_x, cur_t)
+        epsilon_1 = (cur_x - x0) / cur_t
+        xt_next = x0 + next_t * epsilon_1
+        x0, loss_info, traj_info = x0_pred_fn(xt_next, next_t)
+        epsilon_2 = (xt_next - x0) / next_t
+        xt_next = cur_x + (next_t - cur_t) * (epsilon_1 + epsilon_2) / 2
+        running_x = xt_next
+        if is_traj:
+            for key, value in loss_info.items():
+                measure_loss[key].append(value)
+            for key, value in traj_info.items():
+                traj[key].append(value)
+            traj["xt"].append(running_x.to("cpu").detach())
+        if xt_lgv_fn:
+            raise RuntimeError("Not implemented")
+    if is_traj:
+        return traj, measure_loss
+    return running_x

diff_collage/loss_helper.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import torch as th
+from .generic_sampler import batch_mul
+def get_x0_grad_pred_fn(raw_net_model, cond_loss_fn, weight_fn, x0_update, thres_t):
+    def fn(xt, scalar_t):
+        xt = xt.requires_grad_(True)
+        x0_pred = raw_net_model(xt, scalar_t)
+        loss_info = {
+            "raw_x0": cond_loss_fn(x0_pred.detach()).cpu(),
+        }
+        traj_info = {
+            "t": scalar_t,
+        }
+        if scalar_t < thres_t:
+            x0_cor = x0_pred.detach()
+        else:
+            pred_loss = cond_loss_fn(x0_pred)
+            grad_term = th.autograd.grad(pred_loss.sum(), xt)[0]
+            weights = weight_fn(x0_pred, grad_term, cond_loss_fn)
+            x0_cor = (x0_pred - batch_mul(weights, grad_term)).detach()
+            loss_info["weight"] = weights.detach().cpu()
+            traj_info["grad"] = grad_term.detach().cpu()
+        if x0_update:
+            x0 = x0_update(x0_cor, scalar_t)
+        else:
+            x0 = x0_cor
+        loss_info["cor_x0"] = cond_loss_fn(x0_cor.detach()).cpu()
+        loss_info["x0"] = cond_loss_fn(x0.detach()).cpu()
+        traj_info.update({
+                "raw_x0": x0_pred.detach().cpu(),
+                "cor_x0": x0_cor.detach().cpu(),
+                "x0": x0.detach().cpu(),
+            }
+        )
+        return x0_cor, loss_info, traj_info
+    return fn

diff_collage/w_img.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import torch as th
+from einops import rearrange
+__all__ = [
+    "split_wimg",
+]
+def split_wimg(wimg, n_img, rtn_overlap=True):
+    if wimg.ndim == 3:
+        wimg = wimg[None]
+    _, _, h, w = wimg.shape
+    base_len = 128   # todo: hard code 128 here (the length of the latents)
+    overlap_size = (n_img * base_len - w) // (n_img - 1)
+    assert n_img * base_len - overlap_size * (n_img - 1) == w
+    img = th.nn.functional.unfold(wimg, kernel_size=(h, base_len), stride=base_len - overlap_size) #(B, block, n_img)
+    img = rearrange(
+        img,
+        "b (c h w) n -> (b n) c h w", h=h, w=base_len
+    )
+    if rtn_overlap:
+        return img , overlap_size
+    return img
+def avg_merge_wimg(imgs, overlap_size, n=None, is_avg=True):
+    b, _, h, w = imgs.shape
+    if n == None:
+        n = b
+    unfold_img = rearrange(
+        imgs,
+        "(b n) c h w -> b (c h w) n", n = n
+    )
+    img = th.nn.functional.fold(
+        unfold_img,
+        (h, n * w - (n-1) * overlap_size),
+        kernel_size = (h, w),
+        stride = w - overlap_size
+    )
+    if is_avg:
+        counter = th.nn.functional.fold(
+            th.ones_like(unfold_img),
+            (h, n * w - (n-1) * overlap_size),
+            kernel_size = (h, w),
+            stride = w - overlap_size
+        )
+        return img / counter
+    return img
+# legacy code use naive implementation
+def split_wimg_legacy(himg, n_img, rtn_overlap=True):
+    if himg.ndim == 3:
+        himg = himg[None]
+    _, _, h, w = himg.shape
+    overlap_size = (n_img * h - w) // (n_img - 1)
+    assert n_img * h - overlap_size * (n_img - 1) == w
+    himg = himg[0]
+    rtn_img = [himg[:, :, :h]]
+    for i in range(n_img - 1):
+        rtn_img.append(himg[:, :, (h - overlap_size) * (i + 1) : h + (h - overlap_size) * (i + 1)])
+    if rtn_overlap:
+        return th.stack(rtn_img), overlap_size
+    return th.stack(rtn_img)
+def avg_merge_wimg_legacy(imgs, overlap_size):
+    _, _, _, w = imgs.shape
+    rtn_img = [imgs[0]]
+    for cur_img in imgs[1:]:
+        rtn_img.append(cur_img[:, :, overlap_size:])
+    first_img = th.cat(rtn_img, dim=-1)
+    rtn_img = []
+    for cur_img in imgs[:-1]:
+        rtn_img.append(cur_img[:, :, : w - overlap_size])
+    rtn_img.append(imgs[-1])
+    second_img = th.cat(rtn_img, dim=-1)
+    return (first_img + second_img) / 2.0

diff_collage/w_loss.py ADDED Viewed

	@@ -0,0 +1,433 @@

+import math
+import torch as th
+from einops import rearrange
+import numpy as np
+from .generic_sampler import batch_mul
+def split_wimg(himg, n_img, rtn_overlap=True):
+    if himg.ndim == 3:
+        himg = himg[None]
+    _, _, h, w = himg.shape
+    overlap_size = (n_img * h - w) // (n_img - 1)
+    assert n_img * h - overlap_size * (n_img - 1) == w
+    himg = himg[0]
+    rtn_img = [himg[:, :, :h]]
+    for i in range(n_img - 1):
+        rtn_img.append(himg[:, :, (h - overlap_size) * (i + 1) : h + (h - overlap_size) * (i + 1)])
+    if rtn_overlap:
+        return th.stack(rtn_img), overlap_size
+    return th.stack(rtn_img)
+def merge_wimg(imgs, overlap_size):
+    _, _, _, w = imgs.shape
+    rtn_img = [imgs[0]]
+    for cur_img in imgs[1:]:
+        rtn_img.append(cur_img[:, :, overlap_size:])
+    first_img = th.cat(rtn_img, dim=-1)
+    rtn_img = []
+    for cur_img in imgs[:-1]:
+        rtn_img.append(cur_img[:, :, : w - overlap_size])
+    rtn_img.append(imgs[-1])
+    second_img = th.cat(rtn_img, dim=-1)
+    return (first_img + second_img) / 2.0
+def get_x0_pred_fn(raw_net_model, cond_loss_fn, weight_fn, x0_fn, thres_t, init_fn=None):
+    def fn(xt, scalar_t):
+        if init_fn is not None:
+            xt = init_fn(xt, scalar_t)
+        xt = xt.requires_grad_(True)
+        x0_pred = raw_net_model(xt, scalar_t)
+        loss_info = {
+            "raw_x0": cond_loss_fn(x0_pred.detach()).cpu(),
+        }
+        traj_info = {
+            "t": scalar_t,
+        }
+        if scalar_t < thres_t:
+            x0_cor = x0_pred.detach()
+        else:
+            pred_loss = cond_loss_fn(x0_pred)
+            grad_term = th.autograd.grad(pred_loss.sum(), xt)[0]
+            weights = weight_fn(x0_pred, grad_term, cond_loss_fn)
+            x0_cor = (x0_pred - batch_mul(weights, grad_term)).detach()
+            loss_info["weight"] = weights.detach().cpu()
+            traj_info["grad"] = grad_term.detach().cpu()
+        if x0_fn:
+            x0 = x0_fn(x0_cor, scalar_t)
+        else:
+            x0 = x0_cor
+        loss_info["cor_x0"] = cond_loss_fn(x0_cor.detach()).cpu()
+        loss_info["x0"] = cond_loss_fn(x0.detach()).cpu()
+        traj_info.update({
+                "raw_x0": x0_pred.detach().cpu(),
+                "cor_x0": x0_cor.detach().cpu(),
+                "x0": x0.detach().cpu(),
+            }
+        )
+        return x0_cor, loss_info, traj_info
+    return fn
+def simple_noise(cur_t, xt):
+    del cur_t
+    return th.randn_like(xt)
+def get_fix_weight_fn(fix_weight):
+    def weight_fn(xs, grads, *args):
+        del grads, args
+        return th.ones(xs.shape[0]).to(xs) * fix_weight
+    return weight_fn
+class SeqWorker:
+    def __init__(self, overlap_size=10, src_img=None):
+        self.overlap_size = overlap_size
+        self.src_img = src_img
+    def loss(self, x):
+        return th.sum(
+            (th.abs(self.src_img[:, :, :, -self.overlap_size :] - x[:, :, :, : self.overlap_size]))
+            ** 2,
+            dim=(1, 2, 3),
+        )
+    def x0_replace(self, x0):
+        rtn_x0 = x0.clone()
+        rtn_x0[:, :, :, : self.overlap_size] = self.src_img[:, :, :, -self.overlap_size :]
+        return x0
+    def optimal_weight_fn(self, x0, grads, *args, ratio=1.0):
+        del args
+        overlap_size = self.overlap_size
+        # argmin_{w} (delta_pixel - w * delta_pixel)^2
+        delta_pixel = x0[:, :, :, :overlap_size] - self.src_img[:, :, :, -overlap_size:]
+        delta_grads = grads[:, :, :, :overlap_size]
+        num = th.sum(delta_pixel * delta_grads).item()
+        denum = th.sum(delta_grads * delta_grads).item()
+        _optimal_weight = num / denum
+        if math.isnan(_optimal_weight):
+            print(denum)
+            raise RuntimeError("nan for weights")
+        return ratio * _optimal_weight * th.ones(x0.shape[0]).to(x0)
+class CircleWorker:
+    def __init__(self, overlap_size=10, adam_num_iter=100):
+        self.overlap_size = overlap_size
+        self.adam_num_iter = adam_num_iter
+    def get_match_patch(self, x):
+        tail = x[:, :, :, -self.overlap_size :]
+        head = x[:, :, :, : self.overlap_size]
+        tail = th.roll(tail, 1, 0)
+        return tail, head
+    def loss(self, x):
+        tail, head = self.get_match_patch(x)
+        return th.sum(
+            (tail - head)**2,
+            dim=(1, 2, 3),
+        )
+    def split_noise(self, cur_t, xt):
+        noise = simple_noise(cur_t, xt)
+        b, _, _, w = xt.shape
+        final_img_w = w * b - self.overlap_size * b
+        noise = rearrange(noise, "(t n) c h w -> t c h (n w)", t=1)[:, :, :, :final_img_w]
+        noise = th.cat([noise, noise[:,:,:, :self.overlap_size]], dim=-1)
+        noise, _ = split_wimg(noise, b)
+        return noise
+    def merge_circle_image(self, xt):
+        merged_long_img = merge_wimg(xt, self.overlap_size)
+        return th.cat(
+            [
+                (merged_long_img[:,:,:self.overlap_size] + merged_long_img[:,:,-self.overlap_size:]) / 2.0,
+                merged_long_img[:,:,self.overlap_size:-self.overlap_size],
+            ],
+            dim=-1
+        )
+    def split_circle_image(self, merged_long_img, n):
+        imgs,_ = split_wimg(
+            th.cat(
+                [
+                    merged_long_img,
+                    merged_long_img[:,:,:self.overlap_size],
+                ],
+                dim = -1,
+            ),
+            n
+        )
+        return imgs
+    def optimal_weight_fn(self, xs, grads, *args):
+        del args
+        # argmin_{w} (delta_pixel - w * delta_pixel)^2
+        tail, head = self.get_match_patch(xs)
+        delta_pixel = tail - head
+        tail, head = self.get_match_patch(grads)
+        delta_grads = tail - head
+        num = th.sum(delta_pixel * delta_grads).item()
+        denum = th.sum(delta_grads * delta_grads).item()
+        _optimal_weight = num / denum
+        return _optimal_weight * th.ones(xs.shape[0]).to(xs)
+    def adam_grad_weight(self, x0, grad_term, cond_loss_fn):
+        init_weight = self.optimal_weight_fn(x0, grad_term)
+        grad_term = grad_term.detach()
+        x0 = x0.detach()
+        with th.enable_grad():
+            weights = init_weight.requires_grad_()
+            optimizer = th.optim.Adam(
+                [
+                    weights,
+                ],
+                lr=1e-2,
+            )
+            def _loss(w):
+                cor_x0 = x0 - batch_mul(w, grad_term)
+                return cond_loss_fn(cor_x0).sum()
+            for _ in range(self.adam_num_iter):
+                optimizer.zero_grad()
+                _cur_loss = _loss(weights)
+                _cur_loss.backward()
+                optimizer.step()
+        return weights
+    # TODO:
+    def x0_replace(self, x0, sclar_t, thres_t):
+        if sclar_t > thres_t:
+            merge_x0 = merge_wimg(x0, self.overlap_size)
+            return split_wimg(merge_x0, x0.shape[0])[0]
+        else:
+            return x0
+class ParaWorker:
+    def __init__(self, overlap_size=10, adam_num_iter=100):
+        self.overlap_size = overlap_size
+        self.adam_num_iter = adam_num_iter
+    def loss(self, x):
+        x1, x2 = x[:-1], x[1:]
+        return th.sum(
+            (th.abs(x1[:, :, :, -self.overlap_size :] - x2[:, :, :, : self.overlap_size])) ** 2,
+            dim=(1, 2, 3),
+        )
+    def split_noise(self, xt, cur_t):
+        noise = simple_noise(cur_t, xt)
+        b, _, _, w = xt.shape
+        final_img_w = w * b - self.overlap_size * (b - 1)
+        noise = rearrange(noise, "(t n) c h w -> t c h (n w)", t=1)[:, :, :, :final_img_w]
+        noise, _ = split_wimg(noise, b)
+        return noise
+    def optimal_weight_fn(self, xs, grads, *args):
+        del args
+        overlap_size = self.overlap_size
+        # argmin_{w} (delta_pixel - w * delta_pixel)^2
+        delta_pixel = xs[:-1, :, :, -overlap_size:] - xs[1:, :, :, :overlap_size]
+        delta_grads = grads[:-1, :, :, -overlap_size:] - grads[1:, :, :, :overlap_size]
+        num = th.sum(delta_pixel * delta_grads).item()
+        denum = th.sum(delta_grads * delta_grads).item()
+        _optimal_weight = num / denum
+        return _optimal_weight * th.ones(xs.shape[0]).to(xs)
+    def adam_grad_weight(self, x0, grad_term, cond_loss_fn):
+        init_weight = self.optimal_weight_fn(x0, grad_term)
+        grad_term = grad_term.detach()
+        x0 = x0.detach()
+        with th.enable_grad():
+            weights = init_weight.requires_grad_()
+            optimizer = th.optim.Adam(
+                [
+                    weights,
+                ],
+                lr=1e-2,
+            )
+            def _loss(w):
+                cor_x0 = x0 - batch_mul(w, grad_term)
+                return cond_loss_fn(cor_x0).sum()
+            for _ in range(self.adam_num_iter):
+                optimizer.zero_grad()
+                _cur_loss = _loss(weights)
+                _cur_loss.backward()
+                optimizer.step()
+        return weights
+    def x0_replace(self, x0, sclar_t, thres_t):
+        if sclar_t > thres_t:
+            merge_x0 = merge_wimg(x0, self.overlap_size)
+            return split_wimg(merge_x0, x0.shape[0])[0]
+        else:
+            return x0
+class ParaWorkerC(ParaWorker):
+    def __init__(self, src_img, mask_img, inpaint_w = 1.0, overlap_size=10, adam_num_iter=100):
+        self.src_img = src_img
+        self.inpaint_w = inpaint_w
+        self.mask_img = mask_img # 1 indicate masked given pixels
+        super().__init__(overlap_size, adam_num_iter)
+    def loss(self, x):
+        if x.shape[0] == 1:
+            return th.sum(
+                th.sum(
+                    th.square(self.src_img[:,:,:,:x.shape[-1]] - x), dim=(0,1)
+                ) * self.mask_img[:,:x.shape[-1]]
+            )
+        else:
+            consistent_loss = super().loss(x)
+            # merge image
+            merge_x = merge_wimg(x, self.overlap_size)
+            inpating_loss = th.sum(
+                th.sum(
+                    th.square(self.src_img[:,:,:,:merge_x.shape[-1]] - merge_x), dim=(0,1)
+                ) * self.mask_img[:,:merge_x.shape[-1]]
+            )
+        return consistent_loss + inpating_loss / (x.shape[-1] - 1)
+    def x0_replace(self, x0, sclar_t, thres_t):
+        if sclar_t > thres_t:
+            merge_x = merge_wimg(x0, self.overlap_size)
+            src_img = self.src_img[:,:,:,:merge_x.shape[-1]]
+            mask_img = self.mask_img[:,:merge_x.shape[-1]]
+            merge_x = th.where(mask_img[None,None], src_img, merge_x)
+            return split_wimg(merge_x, x0.shape[0])[0]
+        else:
+            return x0
+class SplitMergeOp:
+    def __init__(self, avg_overlap=32):
+        self.avg_overlap = avg_overlap
+        self.cur_overlap_int = None
+    def sample(self, n):
+        # lower_coef = 3 / 4.0
+        _lower_bound = self.avg_overlap - 6
+        base_overlap = np.ones(n) * _lower_bound
+        total_ball = (self.avg_overlap - _lower_bound) * n
+        random_number = np.random.randint(0, total_ball - n, n-1)
+        random_number = np.sort(random_number)
+        balls = np.append(random_number, total_ball - n) - np.insert(random_number, 0, 0) + np.ones(n) + base_overlap
+        assert np.sum(balls) == n * self.avg_overlap
+        # TODO: FIXME
+        balls = np.ones(n) * self.avg_overlap
+        return balls.astype(np.int)
+    def reset(self, n):
+        self.cur_overlap_int = self.sample(n)
+    def split(self, img, n, img_w=64):
+        assert img.ndim == 3
+        # assert img.shape[-1] > (n-1) * self.avg_overlap
+        assert (n-1) == self.cur_overlap_int.shape[0]
+        assert (n-1) * self.avg_overlap + img.shape[-1] == n * img_w
+        cur_idx = 0
+        imgs = []
+        for cur_overlap in self.cur_overlap_int:
+            imgs.append(img[:,:,cur_idx:cur_idx + img_w])
+            cur_idx = cur_idx + img_w - cur_overlap
+        imgs.append(img[:,:,cur_idx:])
+        return th.stack(imgs)
+    def merge(self, imgs):
+        b = imgs.shape[0]
+        img_size = imgs.shape[-1]
+        assert b - 1 == self.cur_overlap_int.shape[0]
+        img_width = b * imgs.shape[-1] - np.sum(self.cur_overlap_int)
+        wimg = th.zeros((3, imgs.shape[-2], img_width)).to(imgs)
+        ncnt = th.zeros(img_width).to(imgs)
+        cur_idx = 0
+        for i_th, cur_img in enumerate(imgs):
+            wimg[:,:,cur_idx:cur_idx + img_size] += cur_img
+            ncnt[cur_idx:cur_idx + img_size] += 1.0
+            if i_th < b -1:
+                cur_idx = cur_idx + img_size - self.cur_overlap_int[i_th]
+        return wimg / ncnt[None,None,:]
+class ParaWorkerFix:
+    def __init__(self, overlap_size=10, adam_num_iter=100):
+        self.overlap_size = overlap_size
+        self.adam_num_iter = adam_num_iter
+        self.op = SplitMergeOp(overlap_size)
+    def loss(self, x):
+        avg_x = self.op.split(
+            self.op.merge(x), x.shape[0], x.shape[-1]
+        )
+        return th.sum(
+            (x - avg_x) ** 2,
+            dim=(1, 2, 3),
+        )
+    def split_noise(self, cur_t, xt):
+        noise = simple_noise(cur_t, xt)
+        b, _, _, w = xt.shape
+        final_img_w = w * b - self.overlap_size * (b - 1)
+        noise = rearrange(noise, "(t n) c h w -> t c h (n w)", t=1)[:, :, :, :final_img_w][0]
+        noise = self.op.split(noise, b, w)
+        return noise
+    def adam_grad_weight(self, x0, grad_term, cond_loss_fn):
+        init_weight = th.ones(x0.shape[0]).to(x0)
+        grad_term = grad_term.detach()
+        x0 = x0.detach()
+        with th.enable_grad():
+            weights = init_weight.requires_grad_()
+            optimizer = th.optim.Adam(
+                [
+                    weights,
+                ],
+                lr=1e-2,
+            )
+            def _loss(w):
+                cor_x0 = x0 - batch_mul(w, grad_term)
+                return cond_loss_fn(cor_x0).sum()
+            for _ in range(self.adam_num_iter):
+                optimizer.zero_grad()
+                _cur_loss = _loss(weights)
+                _cur_loss.backward()
+                optimizer.step()
+        return weights
+    def x0_replace(self, x0, sclar_t, thres_t):
+        if sclar_t > thres_t:
+            merge_x0 = self.op.merge(x0)
+            return self.op.split(merge_x0, x0.shape[0], x0.shape[-1])
+        else:
+            return x0

environment.yml ADDED Viewed

	@@ -0,0 +1,282 @@

+name: guided
+channels:
+  - pytorch
+  - nvidia
+  - conda-forge
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=5.1=1_gnu
+  - blas=1.0=mkl
+  - brotlipy=0.7.0=py39h27cfd23_1003
+  - bzip2=1.0.8=h7b6447c_0
+  - ca-certificates=2023.08.22=h06a4308_0
+  - certifi=2023.7.22=py39h06a4308_0
+  - cffi=1.15.1=py39h5eee18b_3
+  - charset-normalizer=2.0.4=pyhd3eb1b0_0
+  - cryptography=39.0.1=py39h9ce1e76_2
+  - cuda-cudart=11.7.99=0
+  - cuda-cupti=11.7.101=0
+  - cuda-libraries=11.7.1=0
+  - cuda-nvrtc=11.7.99=0
+  - cuda-nvtx=11.7.91=0
+  - cuda-runtime=11.7.1=0
+  - ffmpeg=4.3=hf484d3e_0
+  - filelock=3.9.0=py39h06a4308_0
+  - freetype=2.12.1=h4a9f257_0
+  - giflib=5.2.1=h5eee18b_3
+  - gmp=6.2.1=h295c915_3
+  - gmpy2=2.1.2=py39heeb90bb_0
+  - gnutls=3.6.15=he1e5248_0
+  - idna=3.4=py39h06a4308_0
+  - intel-openmp=2023.1.0=hdb19cb5_46305
+  - jinja2=3.1.2=py39h06a4308_0
+  - jpeg=9e=h5eee18b_1
+  - lame=3.100=h7b6447c_0
+  - lcms2=2.12=h3be6417_0
+  - ld_impl_linux-64=2.38=h1181459_1
+  - lerc=3.0=h295c915_0
+  - libcublas=11.10.3.66=0
+  - libcufft=10.7.2.124=h4fbf590_0
+  - libcufile=1.6.1.9=0
+  - libcurand=10.3.2.106=0
+  - libcusolver=11.4.0.1=0
+  - libcusparse=11.7.4.91=0
+  - libdeflate=1.17=h5eee18b_0
+  - libffi=3.4.4=h6a678d5_0
+  - libgcc-ng=11.2.0=h1234567_1
+  - libgfortran-ng=7.5.0=h14aa051_20
+  - libgfortran4=7.5.0=h14aa051_20
+  - libgomp=11.2.0=h1234567_1
+  - libiconv=1.16=h7f8727e_2
+  - libidn2=2.3.4=h5eee18b_0
+  - libnpp=11.7.4.75=0
+  - libnvjpeg=11.8.0.2=0
+  - libpng=1.6.39=h5eee18b_0
+  - libstdcxx-ng=11.2.0=h1234567_1
+  - libtasn1=4.19.0=h5eee18b_0
+  - libtiff=4.5.0=h6a678d5_2
+  - libunistring=0.9.10=h27cfd23_0
+  - libwebp=1.2.4=h11a3e52_1
+  - libwebp-base=1.2.4=h5eee18b_1
+  - lz4-c=1.9.4=h6a678d5_0
+  - markupsafe=2.1.1=py39h7f8727e_0
+  - mkl=2023.1.0=h6d00ec8_46342
+  - mkl-service=2.4.0=py39h5eee18b_1
+  - mkl_fft=1.3.6=py39h417a72b_1
+  - mkl_random=1.2.2=py39h417a72b_1
+  - mpc=1.1.0=h10f8cd9_1
+  - mpfr=4.0.2=hb69a4c5_1
+  - mpi=1.0=openmpi
+  - mpi4py=3.1.4=py39h3e5f7c9_0
+  - mpmath=1.2.1=py39h06a4308_0
+  - ncurses=6.4=h6a678d5_0
+  - nettle=3.7.3=hbbd107a_1
+  - networkx=2.8.4=py39h06a4308_1
+  - numpy=1.24.3=py39hf6e8229_1
+  - numpy-base=1.24.3=py39h060ed82_1
+  - openh264=2.1.1=h4ff587b_0
+  - openmpi=4.0.4=hdf1f1ad_0
+  - openssl=3.0.12=h7f8727e_0
+  - pillow=9.4.0=py39h6a678d5_0
+  - pip=23.1.2=py39h06a4308_0
+  - pycparser=2.21=pyhd3eb1b0_0
+  - pyopenssl=23.0.0=py39h06a4308_0
+  - pysocks=1.7.1=py39h06a4308_0
+  - python=3.9.16=h955ad1f_3
+  - pytorch=2.0.1=py3.9_cuda11.7_cudnn8.5.0_0
+  - pytorch-cuda=11.7=h778d358_5
+  - pytorch-mutex=1.0=cuda
+  - readline=8.2=h5eee18b_0
+  - requests=2.31.0=pyhd8ed1ab_0
+  - setuptools=67.8.0=py39h06a4308_0
+  - six=1.16.0=pyhd3eb1b0_1
+  - sqlite=3.41.2=h5eee18b_0
+  - sympy=1.11.1=py39h06a4308_0
+  - tbb=2021.8.0=hdb19cb5_0
+  - tk=8.6.12=h1ccaba5_0
+  - torchaudio=2.0.2=py39_cu117
+  - torchtext=0.6.0=py_1
+  - torchtriton=2.0.0=py39
+  - torchvision=0.15.2=py39_cu117
+  - tqdm=4.65.0=py39hb070fc8_0
+  - typing_extensions=4.6.3=py39h06a4308_0
+  - urllib3=1.26.16=py39h06a4308_0
+  - wheel=0.38.4=py39h06a4308_0
+  - xz=5.4.2=h5eee18b_0
+  - zlib=1.2.13=h5eee18b_0
+  - zstd=1.5.5=hc292b87_0
+  - pip:
+    - absl-py==1.4.0
+    - accelerate==0.21.0
+    - anyio==4.0.0
+    - appdirs==1.4.4
+    - argon2-cffi==23.1.0
+    - argon2-cffi-bindings==21.2.0
+    - arrow==1.3.0
+    - asttokens==2.4.1
+    - async-lru==2.0.4
+    - attrs==23.1.0
+    - awscli==1.29.84
+    - babel==2.13.1
+    - beautifulsoup4==4.12.2
+    - bleach==6.1.0
+    - blobfile==2.0.2
+    - boltons==23.0.0
+    - botocore==1.31.84
+    - cachetools==5.3.1
+    - chardet==5.1.0
+    - clean-fid==0.1.35
+    - click==8.1.3
+    - clip-anytorch==2.5.2
+    - colorama==0.4.4
+    - comm==0.1.4
+    - contextlib2==21.6.0
+    - contourpy==1.1.0
+    - cycler==0.11.0
+    - debugpy==1.8.0
+    - decorator==5.1.1
+    - defusedxml==0.7.1
+    - docker-pycreds==0.4.0
+    - docutils==0.16
+    - einops==0.6.1
+    - exceptiongroup==1.1.3
+    - executing==2.0.1
+    - fastjsonschema==2.18.1
+    - fonttools==4.40.0
+    - fqdn==1.5.1
+    - fsspec==2023.6.0
+    - ftfy==6.1.1
+    - future==0.18.3
+    - gitdb==4.0.10
+    - gitpython==3.1.31
+    - google-auth==2.21.0
+    - google-auth-oauthlib==1.0.0
+    - grpcio==1.56.0
+    - huggingface-hub==0.15.1
+    - imageio==2.31.1
+    - importlib-metadata==6.7.0
+    - importlib-resources==5.12.0
+    - ipykernel==6.26.0
+    - ipython==8.17.2
+    - ipython-genutils==0.2.0
+    - ipywidgets==8.1.1
+    - isoduration==20.11.0
+    - jedi==0.19.1
+    - jmespath==1.0.1
+    - joblib==1.3.1
+    - json5==0.9.14
+    - jsonmerge==1.9.2
+    - jsonpickle==3.0.1
+    - jsonpointer==2.4
+    - jsonschema==4.19.0
+    - jsonschema-specifications==2023.7.1
+    - jupyter==1.0.0
+    - jupyter-client==8.5.0
+    - jupyter-console==6.6.3
+    - jupyter-core==5.5.0
+    - jupyter-events==0.8.0
+    - jupyter-lsp==2.2.0
+    - jupyter-server==2.9.1
+    - jupyter-server-terminals==0.4.4
+    - jupyterlab==4.0.8
+    - jupyterlab-pygments==0.2.2
+    - jupyterlab-server==2.25.0
+    - jupyterlab-widgets==3.0.9
+    - k-diffusion==0.0.16
+    - kiwisolver==1.4.4
+    - kornia==0.7.0
+    - lazy-loader==0.3
+    - lmdb==1.4.1
+    - lxml==4.9.2
+    - markdown==3.4.3
+    - matplotlib==3.7.1
+    - matplotlib-inline==0.1.6
+    - mido==1.2.10
+    - mistune==3.0.2
+    - ml-collections==0.1.1
+    - more-itertools==10.0.0
+    - music21==8.3.0
+    - nbclient==0.8.0
+    - nbconvert==7.10.0
+    - nbformat==5.9.2
+    - nest-asyncio==1.5.8
+    - notebook==7.0.6
+    - notebook-shim==0.2.3
+    - oauthlib==3.2.2
+    - omegaconf==2.0.0
+    - overrides==7.4.0
+    - packaging==23.1
+    - pandas==2.0.2
+    - pandocfilters==1.5.0
+    - parso==0.8.3
+    - pathtools==0.1.2
+    - pexpect==4.8.0
+    - platformdirs==3.11.0
+    - prometheus-client==0.18.0
+    - prompt-toolkit==3.0.39
+    - protobuf==4.23.3
+    - psutil==5.9.5
+    - ptyprocess==0.7.0
+    - pure-eval==0.2.2
+    - pyasn1==0.5.0
+    - pyasn1-modules==0.3.0
+    - pycryptodomex==3.18.0
+    - pygments==2.16.1
+    - pyparsing==3.1.0
+    - python-dateutil==2.8.2
+    - python-json-logger==2.0.7
+    - pytorch-lightning==1.0.8
+    - pytz==2023.3
+    - pywavelets==1.4.1
+    - pyyaml==6.0
+    - pyzmq==25.1.1
+    - qtconsole==5.4.4
+    - qtpy==2.4.1
+    - referencing==0.30.2
+    - regex==2023.8.8
+    - requests-oauthlib==1.3.1
+    - resize-right==0.0.2
+    - rfc3339-validator==0.1.4
+    - rfc3986-validator==0.1.1
+    - rotary-embedding-torch==0.3.2
+    - rpds-py==0.9.2
+    - rsa==4.7.2
+    - s3transfer==0.7.0
+    - safetensors==0.3.1
+    - scikit-image==0.21.0
+    - scikit-learn==1.3.2
+    - scipy==1.11.2
+    - seaborn==0.13.0
+    - send2trash==1.8.2
+    - sentry-sdk==1.25.1
+    - setproctitle==1.3.2
+    - smmap==5.0.0
+    - sniffio==1.3.0
+    - soupsieve==2.5
+    - stack-data==0.6.3
+    - tensorboard==2.13.0
+    - tensorboard-data-server==0.7.1
+    - terminado==0.17.1
+    - threadpoolctl==3.2.0
+    - tifffile==2023.8.12
+    - timm==0.9.2
+    - tinycss2==1.2.1
+    - tomli==2.0.1
+    - torchdata==0.6.1
+    - torchdiffeq==0.2.3
+    - torchsde==0.2.5
+    - tornado==6.3.3
+    - traitlets==5.13.0
+    - trampoline==0.1.2
+    - types-python-dateutil==2.8.19.14
+    - tzdata==2023.3
+    - uri-template==1.3.0
+    - wandb==0.15.4
+    - wcwidth==0.2.6
+    - webcolors==1.13
+    - webencodings==0.5.1
+    - websocket-client==1.6.4
+    - werkzeug==2.3.6
+    - widgetsnbextension==4.0.9
+    - zipp==3.15.0

guided_diffusion/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+"""
+Codebase for "Improved Denoising Diffusion Probabilistic Models".
+"""

guided_diffusion/condition_functions.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import argparse
+import os
+import numpy as np
+import torch as th
+import torch.nn as nn
+import torch.nn.functional as F
+from .pr_datasets_all import FUNC_DICT
+import matplotlib.pyplot as plt
+plt.rcParams["figure.figsize"] = (20, 3)
+plt.rcParams['figure.dpi'] = 300
+plt.rcParams['savefig.dpi'] = 300
+def model_fn(x, t, y=None, rule=None,
+             model=nn.Identity(), num_classes=3, class_cond=True, cfg=False, w=0.):
+    # y has to be composer, rule is a dummy input
+    y_null = th.tensor([num_classes] * x.shape[0], device=x.device)
+    if class_cond:
+        if cfg:
+            return (1 + w) * model(x, t, y) - w * model(x, t, y_null)
+        else:
+            return model(x, t, y)
+    else:
+        return model(x, t, y_null)
+def dc_model_fn(x, t, y=None, rule=None,
+                model=nn.Identity(), num_classes=3, class_cond=True, cfg=False, w=0.):
+    # diffcollage score function takes in 4 x pitch x time
+    x = x.permute(0, 1, 3, 2)
+    y_null = th.tensor([num_classes] * x.shape[0], device=x.device)
+    if class_cond:
+        if cfg:
+            eps = (1 + w) * model(x, t, y) - w * model(x, t, y_null)
+            return eps.permute(0, 1, 3, 2)  # need to return 4 x time x pitch
+        else:
+            return model(x, t, y).permute(0, 1, 3, 2)
+    else:
+        return model(x, t, y_null).permute(0, 1, 3, 2)
+# y is a dummy input for cond_fn, rule is the real input
+def grad_nn_zt_xentropy(x, y=None, rule=None, classifier=nn.Identity()):
+    # Xentropy cond_fn
+    assert rule is not None
+    t = th.zeros(x.shape[0], device=x.device)
+    with th.enable_grad():
+        x_in = x.detach().requires_grad_(True)
+        logits = classifier(x_in, t)
+        log_probs = F.log_softmax(logits, dim=-1)
+        selected = log_probs[range(len(logits)), rule.view(-1)]
+        return th.autograd.grad(selected.sum(), x_in)[0]
+def grad_nn_zt_mse(x, t, y=None, rule=None, classifier_scale=10., classifier=nn.Identity()):
+    assert rule is not None
+    with th.enable_grad():
+        x_in = x.detach().requires_grad_(True)
+        logits = classifier(x_in, t)
+        log_probs = - F.mse_loss(logits, rule, reduction="none").sum(dim=-1)
+        return th.autograd.grad(log_probs.sum(), x_in)[0] * classifier_scale
+def grad_nn_zt_chord(x, t, y=None, rule=None, classifier_scale=10., classifier=nn.Identity(), both=False):
+    assert rule is not None
+    with th.enable_grad():
+        x_in = x.detach().requires_grad_(True)
+        key_logits, chord_logits = classifier(x_in, t)
+        if both:
+            rule_key = rule[:, :1]
+            rule_chord = rule[:, 1:]
+            rule_chord = rule_chord.reshape(-1)
+            chord_logits = chord_logits.reshape(-1, chord_logits.shape[-1])
+            key_log_probs = - F.cross_entropy(key_logits, rule_key, reduction="none")
+            chord_log_probs = - F.cross_entropy(chord_logits, rule_chord, reduction="none")
+            chord_log_probs = chord_log_probs.reshape(x_in.shape[0], -1).mean(dim=-1)
+            log_probs = key_log_probs + chord_log_probs
+        else:
+            rule = rule.reshape(-1)
+            chord_logits = chord_logits.reshape(-1, chord_logits.shape[-1])
+            log_probs = - F.cross_entropy(chord_logits, rule, reduction="none")
+        return th.autograd.grad(log_probs.sum(), x_in)[0] * classifier_scale
+def nn_z0_chord_dummy(x, t, y=None, rule=None, classifier_scale=0.1, classifier=nn.Identity(), both=False):
+    # classifier_scale is equivalent to step_size
+    t = th.zeros(x.shape[0], device=x.device)
+    key_logits, chord_logits = classifier(x, t)
+    if both:
+        rule_key = rule[:, :1]
+        rule_chord = rule[:, 1:]
+        rule_chord = rule_chord.reshape(-1)
+        chord_logits = chord_logits.reshape(-1, chord_logits.shape[-1])
+        key_log_probs = - F.cross_entropy(key_logits, rule_key, reduction="none")
+        chord_log_probs = - F.cross_entropy(chord_logits, rule_chord, reduction="none")
+        chord_log_probs = chord_log_probs.reshape(x.shape[0], -1).mean(dim=-1)
+        log_probs = key_log_probs + chord_log_probs
+    else:
+        rule = rule.reshape(-1)
+        chord_logits = chord_logits.reshape(-1, chord_logits.shape[-1])
+        log_probs = - F.cross_entropy(chord_logits, rule, reduction="none")
+        log_probs = log_probs.reshape(x.shape[0], -1).mean(dim=-1)
+    return log_probs * classifier_scale
+def nn_z0_mse_dummy(x, t, y=None, rule=None, classifier_scale=0.1, classifier=nn.Identity()):
+    # mse cond_fn, t is a dummy variable b/c wrap_model in respace
+    assert rule is not None
+    t = th.zeros(x.shape[0], device=x.device)
+    logits = classifier(x, t)
+    log_probs = - F.mse_loss(logits, rule, reduction="none").sum(dim=-1)
+    return log_probs * classifier_scale
+def nn_z0_mse(x, rule=None, classifier=nn.Identity()):
+    # mse cond_fn, t is a dummy variable b/c wrap_model in respace
+    t = th.zeros(x.shape[0], device=x.device)
+    logits = classifier(x, t)
+    log_probs = - F.mse_loss(logits, rule, reduction="none").sum(dim=-1)
+    return log_probs
+def rule_x0_mse_dummy(x, t, y=None, rule=None, rule_name='pitch_hist'):
+    # use differentiable rule to differentiate through rule(x_0), t is a dummy variable b/c wrap_model in respace
+    logits = FUNC_DICT[rule_name](x)
+    log_probs = - F.mse_loss(logits, rule, reduction="none").sum(dim=-1)
+    return log_probs
+def rule_x0_mse(x, rule=None, rule_name='pitch_hist', soft=False):
+    # soften non-differentiable rule to differentiate through rule(x_0)
+    # soften doesn't seem to work so didn't actually take in soft as input, always set to False
+    logits = FUNC_DICT[rule_name](x, soft=soft)
+    log_probs = - F.mse_loss(logits, rule, reduction="none").sum(dim=-1)
+    return log_probs
+class _WrappedFn:
+    def __init__(self, fn):
+        self.fn = fn
+    def __call__(self, x, t, y=None, rule=None):
+        return self.fn(x, t, y, rule)
+function_map = {
+    "grad_nn_zt_xentropy": grad_nn_zt_xentropy,
+    "grad_nn_zt_mse": grad_nn_zt_mse,
+    "grad_nn_zt_chord": grad_nn_zt_chord,
+    "nn_z0_chord_dummy": nn_z0_chord_dummy,
+    "nn_z0_mse_dummy": nn_z0_mse_dummy,
+    "nn_z0_mse": nn_z0_mse,
+    "rule_x0_mse_dummy": rule_x0_mse_dummy,
+    "rule_x0_mse": rule_x0_mse
+}
+def composite_nn_zt(x, t, y=None, rule=None, fns=None, classifier_scales=None, classifiers=None, rule_names=None):
+    num_classifiers = len(classifiers)
+    out = 0
+    for i in range(num_classifiers):
+        out += function_map[fns[i]](x, t, y=y, rule=rule[rule_names[i]],
+                                    classifier_scale=classifier_scales[i], classifier=classifiers[i])
+    return out
+def composite_rule(x, t, y=None, rule=None, fns=None, classifier_scales=None, rule_names=None):
+    out = 0
+    for i in range(len(fns)):
+        out += function_map[fns[i]](x, t, y=y, rule=rule[rule_names[i]], rule_name=rule_names[i]) * classifier_scales[i]
+    return out

guided_diffusion/dist_util.py ADDED Viewed

	@@ -0,0 +1,104 @@

+"""
+Helpers for distributed training.
+"""
+import io
+import os
+import socket
+import blobfile as bf
+from mpi4py import MPI
+import torch as th
+import torch.distributed as dist
+# Change this to reflect your cluster layout.
+# The GPU for a given rank is (rank % GPUS_PER_NODE).
+GPUS_PER_NODE = 2
+SETUP_RETRY_COUNT = 3
+def setup_dist(port=None):
+    """
+    Setup a distributed process group.
+    For NGC, set port = "8023"
+    """
+    if dist.is_initialized():
+        return
+    if not os.environ.get("CUDA_VISIBLE_DEVICES"):
+        os.environ["CUDA_VISIBLE_DEVICES"] = f"{MPI.COMM_WORLD.Get_rank() % GPUS_PER_NODE}"
+    comm = MPI.COMM_WORLD
+    backend = "gloo" if not th.cuda.is_available() else "nccl"
+    if backend == "gloo":
+        hostname = "localhost"
+    else:
+        hostname = socket.gethostbyname(socket.getfqdn())
+    if port is not None:
+        os.environ["MASTER_ADDR"] = "127.0.0.1"
+    else:
+        os.environ["MASTER_ADDR"] = comm.bcast(hostname, root=0)
+    os.environ["RANK"] = str(comm.rank)
+    os.environ["WORLD_SIZE"] = str(comm.size)
+    if port is not None:
+        os.environ["MASTER_PORT"] = port
+    else:
+        port = comm.bcast(_find_free_port(), root=0)
+        os.environ["MASTER_PORT"] = str(port)
+    dist.init_process_group(backend=backend, init_method="env://")
+    th.cuda.set_device(comm.rank)   # need to run on hpc
+    return comm
+def dev():
+    """
+    Get the device to use for torch.distributed.
+    """
+    if th.cuda.is_available():
+        return th.device(f"cuda")
+    return th.device("cpu")
+def load_state_dict(path, **kwargs):
+    """
+    Load a PyTorch file without redundant fetches across MPI ranks.
+    """
+    chunk_size = 2 ** 30  # MPI has a relatively small size limit
+    if MPI.COMM_WORLD.Get_rank() == 0:
+        with bf.BlobFile(path, "rb") as f:
+            data = f.read()
+        num_chunks = len(data) // chunk_size
+        if len(data) % chunk_size:
+            num_chunks += 1
+        MPI.COMM_WORLD.bcast(num_chunks)
+        for i in range(0, len(data), chunk_size):
+            MPI.COMM_WORLD.bcast(data[i : i + chunk_size])
+    else:
+        num_chunks = MPI.COMM_WORLD.bcast(None)
+        data = bytes()
+        for _ in range(num_chunks):
+            data += MPI.COMM_WORLD.bcast(None)
+    return th.load(io.BytesIO(data), **kwargs)
+def sync_params(params):
+    """
+    Synchronize a sequence of Tensors across ranks from rank 0.
+    """
+    for p in params:
+        with th.no_grad():
+            dist.broadcast(p, 0)
+def _find_free_port():
+    try:
+        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        s.bind(("", 0))
+        s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+        return s.getsockname()[1]
+    finally:
+        s.close()

guided_diffusion/dit.py ADDED Viewed

	@@ -0,0 +1,983 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# GLIDE: https://github.com/openai/glide-text2im
+# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
+# --------------------------------------------------------
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from rotary_embedding_torch import RotaryEmbedding
+from torch.jit import Final
+import numpy as np
+import math
+from timm.models.vision_transformer import Attention, Mlp
+from timm.models.vision_transformer_relpos import RelPosAttention
+from timm.layers import Format, nchw_to, to_2tuple, _assert, RelPosBias, use_fused_attn
+from typing import Callable, List, Optional, Tuple, Union
+from functools import partial
+def modulate(x, shift, scale):
+    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+#################################################################################
+#               Embedding Layers for Timesteps and Class Labels                 #
+#################################################################################
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+        ).to(device=t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class LabelEmbedder(nn.Module):
+    """
+    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
+    """
+    def __init__(self, num_classes, hidden_size, dropout_prob):
+        super().__init__()
+        use_cfg_embedding = dropout_prob > 0
+        self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size)
+        self.num_classes = num_classes
+        self.dropout_prob = dropout_prob
+    def token_drop(self, labels, force_drop_ids=None):
+        """
+        Drops labels to enable classifier-free guidance.
+        """
+        if force_drop_ids is None:
+            drop_ids = torch.rand(labels.shape[0], device=labels.device) < self.dropout_prob
+        else:
+            drop_ids = force_drop_ids == 1
+        labels = torch.where(drop_ids, self.num_classes, labels)
+        return labels
+    def forward(self, labels, train, force_drop_ids=None):
+        use_dropout = self.dropout_prob > 0
+        if (train and use_dropout) or (force_drop_ids is not None):
+            labels = self.token_drop(labels, force_drop_ids)
+        embeddings = self.embedding_table(labels)
+        return embeddings
+#################################################################################
+#               Embedding Layers for Patches that Support H != W                #
+#################################################################################
+class PatchEmbed(nn.Module):
+    """ 2D Image to Patch Embedding
+    """
+    output_fmt: Format
+    def __init__(
+            self,
+            img_size: Optional[Union[int, tuple, list]] = 224,
+            patch_size: Union[int, tuple, list] = 16,
+            in_chans: int = 3,
+            embed_dim: int = 768,
+            norm_layer: Optional[Callable] = None,
+            flatten: bool = True,
+            output_fmt: Optional[str] = None,
+            bias: bool = True,
+            strict_img_size: bool = True,
+    ):
+        super().__init__()
+        self.patch_size = to_2tuple(patch_size)
+        if img_size is not None:
+            if isinstance(img_size, int):
+                self.img_size = to_2tuple(img_size)
+            elif len(img_size) == 1:
+                self.img_size = to_2tuple(img_size[0])
+            else:
+                self.img_size = img_size
+            self.grid_size = tuple([s // p for s, p in zip(self.img_size, self.patch_size)])
+            self.num_patches = self.grid_size[0] * self.grid_size[1]
+        else:
+            self.img_size = None
+            self.grid_size = None
+            self.num_patches = None
+        if output_fmt is not None:
+            self.flatten = False
+            self.output_fmt = Format(output_fmt)
+        else:
+            # flatten spatial dim and transpose to channels last, kept for bwd compat
+            self.flatten = flatten
+            self.output_fmt = Format.NCHW
+        self.strict_img_size = strict_img_size
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+    def forward(self, x):
+        B, C, H, W = x.shape
+        if self.img_size is not None:
+            if self.strict_img_size:
+                _assert(H == self.img_size[0], f"Input height ({H}) doesn't match model ({self.img_size[0]}).")
+                _assert(W == self.img_size[1], f"Input width ({W}) doesn't match model ({self.img_size[1]}).")
+            else:
+                _assert(
+                    H % self.patch_size[0] == 0,
+                    f"Input height ({H}) should be divisible by patch size ({self.patch_size[0]})."
+                )
+                _assert(
+                    W % self.patch_size[1] == 0,
+                    f"Input width ({W}) should be divisible by patch size ({self.patch_size[1]})."
+                )
+        x = self.proj(x)
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)  # NCHW -> NLC
+        elif self.output_fmt != Format.NCHW:
+            x = nchw_to(x, self.output_fmt)
+        x = self.norm(x)
+        return x
+class FlattenNorm(nn.Module):
+    """ Flatten 2D Image to a vector
+    """
+    def __init__(
+            self,
+            img_size: Optional[Union[int, tuple, list]] = 224,
+            embed_dim: int = 768,
+            norm_layer: Optional[Callable] = None,
+    ):
+        super().__init__()
+        self.num_patches = max(img_size)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+        # todo: hard code 64 and hidden_dim for now
+        self.MLP = nn.Sequential(nn.Linear(64, 256), nn.SiLU(), nn.Linear(256, embed_dim))
+    def forward(self, x):
+        x = x.permute(0, 2, 1, 3).flatten(2)   # B x 4 x 128 x 16 -> B x 128 x 4 x 16 - > B x 128 x 64
+        x = self.MLP(x)    # B x 128 x 768
+        x = self.norm(x)
+        return x
+class FlattenPatchify1D(nn.Module):
+    """ Flatten 2D Image to a vector with pitch per token
+    """
+    def __init__(
+            self,
+            in_channels: int = 4,
+            img_size: Optional[Union[int, tuple, list]] = 224,
+            embed_dim: int = 768,
+            patch_size: int = 8,
+            norm_layer: Optional[Callable] = None,
+    ):
+        super().__init__()
+        # dummy, is not needed by the rotary model, but needed for REL and DiT
+        self.num_patches = img_size[0] * img_size[1] // patch_size   # img_size: 128x16
+        self.patch_size = patch_size
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+        self.MLP = nn.Sequential(nn.Linear(in_channels * patch_size, 256), nn.SiLU(), nn.Linear(256, embed_dim))
+    def forward(self, x):
+        x = x.permute(0, 2, 3, 1)      # B x c x 128 x 16 -> B x 128 x 16 x c
+        b, n_time, n_pitch, c = x.shape
+        num_patches = n_time * n_pitch // self.patch_size
+        # B x 128 x 16 x 4 -> B x (128 x 16 / 8) x (4 * 8)
+        x = x.reshape(b, num_patches, -1)
+        x = self.MLP(x)    # B x 256 x 768
+        x = self.norm(x)
+        return x
+#################################################################################
+#                                 Core DiT Model                                #
+#################################################################################
+class RotaryAttention(nn.Module):
+    fused_attn: Final[bool]
+    def __init__(
+            self,
+            dim,
+            num_heads=8,
+            qkv_bias=False,
+            qk_norm=False,
+            attn_drop=0.,
+            proj_drop=0.,
+            norm_layer=nn.LayerNorm,
+            rotary_emb=None,
+    ):
+        super().__init__()
+        assert dim % num_heads == 0, 'dim should be divisible by num_heads'
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim ** -0.5
+        self.fused_attn = use_fused_attn()
+        self.rotary_emb = rotary_emb
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)
+        q, k = self.q_norm(q), self.k_norm(k)
+        if self.rotary_emb is not None:
+            q = self.rotary_emb.rotate_queries_or_keys(q)
+            k = self.rotary_emb.rotate_queries_or_keys(k)
+        if self.fused_attn:
+            x = F.scaled_dot_product_attention(
+                q, k, v,
+                dropout_p=self.attn_drop.p,
+            )
+        else:
+            q = q * self.scale
+            attn = q @ k.transpose(-2, -1)
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = attn @ v
+        x = x.transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class DiTBlock(nn.Module):
+    """
+    A DiT block with adaptive layer norm zero (adaLN-Zero) conditioning.
+    """
+    def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, **block_kwargs):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.attn = Attention(hidden_size, num_heads=num_heads, qkv_bias=True, **block_kwargs)
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 6 * hidden_size, bias=True)
+        )
+    def forward(self, x, c):
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(6, dim=1)
+        x = x + gate_msa.unsqueeze(1) * self.attn(modulate(self.norm1(x), shift_msa, scale_msa))
+        x = x + gate_mlp.unsqueeze(1) * self.mlp(modulate(self.norm2(x), shift_mlp, scale_mlp))
+        return x
+class DiTBlockRotary(nn.Module):
+    """
+    A DiT block with adaptive layer norm zero (adaLN-Zero) conditioning & rotary attention.
+    """
+    def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, rotary_emb=None, **block_kwargs):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.attn = RotaryAttention(hidden_size, num_heads=num_heads, qkv_bias=True, rotary_emb=rotary_emb, **block_kwargs)
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 6 * hidden_size, bias=True)
+        )
+    def forward(self, x, c):
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(6, dim=1)
+        x = x + gate_msa.unsqueeze(1) * self.attn(modulate(self.norm1(x), shift_msa, scale_msa))
+        x = x + gate_mlp.unsqueeze(1) * self.mlp(modulate(self.norm2(x), shift_mlp, scale_mlp))
+        return x
+class FinalLayer(nn.Module):
+    """
+    The final layer of DiT.
+    """
+    def __init__(self, hidden_size, patch_size, out_channels):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 2 * hidden_size, bias=True)
+        )
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+class FinalLayerPatch1D(nn.Module):
+    """
+    The final layer of DiT with 1d Patchify.
+    """
+    def __init__(self, hidden_size, out_channels, patch_size_1d=16):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, patch_size_1d*out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 2 * hidden_size, bias=True)
+        )
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+class DiT(nn.Module):
+    """
+    Diffusion model with a Transformer backbone.
+    """
+    def __init__(
+        self,
+        input_size=32,
+        patch_size=2,
+        in_channels=3,
+        hidden_size=1152,
+        depth=28,
+        num_heads=16,
+        mlp_ratio=4.0,
+        class_dropout_prob=0.1,
+        num_classes=9,    # cluster composers into 9 groups
+        learn_sigma=True,
+        patchify=True,
+    ):
+        super().__init__()
+        self.learn_sigma = learn_sigma
+        self.in_channels = in_channels
+        self.out_channels = in_channels * 2 if learn_sigma else in_channels
+        self.patch_size = patch_size
+        self.num_heads = num_heads
+        self.input_size = input_size
+        self.patchify = patchify
+        if patchify:
+            self.x_embedder = PatchEmbed(input_size, patch_size, in_channels, hidden_size, bias=True)
+        else:
+            self.x_embedder = FlattenNorm(input_size, hidden_size)
+        self.t_embedder = TimestepEmbedder(hidden_size)
+        self.num_classes = num_classes
+        if self.num_classes:
+            self.y_embedder = LabelEmbedder(num_classes, hidden_size, class_dropout_prob)
+        num_patches = self.x_embedder.num_patches
+        # Will use fixed sin-cos embedding:
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, hidden_size), requires_grad=False)
+        self.blocks = nn.ModuleList([
+            DiTBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio) for _ in range(depth)
+        ])
+        if patchify:
+            self.final_layer = FinalLayer(hidden_size, patch_size, self.out_channels)
+        else:
+            self.final_layer = FinalLayerPatch1D(hidden_size, self.out_channels, patch_size)
+        self.initialize_weights()
+    def initialize_weights(self):
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize (and freeze) pos_embed by sin-cos embedding:
+        if self.patchify:
+            if isinstance(self.input_size, int) or len(self.input_size) == 1:
+                pos_embed = get_2d_sincos_pos_embed(self.pos_embed.shape[-1], int(self.x_embedder.num_patches ** 0.5), int(self.x_embedder.num_patches ** 0.5))
+            else:
+                pos_embed = get_2d_sincos_pos_embed(self.pos_embed.shape[-1], self.x_embedder.grid_size[0], self.x_embedder.grid_size[1])
+        else:
+            # 1D position encoding
+            pos_embed = get_1d_sincos_pos_embed_from_grid(self.pos_embed.shape[-1],
+                                              np.arange(self.x_embedder.num_patches, dtype=np.float32))
+        self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))
+        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
+        if self.patchify:
+            w = self.x_embedder.proj.weight.data
+            nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+            nn.init.constant_(self.x_embedder.proj.bias, 0)
+        # Initialize label embedding table:
+        if self.num_classes:
+            nn.init.normal_(self.y_embedder.embedding_table.weight, std=0.02)
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers in DiT blocks:
+        for block in self.blocks:
+            nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+        # Zero-out output layers:
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0)
+        nn.init.constant_(self.final_layer.linear.weight, 0)
+        nn.init.constant_(self.final_layer.linear.bias, 0)
+    def unpatchify(self, x):
+        """
+        x: (N, T, patch_size**2 * C)
+        imgs: (N, H, W, C)
+        """
+        c = self.out_channels
+        p = self.x_embedder.patch_size[0]
+        if isinstance(self.input_size, int) or len(self.input_size) == 1:
+            h = w = int(x.shape[1] ** 0.5)
+            assert h * w == x.shape[1]
+        else:
+            h = self.input_size[0] // self.patch_size
+            w = self.input_size[1] // self.patch_size
+        x = x.reshape(shape=(x.shape[0], h, w, p, p, c))
+        x = torch.einsum('nhwpqc->nchpwq', x)
+        imgs = x.reshape(shape=(x.shape[0], c, h * p, w * p))
+        return imgs
+    def unflatten(self, x):
+        c = self.out_channels
+        x = x.reshape(shape=(x.shape[0], x.shape[1], c, -1))
+        imgs = x.permute(0, 2, 1, 3)
+        return imgs
+    def forward(self, x, t, y=None):
+        """
+        Forward pass of DiT.
+        x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
+        t: (N,) tensor of diffusion timesteps
+        y: (N,) tensor of class labels
+        """
+        x = self.x_embedder(x) + self.pos_embed  # (N, T, D), where T = H * W / patch_size ** 2
+        c = self.t_embedder(t)                   # (N, D)
+        if self.num_classes and y is not None:
+            y = self.y_embedder(y, self.training)    # (N, D)
+            c = c + y                                # (N, D)
+        for block in self.blocks:
+            x = block(x, c)                      # (N, T, D)
+        x = self.final_layer(x, c)  # (N, T, patch_size ** 2 * out_channels)
+        if self.patchify:
+            x = self.unpatchify(x)                   # (N, out_channels, H, W)
+        else:
+            x = self.unflatten(x)
+        return x
+    def forward_with_cfg(self, x, t, y, cfg_scale):
+        """
+        Forward pass of DiT, but also batches the unconditional forward pass for classifier-free guidance.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
+        half = x[: len(x) // 2]
+        combined = torch.cat([half, half], dim=0)
+        model_out = self.forward(combined, t, y)
+        # For exact reproducibility reasons, we apply classifier-free guidance on only
+        # three channels by default. The standard approach to cfg applies it to all channels.
+        # This can be done by uncommenting the following line and commenting-out the line following that.
+        # eps, rest = model_out[:, :self.in_channels], model_out[:, self.in_channels:]
+        eps, rest = model_out[:, :3], model_out[:, 3:]
+        cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)
+        half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps)
+        eps = torch.cat([half_eps, half_eps], dim=0)
+        return torch.cat([eps, rest], dim=1)
+class DiTRotary(nn.Module):
+    """
+    Diffusion model with a Transformer backbone, with rotary position embedding.
+    Use 1D position encoding, patchify is set to False
+    """
+    def __init__(
+        self,
+        input_size=32,
+        patch_size=8,   # patch size for 1D patchify
+        in_channels=3,
+        hidden_size=1152,
+        depth=28,
+        num_heads=16,
+        mlp_ratio=4.0,
+        class_dropout_prob=0.1,
+        num_classes=9,  # cluster composers into 9 groups
+        learn_sigma=True,
+    ):
+        super().__init__()
+        self.learn_sigma = learn_sigma
+        self.in_channels = in_channels
+        self.out_channels = in_channels * 2 if learn_sigma else in_channels
+        self.patch_size = patch_size
+        self.num_heads = num_heads
+        self.input_size = input_size
+        self.x_embedder = FlattenPatchify1D(in_channels, input_size, hidden_size, patch_size)
+        self.t_embedder = TimestepEmbedder(hidden_size)
+        self.num_classes = num_classes
+        if self.num_classes:
+            self.y_embedder = LabelEmbedder(num_classes, hidden_size, class_dropout_prob)
+        rotary_dim = int(hidden_size // num_heads * 0.5)   # 0.5 is rotary percentage in multihead rope, by default 0.5
+        self.rotary_emb = RotaryEmbedding(rotary_dim)
+        self.blocks = nn.ModuleList([
+            DiTBlockRotary(hidden_size, num_heads, mlp_ratio=mlp_ratio, rotary_emb=self.rotary_emb) for _ in range(depth)
+        ])
+        self.final_layer = FinalLayerPatch1D(hidden_size, self.out_channels, patch_size_1d=self.patch_size)
+        self.initialize_weights()
+    def initialize_weights(self):
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize label embedding table:
+        if self.num_classes:
+            nn.init.normal_(self.y_embedder.embedding_table.weight, std=0.02)
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers in DiT blocks:
+        for block in self.blocks:
+            nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+        # Zero-out output layers:
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0)
+        nn.init.constant_(self.final_layer.linear.weight, 0)
+        nn.init.constant_(self.final_layer.linear.bias, 0)
+    def unpatchify(self, x):
+        """
+        x: (N, T, img_size[1] / patch_size * C)
+        imgs: (N, H, W, C)
+        """
+        # input_size[1] is the pitch dimension, should always be the same
+        x = x.reshape(shape=(x.shape[0], -1, self.input_size[1], self.out_channels))
+        imgs = x.permute(0, 3, 1, 2)
+        return imgs
+    def forward(self, x, t, y=None):
+        """
+        Forward pass of DiT.
+        x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
+        t: (N,) tensor of diffusion timesteps
+        y: (N,) tensor of class labels
+        """
+        x = self.x_embedder(x)  # (N, T, D), where T = H * W / patch_size
+        c = self.t_embedder(t)  # (N, D)
+        if self.num_classes and y is not None:
+            y = self.y_embedder(y, self.training)  # (N, D)
+            c = c + y  # (N, D)
+        for block in self.blocks:
+            x = block(x, c)  # (N, T, D)
+        x = self.final_layer(x, c)  # (N, T, patch_size * out_channels)
+        x = self.unpatchify(x)
+        return x
+class DiT_classifier(nn.Module):
+    """
+    Classifier used in classifier guidance.
+    """
+    def __init__(
+        self,
+        input_size=32,
+        patch_size=2,
+        in_channels=3,
+        hidden_size=1152,
+        depth=28,
+        num_heads=16,
+        mlp_ratio=4.0,
+        num_classes=9,
+        patchify=True,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.patch_size = patch_size
+        self.num_heads = num_heads
+        self.input_size = input_size
+        self.patchify = patchify
+        if patchify:
+            self.x_embedder = PatchEmbed(input_size, patch_size, in_channels, hidden_size, bias=True)
+        else:
+            self.x_embedder = FlattenNorm(input_size, hidden_size)
+        self.t_embedder = TimestepEmbedder(hidden_size)
+        self.num_classes = num_classes
+        num_patches = self.x_embedder.num_patches
+        # Will use fixed sin-cos embedding:
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, hidden_size), requires_grad=False)
+        self.blocks = nn.ModuleList([
+            DiTBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio) for _ in range(depth)
+        ])
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, hidden_size), requires_grad=True)
+        self.norm = nn.LayerNorm(hidden_size)
+        self.classifier_head = nn.Sequential(nn.Linear(hidden_size, hidden_size//4),
+                                             nn.SiLU(), nn.Linear(hidden_size//4, self.num_classes))
+        self.initialize_weights()
+    def initialize_weights(self):
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        if self.patchify:
+            if isinstance(self.input_size, int) or len(self.input_size) == 1:
+                pos_embed = get_2d_sincos_pos_embed(self.pos_embed.shape[-1], int(self.x_embedder.num_patches ** 0.5), int(self.x_embedder.num_patches ** 0.5))
+            else:
+                pos_embed = get_2d_sincos_pos_embed(self.pos_embed.shape[-1], self.x_embedder.grid_size[0], self.x_embedder.grid_size[1])
+        else:
+            # 1D position encoding
+            pos_embed = get_1d_sincos_pos_embed_from_grid(self.pos_embed.shape[-1],
+                                              np.arange(self.x_embedder.num_patches, dtype=np.float32))
+        self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))
+        # Initialize class token
+        nn.init.normal_(self.cls_token, std=1e-6)
+        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
+        if self.patchify:
+            w = self.x_embedder.proj.weight.data
+            nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+            nn.init.constant_(self.x_embedder.proj.bias, 0)
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers in DiT blocks:
+        for block in self.blocks:
+            nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+    def forward(self, x, t):
+        """
+        Forward pass of DiT.
+        x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
+        t: (N,) tensor of diffusion timesteps
+        y: (N,) tensor of class labels
+        """
+        x = self.x_embedder(x) + self.pos_embed  # (N, T, D), where T = H * W / patch_size ** 2
+        x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+        c = self.t_embedder(t)                   # (N, D)
+        for block in self.blocks:
+            x = block(x, c)                      # (N, T, D)
+        x = x[:, 0, :]                           # (N, D)
+        x = self.norm(x)
+        x = self.classifier_head(x)                # (N, num_classes)
+        return x
+class DiTRotaryClassifier(nn.Module):
+    """
+    Diffusion model with a Transformer backbone, with rotary position embedding.
+    Use 1D position encoding, patchify is set to False
+    """
+    def __init__(
+        self,
+        input_size=32,
+        patch_size=8,   # patch size for 1D patchify
+        in_channels=3,
+        hidden_size=1152,
+        depth=28,
+        num_heads=16,
+        mlp_ratio=4.0,
+        num_classes=9,  # cluster composers into 9 groups
+        chord=False,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.patch_size = patch_size
+        self.num_heads = num_heads
+        self.input_size = input_size
+        self.chord = chord
+        self.hidden_size = hidden_size
+        self.x_embedder = FlattenPatchify1D(in_channels, input_size, hidden_size, patch_size)
+        self.t_embedder = TimestepEmbedder(hidden_size)
+        self.num_classes = num_classes
+        rotary_dim = int(hidden_size // num_heads * 0.5)   # 0.5 is rotary percentage in multihead rope, by default 0.5
+        self.rotary_emb = RotaryEmbedding(rotary_dim)
+        self.blocks = nn.ModuleList([
+            DiTBlockRotary(hidden_size, num_heads, mlp_ratio=mlp_ratio, rotary_emb=self.rotary_emb) for _ in range(depth)
+        ])
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, hidden_size), requires_grad=True)
+        self.norm = nn.LayerNorm(hidden_size)
+        self.classifier_head = nn.Sequential(nn.Linear(hidden_size, hidden_size//4),
+                                             nn.SiLU(), nn.Linear(hidden_size//4, self.num_classes))
+        if self.chord:
+            self.norm_key = nn.LayerNorm(hidden_size)
+            # predict key also: 24 major and minor keys + null
+            self.classifier_head_key = nn.Sequential(nn.Linear(hidden_size, hidden_size//4),
+                                             nn.SiLU(), nn.Linear(hidden_size//4, 25))
+        self.initialize_weights()
+    def initialize_weights(self):
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize class token
+        nn.init.normal_(self.cls_token, std=1e-6)
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers in DiT blocks:
+        for block in self.blocks:
+            nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+    def forward(self, x, t, y=None):
+        """
+        Forward pass of DiT.
+        x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
+        t: (N,) tensor of diffusion timesteps
+        y: (N,) tensor of class labels
+        """
+        if self.chord:
+            n_token = x.shape[2] // x.shape[3]
+        x = self.x_embedder(x)  # (N, T, D), where T = H * W / patch_size
+        x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+        c = self.t_embedder(t)  # (N, D)
+        for block in self.blocks:
+            x = block(x, c)  # (N, T, D)
+        if self.chord:
+            x_key = x[:, 0, :]
+            x_key = self.norm_key(x_key)
+            key = self.classifier_head_key(x_key)
+            x_chord = x[:, 1:, :]
+            x_chord = x_chord.reshape(shape=[x.shape[0], n_token, -1, self.hidden_size])
+            x_chord = x_chord.mean(dim=-2)
+            x_chord = self.norm(x_chord)
+            chord = self.classifier_head(x_chord)
+            return key, chord
+        else:
+            x = x[:, 0, :]  # (N, D)
+            x = self.norm(x)
+            x = self.classifier_head(x)    # (N, num_classes)
+            return x
+#################################################################################
+#                   Sine/Cosine Positional Embedding Functions                  #
+#################################################################################
+# https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+def get_2d_sincos_pos_embed(embed_dim, grid_size_h, grid_size_w, cls_token=False, extra_tokens=0):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size_h, dtype=np.float32)
+    grid_w = np.arange(grid_size_w, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size_h, grid_size_w])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token and extra_tokens > 0:
+        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out) # (M, D/2)
+    emb_cos = np.cos(out) # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+#################################################################################
+#                                   DiT Configs                                  #
+#################################################################################
+def DiT_XL_2(**kwargs):
+    return DiT(depth=28, hidden_size=1152, patch_size=2, num_heads=16, **kwargs)
+def DiT_XL_4(**kwargs):
+    return DiT(depth=28, hidden_size=1152, patch_size=4, num_heads=16, **kwargs)
+def DiTRotary_XL_16(**kwargs):
+    return DiTRotary(depth=28, hidden_size=1152, patch_size=16, num_heads=16, **kwargs)
+def DiTRotary_XL_8(**kwargs):
+    return DiTRotary(depth=28, hidden_size=1152, patch_size=8, num_heads=16, **kwargs)
+def DiT_XL_8(**kwargs):
+    return DiT(depth=28, hidden_size=1152, patch_size=8, num_heads=16, **kwargs)
+def DiT_L_2(**kwargs):
+    return DiT(depth=24, hidden_size=1024, patch_size=2, num_heads=16, **kwargs)
+def DiT_L_4(**kwargs):
+    return DiT(depth=24, hidden_size=1024, patch_size=4, num_heads=16, **kwargs)
+def DiT_L_8(**kwargs):
+    return DiT(depth=24, hidden_size=1024, patch_size=8, num_heads=16, **kwargs)
+def DiT_B_2(**kwargs):
+    return DiT(depth=12, hidden_size=768, patch_size=2, num_heads=12, **kwargs)
+def DiT_B_4(**kwargs):
+    return DiT(depth=12, hidden_size=768, patch_size=4, num_heads=12, **kwargs)
+def DiTRotary_B_16(**kwargs):   # seq_len = 128 = 128 * 16/16
+    return DiTRotary(depth=12, hidden_size=768, patch_size=16, num_heads=12, **kwargs)
+def DiTRotary_B_8(**kwargs):   # seq_len = 256 = 128 * 16/8
+    return DiTRotary(depth=12, hidden_size=768, patch_size=8, num_heads=12, **kwargs)
+def DiT_B_8(**kwargs):
+    return DiT(depth=12, hidden_size=768, patch_size=8, num_heads=12, **kwargs)
+def DiT_B_4_classifier(**kwargs):
+    return DiT_classifier(depth=12, hidden_size=768, patch_size=4, num_heads=12, **kwargs)
+def DiT_B_8_classifier(**kwargs):
+    return DiT_classifier(depth=12, hidden_size=768, patch_size=8, num_heads=12, **kwargs)
+def DiTRotary_B_8_classifier(**kwargs):
+    return DiTRotaryClassifier(depth=12, hidden_size=768, patch_size=8, num_heads=12, **kwargs)
+def DiT_S_2(**kwargs):
+    return DiT(depth=12, hidden_size=384, patch_size=2, num_heads=6, **kwargs)
+def DiT_S_2_classifier(**kwargs):
+    return DiT_classifier(depth=12, hidden_size=384, patch_size=2, num_heads=6, **kwargs)
+def DiTRotary_S_8_classifier(**kwargs):
+    return DiTRotaryClassifier(depth=12, hidden_size=384, patch_size=8, num_heads=6, **kwargs)
+def DiTRotary_S_8_chord_classifier(**kwargs):
+    return DiTRotaryClassifier(depth=12, hidden_size=384, patch_size=8, num_heads=6, chord=True, **kwargs)
+def DiT_XS_2_classifier(**kwargs):
+    return DiT_classifier(depth=4, hidden_size=384, patch_size=2, num_heads=6, **kwargs)
+def DiTRotary_XS_8_classifier(**kwargs):
+    return DiTRotaryClassifier(depth=4, hidden_size=384, patch_size=8, num_heads=6, **kwargs)
+def DiT_S_4(**kwargs):
+    return DiT(depth=12, hidden_size=384, patch_size=4, num_heads=6, **kwargs)
+def DiT_S_4_classifier(**kwargs):
+    return DiT_classifier(depth=12, hidden_size=384, patch_size=4, num_heads=6, **kwargs)
+def DiT_S_8(**kwargs):
+    return DiT(depth=12, hidden_size=384, patch_size=8, num_heads=6, **kwargs)
+DiT_models = {
+    'DiT-XL/2': DiT_XL_2,  'DiT-XL/4': DiT_XL_4,  'DiT-XL/8': DiT_XL_8,
+    'DiT-L/2':  DiT_L_2,   'DiT-L/4':  DiT_L_4,   'DiT-L/8':  DiT_L_8,
+    'DiT-B/2':  DiT_B_2,   'DiT-B/4':  DiT_B_4,   'DiT-B/8':  DiT_B_8,
+    'DiT-S/2':  DiT_S_2,   'DiT-S/4':  DiT_S_4,   'DiT-S/8':  DiT_S_8,
+    'DiTRotary_B_16': DiTRotary_B_16,  'DiTRotary_B_8': DiTRotary_B_8,
+    'DiTRotary_XL_16': DiTRotary_XL_16, 'DiTRotary_XL_8': DiTRotary_XL_8,
+    'DiT-B/4-cls':  DiT_B_4_classifier,   'DiT-B/8-cls':  DiT_B_8_classifier,
+    'DiT-S/4-cls': DiT_S_4_classifier, 'DiT-S/2-cls': DiT_S_2_classifier,
+    'DiT-XS/2-cls': DiT_XS_2_classifier,
+    'DiTRotary-XS/8-cls': DiTRotary_XS_8_classifier,
+    'DiTRotary-S/8-cls': DiTRotary_S_8_classifier,
+    'DiTRotary-S/8-chord-cls': DiTRotary_S_8_chord_classifier,
+    'DiTRotary-B/8-cls': DiTRotary_B_8_classifier,
+ }

guided_diffusion/embed_datasets.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import math
+import random
+import os
+import pandas as pd
+import re
+from PIL import Image
+import blobfile as bf
+from mpi4py import MPI
+import numpy as np
+from torch.utils.data import DataLoader, Dataset
+CLUSTERS = {'Balakirev': 0,
+            'Bartholdy': 0,
+            'Bizet': 0,
+            'Brahms': 0,
+            'Busoni': 0,
+            'Chopin': 0,
+            'Grieg': 0,
+            'Horowitz': 0,
+            'Liszt': 0,
+            'Mendelssohn': 0,
+            'Moszkowski': 0,
+            'Paganini': 0,
+            'Saint-Saens': 0,
+            'Schubert': 0,
+            'Schumann': 0,
+            'Strauss': 0,
+            'Tchaikovsky': 0,
+            'Wagner': 0,
+            'Beethoven': 1,
+            'Bach': 2,
+            'Handel': 2,
+            'Purcell': 2,
+            'Barber': 3,
+            'Bartok': 3,
+            'Hindemith': 3,
+            'Ligeti': 3,
+            'Messiaen': 3,
+            'Mussorgsky': 3,
+            'Myaskovsky': 3,
+            'Prokofiev': 3,
+            'Schnittke': 3,
+            'Schonberg': 3,
+            'Shostakovich': 3,
+            'Stravinsky': 3,
+            'Debussy': 4,
+            'Ravel': 4,
+            'Clementi': 5,
+            'Haydn': 5,
+            'Mozart': 5,
+            'Pachelbel': 5,
+            'Scarlatti': 5,
+            'Rachmaninoff': 6,
+            'Scriabin': 6,
+            'Gershwin': 7,
+            'Kapustin': 7
+            }
+def extract_string(file_name):
+    if 'loc' not in file_name:
+        ind = [i.start() for i in re.finditer('_', file_name)][-1]
+        name = file_name[:ind]
+    else:
+        name = file_name.split('loc')[0][:-1]
+    return name
+def find_composer(name, df):
+    compound_composer = df.loc[df['simple_midi_name'] == name]['canonical_composer'].item()
+    composer = compound_composer.split(' / ')[0].split(' ')[-1]   # take the last name of the first composer
+    result = CLUSTERS.setdefault(composer, 8)    # default cluster is everyone else (8)
+    return result
+def load_data(
+    *,
+    data_dir,
+    batch_size,
+    class_cond=False,
+    deterministic=False,
+):
+    """
+    For a dataset, create a generator over (images, kwargs) pairs.
+    Each images is an NCHW float tensor, and the kwargs dict contains zero or
+    more keys, each of which map to a batched Tensor of their own.
+    The kwargs dict can be used for class labels, in which case the key is "y"
+    and the values are integer tensors of class labels.
+    :param data_dir: a dataset directory.
+    :param batch_size: the batch size of each returned pair.
+    :param image_size: the size to which images are resized.
+    :param class_cond: if True, include a "y" key in returned dicts for class
+                       label. If classes are not available and this is true, an
+                       exception will be raised.
+    :param deterministic: if True, yield results in a deterministic order.
+    :param random_crop: if True, randomly crop the images for augmentation.
+    :param random_flip: if True, randomly flip the images for augmentation.
+    """
+    if not data_dir:
+        raise ValueError("unspecified data directory")
+    all_files = _list_image_files(data_dir)
+    classes = None
+    if class_cond:
+        # find the composer
+        parent_dir = os.path.join(*data_dir.split('/')[:-1])
+        if data_dir[0] == '/':
+          parent_dir = '/' + parent_dir
+        df = pd.read_csv(os.path.join(parent_dir, 'maestro-v3.0.0.csv'))
+        df['simple_midi_name'] = [midi_name[5:-5] for midi_name in df['midi_filename']]
+        all_file_names = bf.listdir(data_dir)
+        extracted_names = [extract_string(file_name) for file_name in all_file_names]
+        classes = [find_composer(name, df) for name in extracted_names]
+    dataset = ImageDataset(
+        all_files,
+        classes=classes,
+        shard=MPI.COMM_WORLD.Get_rank(),
+        num_shards=MPI.COMM_WORLD.Get_size(),
+    )
+    if deterministic:
+        loader = DataLoader(
+            dataset, batch_size=batch_size, shuffle=False, num_workers=1, drop_last=True
+        )
+    else:
+        loader = DataLoader(
+            dataset, batch_size=batch_size, shuffle=True, num_workers=1, drop_last=True
+        )
+    while True:
+        yield from loader
+def _list_image_files(data_dir):
+    dirs = bf.listdir(data_dir)
+    return [data_dir + '/' + d for d in dirs]
+class ImageDataset(Dataset):
+    def __init__(
+        self,
+        image_paths,
+        classes=None,
+        shard=0,
+        num_shards=1,
+    ):
+        super().__init__()
+        self.local_images = image_paths[shard:][::num_shards]
+        self.local_classes = None if classes is None else classes[shard:][::num_shards]
+    def __len__(self):
+        return len(self.local_images)
+    def __getitem__(self, idx):
+        path = self.local_images[idx]
+        arr = np.load(path)
+        out_dict = {}
+        if self.local_classes is not None:
+            out_dict["y"] = np.array(self.local_classes[idx], dtype=np.int64)
+        return arr, out_dict

guided_diffusion/fp16_util.py ADDED Viewed

	@@ -0,0 +1,237 @@

+"""
+Helpers to train with 16-bit precision.
+"""
+import numpy as np
+import torch as th
+import torch.nn as nn
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from . import logger
+INITIAL_LOG_LOSS_SCALE = 20.0
+def convert_module_to_f16(l):
+    """
+    Convert primitive modules to float16.
+    """
+    if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Conv3d)):
+        l.weight.data = l.weight.data.half()
+        if l.bias is not None:
+            l.bias.data = l.bias.data.half()
+def convert_module_to_f32(l):
+    """
+    Convert primitive modules to float32, undoing convert_module_to_f16().
+    """
+    if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Conv3d)):
+        l.weight.data = l.weight.data.float()
+        if l.bias is not None:
+            l.bias.data = l.bias.data.float()
+def make_master_params(param_groups_and_shapes):
+    """
+    Copy model parameters into a (differently-shaped) list of full-precision
+    parameters.
+    """
+    master_params = []
+    for param_group, shape in param_groups_and_shapes:
+        master_param = nn.Parameter(
+            _flatten_dense_tensors(
+                [param.detach().float() for (_, param) in param_group]
+            ).view(shape)
+        )
+        master_param.requires_grad = True
+        master_params.append(master_param)
+    return master_params
+def model_grads_to_master_grads(param_groups_and_shapes, master_params):
+    """
+    Copy the gradients from the model parameters into the master parameters
+    from make_master_params().
+    """
+    for master_param, (param_group, shape) in zip(
+        master_params, param_groups_and_shapes
+    ):
+        master_param.grad = _flatten_dense_tensors(
+            [param_grad_or_zeros(param) for (_, param) in param_group]
+        ).view(shape)
+def master_params_to_model_params(param_groups_and_shapes, master_params):
+    """
+    Copy the master parameter data back into the model parameters.
+    """
+    # Without copying to a list, if a generator is passed, this will
+    # silently not copy any parameters.
+    for master_param, (param_group, _) in zip(master_params, param_groups_and_shapes):
+        for (_, param), unflat_master_param in zip(
+            param_group, unflatten_master_params(param_group, master_param.view(-1))
+        ):
+            param.detach().copy_(unflat_master_param)
+def unflatten_master_params(param_group, master_param):
+    return _unflatten_dense_tensors(master_param, [param for (_, param) in param_group])
+def get_param_groups_and_shapes(named_model_params):
+    named_model_params = list(named_model_params)
+    scalar_vector_named_params = (
+        [(n, p) for (n, p) in named_model_params if p.ndim <= 1],
+        (-1),
+    )
+    matrix_named_params = (
+        [(n, p) for (n, p) in named_model_params if p.ndim > 1],
+        (1, -1),
+    )
+    return [scalar_vector_named_params, matrix_named_params]
+def master_params_to_state_dict(
+    model, param_groups_and_shapes, master_params, use_fp16
+):
+    if use_fp16:
+        state_dict = model.state_dict()
+        for master_param, (param_group, _) in zip(
+            master_params, param_groups_and_shapes
+        ):
+            for (name, _), unflat_master_param in zip(
+                param_group, unflatten_master_params(param_group, master_param.view(-1))
+            ):
+                assert name in state_dict
+                state_dict[name] = unflat_master_param
+    else:
+        state_dict = model.state_dict()
+        for i, (name, _value) in enumerate(model.named_parameters()):
+            assert name in state_dict
+            state_dict[name] = master_params[i]
+    return state_dict
+def state_dict_to_master_params(model, state_dict, use_fp16):
+    if use_fp16:
+        named_model_params = [
+            (name, state_dict[name]) for name, _ in model.named_parameters()
+        ]
+        param_groups_and_shapes = get_param_groups_and_shapes(named_model_params)
+        master_params = make_master_params(param_groups_and_shapes)
+    else:
+        master_params = [state_dict[name] for name, _ in model.named_parameters()]
+    return master_params
+def zero_master_grads(master_params):
+    for param in master_params:
+        param.grad = None
+def zero_grad(model_params):
+    for param in model_params:
+        # Taken from https://pytorch.org/docs/stable/_modules/torch/optim/optimizer.html#Optimizer.add_param_group
+        if param.grad is not None:
+            param.grad.detach_()
+            param.grad.zero_()
+def param_grad_or_zeros(param):
+    if param.grad is not None:
+        return param.grad.data.detach()
+    else:
+        return th.zeros_like(param)
+class MixedPrecisionTrainer:
+    def __init__(
+        self,
+        *,
+        model,
+        use_fp16=False,
+        fp16_scale_growth=1e-3,
+        initial_lg_loss_scale=INITIAL_LOG_LOSS_SCALE,
+    ):
+        self.model = model
+        self.use_fp16 = use_fp16
+        self.fp16_scale_growth = fp16_scale_growth
+        self.model_params = list(self.model.parameters())
+        self.master_params = self.model_params
+        self.param_groups_and_shapes = None
+        self.lg_loss_scale = initial_lg_loss_scale
+        if self.use_fp16:
+            self.param_groups_and_shapes = get_param_groups_and_shapes(
+                self.model.named_parameters()
+            )
+            self.master_params = make_master_params(self.param_groups_and_shapes)
+            self.model.convert_to_fp16()
+    def zero_grad(self):
+        zero_grad(self.model_params)
+    def backward(self, loss: th.Tensor):
+        if self.use_fp16:
+            loss_scale = 2 ** self.lg_loss_scale
+            (loss * loss_scale).backward()
+        else:
+            loss.backward()
+    def optimize(self, opt: th.optim.Optimizer):
+        if self.use_fp16:
+            return self._optimize_fp16(opt)
+        else:
+            return self._optimize_normal(opt)
+    def _optimize_fp16(self, opt: th.optim.Optimizer):
+        logger.logkv_mean("lg_loss_scale", self.lg_loss_scale)
+        model_grads_to_master_grads(self.param_groups_and_shapes, self.master_params)
+        grad_norm, param_norm = self._compute_norms(grad_scale=2 ** self.lg_loss_scale)
+        if check_overflow(grad_norm):
+            self.lg_loss_scale -= 1
+            logger.log(f"Found NaN, decreased lg_loss_scale to {self.lg_loss_scale}")
+            zero_master_grads(self.master_params)
+            return False
+        logger.logkv_mean("grad_norm", grad_norm)
+        logger.logkv_mean("param_norm", param_norm)
+        for p in self.master_params:
+            p.grad.mul_(1.0 / (2 ** self.lg_loss_scale))
+        opt.step()
+        zero_master_grads(self.master_params)
+        master_params_to_model_params(self.param_groups_and_shapes, self.master_params)
+        self.lg_loss_scale += self.fp16_scale_growth
+        return True
+    def _optimize_normal(self, opt: th.optim.Optimizer):
+        grad_norm, param_norm = self._compute_norms()
+        logger.logkv_mean("grad_norm", grad_norm)
+        logger.logkv_mean("param_norm", param_norm)
+        opt.step()
+        return True
+    def _compute_norms(self, grad_scale=1.0):
+        grad_norm = 0.0
+        param_norm = 0.0
+        for p in self.master_params:
+            with th.no_grad():
+                param_norm += th.norm(p, p=2, dtype=th.float32).item() ** 2
+                if p.grad is not None:
+                    grad_norm += th.norm(p.grad, p=2, dtype=th.float32).item() ** 2
+        return np.sqrt(grad_norm) / grad_scale, np.sqrt(param_norm)
+    def master_params_to_state_dict(self, master_params):
+        return master_params_to_state_dict(
+            self.model, self.param_groups_and_shapes, master_params, self.use_fp16
+        )
+    def state_dict_to_master_params(self, state_dict):
+        return state_dict_to_master_params(self.model, state_dict, self.use_fp16)
+def check_overflow(value):
+    return (value == float("inf")) or (value == -float("inf")) or (value != value)

guided_diffusion/gaussian_diffusion.py ADDED Viewed

	@@ -0,0 +1,1400 @@

+"""
+This code started out as a PyTorch port of Ho et al's diffusion models:
+https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py
+Docstrings have been added, as well as DDIM sampling and a new collection of beta schedules.
+"""
+import enum
+import os
+import math
+import numpy as np
+import torch as th
+from .nn import mean_flat
+from .losses import normal_kl, discretized_gaussian_log_likelihood
+from .midi_util import save_piano_roll_midi
+from music_rule_guidance.rule_maps import FUNC_DICT, LOSS_DICT
+from collections import defaultdict
+import torch.nn.functional as F
+import multiprocessing
+from functools import partial
+import matplotlib.pyplot as plt
+plt.rcParams["figure.figsize"] = (20, 3)
+plt.rcParams['figure.dpi'] = 300
+plt.rcParams['savefig.dpi'] = 300
+def get_named_beta_schedule(schedule_name, num_diffusion_timesteps):
+    """
+    Get a pre-defined beta schedule for the given name.
+    The beta schedule library consists of beta schedules which remain similar
+    in the limit of num_diffusion_timesteps.
+    Beta schedules may be added, but should not be removed or changed once
+    they are committed to maintain backwards compatibility.
+    """
+    if schedule_name == "linear":
+        # Linear schedule from Ho et al, extended to work for any number of
+        # diffusion steps.
+        scale = 1000 / num_diffusion_timesteps
+        beta_start = scale * 0.0001
+        beta_end = scale * 0.02
+        return np.linspace(
+            beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64
+        )
+    elif schedule_name == "cosine":
+        return betas_for_alpha_bar(
+            num_diffusion_timesteps,
+            lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2,
+        )
+    elif schedule_name == 'stable-diffusion':
+        scale = 1000 / num_diffusion_timesteps
+        beta_start = scale * math.sqrt(0.00085)
+        beta_end = scale * math.sqrt(0.012)
+        return np.linspace(
+            beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64
+        ) ** 2
+    else:
+        raise NotImplementedError(f"unknown beta schedule: {schedule_name}")
+def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function,
+    which defines the cumulative product of (1-beta) over time from t = [0,1].
+    :param num_diffusion_timesteps: the number of betas to produce.
+    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
+                      produces the cumulative product of (1-beta) up to that
+                      part of the diffusion process.
+    :param max_beta: the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+    """
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+    return np.array(betas)
+class ModelMeanType(enum.Enum):
+    """
+    Which type of output the model predicts.
+    """
+    PREVIOUS_X = enum.auto()  # the model predicts x_{t-1}
+    START_X = enum.auto()  # the model predicts x_0
+    EPSILON = enum.auto()  # the model predicts epsilon
+class ModelVarType(enum.Enum):
+    """
+    What is used as the model's output variance.
+    The LEARNED_RANGE option has been added to allow the model to predict
+    values between FIXED_SMALL and FIXED_LARGE, making its job easier.
+    """
+    LEARNED = enum.auto()
+    FIXED_SMALL = enum.auto()
+    FIXED_LARGE = enum.auto()
+    LEARNED_RANGE = enum.auto()
+class LossType(enum.Enum):
+    MSE = enum.auto()  # use raw MSE loss (and KL when learning variances)
+    RESCALED_MSE = (
+        enum.auto()
+    )  # use raw MSE loss (with RESCALED_KL when learning variances)
+    KL = enum.auto()  # use the variational lower-bound
+    RESCALED_KL = enum.auto()  # like KL, but rescale to estimate the full VLB
+    def is_vb(self):
+        return self == LossType.KL or self == LossType.RESCALED_KL
+class GaussianDiffusion:
+    """
+    Utilities for training and sampling diffusion models.
+    Ported directly from here, and then adapted over time to further experimentation.
+    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py#L42
+    :param betas: a 1-D numpy array of betas for each diffusion timestep,
+                  starting at T and going to 1.
+    :param model_mean_type: a ModelMeanType determining what the model outputs.
+    :param model_var_type: a ModelVarType determining how variance is output.
+    :param loss_type: a LossType determining the loss function to use.
+    :param rescale_timesteps: if True, pass floating point timesteps into the
+                              model so that they are always scaled like in the
+                              original paper (0 to 1000).
+    """
+    def __init__(
+        self,
+        *,
+        betas,
+        model_mean_type,
+        model_var_type,
+        loss_type,
+        rescale_timesteps=False,
+    ):
+        self.model_mean_type = model_mean_type
+        self.model_var_type = model_var_type
+        self.loss_type = loss_type
+        self.rescale_timesteps = rescale_timesteps
+        # Use float64 for accuracy.
+        betas = np.array(betas, dtype=np.float64)
+        self.betas = betas
+        assert len(betas.shape) == 1, "betas must be 1-D"
+        assert (betas > 0).all() and (betas <= 1).all()
+        self.num_timesteps = int(betas.shape[0])
+        alphas = 1.0 - betas
+        self.alphas_cumprod = np.cumprod(alphas, axis=0)
+        self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1])
+        self.alphas_cumprod_next = np.append(self.alphas_cumprod[1:], 0.0)
+        assert self.alphas_cumprod_prev.shape == (self.num_timesteps,)
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.sqrt_alphas_cumprod = np.sqrt(self.alphas_cumprod)
+        self.sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - self.alphas_cumprod)
+        self.log_one_minus_alphas_cumprod = np.log(1.0 - self.alphas_cumprod)
+        self.sqrt_recip_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod)
+        self.sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod - 1)
+        # calculations for posterior q(x_{t-1} | x_t, x_0)
+        self.posterior_variance = (
+            betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        )
+        # log calculation clipped because the posterior variance is 0 at the
+        # beginning of the diffusion chain.
+        self.posterior_log_variance_clipped = np.log(
+            np.append(self.posterior_variance[1], self.posterior_variance[1:])
+        )
+        self.posterior_mean_coef1 = (
+            betas * np.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        )
+        self.posterior_mean_coef2 = (
+            (1.0 - self.alphas_cumprod_prev)
+            * np.sqrt(alphas)
+            / (1.0 - self.alphas_cumprod)
+        )
+    def q_mean_variance(self, x_start, t):
+        """
+        Get the distribution q(x_t | x_0).
+        :param x_start: the [N x C x ...] tensor of noiseless inputs.
+        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
+        :return: A tuple (mean, variance, log_variance), all of x_start's shape.
+        """
+        mean = (
+            _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+        )
+        variance = _extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape)
+        log_variance = _extract_into_tensor(
+            self.log_one_minus_alphas_cumprod, t, x_start.shape
+        )
+        return mean, variance, log_variance
+    def q_sample(self, x_start, t, noise=None):
+        """
+        Diffuse the data for a given number of diffusion steps.
+        In other words, sample from q(x_t | x_0).
+        :param x_start: the initial data batch.
+        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
+        :param noise: if specified, the split-out normal noise.
+        :return: A noisy version of x_start.
+        """
+        if noise is None:
+            noise = th.randn_like(x_start)
+        assert noise.shape == x_start.shape
+        return (
+            _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+            + _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape)
+            * noise
+        )
+    def q_posterior_mean_variance(self, x_start, x_t, t):
+        """
+        Compute the mean and variance of the diffusion posterior:
+            q(x_{t-1} | x_t, x_0)
+        """
+        assert x_start.shape == x_t.shape
+        posterior_mean = (
+            _extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start
+            + _extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t
+        )
+        posterior_variance = _extract_into_tensor(self.posterior_variance, t, x_t.shape)
+        posterior_log_variance_clipped = _extract_into_tensor(
+            self.posterior_log_variance_clipped, t, x_t.shape
+        )
+        assert (
+            posterior_mean.shape[0]
+            == posterior_variance.shape[0]
+            == posterior_log_variance_clipped.shape[0]
+            == x_start.shape[0]
+        )
+        return posterior_mean, posterior_variance, posterior_log_variance_clipped
+    def p_mean_variance(
+        self, model, x, t, clip_denoised=True, denoised_fn=None, model_kwargs=None,
+        cond_fn=None, embed_model=None, edit_kwargs=None,
+    ):
+        """
+        Apply the model to get p(x_{t-1} | x_t), as well as a prediction of
+        the initial x, x_0.
+        :param model: the model, which takes a signal and a batch of timesteps
+                      as input.
+        :param x: the [N x C x ...] tensor at time t.
+        :param t: a 1-D Tensor of timesteps.
+        :param clip_denoised: if True, clip the denoised signal into [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample. Applies before
+            clip_denoised.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :param cond_fn: log p(y|x), to maximize
+        :param embed_model: contains encoder and decoder
+        :param edit_kwargs: replacement-based conditioning
+        :return: a dict with the following keys:
+                 - 'mean': the model mean output.
+                 - 'variance': the model variance output.
+                 - 'log_variance': the log of 'variance'.
+                 - 'pred_xstart': the prediction for x_0.
+        """
+        def process_xstart(x):
+            if denoised_fn is not None:
+                x = denoised_fn(x)
+            if clip_denoised:
+                return x.clamp(-1, 1)
+            return x
+        if model_kwargs is None:
+            model_kwargs = {}
+        B, C = x.shape[:2]
+        assert t.shape == (B,)
+        model_output = model(x, self._scale_timesteps(t), **model_kwargs)
+        if edit_kwargs is not None:
+            pred_xstart = process_xstart(
+                self._predict_xstart_from_eps(x_t=x, t=t, eps=model_output)
+            )
+            replaced_x0 = edit_kwargs["mask"] * edit_kwargs["gt"] + (1 - edit_kwargs["mask"]) * pred_xstart
+            model_output = self._predict_eps_from_xstart(x_t=x, t=t, pred_xstart=replaced_x0)
+        if self.model_var_type in [ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE]:
+            assert model_output.shape == (B, C * 2, *x.shape[2:])
+            model_output, model_var_values = th.split(model_output, C, dim=1)
+            if self.model_var_type == ModelVarType.LEARNED:
+                model_log_variance = model_var_values
+                model_variance = th.exp(model_log_variance)
+            else:
+                min_log = _extract_into_tensor(
+                    self.posterior_log_variance_clipped, t, x.shape
+                )
+                max_log = _extract_into_tensor(np.log(self.betas), t, x.shape)
+                # The model_var_values is [-1, 1] for [min_var, max_var].
+                frac = (model_var_values + 1) / 2
+                model_log_variance = frac * max_log + (1 - frac) * min_log
+                model_variance = th.exp(model_log_variance)
+        else:
+            model_variance, model_log_variance = {
+                # for fixedlarge, we set the initial (log-)variance like so
+                # to get a better decoder log likelihood.
+                ModelVarType.FIXED_LARGE: (
+                    np.append(self.posterior_variance[1], self.betas[1:]),
+                    np.log(np.append(self.posterior_variance[1], self.betas[1:])),
+                ),
+                ModelVarType.FIXED_SMALL: (
+                    self.posterior_variance,
+                    self.posterior_log_variance_clipped,
+                ),
+            }[self.model_var_type]
+            model_variance = _extract_into_tensor(model_variance, t, x.shape)
+            model_log_variance = _extract_into_tensor(model_log_variance, t, x.shape)
+        if self.model_mean_type == ModelMeanType.PREVIOUS_X:
+            pred_xstart = process_xstart(
+                self._predict_xstart_from_xprev(x_t=x, t=t, xprev=model_output)
+            )
+            model_mean = model_output
+        elif self.model_mean_type in [ModelMeanType.START_X, ModelMeanType.EPSILON]:
+            if self.model_mean_type == ModelMeanType.START_X:
+                pred_xstart = process_xstart(model_output)
+            else:
+                pred_xstart = process_xstart(
+                    self._predict_xstart_from_eps(x_t=x, t=t, eps=model_output)
+                )
+            model_mean, _, _ = self.q_posterior_mean_variance(
+                x_start=pred_xstart, x_t=x, t=t
+            )
+        else:
+            raise NotImplementedError(self.model_mean_type)
+        assert (
+            model_mean.shape == model_log_variance.shape == pred_xstart.shape == x.shape
+        )
+        return {
+            "mean": model_mean,
+            "variance": model_variance,
+            "log_variance": model_log_variance,
+            "pred_xstart": pred_xstart,
+        }
+    def _predict_xstart_from_eps(self, x_t, t, eps):
+        assert x_t.shape == eps.shape
+        return (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
+            - _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps
+        )
+    def _predict_xstart_from_xprev(self, x_t, t, xprev):
+        assert x_t.shape == xprev.shape
+        return (  # (xprev - coef2*x_t) / coef1
+            _extract_into_tensor(1.0 / self.posterior_mean_coef1, t, x_t.shape) * xprev
+            - _extract_into_tensor(
+                self.posterior_mean_coef2 / self.posterior_mean_coef1, t, x_t.shape
+            )
+            * x_t
+        )
+    def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
+        return (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
+            - pred_xstart
+        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
+    def _scale_timesteps(self, t):
+        if self.rescale_timesteps:
+            return t.float() * (1000.0 / self.num_timesteps)
+        return t
+    def condition_mean(self, cond_fn, p_mean_var, x, t, model_kwargs=None, guidance_kwargs=None,
+                       model=None, embed_model=None, edit_kwargs=None, scale_factor=1.,
+                       record=False):
+        """
+        Compute the mean for the previous step, given a function cond_fn that
+        computes the gradient of a conditional log probability with respect to
+        x. In particular, cond_fn computes grad(log(p(y|x))), and we want to
+        condition on y.
+        If dps=True, use diffusion posterior sampling, cond_fn is log p(y|x_0)
+        instead of the grad of it. Need to use model (eps) and embed_model.
+        This uses the conditioning strategy from Sohl-Dickstein et al. (2015).
+        """
+        dps = True if guidance_kwargs.method == 'dps' else False
+        if not dps:
+            if edit_kwargs is None:
+                gradient = cond_fn(x, self._scale_timesteps(t), **model_kwargs)
+                new_mean = (
+                    p_mean_var["mean"].float() + p_mean_var["variance"] * gradient.float()
+                )
+            else:
+                # only compute gradient on editable latents, since rule is only on editable length
+                x = x[:, :, edit_kwargs["l_start"]:edit_kwargs["l_end"], :]
+                gradient = cond_fn(x, self._scale_timesteps(t), **model_kwargs)
+                new_mean = p_mean_var["mean"].float()
+                new_mean[:, :, edit_kwargs["l_start"]:edit_kwargs["l_end"], :] += (
+                    p_mean_var["variance"] * gradient.float())
+        else:
+            assert model is not None
+            step_size = guidance_kwargs.step_size
+            with th.enable_grad():
+                xt = x.detach().requires_grad_(True)
+                eps = model(xt, self._scale_timesteps(t), **model_kwargs)
+                pred_xstart = self._predict_xstart_from_eps(xt, t, eps)
+                # If vae is not None, and not dps_nn, i.e. using dps rule
+                if embed_model is not None and not guidance_kwargs.nn:
+                    pred_xstart = _decode(pred_xstart, embed_model, scale_factor=scale_factor)
+                if record:
+                    pred_xstart.retain_grad()
+                if edit_kwargs is not None:
+                    # only check condition on the editable part
+                    pred_xstart = pred_xstart[:, :, edit_kwargs["l_start"]:edit_kwargs["l_end"], :]
+                log_probs = cond_fn(pred_xstart, self._scale_timesteps(t), **model_kwargs)
+                gradient = th.autograd.grad(log_probs.sum(), xt)[0]
+            # check if x_0 space works
+            if record:
+                pred_xstart_up = pred_xstart + pred_xstart.grad
+                log_probs_up = cond_fn(pred_xstart_up, self._scale_timesteps(t), **model_kwargs)
+                # record gradient difference
+                cur_grad_diff = (self.prev_gradient_single - gradient).reshape(x.shape[0], -1).norm(dim=-1)
+                prev_gradient_norm = self.prev_gradient_single.reshape(x.shape[0], -1).norm(dim=-1)
+                if prev_gradient_norm.mean() > 1e-5:
+                    self.grad_norm.append(prev_gradient_norm.mean().item())
+                    cur_grad_diff = cur_grad_diff / prev_gradient_norm
+                    self.gradient_diff.append(cur_grad_diff.mean().item())
+                self.prev_gradient_single = gradient
+                self.log_probs.append((log_probs.mean().item()))
+            gradient = gradient / th.sqrt(-log_probs.view(x.shape[0], 1, 1, 1) + 1e-12)
+            # gradient = gradient / (-log_probs.view(x.shape[0], 1, 1, 1) + 1e-12)
+            if edit_kwargs is None:
+                new_mean = (
+                    p_mean_var["mean"].float() + step_size * gradient.float()
+                )
+            else:
+                new_mean = p_mean_var["mean"].float()
+                new_mean[:, :, edit_kwargs["l_start"]:edit_kwargs["l_end"], :] += step_size * gradient.float()
+            # check whether moved towards good direction om z space
+            if record:
+                eps = model(xt + step_size * gradient.float(), self._scale_timesteps(t), **model_kwargs)
+                pred_xstart_2 = self._predict_xstart_from_eps(xt, t, eps)
+                pred_xstart_2 = _decode(pred_xstart_2, embed_model, scale_factor=scale_factor)
+                log_probs_2 = cond_fn(pred_xstart_2, self._scale_timesteps(t), **model_kwargs)
+        return new_mean
+    def condition_score(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
+        """
+        Compute what the p_mean_variance output would have been, should the
+        model's score function be conditioned by cond_fn.
+        See condition_mean() for details on cond_fn.
+        Unlike condition_mean(), this instead uses the conditioning strategy
+        from Song et al (2020).
+        """
+        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
+        eps = self._predict_eps_from_xstart(x, t, p_mean_var["pred_xstart"])
+        eps = eps - (1 - alpha_bar).sqrt() * cond_fn(
+            x, self._scale_timesteps(t), **model_kwargs
+        )
+        out = p_mean_var.copy()
+        out["pred_xstart"] = self._predict_xstart_from_eps(x, t, eps)
+        out["mean"], _, _ = self.q_posterior_mean_variance(
+            x_start=out["pred_xstart"], x_t=x, t=t
+        )
+        return out
+    def scg_sample(self,
+                    model,
+                    t,
+                    mean_pred,
+                    g_coeff,
+                    embed_model,
+                    scale_factor,
+                    model_kwargs=None,
+                    scg_kwargs=None,
+                    edit_kwargs=None,
+                    dc_kwargs=None,
+                    record=False,
+                    record_freq=100):
+        """
+        Sample N x_{t-1} from x_t and select the best one.
+        """
+        # mean_pred = p_mean_var["mean"]
+        # g_coeff = th.exp(0.5 * p_mean_var["log_variance"])
+        num_samples = scg_kwargs["num_samples"]
+        sample = mean_pred.unsqueeze(dim=0)
+        sample = sample.expand(num_samples, *mean_pred.shape).contiguous()
+        noise = th.randn_like(sample)
+        sample = sample + g_coeff * noise
+        sample = sample.view(-1, *mean_pred.shape[1:])
+        t = t.repeat(num_samples)
+        # it's fine to use different target for different samples, expand and repeat match with each other (012012)
+        cloned_model_kwargs = {"y": model_kwargs["y"].repeat(num_samples)}
+        eps = model(sample, self._scale_timesteps(t), **cloned_model_kwargs)
+        pred_xstart = self._predict_xstart_from_eps(sample, t, eps)
+        if edit_kwargs is not None:
+            # only decode editable part
+            pred_xstart = pred_xstart[:, :, edit_kwargs["l_start"]:edit_kwargs["l_end"], :]
+        if embed_model is not None:
+            pred_xstart = _decode(pred_xstart, embed_model, scale_factor=scale_factor)
+        if dc_kwargs is None or dc_kwargs.base <= 0:
+            if record:
+                # create dictionary to record the loss for each rule
+                each_loss = {}
+            # work with multiple rules, model_kwargs["rule"] is a dict that contains rule_name: target
+            total_log_prob = 0
+            for rule_name, rule_target in model_kwargs["rule"].items():
+                gen_rule = _extract_rule(rule_name, pred_xstart)
+                y_ = rule_target.repeat(num_samples, 1)
+                log_prob = - LOSS_DICT[rule_name](gen_rule, y_)
+                if record:
+                    each_loss[rule_name] = -log_prob.view(num_samples, -1)
+                total_log_prob += log_prob * scg_kwargs.get(rule_name, 1.)
+            total_log_prob = total_log_prob.view(num_samples, -1)
+            max_ind = total_log_prob.argmax(dim=0)
+            # softmax (need to reweight to get unit var otherwise goes to empty rolls)
+            # weight = F.softmax(total_log_prob * 1., dim=0)
+            # var = (weight ** 2).sum(dim=0)
+            # avg_noise = (noise * weight[..., None, None, None]).sum(dim=0) / th.sqrt(var)[..., None, None, None]
+            # # not adding dw
+            # sample = mean_pred + g_coeff * avg_noise
+            # # add dw
+            # dw = th.randn_like(p_mean_var["mean"])
+            # sample = mean_pred + g_coeff * (avg_noise + dw)
+            # take argmax
+            sample = sample.view(num_samples, *mean_pred.shape)
+            sample = sample[max_ind, th.arange(mean_pred.shape[0])]
+            # take argmax, and add dw
+            # noise = noise.view(num_samples, *p_mean_var["mean"].shape)
+            # best_noise = noise[max_ind, th.arange(p_mean_var["mean"].shape[0])]
+            # dw = th.randn_like(p_mean_var["mean"])
+            # sample = p_mean_var["mean"] + th.exp(0.5 * p_mean_var["log_variance"]) * (best_noise + dw)
+        else:
+            # Assuming base length in x0 is only controlled by the corresponding location in xt
+            # (doesn't hold, but maybe can approximate because of cond ind)
+            sample = sample.view(num_samples, *mean_pred.shape)
+            sub_samples = []
+            total_length = pred_xstart.shape[-1]
+            start_inds = th.arange(0, total_length, dc_kwargs.base*8)
+            rule_base = dc_kwargs.base // 16   # number of rules under the base length
+            for i, start_ind in enumerate(start_inds):
+                end_ind = min(start_ind+dc_kwargs.base*8, total_length)
+                pred_xstart_cur = pred_xstart[:, :, :, start_ind: end_ind]
+                total_log_prob = 0
+                for rule_name, rule_target in model_kwargs["rule"].items():
+                    gen_rule = _extract_rule(rule_name, pred_xstart_cur)
+                    if rule_name == 'note_density':
+                        half = rule_target.shape[-1] // 2
+                        vt_nd_target = rule_target[:, :half][:, i*rule_base: min((i+1)*rule_base, half)]
+                        hr_nd_target = rule_target[:, half:][:, i*rule_base: min((i+1)*rule_base, half)]
+                        rule_target = th.concat((vt_nd_target, hr_nd_target), dim=-1)
+                    elif 'chord' in rule_name:
+                        rule_length = rule_target.shape[-1]
+                        rule_target = rule_target[:, i*rule_base: min((i+1)*rule_base, rule_length)]
+                    y_ = rule_target.repeat(num_samples, 1)
+                    log_prob = - LOSS_DICT[rule_name](gen_rule, y_)
+                    total_log_prob += log_prob * scg_kwargs.get(rule_name, 1.)
+                total_log_prob = total_log_prob.view(num_samples, -1)
+                max_ind = total_log_prob.argmax(dim=0)
+                # take argmax on num_sample x batch_size x 4 x 256 x 16
+                sub_sample = sample[max_ind, th.arange(mean_pred.shape[0]), :, start_ind//8: end_ind//8]
+                sub_samples.append(sub_sample)
+            sample = th.concat(sub_samples, dim=-2)
+        if record:
+            for rule_name, loss in each_loss.items():
+                current_loss = loss[max_ind, th.arange(mean_pred.shape[0])][0].item()
+                self.each_loss[rule_name].append((t[0].item(), current_loss))
+            max_log_prob = total_log_prob[max_ind, th.arange(mean_pred.shape[0])][0].item()
+            # record log_prob
+            self.log_probs.append((t[0].item(), max_log_prob))
+            # record loss std
+            self.loss_std.append((t[0].item(), total_log_prob.std().item()))
+            # record loss range
+            self.loss_range.append((t[0].item(), (max_log_prob - total_log_prob.min()).abs().item()))
+            # record gradient difference
+            noise = noise.view(num_samples, *mean_pred.shape)
+            gradient = noise[max_ind, th.arange(mean_pred.shape[0])]
+            cur_grad_diff = (self.prev_gradient_single - gradient).reshape(sample.shape[0], -1).norm(dim=-1)
+            prev_gradient_norm = self.prev_gradient_single.reshape(sample.shape[0], -1).norm(dim=-1)
+            if prev_gradient_norm.mean() > 1e-5:
+                self.grad_norm.append(prev_gradient_norm.mean().item())
+                cur_grad_diff = cur_grad_diff / prev_gradient_norm
+                self.gradient_diff.append(cur_grad_diff.mean().item())
+            self.prev_gradient_single = gradient
+            if (t[0] + 1) % record_freq == 0:
+                pred_xstart = pred_xstart.view(num_samples, -1, *pred_xstart.shape[1:])
+                pred_xstart = pred_xstart[max_ind, th.arange(mean_pred.shape[0])]
+                pred_xstart[pred_xstart <= -0.95] = -1.  # heuristic thresholding the background
+                pred_xstart = ((pred_xstart + 1) * 63.5).clamp(0, 127).to(th.uint8)
+                self.inter_piano_rolls.append(pred_xstart.cpu())
+                # plot loss distribution
+                if len(model_kwargs["rule"].keys()) <= 1:
+                    plt.figure(figsize=(4, 3))
+                    total_log_prob = total_log_prob.view(-1).cpu()
+                    plt.bar(range(len(total_log_prob)), -total_log_prob)
+                    plt.xlabel('choice')
+                    plt.ylabel('loss')
+                    plt.title(f't={t[0]+1}')
+                    plt.tight_layout()
+                    plt.savefig(f'loggings/debug/t={t[0]+1}.png')
+                    plt.show()
+        return sample
+    def p_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        embed_model=None,
+        scale_factor=1.,
+        guidance_kwargs=None,
+        scg_kwargs=None,
+        edit_kwargs=None,
+        record=False,
+    ):
+        """
+        Sample x_{t-1} from the model at the given timestep.
+        :param model: the model to sample from.
+        :param x: the current tensor at x_{t-1}.
+        :param t: the value of t, starting at 0 for the first diffusion step.
+        :param clip_denoised: if True, clip the x_start prediction to [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample.
+        :param cond_fn: if not None, this is a gradient function that acts
+                        similarly to the model.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict containing the following keys:
+                 - 'sample': a random sample from the model.
+                 - 'pred_xstart': a prediction of x_0.
+        """
+        if guidance_kwargs is not None:
+            if guidance_kwargs.schedule:
+                t_start = guidance_kwargs.t_start
+                t_end = guidance_kwargs.t_end
+                interval = guidance_kwargs.interval
+                use_guidance = guide_schedule(t, t_start, t_end, interval)
+            else:
+                use_guidance = True
+        else:
+            use_guidance = False
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+            cond_fn=cond_fn,
+            embed_model=embed_model,
+            edit_kwargs=edit_kwargs,
+        )
+        # if use scg guidance, then schedule only applies to scg sampling
+        if cond_fn is not None and (use_guidance or scg_kwargs is not None):
+            out["mean"] = self.condition_mean(
+                cond_fn, out, x, t, model_kwargs=model_kwargs,
+                guidance_kwargs=guidance_kwargs, model=model, embed_model=embed_model,
+                edit_kwargs=edit_kwargs, scale_factor=scale_factor
+            )
+        if scg_kwargs is None:
+            noise = th.randn_like(x)
+            nonzero_mask = (
+                (t > self.t_end).float().view(-1, *([1] * (len(x.shape) - 1)))
+            )  # no noise when t == t_end (0 if not early stopping)
+            sample = out["mean"] + nonzero_mask * th.exp(0.5 * out["log_variance"]) * noise
+        else:   # scg search (greedy)
+            if t[0] > self.t_end:
+                mean_pred = out["mean"]
+                g_coeff = th.exp(0.5 * out["log_variance"])
+                if use_guidance:
+                    dc_kwargs = getattr(guidance_kwargs, 'dc', None)
+                    sample = self.scg_sample(model, t, mean_pred, g_coeff, embed_model, scale_factor,
+                                              model_kwargs=model_kwargs, scg_kwargs=scg_kwargs,
+                                              edit_kwargs=edit_kwargs, dc_kwargs=dc_kwargs, record=record)
+                else:
+                    sample = mean_pred + g_coeff * th.randn_like(x)
+                    if record:
+                        eps = model(sample, self._scale_timesteps(t), **model_kwargs)
+                        pred_xstart = self._predict_xstart_from_eps(sample, t, eps)
+                        pred_xstart = _decode(pred_xstart, embed_model, scale_factor=scale_factor)
+                        if len(model_kwargs["rule"].keys()) <= 1:
+                            # only record for individual rule to save time
+                            total_log_prob = 0
+                            for rule_name, rule_target in model_kwargs["rule"].items():
+                                gen_rule = _extract_rule(rule_name, pred_xstart)
+                                log_prob = - LOSS_DICT[rule_name](gen_rule, rule_target)
+                                total_log_prob += log_prob.mean().item() * scg_kwargs.get(rule_name, 1.)
+                            self.log_probs.append((t[0].item(), total_log_prob))
+                        if (t[0] + 1) % 100 == 0:
+                            pred_xstart[pred_xstart <= -0.95] = -1.  # heuristic thresholding the background
+                            pred_xstart = ((pred_xstart + 1) * 63.5).clamp(0, 127).to(th.uint8)
+                            self.inter_piano_rolls.append(pred_xstart.cpu())
+            else:
+                sample = out["mean"]
+        return {"sample": sample, "pred_xstart": out["pred_xstart"]}
+    def p_sample_loop(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        t_end=0,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        embed_model=None,
+        scale_factor=1.,
+        guidance_kwargs=None,
+        scg_kwargs=None,
+        edit_kwargs=None,
+        record=False,
+    ):
+        """
+        Generate samples from the model.
+        :param model: the model module.
+        :param shape: the shape of the samples, (N, C, H, W).
+        :param noise: if specified, the noise from the encoder to sample.
+                      Should be of the same shape as `shape`.
+        :param clip_denoised: if True, clip x_start predictions to [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample.
+        :param t_end: early stopping for the sampling process
+        :param cond_fn: if not None, this is a gradient function that acts
+                        similarly to the model.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :param device: if specified, the device to create the samples on.
+                       If not specified, use a model parameter's device.
+        :param progress: if True, show a tqdm progress bar.
+        :return: a non-differentiable batch of samples.
+        """
+        final = None
+        self.t_end = t_end
+        if record:
+            self.prev_gradient_single = th.zeros(shape, device=device)
+            self.gradient_diff = []
+            self.grad_norm = []
+            self.log_probs = []
+            # record loss for each rule
+            self.each_loss = defaultdict(list)
+            self.inter_piano_rolls = []
+            self.loss_std = []
+            self.loss_range = []
+        for sample in self.p_sample_loop_progressive(
+            model,
+            shape,
+            noise=noise,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            t_end=t_end,
+            cond_fn=cond_fn,
+            model_kwargs=model_kwargs,
+            device=device,
+            progress=progress,
+            embed_model=embed_model,
+            scale_factor=scale_factor,
+            guidance_kwargs=guidance_kwargs,
+            scg_kwargs=scg_kwargs,
+            edit_kwargs=edit_kwargs,
+            record=record,
+        ):
+            final = sample
+        return final["sample"]
+    def p_sample_loop_progressive(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        t_end=0,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        embed_model=None,
+        scale_factor=1.,
+        guidance_kwargs=None,
+        scg_kwargs=None,
+        edit_kwargs=None,
+        record=False,
+    ):
+        """
+        Generate samples from the model and yield intermediate samples from
+        each timestep of diffusion.
+        Arguments are the same as p_sample_loop().
+        Returns a generator over dicts, where each dict is the return value of
+        p_sample().
+        """
+        if device is None:
+            device = next(model.parameters()).device
+        assert isinstance(shape, (tuple, list))
+        if noise is not None:
+            img = noise
+        elif edit_kwargs is not None:
+            t = th.tensor([edit_kwargs["noise_level"]-1] * shape[0], device=device)
+            alpha_cumprod = _extract_into_tensor(self.alphas_cumprod, t, shape)
+            img = th.sqrt(alpha_cumprod) * edit_kwargs["gt"] + th.sqrt((1 - alpha_cumprod)) * th.randn(*shape, device=device)
+        else:
+            img = th.randn(*shape, device=device)
+        indices = list(range(self.num_timesteps))[::-1]
+        if t_end:
+            indices = indices[:-t_end]
+        if edit_kwargs is not None:
+            t_start = self.num_timesteps - edit_kwargs["noise_level"]
+            indices = indices[t_start:]
+        if progress:
+            # Lazy import so that we don't depend on tqdm.
+            from tqdm.auto import tqdm
+            indices = tqdm(indices)
+        for i in indices:
+            t = th.tensor([i] * shape[0], device=device)
+            with th.no_grad():
+                out = self.p_sample(
+                    model,
+                    img,
+                    t,
+                    clip_denoised=clip_denoised,
+                    denoised_fn=denoised_fn,
+                    cond_fn=cond_fn,
+                    model_kwargs=model_kwargs,
+                    embed_model=embed_model,
+                    scale_factor=scale_factor,
+                    guidance_kwargs=guidance_kwargs,
+                    scg_kwargs=scg_kwargs,
+                    edit_kwargs=edit_kwargs,
+                    record=record,
+                )
+                yield out
+                img = out["sample"]
+    def ddim_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        eta=0.0,
+        embed_model=None,
+        scale_factor=1.,
+        guidance_kwargs=None,
+        edit_kwargs=None,
+        scg_kwargs=None,
+        record=False,
+    ):
+        """
+        Sample x_{t-1} from the model using DDIM.
+        Same usage as p_sample().
+        """
+        if guidance_kwargs is not None:
+            if guidance_kwargs.schedule:
+                t_start = guidance_kwargs.t_start
+                t_end = guidance_kwargs.t_end
+                interval = guidance_kwargs.interval
+                use_guidance = guide_schedule(t, t_start, t_end, interval)
+            else:
+                use_guidance = True
+        else:
+            use_guidance = False
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+            cond_fn=cond_fn,
+            embed_model=embed_model,
+            edit_kwargs=edit_kwargs,
+        )
+        if cond_fn is not None and use_guidance:
+            out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)
+        # Usually our model outputs epsilon, but we re-derive it
+        # in case we used x_start or x_prev prediction.
+        eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"])
+        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
+        alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape)
+        sigma = (
+            eta
+            * th.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar))
+            * th.sqrt(1 - alpha_bar / alpha_bar_prev)
+        )
+        # Equation 12.
+        mean_pred = (
+            out["pred_xstart"] * th.sqrt(alpha_bar_prev)
+            + th.sqrt(1 - alpha_bar_prev - sigma ** 2) * eps
+        )
+        if scg_kwargs is None:
+            noise = th.randn_like(x)
+            nonzero_mask = (
+                (t != self.t_end).float().view(-1, *([1] * (len(x.shape) - 1)))
+            )  # no noise when t == t_end (0 if not early stopping)
+            sample = mean_pred + nonzero_mask * sigma * noise
+        else:
+            if t[0] > self.t_end:
+                g_coeff = sigma
+                if use_guidance:  # tune according to ddim steps
+                    dc_kwargs = getattr(guidance_kwargs, 'dc', None)
+                    sample = self.scg_sample(self._wrap_model(model), t, mean_pred, g_coeff, embed_model, scale_factor,
+                                              model_kwargs=model_kwargs, scg_kwargs=scg_kwargs, edit_kwargs=edit_kwargs,
+                                              dc_kwargs=dc_kwargs, record=record, record_freq=10)
+                else:
+                    sample = mean_pred + g_coeff * th.randn_like(x)
+                    if record:
+                        eps = self._wrap_model(model)(sample, self._scale_timesteps(t), **model_kwargs)
+                        pred_xstart = self._predict_xstart_from_eps(sample, t, eps)
+                        pred_xstart = _decode(pred_xstart, embed_model, scale_factor=scale_factor)
+                        total_log_prob = 0
+                        for rule_name, rule_target in model_kwargs["rule"].items():
+                            gen_rule = _extract_rule(rule_name, pred_xstart)
+                            log_prob = - LOSS_DICT[rule_name](gen_rule, rule_target)
+                            total_log_prob += log_prob.mean().item() * scg_kwargs.get(rule_name, 1.)
+                        self.log_probs.append((t[0].item(), total_log_prob))
+                        if (t[0] + 1) % 10 == 0:
+                            pred_xstart[pred_xstart <= -0.95] = -1.  # heuristic thresholding the background
+                            pred_xstart = ((pred_xstart + 1) * 63.5).clamp(0, 127).to(th.uint8)
+                            self.inter_piano_rolls.append(pred_xstart.cpu())
+            else:
+                sample = mean_pred
+        return {"sample": sample, "pred_xstart": out["pred_xstart"]}
+    def ddim_reverse_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        model_kwargs=None,
+        eta=0.0,
+    ):
+        """
+        Sample x_{t+1} from the model using DDIM reverse ODE.
+        """
+        assert eta == 0.0, "Reverse ODE only for deterministic path"
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        # Usually our model outputs epsilon, but we re-derive it
+        # in case we used x_start or x_prev prediction.
+        eps = (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x.shape) * x
+            - out["pred_xstart"]
+        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x.shape)
+        alpha_bar_next = _extract_into_tensor(self.alphas_cumprod_next, t, x.shape)
+        # Equation 12. reversed
+        mean_pred = (
+            out["pred_xstart"] * th.sqrt(alpha_bar_next)
+            + th.sqrt(1 - alpha_bar_next) * eps
+        )
+        return {"sample": mean_pred, "pred_xstart": out["pred_xstart"]}
+    def ddim_sample_loop(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        t_end=0,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        eta=0.0,
+        embed_model=None,
+        scale_factor=1.,
+        guidance_kwargs=None,
+        scg_kwargs=None,
+        edit_kwargs=None,
+        record=False,
+    ):
+        """
+        Generate samples from the model using DDIM.
+        Same usage as p_sample_loop().
+        """
+        final = None
+        self.t_end = t_end
+        if record:
+            self.prev_gradient_single = th.zeros(shape, device=device)
+            self.gradient_diff = []
+            self.grad_norm = []
+            self.log_probs = []
+            self.inter_piano_rolls = []
+            self.loss_std = []
+            self.loss_range = []
+        for sample in self.ddim_sample_loop_progressive(
+            model,
+            shape,
+            noise=noise,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            t_end=t_end,
+            cond_fn=cond_fn,
+            model_kwargs=model_kwargs,
+            device=device,
+            progress=progress,
+            eta=eta,
+            embed_model=embed_model,
+            scale_factor=scale_factor,
+            guidance_kwargs=guidance_kwargs,
+            scg_kwargs=scg_kwargs,
+            edit_kwargs=edit_kwargs,
+            record=record,
+        ):
+            final = sample
+        return final["sample"]
+    def ddim_sample_loop_progressive(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        t_end=0,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        eta=0.0,
+        embed_model=None,
+        scale_factor=1.,
+        guidance_kwargs=None,
+        scg_kwargs=None,
+        edit_kwargs=None,
+        record=False,
+    ):
+        """
+        Use DDIM to sample from the model and yield intermediate samples from
+        each timestep of DDIM.
+        Same usage as p_sample_loop_progressive().
+        """
+        if device is None:
+            device = next(model.parameters()).device
+        assert isinstance(shape, (tuple, list))
+        if noise is not None:
+            img = noise
+        elif edit_kwargs is not None:
+            t = th.tensor([edit_kwargs["noise_level"]-1] * shape[0], device=device)
+            alpha_cumprod = _extract_into_tensor(self.alphas_cumprod, t, shape)
+            img = th.sqrt(alpha_cumprod) * edit_kwargs["gt"] + th.sqrt((1 - alpha_cumprod)) * th.randn(*shape, device=device)
+        else:
+            img = th.randn(*shape, device=device)
+        indices = list(range(self.num_timesteps))[::-1]
+        if t_end:
+            indices = indices[:-t_end]
+        if edit_kwargs is not None:
+            t_start = self.num_timesteps - edit_kwargs["noise_level"]
+            indices = indices[t_start:]
+        if progress:
+            # Lazy import so that we don't depend on tqdm.
+            from tqdm.auto import tqdm
+            indices = tqdm(indices)
+        for i in indices:
+            t = th.tensor([i] * shape[0], device=device)
+            with th.no_grad():
+                out = self.ddim_sample(
+                    model,
+                    img,
+                    t,
+                    clip_denoised=clip_denoised,
+                    denoised_fn=denoised_fn,
+                    cond_fn=cond_fn,
+                    model_kwargs=model_kwargs,
+                    eta=eta,
+                    embed_model=embed_model,
+                    scale_factor=scale_factor,
+                    guidance_kwargs=guidance_kwargs,
+                    scg_kwargs=scg_kwargs,
+                    edit_kwargs=edit_kwargs,
+                    record=record,
+                )
+                yield out
+                img = out["sample"]
+    def _vb_terms_bpd(
+        self, model, x_start, x_t, t, clip_denoised=True, model_kwargs=None
+    ):
+        """
+        Get a term for the variational lower-bound.
+        The resulting units are bits (rather than nats, as one might expect).
+        This allows for comparison to other papers.
+        :return: a dict with the following keys:
+                 - 'output': a shape [N] tensor of NLLs or KLs.
+                 - 'pred_xstart': the x_0 predictions.
+        """
+        true_mean, _, true_log_variance_clipped = self.q_posterior_mean_variance(
+            x_start=x_start, x_t=x_t, t=t
+        )
+        out = self.p_mean_variance(
+            model, x_t, t, clip_denoised=clip_denoised, model_kwargs=model_kwargs
+        )
+        kl = normal_kl(
+            true_mean, true_log_variance_clipped, out["mean"], out["log_variance"]
+        )
+        kl = mean_flat(kl) / np.log(2.0)
+        decoder_nll = -discretized_gaussian_log_likelihood(
+            x_start, means=out["mean"], log_scales=0.5 * out["log_variance"]
+        )
+        assert decoder_nll.shape == x_start.shape
+        decoder_nll = mean_flat(decoder_nll) / np.log(2.0)
+        # At the first timestep return the decoder NLL,
+        # otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t))
+        output = th.where((t == 0), decoder_nll, kl)
+        return {"output": output, "pred_xstart": out["pred_xstart"]}
+    def training_losses(self, model, x_start, t, model_kwargs=None, noise=None):
+        """
+        Compute training losses for a single timestep.
+        :param model: the model to evaluate loss on.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :param t: a batch of timestep indices.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :param noise: if specified, the specific Gaussian noise to try to remove.
+        :return: a dict with the key "loss" containing a tensor of shape [N].
+                 Some mean or variance settings may also have other keys.
+        """
+        if model_kwargs is None:
+            model_kwargs = {}
+        if noise is None:
+            noise = th.randn_like(x_start)
+        x_t = self.q_sample(x_start, t, noise=noise)
+        terms = {}
+        if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL:
+            terms["loss"] = self._vb_terms_bpd(
+                model=model,
+                x_start=x_start,
+                x_t=x_t,
+                t=t,
+                clip_denoised=False,
+                model_kwargs=model_kwargs,
+            )["output"]
+            if self.loss_type == LossType.RESCALED_KL:
+                terms["loss"] *= self.num_timesteps
+        elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE:
+            model_output = model(x_t, self._scale_timesteps(t), **model_kwargs)
+            if self.model_var_type in [
+                ModelVarType.LEARNED,
+                ModelVarType.LEARNED_RANGE,
+            ]:
+                B, C = x_t.shape[:2]
+                assert model_output.shape == (B, C * 2, *x_t.shape[2:])
+                model_output, model_var_values = th.split(model_output, C, dim=1)
+                # Learn the variance using the variational bound, but don't let
+                # it affect our mean prediction.
+                frozen_out = th.cat([model_output.detach(), model_var_values], dim=1)
+                terms["vb"] = self._vb_terms_bpd(
+                    model=lambda *args, r=frozen_out: r,
+                    x_start=x_start,
+                    x_t=x_t,
+                    t=t,
+                    clip_denoised=False,
+                )["output"]
+                if self.loss_type == LossType.RESCALED_MSE:
+                    # Divide by 1000 for equivalence with initial implementation.
+                    # Without a factor of 1/1000, the VB term hurts the MSE term.
+                    terms["vb"] *= self.num_timesteps / 1000.0
+            target = {
+                ModelMeanType.PREVIOUS_X: self.q_posterior_mean_variance(
+                    x_start=x_start, x_t=x_t, t=t
+                )[0],
+                ModelMeanType.START_X: x_start,
+                ModelMeanType.EPSILON: noise,
+            }[self.model_mean_type]
+            assert model_output.shape == target.shape == x_start.shape
+            terms["mse"] = mean_flat((target - model_output) ** 2)
+            if "vb" in terms:
+                terms["loss"] = terms["mse"] + terms["vb"]
+            else:
+                terms["loss"] = terms["mse"]
+        else:
+            raise NotImplementedError(self.loss_type)
+        return terms
+    def _prior_bpd(self, x_start):
+        """
+        Get the prior KL term for the variational lower-bound, measured in
+        bits-per-dim.
+        This term can't be optimized, as it only depends on the encoder.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :return: a batch of [N] KL values (in bits), one per batch element.
+        """
+        batch_size = x_start.shape[0]
+        t = th.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device)
+        qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t)
+        kl_prior = normal_kl(
+            mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0
+        )
+        return mean_flat(kl_prior) / np.log(2.0)
+    def calc_bpd_loop(self, model, x_start, clip_denoised=True, model_kwargs=None):
+        """
+        Compute the entire variational lower-bound, measured in bits-per-dim,
+        as well as other related quantities.
+        :param model: the model to evaluate loss on.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :param clip_denoised: if True, clip denoised samples.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict containing the following keys:
+                 - total_bpd: the total variational lower-bound, per batch element.
+                 - prior_bpd: the prior term in the lower-bound.
+                 - vb: an [N x T] tensor of terms in the lower-bound.
+                 - xstart_mse: an [N x T] tensor of x_0 MSEs for each timestep.
+                 - mse: an [N x T] tensor of epsilon MSEs for each timestep.
+        """
+        device = x_start.device
+        batch_size = x_start.shape[0]
+        vb = []
+        xstart_mse = []
+        mse = []
+        for t in list(range(self.num_timesteps))[::-1]:
+            t_batch = th.tensor([t] * batch_size, device=device)
+            noise = th.randn_like(x_start)
+            x_t = self.q_sample(x_start=x_start, t=t_batch, noise=noise)
+            # Calculate VLB term at the current timestep
+            with th.no_grad():
+                out = self._vb_terms_bpd(
+                    model,
+                    x_start=x_start,
+                    x_t=x_t,
+                    t=t_batch,
+                    clip_denoised=clip_denoised,
+                    model_kwargs=model_kwargs,
+                )
+            vb.append(out["output"])
+            xstart_mse.append(mean_flat((out["pred_xstart"] - x_start) ** 2))
+            eps = self._predict_eps_from_xstart(x_t, t_batch, out["pred_xstart"])
+            mse.append(mean_flat((eps - noise) ** 2))
+        vb = th.stack(vb, dim=1)
+        xstart_mse = th.stack(xstart_mse, dim=1)
+        mse = th.stack(mse, dim=1)
+        prior_bpd = self._prior_bpd(x_start)
+        total_bpd = vb.sum(dim=1) + prior_bpd
+        return {
+            "total_bpd": total_bpd,
+            "prior_bpd": prior_bpd,
+            "vb": vb,
+            "xstart_mse": xstart_mse,
+            "mse": mse,
+        }
+def _extract_into_tensor(arr, timesteps, broadcast_shape):
+    """
+    Extract values from a 1-D numpy array for a batch of indices.
+    :param arr: the 1-D numpy array.
+    :param timesteps: a tensor of indices into the array to extract.
+    :param broadcast_shape: a larger shape of K dimensions with the batch
+                            dimension equal to the length of timesteps.
+    :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims.
+    """
+    res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float()
+    while len(res.shape) < len(broadcast_shape):
+        res = res[..., None]
+    return res.expand(broadcast_shape)
+def _decode(pred_zstart, embed_model, scale_factor=1., threshold=False):
+    image_size_h = pred_zstart.shape[-2]
+    image_size_w = pred_zstart.shape[-1]
+    pred_zstart = pred_zstart / scale_factor
+    sample = pred_zstart.permute(0, 1, 3, 2)
+    sample = th.chunk(sample, image_size_h // image_size_w, dim=-1)  # B x C x H x W
+    sample = th.concat(sample, dim=0)  # 1st second for all batch, 2nd second for all batch, ...
+    sample = embed_model.decode(sample)
+    pred_xstart = th.concat(th.chunk(sample, image_size_h // image_size_w, dim=0), dim=-1)
+    if threshold:
+        pred_xstart[pred_xstart <= -0.95] = -1.  # heuristic thresholding the background
+    return pred_xstart
+def _extract_rule(rule_name, pred_xstart):
+    device = pred_xstart.device
+    if 'chord' in rule_name:
+        # Split tensor batch into smaller batches
+        num_processes = 4
+        pred_xstart = pred_xstart.cpu()
+        pred_xstart_split = th.chunk(pred_xstart, num_processes)
+        # rule_func = partial(FUNC_DICT[rule_name], given_key="C major")   # todo: hard code key here
+        rule_func = FUNC_DICT[rule_name]
+        with multiprocessing.Pool(processes=num_processes) as pool:
+            gen_rule = pool.map(rule_func, pred_xstart_split)
+        # Combine results
+        if len(gen_rule[0].shape) == 1:    # batch_size * branching_factor < 4
+            gen_rule = [item.unsqueeze(dim=0) for item in gen_rule]
+        gen_rule = th.concat(gen_rule, dim=0).to(device)
+    else:
+        gen_rule = FUNC_DICT[rule_name](pred_xstart)
+    return gen_rule
+def _encode(pred_xstart, embed_model, scale_factor=1.):
+    image_size_h = pred_xstart.shape[-2]
+    image_size_w = pred_xstart.shape[-1]
+    seq_len = image_size_w // image_size_h
+    micro = th.chunk(pred_xstart, seq_len, dim=-1)  # B x C x H x W
+    micro = th.concat(micro, dim=0)  # 1st second for all batch, 2nd second for all batch, ...
+    micro = embed_model.encode_save(micro, range_fix=False)
+    if micro.shape[1] == 8:
+        z, _ = th.chunk(micro, 2, dim=1)
+    else:
+        z = micro
+    z = th.concat(th.chunk(z, seq_len, dim=0), dim=-1)
+    z = z.permute(0, 1, 3, 2)
+    return z * scale_factor
+def guide_schedule(t, t_start=750, t_end=0, interval=1):
+    flag = t_start > t[0] >= t_end and (t[0] + 1) % interval == 0
+    return flag

guided_diffusion/logger.py ADDED Viewed

	@@ -0,0 +1,521 @@

+"""
+Logger copied from OpenAI baselines to avoid extra RL-based dependencies:
+https://github.com/openai/baselines/blob/ea25b9e8b234e6ee1bca43083f8f3cf974143998/baselines/logger.py
+"""
+import os
+import sys
+import shutil
+import os.path as osp
+import json
+import time
+import datetime
+import tempfile
+import warnings
+from collections import defaultdict
+from contextlib import contextmanager
+import wandb
+DEBUG = 10
+INFO = 20
+WARN = 30
+ERROR = 40
+DISABLED = 50
+class KVWriter(object):
+    def writekvs(self, kvs):
+        raise NotImplementedError
+class SeqWriter(object):
+    def writeseq(self, seq):
+        raise NotImplementedError
+class HumanOutputFormat(KVWriter, SeqWriter):
+    def __init__(self, filename_or_file):
+        if isinstance(filename_or_file, str):
+            self.file = open(filename_or_file, "wt")
+            self.own_file = True
+        else:
+            assert hasattr(filename_or_file, "read"), (
+                "expected file or str, got %s" % filename_or_file
+            )
+            self.file = filename_or_file
+            self.own_file = False
+    def writekvs(self, kvs):
+        # Create strings for printing
+        key2str = {}
+        for (key, val) in sorted(kvs.items()):
+            if hasattr(val, "__float__"):
+                valstr = "%-8.3g" % val
+            else:
+                valstr = str(val)
+            key2str[self._truncate(key)] = self._truncate(valstr)
+        # Find max widths
+        if len(key2str) == 0:
+            print("WARNING: tried to write empty key-value dict")
+            return
+        else:
+            keywidth = max(map(len, key2str.keys()))
+            valwidth = max(map(len, key2str.values()))
+        # Write out the data
+        dashes = "-" * (keywidth + valwidth + 7)
+        lines = [dashes]
+        for (key, val) in sorted(key2str.items(), key=lambda kv: kv[0].lower()):
+            lines.append(
+                "| %s%s | %s%s |"
+                % (key, " " * (keywidth - len(key)), val, " " * (valwidth - len(val)))
+            )
+        lines.append(dashes)
+        self.file.write("\n".join(lines) + "\n")
+        # Flush the output to the file
+        self.file.flush()
+    def _truncate(self, s):
+        maxlen = 30
+        return s[: maxlen - 3] + "..." if len(s) > maxlen else s
+    def writeseq(self, seq):
+        seq = list(seq)
+        for (i, elem) in enumerate(seq):
+            self.file.write(elem)
+            if i < len(seq) - 1:  # add space unless this is the last one
+                self.file.write(" ")
+        self.file.write("\n")
+        self.file.flush()
+    def close(self):
+        if self.own_file:
+            self.file.close()
+class JSONOutputFormat(KVWriter):
+    def __init__(self, filename):
+        self.file = open(filename, "wt")
+    def writekvs(self, kvs):
+        for k, v in sorted(kvs.items()):
+            if hasattr(v, "dtype"):
+                kvs[k] = float(v)
+        self.file.write(json.dumps(kvs) + "\n")
+        self.file.flush()
+    def close(self):
+        self.file.close()
+class CSVOutputFormat(KVWriter):
+    def __init__(self, filename):
+        self.file = open(filename, "w+t")
+        self.keys = []
+        self.sep = ","
+    def writekvs(self, kvs):
+        # Add our current row to the history
+        extra_keys = list(kvs.keys() - self.keys)
+        extra_keys.sort()
+        if extra_keys:
+            self.keys.extend(extra_keys)
+            self.file.seek(0)
+            lines = self.file.readlines()
+            self.file.seek(0)
+            for (i, k) in enumerate(self.keys):
+                if i > 0:
+                    self.file.write(",")
+                self.file.write(k)
+            self.file.write("\n")
+            for line in lines[1:]:
+                self.file.write(line[:-1])
+                self.file.write(self.sep * len(extra_keys))
+                self.file.write("\n")
+        for (i, k) in enumerate(self.keys):
+            if i > 0:
+                self.file.write(",")
+            v = kvs.get(k)
+            if v is not None:
+                self.file.write(str(v))
+        self.file.write("\n")
+        self.file.flush()
+    def close(self):
+        self.file.close()
+class TensorBoardOutputFormat(KVWriter):
+    """
+    Dumps key/value pairs into TensorBoard's numeric format.
+    """
+    def __init__(self, dir):
+        os.makedirs(dir, exist_ok=True)
+        self.dir = dir
+        self.step = 1
+        prefix = "events"
+        path = osp.join(osp.abspath(dir), prefix)
+        import tensorflow as tf
+        from tensorflow.python import pywrap_tensorflow
+        from tensorflow.core.util import event_pb2
+        from tensorflow.python.util import compat
+        self.tf = tf
+        self.event_pb2 = event_pb2
+        self.pywrap_tensorflow = pywrap_tensorflow
+        self.writer = pywrap_tensorflow.EventsWriter(compat.as_bytes(path))
+    def writekvs(self, kvs):
+        def summary_val(k, v):
+            kwargs = {"tag": k, "simple_value": float(v)}
+            return self.tf.Summary.Value(**kwargs)
+        summary = self.tf.Summary(value=[summary_val(k, v) for k, v in kvs.items()])
+        event = self.event_pb2.Event(wall_time=time.time(), summary=summary)
+        event.step = (
+            self.step
+        )  # is there any reason why you'd want to specify the step?
+        self.writer.WriteEvent(event)
+        self.writer.Flush()
+        self.step += 1
+    def close(self):
+        if self.writer:
+            self.writer.Close()
+            self.writer = None
+class WandbOutputFormat(KVWriter):
+    def __init__(self, args):
+        wandb.init(project=args.project, config=vars(args))
+    def writekvs(self, kvs):
+        step = int(kvs["step"])
+        wandb.log(kvs, step=step)
+    def close(self):
+        pass
+def make_output_format(format, ev_dir, args, log_suffix=""):
+    os.makedirs(ev_dir, exist_ok=True)
+    if format == "stdout":
+        return HumanOutputFormat(sys.stdout)
+    elif format == "log":
+        return HumanOutputFormat(osp.join(ev_dir, "log%s.txt" % log_suffix))
+    elif format == "json":
+        return JSONOutputFormat(osp.join(ev_dir, "progress%s.json" % log_suffix))
+    elif format == "csv":
+        return CSVOutputFormat(osp.join(ev_dir, "progress%s.csv" % log_suffix))
+    elif format == "tensorboard":
+        return TensorBoardOutputFormat(osp.join(ev_dir, "tb%s" % log_suffix))
+    elif format == "wandb":
+        return WandbOutputFormat(args)
+    else:
+        raise ValueError("Unknown format specified: %s" % (format,))
+# ================================================================
+# API
+# ================================================================
+def logkv(key, val):
+    """
+    Log a value of some diagnostic
+    Call this once for each diagnostic quantity, each iteration
+    If called many times, last value will be used.
+    """
+    get_current().logkv(key, val)
+def logkv_mean(key, val):
+    """
+    The same as logkv(), but if called many times, values averaged.
+    """
+    get_current().logkv_mean(key, val)
+def logkvs(d):
+    """
+    Log a dictionary of key-value pairs
+    """
+    for (k, v) in d.items():
+        logkv(k, v)
+def dumpkvs():
+    """
+    Write all of the diagnostics from the current iteration
+    """
+    return get_current().dumpkvs()
+def getkvs():
+    return get_current().name2val
+def log(*args, level=INFO):
+    """
+    Write the sequence of args, with no separators, to the console and output files (if you've configured an output file).
+    """
+    get_current().log(*args, level=level)
+def debug(*args):
+    log(*args, level=DEBUG)
+def info(*args):
+    log(*args, level=INFO)
+def warn(*args):
+    log(*args, level=WARN)
+def error(*args):
+    log(*args, level=ERROR)
+def set_level(level):
+    """
+    Set logging threshold on current logger.
+    """
+    get_current().set_level(level)
+def set_comm(comm):
+    get_current().set_comm(comm)
+def get_dir():
+    """
+    Get directory that log files are being written to.
+    will be None if there is no output directory (i.e., if you didn't call start)
+    """
+    return get_current().get_dir()
+record_tabular = logkv
+dump_tabular = dumpkvs
+@contextmanager
+def profile_kv(scopename):
+    logkey = "wait_" + scopename
+    tstart = time.time()
+    try:
+        yield
+    finally:
+        get_current().name2val[logkey] += time.time() - tstart
+def profile(n):
+    """
+    Usage:
+    @profile("my_func")
+    def my_func(): code
+    """
+    def decorator_with_name(func):
+        def func_wrapper(*args, **kwargs):
+            with profile_kv(n):
+                return func(*args, **kwargs)
+        return func_wrapper
+    return decorator_with_name
+# ================================================================
+# Backend
+# ================================================================
+def get_current():
+    if Logger.CURRENT is None:
+        _configure_default_logger()
+    return Logger.CURRENT
+class Logger(object):
+    DEFAULT = None  # A logger with no output files. (See right below class definition)
+    # So that you can still log to the terminal without setting up any output files
+    CURRENT = None  # Current logger being used by the free functions above
+    def __init__(self, dir, output_formats, comm=None):
+        self.name2val = defaultdict(float)  # values this iteration
+        self.name2cnt = defaultdict(int)
+        self.level = INFO
+        self.dir = dir
+        self.output_formats = output_formats
+        self.comm = comm
+    # Logging API, forwarded
+    # ----------------------------------------
+    def logkv(self, key, val):
+        self.name2val[key] = val
+    def logkv_mean(self, key, val):
+        oldval, cnt = self.name2val[key], self.name2cnt[key]
+        self.name2val[key] = oldval * cnt / (cnt + 1) + val / (cnt + 1)
+        self.name2cnt[key] = cnt + 1
+    def dumpkvs(self):
+        if self.comm is None:
+            d = self.name2val
+        else:
+            d = mpi_weighted_mean(
+                self.comm,
+                {
+                    name: (val, self.name2cnt.get(name, 1))
+                    for (name, val) in self.name2val.items()
+                },
+            )
+            if self.comm.rank != 0:
+                d["dummy"] = 1  # so we don't get a warning about empty dict
+        out = d.copy()  # Return the dict for unit testing purposes
+        for fmt in self.output_formats:
+            if isinstance(fmt, KVWriter):
+                fmt.writekvs(d)
+        self.name2val.clear()
+        self.name2cnt.clear()
+        return out
+    def log(self, *args, level=INFO):
+        if self.level <= level:
+            self._do_log(args)
+    # Configuration
+    # ----------------------------------------
+    def set_level(self, level):
+        self.level = level
+    def set_comm(self, comm):
+        self.comm = comm
+    def get_dir(self):
+        return self.dir
+    def close(self):
+        for fmt in self.output_formats:
+            fmt.close()
+    # Misc
+    # ----------------------------------------
+    def _do_log(self, args):
+        for fmt in self.output_formats:
+            if isinstance(fmt, SeqWriter):
+                fmt.writeseq(map(str, args))
+def get_rank_without_mpi_import():
+    # check environment variables here instead of importing mpi4py
+    # to avoid calling MPI_Init() when this module is imported
+    for varname in ["PMI_RANK", "OMPI_COMM_WORLD_RANK"]:
+        if varname in os.environ:
+            return int(os.environ[varname])
+    return 0
+def mpi_weighted_mean(comm, local_name2valcount):
+    """
+    Copied from: https://github.com/openai/baselines/blob/ea25b9e8b234e6ee1bca43083f8f3cf974143998/baselines/common/mpi_util.py#L110
+    Perform a weighted average over dicts that are each on a different node
+    Input: local_name2valcount: dict mapping key -> (value, count)
+    Returns: key -> mean
+    """
+    all_name2valcount = comm.gather(local_name2valcount)
+    if comm.rank == 0:
+        name2sum = defaultdict(float)
+        name2count = defaultdict(float)
+        for n2vc in all_name2valcount:
+            for (name, (val, count)) in n2vc.items():
+                try:
+                    val = float(val)
+                except ValueError:
+                    if comm.rank == 0:
+                        warnings.warn(
+                            "WARNING: tried to compute mean on non-float {}={}".format(
+                                name, val
+                            )
+                        )
+                else:
+                    name2sum[name] += val * count
+                    name2count[name] += count
+        return {name: name2sum[name] / name2count[name] for name in name2sum}
+    else:
+        return {}
+def configure(args=None, format_strs=None, comm=None, log_suffix=""):
+    """
+    If comm is provided, average all numerical stats across that comm
+    """
+    dir = args.dir
+    if dir is not None:
+        if "loggings" not in dir:    # save under cur dir
+            dir = osp.join("loggings", dir)
+    else:
+        if dir is None:
+            dir = os.getenv("OPENAI_LOGDIR")
+        if dir is None:
+            dir = osp.join(
+                # tempfile.gettempdir(),
+                "loggings",
+                datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f"),
+            )
+    assert isinstance(dir, str)
+    dir = os.path.expanduser(dir)
+    os.makedirs(os.path.expanduser(dir), exist_ok=True)
+    if args.training:
+        # make dir for samples and checkpoints if training the model
+        os.makedirs(os.path.expanduser(osp.join(dir, "samples")), exist_ok=True)
+        os.makedirs(os.path.expanduser(osp.join(dir, "checkpoints")), exist_ok=True)
+    rank = get_rank_without_mpi_import()
+    if rank > 0:
+        log_suffix = log_suffix + "-rank%03i" % rank
+    if format_strs is None:
+        if rank == 0:
+            format_strs = os.getenv("OPENAI_LOG_FORMAT", "wandb,stdout,log,csv").split(",")
+        else:
+            format_strs = os.getenv("OPENAI_LOG_FORMAT_MPI", "log").split(",")
+    format_strs = filter(None, format_strs)
+    output_formats = [make_output_format(f, dir, args, log_suffix) for f in format_strs]
+    Logger.CURRENT = Logger(dir=dir, output_formats=output_formats, comm=comm)
+    if output_formats:
+        log("Logging to %s" % dir)
+def _configure_default_logger():
+    configure()
+    Logger.DEFAULT = Logger.CURRENT
+def reset():
+    if Logger.CURRENT is not Logger.DEFAULT:
+        Logger.CURRENT.close()
+        Logger.CURRENT = Logger.DEFAULT
+        log("Reset logger")
+@contextmanager
+def scoped_configure(dir=None, format_strs=None, comm=None):
+    prevlogger = Logger.CURRENT
+    configure(dir=dir, format_strs=format_strs, comm=comm)
+    try:
+        yield
+    finally:
+        Logger.CURRENT.close()
+        Logger.CURRENT = prevlogger

guided_diffusion/losses.py ADDED Viewed

	@@ -0,0 +1,77 @@

+"""
+Helpers for various likelihood-based losses. These are ported from the original
+Ho et al. diffusion models codebase:
+https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/utils.py
+"""
+import numpy as np
+import torch as th
+def normal_kl(mean1, logvar1, mean2, logvar2):
+    """
+    Compute the KL divergence between two gaussians.
+    Shapes are automatically broadcasted, so batches can be compared to
+    scalars, among other use cases.
+    """
+    tensor = None
+    for obj in (mean1, logvar1, mean2, logvar2):
+        if isinstance(obj, th.Tensor):
+            tensor = obj
+            break
+    assert tensor is not None, "at least one argument must be a Tensor"
+    # Force variances to be Tensors. Broadcasting helps convert scalars to
+    # Tensors, but it does not work for th.exp().
+    logvar1, logvar2 = [
+        x if isinstance(x, th.Tensor) else th.tensor(x).to(tensor)
+        for x in (logvar1, logvar2)
+    ]
+    return 0.5 * (
+        -1.0
+        + logvar2
+        - logvar1
+        + th.exp(logvar1 - logvar2)
+        + ((mean1 - mean2) ** 2) * th.exp(-logvar2)
+    )
+def approx_standard_normal_cdf(x):
+    """
+    A fast approximation of the cumulative distribution function of the
+    standard normal.
+    """
+    return 0.5 * (1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3))))
+def discretized_gaussian_log_likelihood(x, *, means, log_scales):
+    """
+    Compute the log-likelihood of a Gaussian distribution discretizing to a
+    given image.
+    :param x: the target images. It is assumed that this was uint8 values,
+              rescaled to the range [-1, 1].
+    :param means: the Gaussian mean Tensor.
+    :param log_scales: the Gaussian log stddev Tensor.
+    :return: a tensor like x of log probabilities (in nats).
+    """
+    assert x.shape == means.shape == log_scales.shape
+    centered_x = x - means
+    inv_stdv = th.exp(-log_scales)
+    plus_in = inv_stdv * (centered_x + 1.0 / 255.0)
+    cdf_plus = approx_standard_normal_cdf(plus_in)
+    min_in = inv_stdv * (centered_x - 1.0 / 255.0)
+    cdf_min = approx_standard_normal_cdf(min_in)
+    log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12))
+    log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12))
+    cdf_delta = cdf_plus - cdf_min
+    log_probs = th.where(
+        x < -0.999,
+        log_cdf_plus,
+        th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))),
+    )
+    assert log_probs.shape == x.shape
+    return log_probs

guided_diffusion/midi_util.py ADDED Viewed

	@@ -0,0 +1,291 @@

+import os
+import math
+import torch
+import numpy as np
+import pandas as pd
+import pretty_midi
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+from . import dist_util
+import yaml
+from types import SimpleNamespace
+from music_rule_guidance.piano_roll_to_chord import piano_roll_to_pretty_midi, KEY_DICT, IND2KEY
+from music_rule_guidance.rule_maps import FUNC_DICT, LOSS_DICT
+from music_rule_guidance.music_rules import MAX_PIANO, MIN_PIANO
+plt.rcParams['figure.dpi'] = 300
+plt.rcParams['savefig.dpi'] = 300
+# bounds to compute classes for nd editing
+VERTICAL_ND_BOUNDS = [1.29, 2.7578125, 3.61, 4.4921875, 5.28125, 6.1171875, 7.22]
+VERTICAL_ND_CENTER = [0.56, 2.0239, 3.1839, 4.0511, 4.8867, 5.6992, 6.6686, 7.77]
+HORIZONTAL_ND_BOUNDS = [1.8, 2.6, 3.2, 3.6, 4.4, 4.8, 5.8]
+HORIZONTAL_ND_CENTER = [1.4, 2.2000, 2.9, 3.4, 4.0, 4.6, 5.3, 6.3]
+def dict_to_obj(d):
+    if isinstance(d, list):
+        d = [dict_to_obj(x) if isinstance(x, dict) else x for x in d]
+    if not isinstance(d, dict):
+        return d
+    return SimpleNamespace(**{k: dict_to_obj(v) for k, v in d.items()})
+def load_config(filename):
+    with open(filename, 'r') as file:
+        data = yaml.safe_load(file)
+    # Convert the dictionary to an object
+    data_obj = dict_to_obj(data)
+    return data_obj
+@torch.no_grad()
+def decode_sample_for_midi(sample, embed_model=None, scale_factor=1., threshold=-0.95):
+    # decode latent samples to a long piano roll of [0, 127]
+    sample = sample / scale_factor
+    if embed_model is not None:
+        image_size_h = sample.shape[-2]
+        image_size_w = sample.shape[-1]
+        if image_size_h > image_size_w:  # transposed for raster col, don't need to permute for pixel space
+            sample = sample.permute(0, 1, 3, 2)  # vertical axis means pitch after transpose
+        num_latents = sample.shape[-1] // sample.shape[-2]
+        if image_size_h >= image_size_w:
+            sample = torch.chunk(sample, num_latents, dim=-1)  # B x C x H x W
+            sample = torch.concat(sample, dim=0)  # 1st second for all batch, 2nd second for all batch, ...
+        sample = embed_model.decode(sample)
+        if image_size_h >= image_size_w:
+          sample = torch.concat(torch.chunk(sample, num_latents, dim=0), dim=-1)
+    sample[sample <= threshold] = -1.  # heuristic thresholding the background
+    sample = ((sample + 1) * 63.5).clamp(0, 127).to(torch.uint8)
+    sample = sample.permute(0, 2, 3, 1)
+    sample = sample.contiguous()
+    return sample
+def save_piano_roll_midi(sample, save_dir, fs=100, y=None, save_piano_roll=False, save_ind=0):
+    # input shape: B x 128 (pitch) x time (no pedal) or B x 2 (pedal) x 128 x time (with pedal)
+    fig_size = sample.shape[-1] // 128 * 3
+    plt.rcParams["figure.figsize"] = (fig_size, 3)
+    pedal = True if len(sample.shape) == 4 else False
+    onset = True if sample.shape[1] == 3 else False
+    for i in range(sample.shape[0]):
+        cur_sample = sample[i]
+        if cur_sample.shape[-1] < 5000 and save_piano_roll:  # do not save piano rolls that are too long
+          if pedal:
+            plt.imshow(cur_sample[0, ::-1], vmin=0, vmax=127)
+          else:
+            plt.imshow(cur_sample[::-1], vmin=0, vmax=127)
+          plt.savefig(os.path.join(save_dir, "prsample_" + str(i) + ".png"))
+        if onset:
+          # add onset for first column
+          first_column = cur_sample[0, :, 0]
+          first_onset_pitch = first_column.nonzero()[0]
+          cur_sample[1, first_onset_pitch, 0] = 127
+        cur_sample = cur_sample.astype(np.float32)
+        pm = piano_roll_to_pretty_midi(cur_sample, fs=fs)
+        if y is not None:
+            save_name = 'sample_' + str(i + save_ind) + '_y_' + str(y[i].item()) + '.midi'
+        else:
+            save_name = 'sample_' + str(i + save_ind) + '.midi'
+        pm.write(os.path.join(save_dir, save_name))
+    return
+def eval_rule_loss(generated_samples, target_rules):
+    results = {}
+    batch_size = generated_samples.shape[0]
+    for rule_name, rule_target in target_rules.items():
+        rule_target_list = rule_target.tolist()
+        if batch_size == 1:
+            rule_target_list = [rule_target_list]
+        results[rule_name + '.target_rule'] = rule_target_list
+        rule_target = rule_target.to(generated_samples.device)
+        if 'chord' in rule_name:
+            gen_rule, key, corr = FUNC_DICT[rule_name](generated_samples, return_key=True)
+            key_strings = [IND2KEY[key_ind] for key_ind in key]
+            loss = LOSS_DICT[rule_name](gen_rule, rule_target)
+            mean_loss, std_loss, gen_rule, loss = loss.mean(), loss.std(), gen_rule.tolist(), loss.tolist()
+            if batch_size == 1:
+                gen_rule = [gen_rule]
+            results[rule_name + '.gen_rule'] = gen_rule
+            results[rule_name + '.key_str'] = key_strings
+            results[rule_name + '.key_corr'] = corr
+            results[rule_name + '.loss'] = loss
+        else:
+            gen_rule = FUNC_DICT[rule_name](generated_samples)
+            loss = LOSS_DICT[rule_name](gen_rule, rule_target)
+            mean_loss, std_loss, gen_rule, loss = loss.mean(), loss.std(), gen_rule.tolist(), loss.tolist()
+            if batch_size == 1:
+                gen_rule = [gen_rule]
+            results[rule_name + '.gen_rule'] = gen_rule
+            results[rule_name + '.loss'] = loss
+    return pd.DataFrame(results)
+def compute_rule(generated_samples, orig_samples, target_rules):
+    results = {}
+    batch_size = generated_samples.shape[0]
+    for rule_name in target_rules:
+        rule_target = FUNC_DICT[rule_name](orig_samples)
+        rule_target_list = rule_target.tolist()
+        if batch_size == 1:
+            rule_target_list = [rule_target_list]
+        results[rule_name + '.target_rule'] = rule_target_list
+        rule_target = rule_target.to(generated_samples.device)
+        if rule_name == 'chord_progression':
+            gen_rule, key, corr = FUNC_DICT[rule_name](generated_samples, return_key=True)
+            key_strings = [IND2KEY[key_ind] for key_ind in key]
+            loss = LOSS_DICT[rule_name](gen_rule, rule_target)
+            mean_loss, std_loss, gen_rule, loss = loss.mean(), loss.std(), gen_rule.tolist(), loss.tolist()
+            if batch_size == 1:
+                gen_rule = [gen_rule]
+            results[rule_name + '.gen_rule'] = gen_rule
+            results[rule_name + '.key_str'] = key_strings
+            results[rule_name + '.key_corr'] = corr
+            results[rule_name + '.loss'] = loss
+        else:
+            gen_rule = FUNC_DICT[rule_name](generated_samples)
+            loss = LOSS_DICT[rule_name](gen_rule, rule_target)
+            mean_loss, std_loss, gen_rule, loss = loss.mean(), loss.std(), gen_rule.tolist(), loss.tolist()
+            if batch_size == 1:
+                gen_rule = [gen_rule]
+            results[rule_name + '.gen_rule'] = gen_rule
+            results[rule_name + '.loss'] = loss
+    return pd.DataFrame(results)
+def visualize_piano_roll(piano_roll):
+    """
+    Assuming piano roll has shape Bx1x128x1024, and the values are between [-1, 1], on gpu.
+    Visualize with some gap in between the first 256, last 256/
+    """
+    piano_roll = torch.flip(piano_roll, [2])
+    piano_roll = (piano_roll + 1) / 2.
+    vis_length = 256
+    gap = 80
+    plt.rcParams["figure.figsize"] = (12, 3)
+    data = torch.zeros(128, vis_length * 2 + gap)
+    data[:, :vis_length] = piano_roll[0, 0, :, :vis_length]
+    data[:, -vis_length:] = piano_roll[0, 0, :, -vis_length:]
+    data_clone = data.clone()
+    # make it look thicker
+    data[1:, :] = data[1:, :] + data_clone[:-1, :]
+    data[2:, :] = data[2:, :] + data_clone[:-2, :]
+    data = data.cpu().numpy()
+    plt.imshow(data, cmap=mpl.colormaps['Blues'])
+    ax = plt.gca()  # gca stands for 'get current axis'
+    for edge, spine in ax.spines.items():
+      spine.set_linewidth(2)  # Adjust the value as per your requirement
+    plt.grid(color='gray', linestyle='-', linewidth=2., alpha=0.5, which='both', axis='x')
+    plt.xticks(
+      np.concatenate((np.arange(0, vis_length + 1, 128), np.arange(vis_length + gap, vis_length * 2 + gap, 128))))
+    # plt.savefig('piano_roll_example.png', bbox_inches='tight', pad_inches=0.1, dpi=300)
+    plt.tick_params(axis='both', which='both', length=0, labelbottom=False, labelleft=False)
+    plt.tight_layout()
+    plt.show()
+    plt.rcParams["figure.figsize"] = (3, 3)
+    for i in range(2):
+      plt.imshow(data[:, i*128: (i+1)*128], cmap=mpl.colormaps['Blues'])
+      ax = plt.gca()
+      for edge, spine in ax.spines.items():
+        spine.set_linewidth(2)
+      plt.tick_params(axis='both', which='both', length=0, labelbottom=False, labelleft=False)
+      plt.tight_layout()
+      plt.show()
+    for i in range(-2, 0):
+      if (i+1)*128 < 0:
+        plt.imshow(data[:, i*128: (i+1)*128], cmap=mpl.colormaps['Blues'])
+      else:
+        plt.imshow(data[:, i*128:], cmap=mpl.colormaps['Blues'])
+      ax = plt.gca()
+      for edge, spine in ax.spines.items():
+        spine.set_linewidth(2)
+      plt.tick_params(axis='both', which='both', length=0, labelbottom=False, labelleft=False)
+      plt.tight_layout()
+      plt.show()
+    return
+def visualize_full_piano_roll(midi_file_name, fs=100):
+    """
+    Visualize full piano roll from midi file
+    """
+    midi_data = pretty_midi.PrettyMIDI(midi_file_name)
+    # do not process sustain pedal
+    piano_roll = torch.tensor(midi_data.get_piano_roll(fs=fs, pedal_threshold=None))
+    data = torch.flip(piano_roll, [0])
+    plt.rcParams["figure.figsize"] = (12, 3)
+    # data_clone = data.clone()
+    # # make it look thicker
+    # data[1:, :] = data[1:, :] + data_clone[:-1, :]
+    # data[2:, :] = data[2:, :] + data_clone[:-2, :]
+    data = data.cpu().numpy()
+    plt.imshow(data, cmap=mpl.colormaps['Blues'])
+    ax = plt.gca()  # gca stands for 'get current axis'
+    for edge, spine in ax.spines.items():
+      spine.set_linewidth(2)  # Adjust the value as per your requirement
+    plt.grid(color='gray', linestyle='-', linewidth=2., alpha=0.5, which='both', axis='x')
+    plt.xticks(np.arange(0, piano_roll.shape[1], 128))
+    # plt.savefig('piano_roll_example.png', bbox_inches='tight', pad_inches=0.1, dpi=300)
+    plt.tick_params(axis='both', which='both', length=0, labelbottom=False, labelleft=False)
+    plt.tight_layout()
+    plt.show()
+    return
+def plot_record(vals, title, save_dir):
+    ts = [item[0] for item in vals]
+    log_probs = [item[1] for item in vals]
+    plt.plot(ts, log_probs)
+    plt.gca().invert_xaxis()
+    plt.title(title)
+    plt.savefig(save_dir + '/' + title + '.png')
+    plt.show()
+    return
+def quantize_pedal(value, num_bins=8):
+  """Quantize an integer value from 0 to 127 into 8 bins and return the center value of the bin."""
+  if value < 0 or value > 127:
+    raise ValueError("Value should be between 0 and 127")
+  # Determine bin size
+  bin_size = 128 // num_bins  # 16
+  # Quantize the value
+  bin_index = value // bin_size
+  bin_center = bin_size * bin_index + bin_size // 2
+  # Handle edge case for the last bin
+  if bin_center > 127:
+    bin_center = 127
+  return bin_center
+def get_full_piano_roll(midi_data, fs, show=False):
+  # do not process sustain pedal
+  piano_roll, onset_roll = midi_data.get_piano_roll(fs=fs, pedal_threshold=None, onset=True)
+  # save pedal roll explicitly
+  pedal_roll = np.zeros_like(piano_roll)
+  # process pedal
+  for instru in midi_data.instruments:
+    pedal_changes = [_e for _e in instru.control_changes if _e.number == CC_SUSTAIN_PEDAL]
+    for cc in pedal_changes:
+      time_now = int(cc.time * fs)
+      if time_now < pedal_roll.shape[-1]:
+        # need to distinguish control_change 0 and background 0, with quantize 0-16 will be 8
+        # in muscore files, 0 immediately followed by 127, need to shift by one column
+        if pedal_roll[MIN_PIANO, time_now] != 0. and abs(pedal_roll[MIN_PIANO, time_now] - cc.value) > 64:
+          # use shift 2 here to prevent missing change when using interpolation augmentation
+          pedal_roll[MIN_PIANO:MAX_PIANO + 1, min(time_now + 2, pedal_roll.shape[-1] - 1)] = quantize_pedal(cc.value)
+        else:
+          pedal_roll[MIN_PIANO:MAX_PIANO + 1, time_now] = quantize_pedal(cc.value)
+  full_roll = np.concatenate((piano_roll[None], onset_roll[None], pedal_roll[None]), axis=0)
+  if show:
+    plt.imshow(piano_roll[::-1, :1024], vmin=0, vmax=127)
+    plt.show()
+    plt.imshow(pedal_roll[::-1, :1024], vmin=0, vmax=127)
+    plt.show()
+  return full_roll

guided_diffusion/nn.py ADDED Viewed

	@@ -0,0 +1,170 @@

+"""
+Various utilities for neural networks.
+"""
+import math
+import torch as th
+import torch.nn as nn
+# PyTorch 1.7 has SiLU, but we support PyTorch 1.5.
+class SiLU(nn.Module):
+    def forward(self, x):
+        return x * th.sigmoid(x)
+class GroupNorm32(nn.GroupNorm):
+    def forward(self, x):
+        return super().forward(x.float()).type(x.dtype)
+def conv_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D convolution module.
+    """
+    if dims == 1:
+        return nn.Conv1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.Conv2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.Conv3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+def linear(*args, **kwargs):
+    """
+    Create a linear module.
+    """
+    return nn.Linear(*args, **kwargs)
+def avg_pool_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D average pooling module.
+    """
+    if dims == 1:
+        return nn.AvgPool1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.AvgPool2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.AvgPool3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+def update_ema(target_params, source_params, rate=0.99):
+    """
+    Update target parameters to be closer to those of source parameters using
+    an exponential moving average.
+    :param target_params: the target parameter sequence.
+    :param source_params: the source parameter sequence.
+    :param rate: the EMA rate (closer to 1 means slower).
+    """
+    for targ, src in zip(target_params, source_params):
+        targ.detach().mul_(rate).add_(src, alpha=1 - rate)
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+def scale_module(module, scale):
+    """
+    Scale the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().mul_(scale)
+    return module
+def mean_flat(tensor):
+    """
+    Take the mean over all non-batch dimensions.
+    """
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+def normalization(channels):
+    """
+    Make a standard normalization layer.
+    :param channels: number of input channels.
+    :return: an nn.Module for normalization.
+    """
+    return GroupNorm32(32, channels)
+def timestep_embedding(timesteps, dim, max_period=10000):
+    """
+    Create sinusoidal timestep embeddings.
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an [N x dim] Tensor of positional embeddings.
+    """
+    half = dim // 2
+    freqs = th.exp(
+        -math.log(max_period) * th.arange(start=0, end=half, dtype=th.float32) / half
+    ).to(device=timesteps.device)
+    args = timesteps[:, None].float() * freqs[None]
+    embedding = th.cat([th.cos(args), th.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = th.cat([embedding, th.zeros_like(embedding[:, :1])], dim=-1)
+    return embedding
+def checkpoint(func, inputs, params, flag):
+    """
+    Evaluate a function without caching intermediate activations, allowing for
+    reduced memory at the expense of extra compute in the backward pass.
+    :param func: the function to evaluate.
+    :param inputs: the argument sequence to pass to `func`.
+    :param params: a sequence of parameters `func` depends on but does not
+                   explicitly take as arguments.
+    :param flag: if False, disable gradient checkpointing.
+    """
+    if flag:
+        args = tuple(inputs) + tuple(params)
+        return CheckpointFunction.apply(func, len(inputs), *args)
+    else:
+        return func(*inputs)
+class CheckpointFunction(th.autograd.Function):
+    @staticmethod
+    def forward(ctx, run_function, length, *args):
+        ctx.run_function = run_function
+        ctx.input_tensors = list(args[:length])
+        ctx.input_params = list(args[length:])
+        with th.no_grad():
+            output_tensors = ctx.run_function(*ctx.input_tensors)
+        return output_tensors
+    @staticmethod
+    def backward(ctx, *output_grads):
+        ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors]
+        with th.enable_grad():
+            # Fixes a bug where the first op in run_function modifies the
+            # Tensor storage in place, which is not allowed for detach()'d
+            # Tensors.
+            shallow_copies = [x.view_as(x) for x in ctx.input_tensors]
+            output_tensors = ctx.run_function(*shallow_copies)
+        input_grads = th.autograd.grad(
+            output_tensors,
+            ctx.input_tensors + ctx.input_params,
+            output_grads,
+            allow_unused=True,
+        )
+        del ctx.input_tensors
+        del ctx.input_params
+        del output_tensors
+        return (None, None) + input_grads

guided_diffusion/pr_datasets_all.py ADDED Viewed

	@@ -0,0 +1,183 @@

+import math
+import random
+import os
+import pandas as pd
+import csv
+import re
+from PIL import Image
+import blobfile as bf
+from mpi4py import MPI
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch.utils.data import DataLoader, Dataset
+from music_rule_guidance import music_rules
+from music_rule_guidance.rule_maps import FUNC_DICT
+import matplotlib.pyplot as plt
+plt.rcParams["figure.figsize"] = (6,3)
+plt.rcParams['figure.dpi'] = 300
+plt.rcParams['savefig.dpi'] = 300
+# This file load in merged dataset with y being its dataset info
+def load_data(
+    *,
+    data_dir,
+    batch_size,
+    class_cond=False,
+    deterministic=False,
+    image_size=1024,
+    rule=None,
+):
+    """
+    For a dataset, create a generator over (images, kwargs) pairs.
+    Each images is an NCHW float tensor, and the kwargs dict contains zero or
+    more keys, each of which map to a batched Tensor of their own.
+    The kwargs dict can be used for class labels, in which case the key is "y"
+    and the values are integer tensors of class labels.
+    :param data_dir: the csv file that contains all the data paths and classes.
+    :param batch_size: the batch size of each returned pair.
+    :param image_size: the size to which images are resized.
+    :param class_cond: if True, include a "y" key in returned dicts for class
+                       label. If classes are not available and this is true, an
+                       exception will be raised.
+    :param deterministic: if True, yield results in a deterministic order.
+    :param rule: a str that contains the name of the rule
+    """
+    df = pd.read_csv(data_dir)
+    all_files = df['midi_filename'].tolist()
+    classes = None
+    if class_cond:
+        classes = df['classes'].tolist()
+    if deterministic:
+        dataset = ImageDataset(
+            all_files,
+            classes=classes,
+            shard=MPI.COMM_WORLD.Get_rank(),
+            num_shards=MPI.COMM_WORLD.Get_size(),
+            image_size=image_size,
+            rule=rule,
+            pitch_shift=False,
+            time_stretch=False,
+        )
+    else:
+        dataset = ImageDataset(
+            all_files,
+            classes=classes,
+            shard=MPI.COMM_WORLD.Get_rank(),
+            num_shards=MPI.COMM_WORLD.Get_size(),
+            image_size=image_size,
+            rule=rule,
+        )
+    if deterministic:
+        loader = DataLoader(
+            dataset, batch_size=batch_size, shuffle=False, num_workers=1, drop_last=True
+        )
+    else:
+        loader = DataLoader(
+            dataset, batch_size=batch_size, shuffle=True, num_workers=1, drop_last=True
+        )
+    while True:
+        yield from loader
+def key_shift(x, k):
+    # apply shift on both notes and onset
+    # x     sample (batch x 3 x pitch x time)
+    # k     number of pitches to shift
+    # only apply on (batch x 2 x pitch x time) because no key shift on pedal
+    pitches_and_onsets = x[:, :2, :, :]
+    pedals = x[:, 2:, :, :]
+    if k > 0:
+        pitches_and_onsets = torch.cat((pitches_and_onsets[:, :, k:, :], pitches_and_onsets[:, :, 0:k, :]), dim=2)
+    elif k < 0:
+        pitches_and_onsets = torch.cat((pitches_and_onsets[:, :, -k:, :], pitches_and_onsets[:, :, 0:-k, :]), dim=2)
+    x = torch.cat((pitches_and_onsets, pedals), dim=1)
+    return music_rules.piano_like(x)
+class ImageDataset(Dataset):
+    def __init__(
+        self,
+        image_paths,
+        classes=None,
+        rule=None,
+        shard=0,
+        num_shards=1,
+        image_size=1024,
+        pitch_shift=True,
+        time_stretch=True,
+    ):
+        super().__init__()
+        self.local_images = image_paths[shard:][::num_shards]
+        self.local_classes = None if classes is None else classes[shard:][::num_shards]
+        self.rule = rule
+        self.pitch_shift = pitch_shift
+        self.time_stretch = time_stretch
+        self.image_size = image_size
+    def __len__(self):
+        return len(self.local_images)
+    def __getitem__(self, idx):
+        path = self.local_images[idx]
+        arr = np.load(path)[np.newaxis]   # 1 x 2 x 128 x time
+        arr = arr.astype(np.float32) / 63.5 - 1
+        arr = torch.from_numpy(arr)
+        if self.time_stretch:   # apply for both notes and pedal
+            pr_len = int(np.random.uniform(0.95, 1.05) * self.image_size)
+            start = np.random.randint(arr.shape[-1] - pr_len)
+            arr = arr[:, :, :, start:start+pr_len]
+            if pr_len < self.image_size:   # stretching, prevent duplicating onsets
+                piano_pedal = arr[:, [0, 2], :, :]
+                piano_pedal = F.interpolate(piano_pedal, size=(128, self.image_size), mode='nearest')
+                onset_raw = arr[:, 1:2, :, :]
+                ind_a2b = (torch.arange(self.image_size)/self.image_size*pr_len).int()
+                ind = ind_a2b.diff().nonzero().squeeze() + 1
+                zero_tensor = torch.tensor([0])
+                ind = torch.concat((zero_tensor, ind))
+                onset = -torch.ones(1, 1, 128, self.image_size)
+                onset[:, :, :, ind] = onset_raw
+                arr = torch.concat((piano_pedal[:, :1, :, :], onset, piano_pedal[:, 1:, :, :]), dim=1)
+            if pr_len > self.image_size:  # compressing, add onset if happen to drop onsets and keep durations
+                arr = F.interpolate(arr, size=(128, self.image_size), mode='nearest')
+                piano = arr[:, :1, :, :]
+                first_column = piano[:, :, :, :1]
+                padded_piano = torch.concat((first_column, piano), dim=-1)
+                onset_online = torch.diff(padded_piano, dim=-1)
+                mask = onset_online > 0
+                arr[:, 1:2, :, :][mask] = 1
+        else:
+            arr = arr[:, :, :, :self.image_size]
+        if self.pitch_shift:   # only apply for notes
+            k = np.random.randint(-6, 7)   # generate randint from -6 to +6
+            arr = key_shift(arr, k)
+        arr = music_rules.piano_like(arr)   # also set pedal roll to be 0 for non-piano pitches (match VAE training)
+        out_dict = {}
+        if self.rule is not None:
+            if 'chord' in self.rule:  # predict chord and key jointly
+              chord, key, _ = FUNC_DICT[self.rule](arr, return_key=True)
+              out_dict["chord"] = chord
+              out_dict["key"] = np.array(key, dtype=np.int64)
+            else:
+              out_dict[self.rule] = FUNC_DICT[self.rule](arr)
+        if self.local_classes is not None:
+            out_dict["y"] = np.array(self.local_classes[idx], dtype=np.int64)
+        # debug
+        # out_dict["path"] = path
+        # Remove the extra dimensions to get back a 3D tensor: 2x128x128
+        arr = arr.squeeze(0)
+        return arr, out_dict

guided_diffusion/resample.py ADDED Viewed

	@@ -0,0 +1,154 @@

+from abc import ABC, abstractmethod
+import numpy as np
+import torch as th
+import torch.distributed as dist
+def create_named_schedule_sampler(name, diffusion):
+    """
+    Create a ScheduleSampler from a library of pre-defined samplers.
+    :param name: the name of the sampler.
+    :param diffusion: the diffusion object to sample for.
+    """
+    if name == "uniform":
+        return UniformSampler(diffusion)
+    elif name == "loss-second-moment":
+        return LossSecondMomentResampler(diffusion)
+    else:
+        raise NotImplementedError(f"unknown schedule sampler: {name}")
+class ScheduleSampler(ABC):
+    """
+    A distribution over timesteps in the diffusion process, intended to reduce
+    variance of the objective.
+    By default, samplers perform unbiased importance sampling, in which the
+    objective's mean is unchanged.
+    However, subclasses may override sample() to change how the resampled
+    terms are reweighted, allowing for actual changes in the objective.
+    """
+    @abstractmethod
+    def weights(self):
+        """
+        Get a numpy array of weights, one per diffusion step.
+        The weights needn't be normalized, but must be positive.
+        """
+    def sample(self, batch_size, device):
+        """
+        Importance-sample timesteps for a batch.
+        :param batch_size: the number of timesteps.
+        :param device: the torch device to save to.
+        :return: a tuple (timesteps, weights):
+                 - timesteps: a tensor of timestep indices.
+                 - weights: a tensor of weights to scale the resulting losses.
+        """
+        w = self.weights()
+        p = w / np.sum(w)
+        indices_np = np.random.choice(len(p), size=(batch_size,), p=p)
+        indices = th.from_numpy(indices_np).long().to(device)
+        weights_np = 1 / (len(p) * p[indices_np])
+        weights = th.from_numpy(weights_np).float().to(device)
+        return indices, weights
+class UniformSampler(ScheduleSampler):
+    def __init__(self, diffusion):
+        self.diffusion = diffusion
+        self._weights = np.ones([diffusion.num_timesteps])
+    def weights(self):
+        return self._weights
+class LossAwareSampler(ScheduleSampler):
+    def update_with_local_losses(self, local_ts, local_losses):
+        """
+        Update the reweighting using losses from a model.
+        Call this method from each rank with a batch of timesteps and the
+        corresponding losses for each of those timesteps.
+        This method will perform synchronization to make sure all of the ranks
+        maintain the exact same reweighting.
+        :param local_ts: an integer Tensor of timesteps.
+        :param local_losses: a 1D Tensor of losses.
+        """
+        batch_sizes = [
+            th.tensor([0], dtype=th.int32, device=local_ts.device)
+            for _ in range(dist.get_world_size())
+        ]
+        dist.all_gather(
+            batch_sizes,
+            th.tensor([len(local_ts)], dtype=th.int32, device=local_ts.device),
+        )
+        # Pad all_gather batches to be the maximum batch size.
+        batch_sizes = [x.item() for x in batch_sizes]
+        max_bs = max(batch_sizes)
+        timestep_batches = [th.zeros(max_bs).to(local_ts) for bs in batch_sizes]
+        loss_batches = [th.zeros(max_bs).to(local_losses) for bs in batch_sizes]
+        dist.all_gather(timestep_batches, local_ts)
+        dist.all_gather(loss_batches, local_losses)
+        timesteps = [
+            x.item() for y, bs in zip(timestep_batches, batch_sizes) for x in y[:bs]
+        ]
+        losses = [x.item() for y, bs in zip(loss_batches, batch_sizes) for x in y[:bs]]
+        self.update_with_all_losses(timesteps, losses)
+    @abstractmethod
+    def update_with_all_losses(self, ts, losses):
+        """
+        Update the reweighting using losses from a model.
+        Sub-classes should override this method to update the reweighting
+        using losses from the model.
+        This method directly updates the reweighting without synchronizing
+        between workers. It is called by update_with_local_losses from all
+        ranks with identical arguments. Thus, it should have deterministic
+        behavior to maintain state across workers.
+        :param ts: a list of int timesteps.
+        :param losses: a list of float losses, one per timestep.
+        """
+class LossSecondMomentResampler(LossAwareSampler):
+    def __init__(self, diffusion, history_per_term=10, uniform_prob=0.001):
+        self.diffusion = diffusion
+        self.history_per_term = history_per_term
+        self.uniform_prob = uniform_prob
+        self._loss_history = np.zeros(
+            [diffusion.num_timesteps, history_per_term], dtype=np.float64
+        )
+        self._loss_counts = np.zeros([diffusion.num_timesteps], dtype=int)
+    def weights(self):
+        if not self._warmed_up():
+            return np.ones([self.diffusion.num_timesteps], dtype=np.float64)
+        weights = np.sqrt(np.mean(self._loss_history ** 2, axis=-1))
+        weights /= np.sum(weights)
+        weights *= 1 - self.uniform_prob
+        weights += self.uniform_prob / len(weights)
+        return weights
+    def update_with_all_losses(self, ts, losses):
+        for t, loss in zip(ts, losses):
+            if self._loss_counts[t] == self.history_per_term:
+                # Shift out the oldest loss term.
+                self._loss_history[t, :-1] = self._loss_history[t, 1:]
+                self._loss_history[t, -1] = loss
+            else:
+                self._loss_history[t, self._loss_counts[t]] = loss
+                self._loss_counts[t] += 1
+    def _warmed_up(self):
+        return (self._loss_counts == self.history_per_term).all()

guided_diffusion/respace.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import numpy as np
+import torch as th
+from .gaussian_diffusion import GaussianDiffusion
+def space_timesteps(num_timesteps, section_counts):
+    """
+    Create a list of timesteps to use from an original diffusion process,
+    given the number of timesteps we want to take from equally-sized portions
+    of the original process.
+    For example, if there's 300 timesteps and the section counts are [10,15,20]
+    then the first 100 timesteps are strided to be 10 timesteps, the second 100
+    are strided to be 15 timesteps, and the final 100 are strided to be 20.
+    If the stride is a string starting with "ddim", then the fixed striding
+    from the DDIM paper is used, and only one section is allowed.
+    :param num_timesteps: the number of diffusion steps in the original
+                          process to divide up.
+    :param section_counts: either a list of numbers, or a string containing
+                           comma-separated numbers, indicating the step count
+                           per section. As a special case, use "ddimN" where N
+                           is a number of steps to use the striding from the
+                           DDIM paper.
+    :return: a set of diffusion steps from the original process to use.
+    """
+    if isinstance(section_counts, str):
+        if section_counts.startswith("ddim"):
+            desired_count = int(section_counts[len("ddim") :])
+            for i in range(1, num_timesteps):
+                if len(range(0, num_timesteps, i)) == desired_count:
+                    return set(range(0, num_timesteps, i))
+            raise ValueError(
+                f"cannot create exactly {num_timesteps} steps with an integer stride"
+            )
+        section_counts = [int(x) for x in section_counts.split(",")]
+    size_per = num_timesteps // len(section_counts)
+    extra = num_timesteps % len(section_counts)
+    start_idx = 0
+    all_steps = []
+    for i, section_count in enumerate(section_counts):
+        size = size_per + (1 if i < extra else 0)
+        if size < section_count:
+            raise ValueError(
+                f"cannot divide section of {size} steps into {section_count}"
+            )
+        if section_count <= 1:
+            frac_stride = 1
+        else:
+            frac_stride = (size - 1) / (section_count - 1)
+        cur_idx = 0.0
+        taken_steps = []
+        for _ in range(section_count):
+            taken_steps.append(start_idx + round(cur_idx))
+            cur_idx += frac_stride
+        all_steps += taken_steps
+        start_idx += size
+    return set(all_steps)
+class SpacedDiffusion(GaussianDiffusion):
+    """
+    A diffusion process which can skip steps in a base diffusion process.
+    :param use_timesteps: a collection (sequence or set) of timesteps from the
+                          original diffusion process to retain.
+    :param kwargs: the kwargs to create the base diffusion process.
+    """
+    def __init__(self, use_timesteps, **kwargs):
+        self.use_timesteps = set(use_timesteps)
+        self.timestep_map = []
+        self.original_num_steps = len(kwargs["betas"])
+        base_diffusion = GaussianDiffusion(**kwargs)  # pylint: disable=missing-kwoa
+        last_alpha_cumprod = 1.0
+        new_betas = []
+        for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod):
+            if i in self.use_timesteps:
+                new_betas.append(1 - alpha_cumprod / last_alpha_cumprod)
+                last_alpha_cumprod = alpha_cumprod
+                self.timestep_map.append(i)
+        kwargs["betas"] = np.array(new_betas)
+        super().__init__(**kwargs)
+    def p_mean_variance(
+        self, model, *args, **kwargs
+    ):  # pylint: disable=signature-differs
+        return super().p_mean_variance(self._wrap_model(model), *args, **kwargs)
+    def training_losses(
+        self, model, *args, **kwargs
+    ):  # pylint: disable=signature-differs
+        return super().training_losses(self._wrap_model(model), *args, **kwargs)
+    def condition_mean(self, cond_fn, *args, **kwargs):
+        return super().condition_mean(self._wrap_model(cond_fn), *args, **kwargs)
+    def condition_score(self, cond_fn, *args, **kwargs):
+        return super().condition_score(self._wrap_model(cond_fn), *args, **kwargs)
+    def _wrap_model(self, model):
+        if isinstance(model, _WrappedModel):
+            return model
+        return _WrappedModel(
+            model, self.timestep_map, self.rescale_timesteps, self.original_num_steps
+        )
+    def _scale_timesteps(self, t):
+        # Scaling is done by the wrapped model.
+        return t
+class _WrappedModel:
+    def __init__(self, model, timestep_map, rescale_timesteps, original_num_steps):
+        self.model = model
+        self.timestep_map = timestep_map
+        self.rescale_timesteps = rescale_timesteps
+        self.original_num_steps = original_num_steps
+    def __call__(self, x, ts, **kwargs):
+        map_tensor = th.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype)
+        new_ts = map_tensor[ts]
+        if self.rescale_timesteps:
+            new_ts = new_ts.float() * (1000.0 / self.original_num_steps)
+        return self.model(x, new_ts, **kwargs)

guided_diffusion/script_util.py ADDED Viewed

	@@ -0,0 +1,531 @@

+import argparse
+import inspect
+import torch.nn.functional as F
+from music_rule_guidance import music_rules
+from . import gaussian_diffusion as gd
+from .respace import SpacedDiffusion, space_timesteps
+from .unet import SuperResModel, UNetModel, EncoderUNetModel
+NUM_CLASSES = 3   # number of datasets
+def diffusion_defaults():
+    """
+    Defaults for image and classifier training.
+    """
+    return dict(
+        learn_sigma=False,
+        diffusion_steps=1000,
+        noise_schedule="linear",
+        timestep_respacing="",
+        use_kl=False,
+        predict_xstart=False,
+        rescale_timesteps=False,
+        rescale_learned_sigmas=False,
+    )
+def classifier_defaults():
+    """
+    Defaults for classifier models.
+    """
+    return dict(
+        image_size=64,
+        in_channels=3,
+        classifier_use_fp16=False,
+        classifier_width=128,
+        classifier_depth=2,
+        classifier_attention_resolutions="32,16,8",  # 16
+        classifier_use_scale_shift_norm=True,  # False
+        classifier_resblock_updown=True,  # False
+        classifier_pool="attention",
+        num_classes=3,
+        chord=False,
+    )
+def model_and_diffusion_image_defaults():
+    """
+    Defaults for image training.
+    """
+    res = dict(
+        image_size=64,
+        in_channels=3,
+        num_channels=128,
+        num_res_blocks=2,
+        num_heads=4,
+        num_heads_upsample=-1,
+        num_head_channels=-1,
+        attention_resolutions="32,16,8",
+        channel_mult="",
+        dropout=0.0,
+        class_cond=False,
+        use_checkpoint=False,
+        use_scale_shift_norm=True,
+        resblock_updown=False,
+        use_fp16=False,
+        use_new_attention_order=False,
+    )
+    res.update(diffusion_defaults())
+    return res
+def model_and_diffusion_defaults():
+    """
+    Defaults for piano roll training.
+    """
+    res = dict(
+        image_size=128,
+        in_channels=1,
+        num_channels=128,
+        num_res_blocks=2,
+        num_heads=4,
+        num_heads_upsample=-1,
+        num_head_channels=-1,
+        attention_resolutions="32,16,8",
+        channel_mult="",
+        dropout=0.0,
+        class_cond=False,
+        use_checkpoint=False,
+        use_scale_shift_norm=True,
+        resblock_updown=False,
+        use_fp16=False,
+        use_new_attention_order=False,
+    )
+    res.update(diffusion_defaults())
+    return res
+def classifier_and_diffusion_defaults():
+    res = classifier_defaults()
+    res.update(diffusion_defaults())
+    return res
+def create_diffusion(
+    learn_sigma,
+    diffusion_steps,
+    noise_schedule,
+    timestep_respacing,
+    use_kl,
+    predict_xstart,
+    rescale_timesteps,
+    rescale_learned_sigmas,
+):
+    diffusion = create_gaussian_diffusion(
+        steps=diffusion_steps,
+        learn_sigma=learn_sigma,
+        noise_schedule=noise_schedule,
+        use_kl=use_kl,
+        predict_xstart=predict_xstart,
+        rescale_timesteps=rescale_timesteps,
+        rescale_learned_sigmas=rescale_learned_sigmas,
+        timestep_respacing=timestep_respacing,
+    )
+    return diffusion
+def create_model_and_diffusion(
+    image_size,
+    in_channels,
+    class_cond,
+    learn_sigma,
+    num_channels,
+    num_res_blocks,
+    channel_mult,
+    num_heads,
+    num_head_channels,
+    num_heads_upsample,
+    attention_resolutions,
+    dropout,
+    diffusion_steps,
+    noise_schedule,
+    timestep_respacing,
+    use_kl,
+    predict_xstart,
+    rescale_timesteps,
+    rescale_learned_sigmas,
+    use_checkpoint,
+    use_scale_shift_norm,
+    resblock_updown,
+    use_fp16,
+    use_new_attention_order,
+):
+    model = create_model(
+        image_size,
+        num_channels,
+        num_res_blocks,
+        in_channels,
+        channel_mult=channel_mult,
+        learn_sigma=learn_sigma,
+        class_cond=class_cond,
+        use_checkpoint=use_checkpoint,
+        attention_resolutions=attention_resolutions,
+        num_heads=num_heads,
+        num_head_channels=num_head_channels,
+        num_heads_upsample=num_heads_upsample,
+        use_scale_shift_norm=use_scale_shift_norm,
+        dropout=dropout,
+        resblock_updown=resblock_updown,
+        use_fp16=use_fp16,
+        use_new_attention_order=use_new_attention_order,
+    )
+    diffusion = create_gaussian_diffusion(
+        steps=diffusion_steps,
+        learn_sigma=learn_sigma,
+        noise_schedule=noise_schedule,
+        use_kl=use_kl,
+        predict_xstart=predict_xstart,
+        rescale_timesteps=rescale_timesteps,
+        rescale_learned_sigmas=rescale_learned_sigmas,
+        timestep_respacing=timestep_respacing,
+    )
+    return model, diffusion
+def create_model(
+    image_size,
+    num_channels,
+    num_res_blocks,
+    in_channels=3,
+    channel_mult="",
+    learn_sigma=False,
+    class_cond=False,
+    use_checkpoint=False,
+    attention_resolutions="16",
+    num_heads=4,
+    num_head_channels=-1,
+    num_heads_upsample=-1,
+    use_scale_shift_norm=False,
+    dropout=0,
+    resblock_updown=False,
+    use_fp16=False,
+    use_new_attention_order=False,
+):
+    image_size = image_size[-1]   # if H != W, use W as image_size
+    if channel_mult == "":
+        if image_size == 512:
+            channel_mult = (0.5, 1, 1, 2, 2, 4, 4)
+        elif image_size == 256:
+            channel_mult = (1, 1, 2, 2, 4, 4)
+        elif image_size == 128:
+            channel_mult = (1, 1, 2, 3, 4)
+        elif image_size == 64:
+            channel_mult = (1, 2, 3, 4)
+        elif image_size == 32:
+            channel_mult = (1, 2, 2, 2)
+        elif image_size == 16:
+            channel_mult = (1, 2, 2)
+        else:
+            raise ValueError(f"unsupported image size: {image_size}")
+    else:
+        channel_mult = tuple(int(ch_mult) for ch_mult in channel_mult.split(","))
+    attention_ds = []
+    for res in attention_resolutions.split(","):
+        attention_ds.append(image_size // int(res))
+    return UNetModel(
+        image_size=image_size,
+        in_channels=in_channels,
+        model_channels=num_channels,
+        out_channels=(in_channels if not learn_sigma else 2*in_channels),
+        num_res_blocks=num_res_blocks,
+        attention_resolutions=tuple(attention_ds),
+        dropout=dropout,
+        channel_mult=channel_mult,
+        num_classes=(NUM_CLASSES if class_cond else None),
+        use_checkpoint=use_checkpoint,
+        use_fp16=use_fp16,
+        num_heads=num_heads,
+        num_head_channels=num_head_channels,
+        num_heads_upsample=num_heads_upsample,
+        use_scale_shift_norm=use_scale_shift_norm,
+        resblock_updown=resblock_updown,
+        use_new_attention_order=use_new_attention_order,
+    )
+def create_classifier_and_diffusion(
+    image_size,
+    in_channels,
+    classifier_use_fp16,
+    classifier_width,
+    classifier_depth,
+    classifier_attention_resolutions,
+    classifier_use_scale_shift_norm,
+    classifier_resblock_updown,
+    classifier_pool,
+    learn_sigma,
+    diffusion_steps,
+    noise_schedule,
+    timestep_respacing,
+    use_kl,
+    predict_xstart,
+    rescale_timesteps,
+    rescale_learned_sigmas,
+    num_classes,
+    chord,
+):
+    classifier = create_classifier(
+        image_size,
+        in_channels,
+        classifier_use_fp16,
+        classifier_width,
+        classifier_depth,
+        classifier_attention_resolutions,
+        classifier_use_scale_shift_norm,
+        classifier_resblock_updown,
+        classifier_pool,
+        num_classes,
+        chord,
+    )
+    diffusion = create_gaussian_diffusion(
+        steps=diffusion_steps,
+        learn_sigma=learn_sigma,
+        noise_schedule=noise_schedule,
+        use_kl=use_kl,
+        predict_xstart=predict_xstart,
+        rescale_timesteps=rescale_timesteps,
+        rescale_learned_sigmas=rescale_learned_sigmas,
+        timestep_respacing=timestep_respacing,
+    )
+    return classifier, diffusion
+def create_classifier(
+    image_size,
+    in_channels,
+    classifier_use_fp16,
+    classifier_width,
+    classifier_depth,
+    classifier_attention_resolutions,
+    classifier_use_scale_shift_norm,
+    classifier_resblock_updown,
+    classifier_pool,
+    num_classes,
+    chord,
+):
+    image_size = image_size[-1]  # if H != W, use W as image_size
+    if image_size == 512:
+        channel_mult = (0.5, 1, 1, 2, 2, 4, 4)
+    elif image_size == 256:
+        channel_mult = (1, 1, 2, 2, 4, 4)
+    elif image_size == 128:
+        channel_mult = (1, 1, 2, 3, 4)
+    elif image_size == 64:
+        channel_mult = (1, 2, 3, 4)
+    elif image_size == 16:   # debug data load in
+        channel_mult = (1, 2, 2)
+    else:
+        raise ValueError(f"unsupported image size: {image_size}")
+    attention_ds = []
+    for res in classifier_attention_resolutions.split(","):
+        attention_ds.append(image_size // int(res))
+    return EncoderUNetModel(
+        image_size=image_size,
+        in_channels=in_channels,
+        model_channels=classifier_width,
+        out_channels=num_classes,
+        num_res_blocks=classifier_depth,
+        attention_resolutions=tuple(attention_ds),
+        channel_mult=channel_mult,
+        use_fp16=classifier_use_fp16,
+        num_head_channels=64,
+        use_scale_shift_norm=classifier_use_scale_shift_norm,
+        resblock_updown=classifier_resblock_updown,
+        pool=classifier_pool,
+        chord=chord,
+    )
+def sr_model_and_diffusion_defaults():
+    res = model_and_diffusion_defaults()
+    res["large_size"] = 256
+    res["small_size"] = 64
+    arg_names = inspect.getfullargspec(sr_create_model_and_diffusion)[0]
+    for k in res.copy().keys():
+        if k not in arg_names:
+            del res[k]
+    return res
+def sr_create_model_and_diffusion(
+    large_size,
+    small_size,
+    class_cond,
+    learn_sigma,
+    num_channels,
+    num_res_blocks,
+    num_heads,
+    num_head_channels,
+    num_heads_upsample,
+    attention_resolutions,
+    dropout,
+    diffusion_steps,
+    noise_schedule,
+    timestep_respacing,
+    use_kl,
+    predict_xstart,
+    rescale_timesteps,
+    rescale_learned_sigmas,
+    use_checkpoint,
+    use_scale_shift_norm,
+    resblock_updown,
+    use_fp16,
+):
+    model = sr_create_model(
+        large_size,
+        small_size,
+        num_channels,
+        num_res_blocks,
+        learn_sigma=learn_sigma,
+        class_cond=class_cond,
+        use_checkpoint=use_checkpoint,
+        attention_resolutions=attention_resolutions,
+        num_heads=num_heads,
+        num_head_channels=num_head_channels,
+        num_heads_upsample=num_heads_upsample,
+        use_scale_shift_norm=use_scale_shift_norm,
+        dropout=dropout,
+        resblock_updown=resblock_updown,
+        use_fp16=use_fp16,
+    )
+    diffusion = create_gaussian_diffusion(
+        steps=diffusion_steps,
+        learn_sigma=learn_sigma,
+        noise_schedule=noise_schedule,
+        use_kl=use_kl,
+        predict_xstart=predict_xstart,
+        rescale_timesteps=rescale_timesteps,
+        rescale_learned_sigmas=rescale_learned_sigmas,
+        timestep_respacing=timestep_respacing,
+    )
+    return model, diffusion
+def sr_create_model(
+    large_size,
+    small_size,
+    num_channels,
+    num_res_blocks,
+    learn_sigma,
+    class_cond,
+    use_checkpoint,
+    attention_resolutions,
+    num_heads,
+    num_head_channels,
+    num_heads_upsample,
+    use_scale_shift_norm,
+    dropout,
+    resblock_updown,
+    use_fp16,
+):
+    _ = small_size  # hack to prevent unused variable
+    if large_size == 512:
+        channel_mult = (1, 1, 2, 2, 4, 4)
+    elif large_size == 256:
+        channel_mult = (1, 1, 2, 2, 4, 4)
+    elif large_size == 64:
+        channel_mult = (1, 2, 3, 4)
+    else:
+        raise ValueError(f"unsupported large size: {large_size}")
+    attention_ds = []
+    for res in attention_resolutions.split(","):
+        attention_ds.append(large_size // int(res))
+    return SuperResModel(
+        image_size=large_size,
+        in_channels=3,
+        model_channels=num_channels,
+        out_channels=(3 if not learn_sigma else 6),
+        num_res_blocks=num_res_blocks,
+        attention_resolutions=tuple(attention_ds),
+        dropout=dropout,
+        channel_mult=channel_mult,
+        num_classes=(NUM_CLASSES if class_cond else None),
+        use_checkpoint=use_checkpoint,
+        num_heads=num_heads,
+        num_head_channels=num_head_channels,
+        num_heads_upsample=num_heads_upsample,
+        use_scale_shift_norm=use_scale_shift_norm,
+        resblock_updown=resblock_updown,
+        use_fp16=use_fp16,
+    )
+def create_gaussian_diffusion(
+    *,
+    steps=1000,
+    learn_sigma=False,
+    sigma_small=False,
+    noise_schedule="linear",
+    use_kl=False,
+    predict_xstart=False,
+    rescale_timesteps=False,
+    rescale_learned_sigmas=False,
+    timestep_respacing="",
+):
+    betas = gd.get_named_beta_schedule(noise_schedule, steps)
+    if use_kl:
+        loss_type = gd.LossType.RESCALED_KL
+    elif rescale_learned_sigmas:
+        loss_type = gd.LossType.RESCALED_MSE
+    else:
+        loss_type = gd.LossType.MSE
+    if not timestep_respacing:
+        timestep_respacing = [steps]
+    return SpacedDiffusion(
+        use_timesteps=space_timesteps(steps, timestep_respacing),
+        betas=betas,
+        model_mean_type=(
+            gd.ModelMeanType.EPSILON if not predict_xstart else gd.ModelMeanType.START_X
+        ),
+        model_var_type=(
+            (
+                gd.ModelVarType.FIXED_LARGE
+                if not sigma_small
+                else gd.ModelVarType.FIXED_SMALL
+            )
+            if not learn_sigma
+            else gd.ModelVarType.LEARNED_RANGE
+        ),
+        loss_type=loss_type,
+        rescale_timesteps=rescale_timesteps,
+    )
+def add_dict_to_argparser(parser, default_dict):
+    for k, v in default_dict.items():
+        v_type = type(v)
+        if v is None:
+            v_type = str
+        elif isinstance(v, bool):
+            v_type = str2bool
+        if k == 'image_size':
+            parser.add_argument(f"--{k}", nargs='+', default=v, type=v_type)
+        else:
+            parser.add_argument(f"--{k}", default=v, type=v_type)
+def args_to_dict(args, keys):
+    return {k: getattr(args, k) for k in keys}
+def str2bool(v):
+    """
+    https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse
+    """
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ("yes", "true", "t", "y", "1"):
+        return True
+    elif v.lower() in ("no", "false", "f", "n", "0"):
+        return False
+    else:
+        raise argparse.ArgumentTypeError("boolean value expected")

guided_diffusion/train_util.py ADDED Viewed

	@@ -0,0 +1,475 @@

+import copy
+import functools
+import os
+import os.path as osp
+import numpy as np
+import math
+import blobfile as bf
+import torch as th
+import torch.nn.functional as F
+import torch.distributed as dist
+from torch.nn.parallel.distributed import DistributedDataParallel as DDP
+from torch.optim import AdamW
+from . import dist_util, midi_util, logger
+from .fp16_util import MixedPrecisionTrainer
+from .nn import update_ema
+from .resample import LossAwareSampler, UniformSampler
+from taming.modules.distributions.distributions import DiagonalGaussianDistribution
+# For ImageNet experiments, this was a good default value.
+# We found that the lg_loss_scale quickly climbed to
+# 20-21 within the first ~1K steps of training.
+INITIAL_LOG_LOSS_SCALE = 20.0
+class TrainLoop:
+    def __init__(
+        self,
+        *,
+        model,
+        eval_model,
+        diffusion,
+        data,
+        batch_size,
+        microbatch,
+        lr,
+        ema_rate,
+        log_interval,
+        save_interval,
+        resume_checkpoint,
+        embed_model=None,
+        use_fp16=False,
+        fp16_scale_growth=1e-3,
+        schedule_sampler=None,
+        weight_decay=0.0,
+        lr_anneal_steps=0,
+        eval_data=None,
+        eval_interval=-1,
+        eval_sample_batch_size=16,
+        total_num_gpus=1,   # training is run on how many gpus, used to distribute classes on each gpu
+        eval_sample_use_ddim=True,
+        eval_sample_clip_denoised=True,
+        in_channels=1,
+        fs=100,
+        pedal=False,        # whether decode with pedal as the second channel
+        scale_factor=1.,
+        num_classes=0,   # whether to use class_cond in sampling
+        microbatch_encode=-1,
+        encode_rep=4,
+        shift_size=4,   # shift_size when generating time shifted sampels from an encoding
+    ):
+        self.model = model
+        self.eval_model = eval_model
+        self.embed_model = embed_model
+        self.scale_factor = scale_factor
+        self.diffusion = diffusion
+        self.data = data
+        self.batch_size = batch_size
+        self.microbatch = microbatch if microbatch > 0 else batch_size
+        self.microbatch_encode = microbatch_encode
+        self.encode_rep = encode_rep
+        self.batch_size = self.batch_size // self.encode_rep    # effective batch size
+        self.microbatch = self.microbatch // self.encode_rep
+        self.shift_size = shift_size  # need to be compatible with encode_rep
+        self.lr = lr
+        self.ema_rate = (
+            [ema_rate]
+            if isinstance(ema_rate, float)
+            else [float(x) for x in ema_rate.split(",")]
+        )
+        self.log_interval = log_interval
+        self.save_interval = save_interval
+        self.resume_checkpoint = resume_checkpoint
+        self.use_fp16 = use_fp16
+        self.fp16_scale_growth = fp16_scale_growth
+        self.schedule_sampler = schedule_sampler or UniformSampler(diffusion)
+        self.weight_decay = weight_decay
+        self.lr_anneal_steps = lr_anneal_steps
+        # eval
+        self.eval_data = eval_data
+        self.eval_interval = eval_interval
+        self.total_num_gpus = total_num_gpus
+        self.eval_sample_batch_size = eval_sample_batch_size // self.total_num_gpus
+        self.eval_sample_use_ddim = eval_sample_use_ddim
+        self.eval_sample_clip_denoised = eval_sample_clip_denoised
+        self.in_channels = in_channels
+        self.fs = fs
+        self.pedal = pedal
+        self.num_classes = num_classes
+        self.step = 0
+        self.resume_step = 0
+        self.global_batch = self.batch_size * dist.get_world_size()
+        self.sync_cuda = th.cuda.is_available()
+        self._load_and_sync_parameters()
+        self.mp_trainer = MixedPrecisionTrainer(
+            model=self.model,
+            use_fp16=self.use_fp16,
+            fp16_scale_growth=fp16_scale_growth,
+        )
+        self.opt = AdamW(
+            self.mp_trainer.master_params, lr=self.lr, weight_decay=self.weight_decay
+        )
+        if self.resume_step:
+            self._load_optimizer_state()
+            # Model was resumed, either due to a restart or a checkpoint
+            # being specified at the command line.
+            self.ema_params = [
+                self._load_ema_parameters(rate) for rate in self.ema_rate
+            ]
+        else:
+            self.ema_params = [
+                copy.deepcopy(self.mp_trainer.master_params)
+                for _ in range(len(self.ema_rate))
+            ]
+        if th.cuda.is_available():
+            self.use_ddp = True
+            self.ddp_model = DDP(
+                self.model,
+                device_ids=[dist_util.dev()],
+                output_device=dist_util.dev(),
+                broadcast_buffers=False,
+                bucket_cap_mb=128,
+                find_unused_parameters=False,
+            )
+        else:
+            if dist.get_world_size() > 1:
+                logger.warn(
+                    "Distributed training requires CUDA. "
+                    "Gradients will not be synchronized properly!"
+                )
+            self.use_ddp = False
+            self.ddp_model = self.model
+    def _load_and_sync_parameters(self):
+        resume_checkpoint = find_resume_checkpoint() or self.resume_checkpoint
+        if resume_checkpoint:
+            self.resume_step = parse_resume_step_from_filename(resume_checkpoint)
+            logger.log(f"loading model from checkpoint: {resume_checkpoint}...")
+            self.model.load_state_dict(
+                 dist_util.load_state_dict(
+                     resume_checkpoint, map_location=dist_util.dev()
+                 )
+             )
+        dist_util.sync_params(self.model.parameters())
+    def _load_ema_parameters(self, rate):
+        ema_params = copy.deepcopy(self.mp_trainer.master_params)
+        main_checkpoint = find_resume_checkpoint() or self.resume_checkpoint
+        ema_checkpoint = find_ema_checkpoint(main_checkpoint, self.resume_step, rate)
+        if ema_checkpoint:
+            logger.log(f"loading EMA from checkpoint: {ema_checkpoint}...")
+            state_dict = dist_util.load_state_dict(
+                 ema_checkpoint, map_location=dist_util.dev()
+             )
+            ema_params = self.mp_trainer.state_dict_to_master_params(state_dict)
+        dist_util.sync_params(ema_params)
+        return ema_params
+    def _load_optimizer_state(self):
+        main_checkpoint = find_resume_checkpoint() or self.resume_checkpoint
+        opt_checkpoint = bf.join(
+            bf.dirname(main_checkpoint), f"opt{self.resume_step:06}.pt"
+        )
+        if bf.exists(opt_checkpoint):
+            logger.log(f"loading optimizer state from checkpoint: {opt_checkpoint}")
+            state_dict = dist_util.load_state_dict(
+                opt_checkpoint, map_location=dist_util.dev()
+            )
+            self.opt.load_state_dict(state_dict)
+    def run_loop(self):
+        while (
+            not self.lr_anneal_steps
+            or self.step + self.resume_step < self.lr_anneal_steps
+        ):
+            batch, cond = next(self.data)
+            dist.barrier()
+            self.run_step(batch, cond)
+            if self.eval_data is not None and self.step % self.eval_interval == 0:
+                batch_eval, cond_eval = next(self.eval_data)
+                self.run_step_eval(batch_eval, cond_eval)
+            if self.step % self.log_interval == 0:
+                logger.dumpkvs()
+            if self.step % self.save_interval == 0 and self.step != 0:
+                self.save()
+                # Run for a finite amount of time in integration tests.
+                if os.environ.get("DIFFUSION_TRAINING_TEST", "") and self.step > 0:
+                    return
+            self.step += 1
+        # Save the last checkpoint if it wasn't already saved.
+        if (self.step - 1) % self.save_interval != 0:
+            self.save()
+    def run_step(self, batch, cond):
+        self.forward_backward(batch, cond)
+        took_step = self.mp_trainer.optimize(self.opt)
+        if took_step:
+            self._update_ema()
+        self._anneal_lr()
+        self.log_step()
+    def run_step_eval(self, batch, cond):
+        with th.no_grad():
+            # load in ema_params for eval_model in cpu, then move to gpu
+            # only use the first ema rate if there are multiple ema rate
+            ema_state_dict = self.mp_trainer.master_params_to_state_dict(self.ema_params[0])
+            # ema_state_dict_cpu = {k: v.cpu() for k, v in ema_state_dict.items()}
+            # self.eval_model.load_state_dict(ema_state_dict_cpu)
+            self.eval_model.load_state_dict(ema_state_dict)
+            # self.eval_model.to(dist_util.dev())
+            if self.use_fp16:
+                self.eval_model.convert_to_fp16()
+            self.eval_model.eval()
+            for i in range(0, batch.shape[0], self.microbatch):
+                micro = batch[i: i + self.microbatch].to(dist_util.dev())
+                if self.embed_model is not None:
+                    micro = get_kl_input(micro, microbatch=self.microbatch_encode,
+                                         model=self.embed_model, scale_factor=self.scale_factor,
+                                         shift_size=self.shift_size)
+                micro_cond = {
+                    k: v[i: i + self.microbatch].repeat_interleave(self.encode_rep).to(dist_util.dev())
+                    for k, v in cond.items()
+                }
+                t, weights = self.schedule_sampler.sample(micro.shape[0], dist_util.dev())
+                compute_losses = functools.partial(
+                    self.diffusion.training_losses,
+                    self.eval_model,
+                    micro,
+                    t,
+                    model_kwargs=micro_cond,
+                )
+                losses = compute_losses()
+                log_loss_dict(
+                    self.diffusion, t, {'eval_'+k: v * weights for k, v in losses.items()}
+                )
+            if self.eval_sample_batch_size > 0 and self.step != 0:
+            # if True:
+                model_kwargs = {}
+                if self.num_classes > 0:
+                    # classes = th.randint(
+                    #     low=0, high=self.num_classes, size=(self.eval_sample_batch_size,), device=dist_util.dev()
+                    # )
+                    # balance generated classes
+                    rank = dist.get_rank()
+                    samples_per_class = math.ceil(self.eval_sample_batch_size * self.total_num_gpus / self.num_classes)
+                    label_start = rank * self.eval_sample_batch_size // samples_per_class
+                    label_end = math.ceil((rank + 1) * self.eval_sample_batch_size / samples_per_class)
+                    classes = th.arange(label_start, label_end, dtype=th.int, device=dist_util.dev()).repeat_interleave(samples_per_class)
+                    model_kwargs["y"] = classes[:self.eval_sample_batch_size]
+                all_images = []
+                all_labels = []
+                image_size_h = micro.shape[-2]
+                image_size_w = micro.shape[-1]
+                sample_fn = (
+                    self.diffusion.p_sample_loop if not self.eval_sample_use_ddim else self.diffusion.ddim_sample_loop
+                )
+                sample = sample_fn(
+                    self.eval_model,
+                    (self.eval_sample_batch_size, self.in_channels, image_size_h, image_size_w),
+                    # (4, self.in_channels, image_size_h, image_size_w),
+                    clip_denoised=self.eval_sample_clip_denoised,
+                    model_kwargs=model_kwargs,
+                    progress=True
+                )
+                ##### debug
+                # sample = micro
+                sample = midi_util.decode_sample_for_midi(sample, embed_model=self.embed_model,
+                                                          scale_factor=self.scale_factor, threshold=-0.95)
+                gathered_samples = [th.zeros_like(sample) for _ in range(dist.get_world_size())]
+                dist.all_gather(gathered_samples, sample)  # gather not supported with NCCL
+                all_images.extend([sample.cpu().numpy() for sample in gathered_samples])
+                if self.num_classes > 0:
+                    gathered_labels = [
+                        th.zeros_like(model_kwargs["y"]) for _ in range(dist.get_world_size())
+                    ]
+                    dist.all_gather(gathered_labels, model_kwargs["y"])
+                    all_labels.extend([labels.cpu().numpy() for labels in gathered_labels])
+                arr = np.concatenate(all_images, axis=0)
+                if arr.shape[-1] == 1:  # no pedal, need shape B x 128 x 1024
+                    arr = arr.squeeze(axis=-1)
+                else:   # with pedal, need shape: B x 2 x 128 x 1024
+                    arr = arr.transpose(0, 3, 1, 2)
+                if self.num_classes > 0:
+                    label_arr = np.concatenate(all_labels, axis=0)
+                save_dir = osp.join(get_blob_logdir(), "samples", "iter_" + str(self.step + self.resume_step))
+                os.makedirs(os.path.expanduser(save_dir), exist_ok=True)
+                if dist.get_rank() == 0:
+                    if self.num_classes > 0:
+                        midi_util.save_piano_roll_midi(arr, save_dir, self.fs, y=label_arr)
+                    else:
+                        midi_util.save_piano_roll_midi(arr, save_dir, self.fs)
+                dist.barrier()
+            # # put the model on cpu to prepare for next loading
+            # self.eval_model.to("cpu")
+    def forward_backward(self, batch, cond):
+        self.mp_trainer.zero_grad()
+        for i in range(0, batch.shape[0], self.microbatch):
+            micro = batch[i : i + self.microbatch].to(dist_util.dev())
+            if self.embed_model is not None:
+                micro = get_kl_input(micro, microbatch=self.microbatch_encode,
+                                     model=self.embed_model, scale_factor=self.scale_factor,
+                                     shift_size=self.shift_size)
+            micro_cond = {
+                k: v[i : i + self.microbatch].repeat_interleave(self.encode_rep).to(dist_util.dev())
+                for k, v in cond.items()
+            }
+            last_batch = (i + self.microbatch) >= self.batch_size
+            t, weights = self.schedule_sampler.sample(micro.shape[0], dist_util.dev())
+            compute_losses = functools.partial(
+                self.diffusion.training_losses,
+                self.ddp_model,
+                micro,
+                t,
+                model_kwargs=micro_cond,
+            )
+            if last_batch or not self.use_ddp:
+                losses = compute_losses()
+            else:
+                with self.ddp_model.no_sync():
+                    losses = compute_losses()
+            if isinstance(self.schedule_sampler, LossAwareSampler):
+                self.schedule_sampler.update_with_local_losses(
+                    t, losses["loss"].detach()
+                )
+            loss = (losses["loss"] * weights).mean()
+            log_loss_dict(
+                self.diffusion, t, {k: v * weights for k, v in losses.items()}
+            )
+            self.mp_trainer.backward(loss)
+            # # keep gpu mem constant?
+            # del losses
+    def _update_ema(self):
+        for rate, params in zip(self.ema_rate, self.ema_params):
+            update_ema(params, self.mp_trainer.master_params, rate=rate)
+    def _anneal_lr(self):
+        if not self.lr_anneal_steps:
+            return
+        frac_done = (self.step + self.resume_step) / self.lr_anneal_steps
+        lr = self.lr * (1 - frac_done)
+        for param_group in self.opt.param_groups:
+            param_group["lr"] = lr
+    def log_step(self):
+        logger.logkv("step", self.step + self.resume_step)
+        logger.logkv("samples", (self.step + self.resume_step + 1) * self.global_batch)
+    def save(self):
+        def save_checkpoint(rate, params):
+            state_dict = self.mp_trainer.master_params_to_state_dict(params)
+            if dist.get_rank() == 0:
+                logger.log(f"saving model {rate}...")
+                if not rate:
+                    filename = f"model{(self.step+self.resume_step):06d}.pt"
+                else:
+                    filename = f"ema_{rate}_{(self.step+self.resume_step):06d}.pt"
+                with bf.BlobFile(bf.join(get_blob_logdir(), "checkpoints", filename), "wb") as f:
+                    th.save(state_dict, f)
+        save_checkpoint(0, self.mp_trainer.master_params)
+        for rate, params in zip(self.ema_rate, self.ema_params):
+            save_checkpoint(rate, params)
+        if dist.get_rank() == 0:
+            with bf.BlobFile(
+                bf.join(get_blob_logdir(), "checkpoints", f"opt{(self.step+self.resume_step):06d}.pt"),
+                "wb",
+            ) as f:
+                th.save(self.opt.state_dict(), f)
+        dist.barrier()
+@th.no_grad()
+def get_kl_input(batch, microbatch=-1, model=None, scale_factor=1., recombine=True, shift_size=4):
+    # here microbatch should be outer microbatch // encode_rep
+    if microbatch < 0:
+        microbatch = batch.shape[0]
+    full_z = []
+    image_size_h = batch.shape[-2]
+    image_size_w = batch.shape[-1]
+    seq_len = image_size_w // image_size_h
+    for i in range(0, batch.shape[0], microbatch):
+        micro = batch[i : i + microbatch].to(dist_util.dev())
+        # encode each 1s and concatenate
+        micro = th.chunk(micro, seq_len, dim=-1)  # B x C x H x W
+        micro = th.concat(micro, dim=0)  # 1st second for all batch, 2nd second for all batch, ...
+        micro = model.encode_save(micro, range_fix=False)
+        posterior = DiagonalGaussianDistribution(micro)
+        # z = posterior.sample()
+        z = posterior.mode()
+        z = th.concat(th.chunk(z, seq_len, dim=0), dim=-1)
+        z = z.permute(0, 1, 3, 2)
+        full_z.append(z)
+    full_z = th.concat(full_z, dim=0)    # B x 4 x (15x16), 16
+    if recombine:   # if not using microbatch, then need to use recombination of tokens
+        # unfold: dimension, size, step
+        full_z = full_z.unfold(2, 8*16, 16*shift_size).permute(0, 2, 1, 4, 3)   # (B x encode_rep) x 4 x 128 x 16
+        full_z = full_z.contiguous().view(-1, 4, 8*16, 16)     # B x 4 x 128 x 16
+    return (full_z * scale_factor).detach()
+def parse_resume_step_from_filename(filename):
+    """
+    Parse filenames of the form path/to/modelNNNNNN.pt, where NNNNNN is the
+    checkpoint's number of steps.
+    """
+    split = filename.split("model")
+    if len(split) < 2:
+        return 0
+    split1 = split[-1].split(".")[0]
+    try:
+        return int(split1)
+    except ValueError:
+        return 0
+def get_blob_logdir():
+    # You can change this to be a separate path to save checkpoints to
+    # a blobstore or some external drive.
+    return logger.get_dir()
+def find_resume_checkpoint():
+    # On your infrastructure, you may want to override this to automatically
+    # discover the latest checkpoint on your blob storage, etc.
+    return None
+def find_ema_checkpoint(main_checkpoint, step, rate):
+    if main_checkpoint is None:
+        return None
+    filename = f"ema_{rate}_{(step):06d}.pt"
+    path = bf.join(bf.dirname(main_checkpoint), filename)
+    if bf.exists(path):
+        return path
+    return None
+def log_loss_dict(diffusion, ts, losses):
+    for key, values in losses.items():
+        logger.logkv_mean(key, values.mean().item())
+        # Log the quantiles (four quartiles, in particular).
+        for sub_t, sub_loss in zip(ts.cpu().numpy(), values.detach().cpu().numpy()):
+            quartile = int(4 * sub_t / diffusion.num_timesteps)
+            logger.logkv_mean(f"{key}_q{quartile}", sub_loss)

guided_diffusion/unet.py ADDED Viewed

	@@ -0,0 +1,906 @@

+from abc import abstractmethod
+import math
+import numpy as np
+import torch as th
+import torch.nn as nn
+import torch.nn.functional as F
+from .fp16_util import convert_module_to_f16, convert_module_to_f32
+from .nn import (
+    checkpoint,
+    conv_nd,
+    linear,
+    avg_pool_nd,
+    zero_module,
+    normalization,
+    timestep_embedding,
+)
+class AttentionPool2d(nn.Module):
+    """
+    Adapted from CLIP: https://github.com/openai/CLIP/blob/main/clip/model.py
+    """
+    def __init__(
+        self,
+        spacial_dim: int,
+        embed_dim: int,
+        num_heads_channels: int,
+        output_dim: int = None,
+        chord: bool = False,
+    ):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(
+            th.randn(embed_dim, spacial_dim ** 2 + 1) / embed_dim ** 0.5
+        )
+        self.qkv_proj = conv_nd(1, embed_dim, 3 * embed_dim, 1)
+        self.c_proj = conv_nd(1, embed_dim, output_dim or embed_dim, 1)
+        self.chord = chord
+        if chord:
+            self.c_proj_key = conv_nd(1, embed_dim, 25, 1)
+        self.num_heads = embed_dim // num_heads_channels
+        self.attention = QKVAttention(self.num_heads)
+    def forward(self, x):
+        b, c, *_spatial = x.shape
+        x = x.reshape(b, c, -1)  # NC(HW)
+        x = th.cat([x.mean(dim=-1, keepdim=True), x], dim=-1)  # NC(HW+1)
+        x = x + self.positional_embedding[None, :, :].to(x.dtype)  # NC(HW+1)
+        x = self.qkv_proj(x)
+        x = self.attention(x)
+        if self.chord:
+            x_key = self.c_proj_key(x)
+            key = x_key[:, :, 0]
+            x_chord = self.c_proj(x)[:, :, 1:]
+            chord = x_chord.reshape(b, -1, *_spatial).mean(dim=2).permute(0, 2, 1)
+            return key, chord
+        else:
+            x = self.c_proj(x)
+            return x[:, :, 0]
+class TimestepBlock(nn.Module):
+    """
+    Any module where forward() takes timestep embeddings as a second argument.
+    """
+    @abstractmethod
+    def forward(self, x, emb):
+        """
+        Apply the module to `x` given `emb` timestep embeddings.
+        """
+class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
+    """
+    A sequential module that passes timestep embeddings to the children that
+    support it as an extra input.
+    """
+    def forward(self, x, emb):
+        for layer in self:
+            if isinstance(layer, TimestepBlock):
+                x = layer(x, emb)
+            else:
+                x = layer(x)
+        return x
+class Upsample(nn.Module):
+    """
+    An upsampling layer with an optional convolution.
+    :param channels: channels in the inputs and outputs.
+    :param use_conv: a bool determining if a convolution is applied.
+    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 upsampling occurs in the inner-two dimensions.
+    """
+    def __init__(self, channels, use_conv, dims=2, out_channels=None):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        if use_conv:
+            self.conv = conv_nd(dims, self.channels, self.out_channels, 3, padding=1)
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        if self.dims == 3:
+            x = F.interpolate(
+                x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode="nearest"
+            )
+        else:
+            x = F.interpolate(x, scale_factor=2, mode="nearest")
+        if self.use_conv:
+            x = self.conv(x)
+        return x
+class Downsample(nn.Module):
+    """
+    A downsampling layer with an optional convolution.
+    :param channels: channels in the inputs and outputs.
+    :param use_conv: a bool determining if a convolution is applied.
+    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 downsampling occurs in the inner-two dimensions.
+    """
+    def __init__(self, channels, use_conv, dims=2, out_channels=None):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        stride = 2 if dims != 3 else (1, 2, 2)
+        if use_conv:
+            self.op = conv_nd(
+                dims, self.channels, self.out_channels, 3, stride=stride, padding=1
+            )
+        else:
+            assert self.channels == self.out_channels
+            self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        return self.op(x)
+class ResBlock(TimestepBlock):
+    """
+    A residual block that can optionally change the number of channels.
+    :param channels: the number of input channels.
+    :param emb_channels: the number of timestep embedding channels.
+    :param dropout: the rate of dropout.
+    :param out_channels: if specified, the number of out channels.
+    :param use_conv: if True and out_channels is specified, use a spatial
+        convolution instead of a smaller 1x1 convolution to change the
+        channels in the skip connection.
+    :param dims: determines if the signal is 1D, 2D, or 3D.
+    :param use_checkpoint: if True, use gradient checkpointing on this module.
+    :param up: if True, use this block for upsampling.
+    :param down: if True, use this block for downsampling.
+    """
+    def __init__(
+        self,
+        channels,
+        emb_channels,
+        dropout,
+        out_channels=None,
+        use_conv=False,
+        use_scale_shift_norm=False,
+        dims=2,
+        use_checkpoint=False,
+        up=False,
+        down=False,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.emb_channels = emb_channels
+        self.dropout = dropout
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_checkpoint = use_checkpoint
+        self.use_scale_shift_norm = use_scale_shift_norm
+        self.in_layers = nn.Sequential(
+            normalization(channels),
+            nn.SiLU(),
+            conv_nd(dims, channels, self.out_channels, 3, padding=1),
+        )
+        self.updown = up or down
+        if up:
+            self.h_upd = Upsample(channels, False, dims)
+            self.x_upd = Upsample(channels, False, dims)
+        elif down:
+            self.h_upd = Downsample(channels, False, dims)
+            self.x_upd = Downsample(channels, False, dims)
+        else:
+            self.h_upd = self.x_upd = nn.Identity()
+        self.emb_layers = nn.Sequential(
+            nn.SiLU(),
+            linear(
+                emb_channels,
+                2 * self.out_channels if use_scale_shift_norm else self.out_channels,
+            ),
+        )
+        self.out_layers = nn.Sequential(
+            normalization(self.out_channels),
+            nn.SiLU(),
+            nn.Dropout(p=dropout),
+            zero_module(
+                conv_nd(dims, self.out_channels, self.out_channels, 3, padding=1)
+            ),
+        )
+        if self.out_channels == channels:
+            self.skip_connection = nn.Identity()
+        elif use_conv:
+            self.skip_connection = conv_nd(
+                dims, channels, self.out_channels, 3, padding=1
+            )
+        else:
+            self.skip_connection = conv_nd(dims, channels, self.out_channels, 1)
+    def forward(self, x, emb):
+        """
+        Apply the block to a Tensor, conditioned on a timestep embedding.
+        :param x: an [N x C x ...] Tensor of features.
+        :param emb: an [N x emb_channels] Tensor of timestep embeddings.
+        :return: an [N x C x ...] Tensor of outputs.
+        """
+        return checkpoint(
+            self._forward, (x, emb), self.parameters(), self.use_checkpoint
+        )
+    def _forward(self, x, emb):
+        if self.updown:
+            in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
+            h = in_rest(x)
+            h = self.h_upd(h)
+            x = self.x_upd(x)
+            h = in_conv(h)
+        else:
+            h = self.in_layers(x)
+        emb_out = self.emb_layers(emb).type(h.dtype)
+        while len(emb_out.shape) < len(h.shape):
+            emb_out = emb_out[..., None]
+        if self.use_scale_shift_norm:
+            out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
+            scale, shift = th.chunk(emb_out, 2, dim=1)
+            h = out_norm(h) * (1 + scale) + shift
+            h = out_rest(h)
+        else:
+            h = h + emb_out
+            h = self.out_layers(h)
+        return self.skip_connection(x) + h
+class AttentionBlock(nn.Module):
+    """
+    An attention block that allows spatial positions to attend to each other.
+    Originally ported from here, but adapted to the N-d case.
+    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
+    """
+    def __init__(
+        self,
+        channels,
+        num_heads=1,
+        num_head_channels=-1,
+        use_checkpoint=False,
+        use_new_attention_order=False,
+    ):
+        super().__init__()
+        self.channels = channels
+        if num_head_channels == -1:
+            self.num_heads = num_heads
+        else:
+            assert (
+                channels % num_head_channels == 0
+            ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
+            self.num_heads = channels // num_head_channels
+        self.use_checkpoint = use_checkpoint
+        self.norm = normalization(channels)
+        self.qkv = conv_nd(1, channels, channels * 3, 1)
+        if use_new_attention_order:
+            # split qkv before split heads
+            self.attention = QKVAttention(self.num_heads)
+        else:
+            # split heads before split qkv
+            self.attention = QKVAttentionLegacy(self.num_heads)
+        self.proj_out = zero_module(conv_nd(1, channels, channels, 1))
+    def forward(self, x):
+        return checkpoint(self._forward, (x,), self.parameters(), True)
+    def _forward(self, x):
+        b, c, *spatial = x.shape
+        x = x.reshape(b, c, -1)
+        qkv = self.qkv(self.norm(x))
+        h = self.attention(qkv)
+        h = self.proj_out(h)
+        return (x + h).reshape(b, c, *spatial)
+def count_flops_attn(model, _x, y):
+    """
+    A counter for the `thop` package to count the operations in an
+    attention operation.
+    Meant to be used like:
+        macs, params = thop.profile(
+            model,
+            inputs=(inputs, timestamps),
+            custom_ops={QKVAttention: QKVAttention.count_flops},
+        )
+    """
+    b, c, *spatial = y[0].shape
+    num_spatial = int(np.prod(spatial))
+    # We perform two matmuls with the same number of ops.
+    # The first computes the weight matrix, the second computes
+    # the combination of the value vectors.
+    matmul_ops = 2 * b * (num_spatial ** 2) * c
+    model.total_ops += th.DoubleTensor([matmul_ops])
+class QKVAttentionLegacy(nn.Module):
+    """
+    A module which performs QKV attention. Matches legacy QKVAttention + input/ouput heads shaping
+    """
+    def __init__(self, n_heads):
+        super().__init__()
+        self.n_heads = n_heads
+    def forward(self, qkv):
+        """
+        Apply QKV attention.
+        :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs.
+        :return: an [N x (H * C) x T] tensor after attention.
+        """
+        bs, width, length = qkv.shape
+        assert width % (3 * self.n_heads) == 0
+        ch = width // (3 * self.n_heads)
+        q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1)
+        scale = 1 / math.sqrt(math.sqrt(ch))
+        weight = th.einsum(
+            "bct,bcs->bts", q * scale, k * scale
+        )  # More stable with f16 than dividing afterwards
+        weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
+        a = th.einsum("bts,bcs->bct", weight, v)
+        return a.reshape(bs, -1, length)
+    @staticmethod
+    def count_flops(model, _x, y):
+        return count_flops_attn(model, _x, y)
+class QKVAttention(nn.Module):
+    """
+    A module which performs QKV attention and splits in a different order.
+    """
+    def __init__(self, n_heads):
+        super().__init__()
+        self.n_heads = n_heads
+    def forward(self, qkv):
+        """
+        Apply QKV attention.
+        :param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs.
+        :return: an [N x (H * C) x T] tensor after attention.
+        """
+        bs, width, length = qkv.shape
+        assert width % (3 * self.n_heads) == 0
+        ch = width // (3 * self.n_heads)
+        q, k, v = qkv.chunk(3, dim=1)
+        scale = 1 / math.sqrt(math.sqrt(ch))
+        weight = th.einsum(
+            "bct,bcs->bts",
+            (q * scale).view(bs * self.n_heads, ch, length),
+            (k * scale).view(bs * self.n_heads, ch, length),
+        )  # More stable with f16 than dividing afterwards
+        weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
+        a = th.einsum("bts,bcs->bct", weight, v.reshape(bs * self.n_heads, ch, length))
+        return a.reshape(bs, -1, length)
+    @staticmethod
+    def count_flops(model, _x, y):
+        return count_flops_attn(model, _x, y)
+class UNetModel(nn.Module):
+    """
+    The full UNet model with attention and timestep embedding.
+    :param in_channels: channels in the input Tensor.
+    :param model_channels: base channel count for the model.
+    :param out_channels: channels in the output Tensor.
+    :param num_res_blocks: number of residual blocks per downsample.
+    :param attention_resolutions: a collection of downsample rates at which
+        attention will take place. May be a set, list, or tuple.
+        For example, if this contains 4, then at 4x downsampling, attention
+        will be used.
+    :param dropout: the dropout probability.
+    :param channel_mult: channel multiplier for each level of the UNet.
+    :param conv_resample: if True, use learned convolutions for upsampling and
+        downsampling.
+    :param dims: determines if the signal is 1D, 2D, or 3D.
+    :param num_classes: if specified (as an int), then this model will be
+        class-conditional with `num_classes` classes.
+    :param use_checkpoint: use gradient checkpointing to reduce memory usage.
+    :param num_heads: the number of attention heads in each attention layer.
+    :param num_heads_channels: if specified, ignore num_heads and instead use
+                               a fixed channel width per attention head.
+    :param num_heads_upsample: works with num_heads to set a different number
+                               of heads for upsampling. Deprecated.
+    :param use_scale_shift_norm: use a FiLM-like conditioning mechanism.
+    :param resblock_updown: use residual blocks for up/downsampling.
+    :param use_new_attention_order: use a different attention pattern for potentially
+                                    increased efficiency.
+    """
+    def __init__(
+        self,
+        image_size,
+        in_channels,
+        model_channels,
+        out_channels,
+        num_res_blocks,
+        attention_resolutions,
+        dropout=0,
+        channel_mult=(1, 2, 4, 8),
+        conv_resample=True,
+        dims=2,
+        num_classes=None,
+        use_checkpoint=False,
+        use_fp16=False,
+        num_heads=1,
+        num_head_channels=-1,
+        num_heads_upsample=-1,
+        use_scale_shift_norm=False,
+        resblock_updown=False,
+        use_new_attention_order=False,
+    ):
+        super().__init__()
+        if num_heads_upsample == -1:
+            num_heads_upsample = num_heads
+        self.image_size = image_size
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.out_channels = out_channels
+        self.num_res_blocks = num_res_blocks
+        self.attention_resolutions = attention_resolutions
+        self.dropout = dropout
+        self.channel_mult = channel_mult
+        self.conv_resample = conv_resample
+        self.num_classes = num_classes
+        self.use_checkpoint = use_checkpoint
+        self.dtype = th.float16 if use_fp16 else th.float32
+        self.num_heads = num_heads
+        self.num_head_channels = num_head_channels
+        self.num_heads_upsample = num_heads_upsample
+        time_embed_dim = model_channels * 4
+        self.time_embed = nn.Sequential(
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim),
+        )
+        if self.num_classes is not None:
+            self.label_emb = nn.Embedding(num_classes, time_embed_dim)
+        ch = input_ch = int(channel_mult[0] * model_channels)
+        self.input_blocks = nn.ModuleList(
+            [TimestepEmbedSequential(conv_nd(dims, in_channels, ch, 3, padding=1))]
+        )
+        self._feature_size = ch
+        input_block_chans = [ch]
+        ds = 1
+        for level, mult in enumerate(channel_mult):
+            for _ in range(num_res_blocks):
+                layers = [
+                    ResBlock(
+                        ch,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=int(mult * model_channels),
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = int(mult * model_channels)
+                if ds in attention_resolutions:
+                    layers.append(
+                        AttentionBlock(
+                            ch,
+                            use_checkpoint=use_checkpoint,
+                            num_heads=num_heads,
+                            num_head_channels=num_head_channels,
+                            use_new_attention_order=use_new_attention_order,
+                        )
+                    )
+                self.input_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+                input_block_chans.append(ch)
+            if level != len(channel_mult) - 1:
+                out_ch = ch
+                self.input_blocks.append(
+                    TimestepEmbedSequential(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            down=True,
+                        )
+                        if resblock_updown
+                        else Downsample(
+                            ch, conv_resample, dims=dims, out_channels=out_ch
+                        )
+                    )
+                )
+                ch = out_ch
+                input_block_chans.append(ch)
+                ds *= 2
+                self._feature_size += ch
+        self.middle_block = TimestepEmbedSequential(
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+            AttentionBlock(
+                ch,
+                use_checkpoint=use_checkpoint,
+                num_heads=num_heads,
+                num_head_channels=num_head_channels,
+                use_new_attention_order=use_new_attention_order,
+            ),
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+        )
+        self._feature_size += ch
+        self.output_blocks = nn.ModuleList([])
+        for level, mult in list(enumerate(channel_mult))[::-1]:
+            for i in range(num_res_blocks + 1):
+                ich = input_block_chans.pop()
+                layers = [
+                    ResBlock(
+                        ch + ich,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=int(model_channels * mult),
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = int(model_channels * mult)
+                if ds in attention_resolutions:
+                    layers.append(
+                        AttentionBlock(
+                            ch,
+                            use_checkpoint=use_checkpoint,
+                            num_heads=num_heads_upsample,
+                            num_head_channels=num_head_channels,
+                            use_new_attention_order=use_new_attention_order,
+                        )
+                    )
+                if level and i == num_res_blocks:
+                    out_ch = ch
+                    layers.append(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            up=True,
+                        )
+                        if resblock_updown
+                        else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch)
+                    )
+                    ds //= 2
+                self.output_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+        self.out = nn.Sequential(
+            normalization(ch),
+            nn.SiLU(),
+            zero_module(conv_nd(dims, input_ch, out_channels, 3, padding=1)),
+        )
+    def convert_to_fp16(self):
+        """
+        Convert the torso of the model to float16.
+        """
+        self.input_blocks.apply(convert_module_to_f16)
+        self.middle_block.apply(convert_module_to_f16)
+        self.output_blocks.apply(convert_module_to_f16)
+    def convert_to_fp32(self):
+        """
+        Convert the torso of the model to float32.
+        """
+        self.input_blocks.apply(convert_module_to_f32)
+        self.middle_block.apply(convert_module_to_f32)
+        self.output_blocks.apply(convert_module_to_f32)
+    def forward(self, x, timesteps, y=None):
+        """
+        Apply the model to an input batch.
+        :param x: an [N x C x ...] Tensor of inputs.
+        :param timesteps: a 1-D batch of timesteps.
+        :param y: an [N] Tensor of labels, if class-conditional.
+        :return: an [N x C x ...] Tensor of outputs.
+        """
+        assert (y is not None) == (
+            self.num_classes is not None
+        ), "must specify y if and only if the model is class-conditional"
+        hs = []
+        emb = self.time_embed(timestep_embedding(timesteps, self.model_channels))
+        if self.num_classes is not None:
+            assert y.shape == (x.shape[0],)
+            emb = emb + self.label_emb(y)
+        h = x.type(self.dtype)
+        for module in self.input_blocks:
+            h = module(h, emb)
+            hs.append(h)
+        h = self.middle_block(h, emb)
+        for module in self.output_blocks:
+            h = th.cat([h, hs.pop()], dim=1)
+            h = module(h, emb)
+        h = h.type(x.dtype)
+        return self.out(h)
+class SuperResModel(UNetModel):
+    """
+    A UNetModel that performs super-resolution.
+    Expects an extra kwarg `low_res` to condition on a low-resolution image.
+    """
+    def __init__(self, image_size, in_channels, *args, **kwargs):
+        super().__init__(image_size, in_channels * 2, *args, **kwargs)
+    def forward(self, x, timesteps, low_res=None, **kwargs):
+        _, _, new_height, new_width = x.shape
+        upsampled = F.interpolate(low_res, (new_height, new_width), mode="bilinear")
+        x = th.cat([x, upsampled], dim=1)
+        return super().forward(x, timesteps, **kwargs)
+class EncoderUNetModel(nn.Module):
+    """
+    The half UNet model with attention and timestep embedding.
+    For usage, see UNet.
+    """
+    def __init__(
+        self,
+        image_size,
+        in_channels,
+        model_channels,
+        out_channels,
+        num_res_blocks,
+        attention_resolutions,
+        dropout=0,
+        channel_mult=(1, 2, 4, 8),
+        conv_resample=True,
+        dims=2,
+        use_checkpoint=False,
+        use_fp16=False,
+        num_heads=1,
+        num_head_channels=-1,
+        num_heads_upsample=-1,
+        use_scale_shift_norm=False,
+        resblock_updown=False,
+        use_new_attention_order=False,
+        pool="adaptive",
+        chord=False,
+    ):
+        super().__init__()
+        if num_heads_upsample == -1:
+            num_heads_upsample = num_heads
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.out_channels = out_channels
+        self.num_res_blocks = num_res_blocks
+        self.attention_resolutions = attention_resolutions
+        self.dropout = dropout
+        self.channel_mult = channel_mult
+        self.conv_resample = conv_resample
+        self.use_checkpoint = use_checkpoint
+        self.dtype = th.float16 if use_fp16 else th.float32
+        self.num_heads = num_heads
+        self.num_head_channels = num_head_channels
+        self.num_heads_upsample = num_heads_upsample
+        time_embed_dim = model_channels * 4
+        self.time_embed = nn.Sequential(
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim),
+        )
+        ch = int(channel_mult[0] * model_channels)
+        self.input_blocks = nn.ModuleList(
+            [TimestepEmbedSequential(conv_nd(dims, in_channels, ch, 3, padding=1))]
+        )
+        self._feature_size = ch
+        input_block_chans = [ch]
+        ds = 1
+        for level, mult in enumerate(channel_mult):
+            for _ in range(num_res_blocks):
+                layers = [
+                    ResBlock(
+                        ch,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=int(mult * model_channels),
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = int(mult * model_channels)
+                if ds in attention_resolutions:
+                    layers.append(
+                        AttentionBlock(
+                            ch,
+                            use_checkpoint=use_checkpoint,
+                            num_heads=num_heads,
+                            num_head_channels=num_head_channels,
+                            use_new_attention_order=use_new_attention_order,
+                        )
+                    )
+                self.input_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+                input_block_chans.append(ch)
+            if level != len(channel_mult) - 1:
+                out_ch = ch
+                self.input_blocks.append(
+                    TimestepEmbedSequential(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            down=True,
+                        )
+                        if resblock_updown
+                        else Downsample(
+                            ch, conv_resample, dims=dims, out_channels=out_ch
+                        )
+                    )
+                )
+                ch = out_ch
+                input_block_chans.append(ch)
+                ds *= 2
+                self._feature_size += ch
+        self.middle_block = TimestepEmbedSequential(
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+            AttentionBlock(
+                ch,
+                use_checkpoint=use_checkpoint,
+                num_heads=num_heads,
+                num_head_channels=num_head_channels,
+                use_new_attention_order=use_new_attention_order,
+            ),
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+        )
+        self._feature_size += ch
+        self.pool = pool
+        if pool == "adaptive":
+            self.out = nn.Sequential(
+                normalization(ch),
+                nn.SiLU(),
+                nn.AdaptiveAvgPool2d((1, 1)),
+                zero_module(conv_nd(dims, ch, out_channels, 1)),
+                nn.Flatten(),
+            )
+        elif pool == "attention":
+            assert num_head_channels != -1
+            self.out = nn.Sequential(
+                normalization(ch),
+                nn.SiLU(),
+                AttentionPool2d(
+                    (image_size // ds), ch, num_head_channels, out_channels, chord
+                ),
+            )
+        elif pool == "spatial":
+            self.out = nn.Sequential(
+                nn.Linear(self._feature_size, 2048),
+                nn.ReLU(),
+                nn.Linear(2048, self.out_channels),
+            )
+        elif pool == "spatial_v2":
+            self.out = nn.Sequential(
+                nn.Linear(self._feature_size, 2048),
+                normalization(2048),
+                nn.SiLU(),
+                nn.Linear(2048, self.out_channels),
+            )
+        else:
+            raise NotImplementedError(f"Unexpected {pool} pooling")
+    def convert_to_fp16(self):
+        """
+        Convert the torso of the model to float16.
+        """
+        self.input_blocks.apply(convert_module_to_f16)
+        self.middle_block.apply(convert_module_to_f16)
+    def convert_to_fp32(self):
+        """
+        Convert the torso of the model to float32.
+        """
+        self.input_blocks.apply(convert_module_to_f32)
+        self.middle_block.apply(convert_module_to_f32)
+    def forward(self, x, timesteps):
+        """
+        Apply the model to an input batch.
+        :param x: an [N x C x ...] Tensor of inputs.
+        :param timesteps: a 1-D batch of timesteps.
+        :return: an [N x K] Tensor of outputs.
+        """
+        emb = self.time_embed(timestep_embedding(timesteps, self.model_channels))
+        results = []
+        h = x.type(self.dtype)
+        for module in self.input_blocks:
+            h = module(h, emb)
+            if self.pool.startswith("spatial"):
+                results.append(h.type(x.dtype).mean(dim=(2, 3)))
+        h = self.middle_block(h, emb)
+        if self.pool.startswith("spatial"):
+            results.append(h.type(x.dtype).mean(dim=(2, 3)))
+            h = th.cat(results, axis=-1)
+            return self.out(h)
+        else:
+            h = h.type(x.dtype)
+            return self.out(h)

load_utils.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import importlib
+import torch
+from omegaconf import OmegaConf
+def get_obj_from_str(string, reload=False):
+    module, cls = string.rsplit(".", 1)
+    if reload:
+        module_imp = importlib.import_module(module)
+        importlib.reload(module_imp)
+    return getattr(importlib.import_module(module, package=None), cls)
+def instantiate_from_config(config):
+    if not "target" in config:
+        raise KeyError("Expected key `target` to instantiate.")
+    return get_obj_from_str(config["target"])(**config.get("params", dict()))
+def load_model(name, ckpt):
+    config = OmegaConf.load(f'taming-transformers/configs/pr/{name}.yaml')
+    model = instantiate_from_config(config.model)
+    model.init_from_ckpt(ckpt) # load_state_dict(mc['state_dict'])
+    model.eval()
+    return model
+def load_data(name):
+    config = OmegaConf.load(f'taming-transformers/configs/pr/{name}.yaml')
+    data = instantiate_from_config(config.data)
+    return data

music_evaluation/README.md ADDED Viewed

	@@ -0,0 +1,22 @@

+# Music Evaluation
+Adapted from this GitHub repository [mgeval](https://github.com/RichardYang40148/mgeval)
+Deleted all packages in mgeval using python 2.
+# Packages:
+scipy, numpy, seaborn, pretty_midi, scikit-learn, python 3
+# Usage:
+```
+python music_evaluator.py --set1dir /path/to/your/ground-truth/data/ --set2dir /path/to/your/generated-sample/ --outdir output-dir --num_sample number-of-samples-to-evaluate
+```
+# Output
+All outputs are in the output-dir directory in the current folder, including plots and statistics.txt.
+Check out the result folder for an example.
+You can either run music_evaluator.py or demo.ipynb for evaluation.

music_evaluation/convert_to_wav.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from midi2audio import FluidSynth
+import os
+import sys
+# This program converts a folder of .midi files to a folder of .wav files
+# Need to download FluidSynth and midi2audio packages
+#
+# Usage:
+# python convert_to_wav.py midi_dir wav_dir
+# More info about FluidSynth: https://github.com/FluidSynth/fluidsynth
+# More info about midi2audio: https://github.com/bzamecnik/midi2audio
+# Need Sound fonts to run this program: https://sites.google.com/site/soundfonts4u/
+# The sound font used in this program: https://drive.google.com/file/d/1nvTy62-wHGnZ6CKYuPNAiGlKLtWg9Ir9/view?usp=sharing
+def convert_midi_to_audio(input_dir, output_dir, fs):
+    # sound_font_path = os.path.join(os.getcwd(), "Dore Mark's NY S&S Model B-v5.2.sf2")
+    # fs = FluidSynth(sound_font_path)
+    os.chdir(input_dir)
+    filenames = os.listdir(input_dir)
+    for midi_file in filenames:
+        filename = midi_file[:-5]
+        filename = filename + ".wav"
+        output_file = os.path.join(output_dir, filename)
+        fs.midi_to_audio(midi_file, output_file)
+    return
+if __name__ == '__main__':
+    sound_font_path = os.path.join(os.getcwd(), "Dore Mark's NY S&S Model B-v5.2.sf2")
+    fs = FluidSynth(sound_font_path)
+    # fs.midi_to_audio('MIDI-Unprocessed_01_R1_2006_01-09_ORIG_MID--AUDIO_01_R1_2006_01_Track01_wav_0.midi', 'output.wav')
+    output_dir = sys.argv[2]
+    os.makedirs(output_dir, exist_ok=True)
+    current_path = os.getcwd()
+    output_dir = os.path.join(current_path, output_dir)
+    input_dir = sys.argv[1]
+    input_dir = os.path.join(current_path, input_dir)
+    convert_midi_to_audio(input_dir, output_dir, fs)

music_evaluation/demo.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

music_evaluation/fad.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from frechet_audio_distance import FrechetAudioDistance
+import sys
+# Compute FAD distance between the ground truth dataset and sample dataset
+# Pretty slow depends on the speed
+# Save embeddings.npy for future fast usage
+#
+# Usage: python fad.py background_dir_path eval_dir_path
+# Feel free the change the embedding path in the code
+# More info about FrechetAudioDistance: https://github.com/gudgud96/frechet-audio-distance
+if __name__ == "__main__":
+    # to use `vggish`
+    frechet = FrechetAudioDistance(
+        model_name="vggish",
+        use_pca=False,
+        use_activation=False,
+        verbose=False
+    )
+    # # to use `PANN`
+    # frechet = FrechetAudioDistance(
+    #     model_name="pann",
+    #     use_pca=False,
+    #     use_activation=False,
+    #     verbose=False
+    # )
+    background_dir = sys.argv[1]
+    eval_dir = sys.argv[2]
+    background_embds_path = "./ground_truth_embeddings.npy"
+    eval_embds_path = "./eval_embeddings.npy"
+    fad_score = frechet.score(background_dir, eval_dir,
+                              background_embds_path=background_embds_path,
+                              eval_embds_path=eval_embds_path,dtype="float32")
+    print(fad_score)

music_evaluation/figaro/chord_recognition.py ADDED Viewed

	@@ -0,0 +1,247 @@

+import numpy as np
+class MIDIChord(object):
+  def __init__(self, pm):
+    self.pm = pm
+    # define pitch classes
+    self.PITCH_CLASSES = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
+    # define chord maps (required)
+    self.CHORD_MAPS = {'maj': [0, 4],
+                        'min': [0, 3],
+                        'dim': [0, 3, 6],
+                        'aug': [0, 4, 8],
+                        'dom7': [0, 4, 10],
+                        'maj7': [0, 4, 11],
+                        'min7': [0, 3, 10]}
+    # define chord insiders (+10)
+    self.CHORD_INSIDERS = {'maj': [7],
+                            'min': [7],
+                            'dim': [9],
+                            'aug': [],
+                            'dom7': [7],
+                            'maj7': [7],
+                            'min7': [7]}
+    # define chord outsiders (-1)
+    self.CHORD_OUTSIDERS_1 = {'maj': [2, 5, 9],
+                              'min': [2, 5, 8],
+                              'dim': [2, 5, 10],
+                              'aug': [2, 5, 9],
+                              'dom7': [2, 5, 9],
+                              'maj7': [2, 5, 9],
+                              'maj7': [2, 5, 9],
+                              'min7': [2, 5, 8]}
+    # define chord outsiders (-2)
+    self.CHORD_OUTSIDERS_2 = {'maj': [1, 3, 6, 8, 10, 11],
+                              'min': [1, 4, 6, 9, 11],
+                              'dim': [1, 4, 7, 8, 11],
+                              'aug': [1, 3, 6, 7, 10],
+                              'dom7': [1, 3, 6, 8, 11],
+                              'maj7': [1, 3, 6, 8, 10],
+                              'min7': [1, 4, 6, 9, 11]}
+  def sequencing(self, chroma):
+    candidates = {}
+    for index in range(len(chroma)):
+      if chroma[index]:
+        root_note = index
+        _chroma = np.roll(chroma, -root_note)
+        sequence = np.where(_chroma == 1)[0]
+        candidates[root_note] = list(sequence)
+    return candidates
+  def scoring(self, candidates):
+    scores = {}
+    qualities = {}
+    for root_note, sequence in candidates.items():
+      if 3 not in sequence and 4 not in sequence:
+        scores[root_note] = -100
+        qualities[root_note] = 'None'
+      elif 3 in sequence and 4 in sequence:
+        scores[root_note] = -100
+        qualities[root_note] = 'None'
+      else:
+        # decide quality
+        if 3 in sequence:
+          if 6 in sequence:
+            quality = 'dim'
+          else:
+            if 10 in sequence:
+              quality = 'min7'
+            else:
+              quality = 'min'
+        elif 4 in sequence:
+          if 8 in sequence:
+            quality = 'aug'
+          else:
+            if 10 in sequence:
+              quality = 'dom7'
+            elif 11 in sequence:
+              quality = 'maj7'
+            else:
+              quality = 'maj'
+        # decide score
+        maps = self.CHORD_MAPS.get(quality)
+        _notes = [n for n in sequence if n not in maps]
+        score = 0
+        for n in _notes:
+          if n in self.CHORD_OUTSIDERS_1.get(quality):
+            score -= 1
+          elif n in self.CHORD_OUTSIDERS_2.get(quality):
+            score -= 2
+          elif n in self.CHORD_INSIDERS.get(quality):
+            score += 10
+        scores[root_note] = score
+        qualities[root_note] = quality
+    return scores, qualities
+  def find_chord(self, chroma, threshold=10):
+      chroma = np.sum(chroma, axis=1)
+      chroma = np.array([1 if c > threshold else 0 for c in chroma])
+      if np.sum(chroma) == 0:
+          return 'N', 'N', 'N', 10
+      else:
+          candidates = self.sequencing(chroma=chroma)
+          scores, qualities = self.scoring(candidates=candidates)
+          # bass note
+          sorted_notes = []
+          for i, v in enumerate(chroma):
+              if v > 0:
+                  sorted_notes.append(int(i%12))
+          bass_note = sorted_notes[0]
+          # root note
+          __root_note = []
+          _max = max(scores.values())
+          for _root_note, score in scores.items():
+              if score == _max:
+                  __root_note.append(_root_note)
+          if len(__root_note) == 1:
+              root_note = __root_note[0]
+          else:
+              #TODO: what should i do
+              for n in sorted_notes:
+                  if n in __root_note:
+                      root_note = n
+                      break
+          # quality
+          quality = qualities.get(root_note)
+          sequence = candidates.get(root_note)
+          # score
+          score = scores.get(root_note)
+          return self.PITCH_CLASSES[root_note], quality, self.PITCH_CLASSES[bass_note], score
+  def greedy(self, candidates, max_tick, min_length):
+    chords = []
+    # start from 0
+    start_tick = 0
+    while start_tick < max_tick:
+      _candidates = candidates.get(start_tick)
+      _candidates = sorted(_candidates.items(), key=lambda x: (x[1][-1], x[0]))
+      # choose
+      end_tick, (root_note, quality, bass_note, _) = _candidates[-1]
+      if root_note == bass_note:
+        chord = '{}:{}'.format(root_note, quality)
+      else:
+        chord = '{}:{}/{}'.format(root_note, quality, bass_note)
+      chords.append([start_tick, end_tick, chord])
+      start_tick = end_tick
+    # remove :None
+    temp = chords
+    while ':None' in temp[0][-1]:
+      try:
+        temp[1][0] = temp[0][0]
+        del temp[0]
+      except:
+        print('NO CHORD')
+        return []
+    temp2 = []
+    for chord in temp:
+      if ':None' not in chord[-1]:
+        temp2.append(chord)
+      else:
+        temp2[-1][1] = chord[1]
+    return temp2
+  def dynamic(self, candidates, max_tick, min_length):
+    # store index of best chord at each position
+    chords = [None for i in range(max_tick + 1)]
+    # store score of best chords at each position
+    scores = np.zeros(max_tick + 1)
+    scores[1:].fill(np.NINF)
+    start_tick = 0
+    while start_tick < max_tick:
+      if start_tick in candidates:
+        for i, (end_tick, candidate) in enumerate(candidates.get(start_tick).items()):
+          root_note, quality, bass_note, score = candidate
+          # if this candidate is best yet, update scores and chords
+          if scores[end_tick] < scores[start_tick] + score:
+            scores[end_tick] = scores[start_tick] + score
+            if root_note == bass_note:
+              chord = '{}:{}'.format(root_note, quality)
+            else:
+              chord = '{}:{}/{}'.format(root_note, quality, bass_note)
+            chords[end_tick] = (start_tick, end_tick, chord)
+      start_tick += 1
+    # Read the best path
+    start_tick = len(chords) - 1
+    results = []
+    while start_tick > 0:
+      chord = chords[start_tick]
+      start_tick = chord[0]
+      results.append(chord)
+    return list(reversed(results))
+  def dedupe(self, chords):
+    if len(chords) == 0:
+      return []
+    deduped = []
+    start, end, chord = chords[0]
+    for (curr, next) in zip(chords[:-1], chords[1:]):
+      if chord == next[2]:
+        end = next[1]
+      else:
+        deduped.append([start, end, chord])
+        start, end, chord = next
+    deduped.append([start, end, chord])
+    return deduped
+  def get_candidates(self, chroma, max_tick, intervals=[1, 2, 3, 4]):
+    candidates = {}
+    for interval in intervals:
+      for start_beat in range(max_tick):
+        # set target pianoroll
+        end_beat = start_beat + interval
+        if end_beat > max_tick:
+          end_beat = max_tick
+        _chroma = chroma[:, start_beat:end_beat]
+        # find chord
+        root_note, quality, bass_note, score = self.find_chord(chroma=_chroma)
+        # save
+        if start_beat not in candidates:
+          candidates[start_beat] = {}
+          candidates[start_beat][end_beat] = (root_note, quality, bass_note, score)
+        else:
+          if end_beat not in candidates[start_beat]:
+            candidates[start_beat][end_beat] = (root_note, quality, bass_note, score)
+    return candidates
+  def extract(self):
+    # read
+    beats = self.pm.get_beats()
+    chroma = self.pm.get_chroma(times=beats)
+    # get lots of candidates
+    candidates = self.get_candidates(chroma, max_tick=len(beats))
+    # greedy
+    chords = self.dynamic(candidates=candidates,
+                          max_tick=len(beats),
+                          min_length=1)
+    chords = self.dedupe(chords)
+    for chord in chords:
+      chord[0] = beats[chord[0]]
+      if chord[1] >= len(beats):
+        chord[1] = self.pm.get_end_time()
+      else:
+        chord[1] = beats[chord[1]]
+    return chords

music_evaluation/figaro/constants.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import numpy as np
+# parameters for input representation
+DEFAULT_POS_PER_QUARTER = 12
+DEFAULT_VELOCITY_BINS = np.linspace(0, 128, 32+1, dtype=np.int32)
+DEFAULT_DURATION_BINS = np.sort(np.concatenate([
+  np.arange(1, 13), # smallest possible units up to 1 quarter
+  np.arange(12, 24, 3)[1:], # 16th notes up to 1 bar
+  np.arange(13, 24, 4)[1:], # triplets up to 1 bar
+  np.arange(24, 48, 6), # 8th notes up to 2 bars
+  np.arange(48, 4*48, 12), # quarter notes up to 8 bars
+  np.arange(4*48, 16*48+1, 24) # half notes up to 16 bars
+]))
+DEFAULT_TEMPO_BINS = np.linspace(0, 240, 32+1, dtype=np.int32)
+DEFAULT_NOTE_DENSITY_BINS = np.linspace(0, 12, 32+1)
+DEFAULT_MEAN_VELOCITY_BINS = np.linspace(0, 128, 32+1)
+DEFAULT_MEAN_PITCH_BINS = np.linspace(0, 128, 32+1)
+DEFAULT_MEAN_DURATION_BINS = np.logspace(0, 7, 32+1, base=2) # log space between 1 and 128 positions (~2.5 bars)
+# parameters for output
+DEFAULT_RESOLUTION = 480
+# maximum length of a single bar is 3*4 = 12 beats
+MAX_BAR_LENGTH = 3
+# maximum number of bars in a piece is 512 (this covers almost all sequences)
+MAX_N_BARS = 512
+PAD_TOKEN = '<pad>'
+UNK_TOKEN = '<unk>'
+BOS_TOKEN = '<bos>'
+EOS_TOKEN = '<eos>'
+MASK_TOKEN = '<mask>'
+TIME_SIGNATURE_KEY = 'Time Signature'
+BAR_KEY = 'Bar'
+POSITION_KEY = 'Position'
+INSTRUMENT_KEY = 'Instrument'
+PITCH_KEY = 'Pitch'
+VELOCITY_KEY = 'Velocity'
+DURATION_KEY = 'Duration'
+TEMPO_KEY = 'Tempo'
+CHORD_KEY = 'Chord'
+NOTE_DENSITY_KEY = 'Note Density'
+MEAN_PITCH_KEY = 'Mean Pitch'
+MEAN_VELOCITY_KEY = 'Mean Velocity'
+MEAN_DURATION_KEY = 'Mean Duration'

music_evaluation/figaro/evaluate.py ADDED Viewed

	@@ -0,0 +1,268 @@

+import os, glob
+from statistics import NormalDist
+import pandas as pd
+import numpy as np
+import input_representation as ir
+SAMPLE_DIR = os.getenv('SAMPLE_DIR', './samples')
+OUT_FILE = os.getenv('OUT_FILE', './metrics.csv')
+MAX_SAMPLES = int(os.getenv('MAX_SAMPLES', 1024))
+# use to find base file name when generate multiple files for 1 gt file
+SPLIT_STR = os.getenv('SPLIT_STR', None)
+POST_STR = os.getenv('POST_STR', None)
+METRICS = [
+  'inst_prec', 'inst_rec', 'inst_f1',
+  'chord_prec', 'chord_rec', 'chord_f1',
+  'time_sig_acc',
+  'note_dens_oa', 'pitch_oa', 'velocity_oa', 'duration_oa',
+  'chroma_crossent', 'chroma_kldiv', 'chroma_sim',
+  'groove_crossent', 'groove_kldiv', 'groove_sim',
+]
+DF_KEYS = ['id', 'original', 'sample'] + METRICS
+keys = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
+qualities = ['maj', 'min', 'dim', 'aug', 'dom7', 'maj7', 'min7', 'None']
+CHORDS = [f"{k}:{q}" for k in keys for q in qualities] + ['N:N']
+def get_group_id(file):
+  # change this depending on name of generated samples
+  name = os.path.basename(file)
+  return name.split('.')[0]
+def get_base_name(file):
+  base_file_name = os.path.basename(file)
+  base_name = base_file_name.split(SPLIT_STR)[0]
+  if POST_STR is not None:
+    gt_name = base_name + POST_STR
+  else:
+    gt_name = base_name
+  return gt_name
+def get_file_groups(path, max_samples=MAX_SAMPLES):
+  # change this depending on file structure of generated samples
+  files = glob.glob(os.path.join(path, '*.mid'), recursive=True) + glob.glob(os.path.join(path, '*.midi'), recursive=True)
+  assert len(files), f"provided directory was empty: {path}"
+  samples = sorted(files)
+  origs = sorted([os.path.join(path, 'gt', get_base_name(file)) for file in files])
+  pairs = list(zip(origs, samples))
+  pairs = list(filter(lambda pair: os.path.exists(pair[0]), pairs))
+  if max_samples > 0:
+    pairs = pairs[:max_samples]
+  groups = dict()
+  for orig, sample in pairs:
+    sample_id = get_group_id(sample)
+    orig_id = get_group_id(orig)
+    if orig_id not in groups:
+      groups[orig_id] = list()
+    groups[orig_id].append((orig, sample))
+  return list(groups.values())
+def read_file(file):
+  with open(file, 'r') as f:
+    events = f.read().split('\n')
+    events = [e for e in events if e]
+    return events
+def get_chord_groups(desc):
+  bars = [1 if 'Bar_' in item else 0 for item in desc]
+  bar_ids = np.cumsum(bars) - 1
+  groups = [[] for _ in range(bar_ids[-1] + 1)]
+  for i, item in enumerate(desc):
+    if 'Chord_' in item:
+      chord = item.split('_')[-1]
+      groups[bar_ids[i]].append(chord)
+  return groups
+def instruments(events):
+  insts = [128 if item.instrument == 'drum' else int(item.instrument) for item in events[1:-1] if item.name == 'Note']
+  insts = np.bincount(insts, minlength=129)
+  return (insts > 0).astype(int)
+def chords(events):
+  chords = [CHORDS.index(item) for item in events]
+  chords = np.bincount(chords, minlength=129)
+  return (chords > 0).astype(int)
+def chroma(events):
+  pitch_classes = [item.pitch % 12 for item in events[1:-1] if item.name == 'Note' and item.instrument != 'drum']
+  if len(pitch_classes):
+    count = np.bincount(pitch_classes, minlength=12)
+    count = count / np.sqrt(np.sum(count ** 2))
+  else:
+    count = np.array([1/12] * 12)
+  return count
+def groove(events, start=0, pos_per_bar=48, ticks_per_bar=1920):
+  flags = np.linspace(start, start + ticks_per_bar, pos_per_bar, endpoint=False)
+  onsets = [item.start for item in events[1:-1] if item.name == 'Note']
+  positions = [np.argmin(np.abs(flags - beat)) for beat in onsets]
+  if len(positions):
+    count = np.bincount(positions, minlength=pos_per_bar)
+    count = np.convolve(count, [1, 4, 1], 'same')
+    count = count / np.sqrt(np.sum(count ** 2))
+  else:
+    count = np.array([1/pos_per_bar] * pos_per_bar)
+  return count
+def multi_class_accuracy(y_true, y_pred):
+  tp = ((y_true == 1) & (y_pred == 1)).sum()
+  p = tp / y_pred.sum()
+  r = tp / y_true.sum()
+  if p + r > 0:
+    f1 = 2*p*r / (p + r)
+  else:
+    f1 = 0
+  return p, r, f1
+def cross_entropy(p_true, p_pred, eps=1e-8):
+  return -np.sum(p_true * np.log(p_pred + eps)) / len(p_true)
+def kl_divergence(p_true, p_pred, eps=1e-8):
+  return np.sum(p_true * (np.log(p_true + eps) - np.log(p_pred + eps))) / len(p_true)
+def cosine_sim(p_true, p_pred):
+  return np.sum(p_true * p_pred)
+def sliding_window_metrics(items, start, end, window=1920, step=480, ticks_per_beat=480):
+  glob_start, glob_end = start, end
+  notes = [item for item in items if item.name == 'Note']
+  starts = np.arange(glob_start, glob_end - window, step=step)
+  groups = []
+  start_idx, end_idx = 0, 0
+  for start in starts:
+    while notes[start_idx].start < start:
+      start_idx += 1
+    while end_idx < len(notes) and notes[end_idx].start < start + window:
+      end_idx += 1
+    groups.append([start] + notes[start_idx:end_idx] + [start + window])
+  return groups
+def meta_stats(group, ticks_per_beat=480):
+  start, end = group[0], group[-1]
+  ns = [item for item in group[1:-1] if item.name == 'Note']
+  ns_ = [note for note in ns if note.instrument != 'drum']
+  pitches = [note.pitch for note in ns_]
+  vels = [note.velocity for note in ns_]
+  durs = [(note.end - note.start) / ticks_per_beat for note in ns_]
+  return {
+    'note_density': len(ns) / ((end - start) / ticks_per_beat),
+    'pitch_mean': np.mean(pitches) if len(pitches) else np.nan,
+    'velocity_mean': np.mean(vels) if len(vels) else np.nan,
+    'duration_mean': np.mean(durs) if len(durs) else np.nan,
+    'pitch_std': np.std(pitches) if len(pitches) else np.nan,
+    'velocity_std': np.std(vels) if len(vels) else np.nan,
+    'duration_std': np.std(durs) if len(durs) else np.nan,
+  }
+def overlapping_area(mu1, sigma1, mu2, sigma2, eps=0.01):
+  sigma1, sigma2 = max(eps, sigma1), max(eps, sigma2)
+  return NormalDist(mu=mu1, sigma=sigma1).overlap(NormalDist(mu=mu2, sigma=sigma2))
+def main():
+  file_groups = get_file_groups(SAMPLE_DIR)
+  metrics = pd.DataFrame()
+  for sample_id, group in enumerate(file_groups):
+    print(f"[info] Group {sample_id + 1}/{len(file_groups)}")
+    micro_metrics = pd.DataFrame()
+    for orig_file, sample_file in group:
+      print(f"original: {orig_file.split('/')[-1]} | sample: {sample_file.split('/')[-1]}")
+      orig = ir.InputRepresentation(orig_file)
+      sample = ir.InputRepresentation(sample_file)
+      orig_desc, sample_desc = orig.get_description(), sample.get_description()
+      if len(orig_desc) == 0 or len(sample_desc) == 0:
+        print("[warning] empty sample! skipping")
+        continue
+      chord_groups1 = get_chord_groups(orig_desc)
+      chord_groups2 = get_chord_groups(sample_desc)
+      note_density_gt = []
+      for g1, g2, cg1, cg2 in zip(orig.groups, sample.groups, chord_groups1, chord_groups2):
+        row = pd.DataFrame([{ 'id': sample_id, 'original': orig_file.split('/')[-1], 'sample': sample_file.split('/')[-1]}])
+        meta1, meta2 = meta_stats(g1, ticks_per_beat=orig.pm.resolution), meta_stats(g2, ticks_per_beat=sample.pm.resolution)
+        row['pitch_oa'] = overlapping_area(meta1['pitch_mean'], meta1['pitch_std'], meta2['pitch_mean'], meta2['pitch_std'])
+        row['velocity_oa'] = overlapping_area(meta1['velocity_mean'], meta1['velocity_std'], meta2['velocity_mean'], meta2['velocity_std'])
+        row['duration_oa'] = overlapping_area(meta1['duration_mean'], meta1['duration_std'], meta2['duration_mean'], meta2['duration_std'])
+        row['note_density_abs_err'] = np.abs(meta1['note_density'] - meta2['note_density'])
+        row['mean_pitch_abs_err'] = np.abs(meta1['pitch_mean'] - meta2['pitch_mean'])
+        row['mean_velocity_abs_err'] = np.abs(meta1['velocity_mean'] - meta2['velocity_mean'])
+        row['mean_duration_abs_err'] = np.abs(meta1['duration_mean'] - meta2['duration_mean'])
+        note_density_gt.append(meta1['note_density'])
+        ts1, ts2 = orig._get_time_signature(g1[0]), sample._get_time_signature(g2[0])
+        ts1, ts2 = f"{ts1.numerator}/{ts1.denominator}", f"{ts2.numerator}/{ts2.denominator}"
+        row['time_sig_acc'] = 1 if ts1 == ts2 else 0
+        inst1, inst2 = instruments(g1), instruments(g2)
+        prec, rec, f1 = multi_class_accuracy(inst1, inst2)
+        row['inst_prec'] = prec
+        row['inst_rec'] = rec
+        row['inst_f1'] = f1
+        chords1, chords2 = chords(cg1), chords(cg2)
+        prec, rec, f1 = multi_class_accuracy(chords1, chords2)
+        row['chord_prec'] = prec
+        row['chord_rec'] = rec
+        row['chord_f1'] = f1
+        c1, c2 = chroma(g1), chroma(g2)
+        row['chroma_crossent'] = cross_entropy(c1, c2)
+        row['chroma_kldiv'] = kl_divergence(c1, c2)
+        row['chroma_sim'] = cosine_sim(c1, c2)
+        ppb = max(orig._get_positions_per_bar(g1[0]), sample._get_positions_per_bar(g2[0]))
+        tpb = max(orig._get_ticks_per_bar(g1[0]), sample._get_ticks_per_bar(g2[0]))
+        r1 = groove(g1, start=g1[0], pos_per_bar=ppb, ticks_per_bar=tpb)
+        r2 = groove(g2, start=g2[0], pos_per_bar=ppb, ticks_per_bar=tpb)
+        row['groove_crossent'] = cross_entropy(r1, r2)
+        row['groove_kldiv'] = kl_divergence(r1, r2)
+        row['groove_sim'] = cosine_sim(r1, r2)
+        micro_metrics = pd.concat([micro_metrics, row], ignore_index=True)
+    if len(micro_metrics) == 0:
+      continue
+    nd_mean = np.mean(note_density_gt)
+    micro_metrics['note_density_nsq_err'] = micro_metrics['note_density_abs_err']**2 / nd_mean**2
+    metrics = pd.concat([metrics, micro_metrics], ignore_index=True)
+    micro_avg = micro_metrics.mean(numeric_only=True)
+    print("[info] Group {}: inst_f1={:.2f} | chord_f1={:.2f} | pitch_oa={:.2f} | vel_oa={:.2f} | dur_oa={:.2f} | chroma_sim={:.2f} | groove_sim={:.2f}".format(
+      sample_id+1, micro_avg['inst_f1'], micro_avg['chord_f1'], micro_avg['pitch_oa'], micro_avg['velocity_oa'], micro_avg['duration_oa'], micro_avg['chroma_sim'], micro_avg['groove_sim']
+    ))
+  os.makedirs(os.path.dirname(OUT_FILE), exist_ok=True)
+  # metrics.to_csv(OUT_FILE, index=False)
+  summary_keys = ['inst_f1', 'chord_f1', 'time_sig_acc', 'pitch_oa', 'velocity_oa', 'duration_oa', 'chroma_sim', 'groove_sim']
+  summary = metrics[summary_keys + ['id']].groupby('id').mean().mean()
+  nsq_err = metrics.groupby('id')['note_density_nsq_err'].mean()
+  summary['note_density_nrmse'] = np.sqrt(nsq_err).mean()
+  print('***** SUMMARY *****')
+  print(summary)
+  summary.to_frame().T.to_csv(OUT_FILE, index=False)
+  print("done")
+if __name__ == '__main__':
+  main()

music_evaluation/figaro/input_representation.py ADDED Viewed

	@@ -0,0 +1,655 @@

+from chord_recognition import MIDIChord
+import numpy as np
+import pretty_midi
+from vocab import RemiVocab
+from constants import (
+  EOS_TOKEN,
+  # vocab keys
+  TIME_SIGNATURE_KEY,
+  BAR_KEY,
+  POSITION_KEY,
+  INSTRUMENT_KEY,
+  PITCH_KEY,
+  VELOCITY_KEY,
+  DURATION_KEY,
+  TEMPO_KEY,
+  CHORD_KEY,
+  NOTE_DENSITY_KEY,
+  MEAN_PITCH_KEY,
+  MEAN_VELOCITY_KEY,
+  MEAN_DURATION_KEY,
+  # discretization parameters
+  DEFAULT_POS_PER_QUARTER,
+  DEFAULT_VELOCITY_BINS,
+  DEFAULT_DURATION_BINS,
+  DEFAULT_TEMPO_BINS,
+  DEFAULT_NOTE_DENSITY_BINS,
+  DEFAULT_MEAN_VELOCITY_BINS,
+  DEFAULT_MEAN_PITCH_BINS,
+  DEFAULT_MEAN_DURATION_BINS,
+  DEFAULT_RESOLUTION
+)
+# define "Item" for general storage
+class Item(object):
+  def __init__(self, name, start, end, velocity=None, pitch=None, instrument=None):
+    self.name = name
+    self.start = start
+    self.end = end
+    self.velocity = velocity
+    self.pitch = pitch
+    self.instrument = instrument
+  def __repr__(self):
+    return 'Item(name={}, start={}, end={}, velocity={}, pitch={}, instrument={})'.format(
+      self.name, self.start, self.end, self.velocity, self.pitch, self.instrument)
+# define "Event" for event storage
+class Event(object):
+  def __init__(self, name, time, value, text):
+    self.name = name
+    self.time = time
+    self.value = value
+    self.text = text
+  def __repr__(self):
+    return 'Event(name={}, time={}, value={}, text={})'.format(
+      self.name, self.time, self.value, self.text)
+class InputRepresentation():
+  def version():
+    return 'v4'
+  def __init__(self, file, do_extract_chords=True, strict=False):
+    if isinstance(file, pretty_midi.PrettyMIDI):
+      self.pm = file
+    else:
+      self.pm = pretty_midi.PrettyMIDI(file)
+    if strict and len(self.pm.time_signature_changes) == 0:
+      raise ValueError("Invalid MIDI file: No time signature defined")
+    self.resolution = self.pm.resolution
+    self.note_items = None
+    self.tempo_items = None
+    self.chords = None
+    self.groups = None
+    self._read_items()
+    self._quantize_items()
+    if do_extract_chords:
+      self.extract_chords()
+    self._group_items()
+    if strict and len(self.note_items) == 0:
+      raise ValueError("Invalid MIDI file: No notes found, empty file.")
+  # read notes and tempo changes from midi (assume there is only one track)
+  def _read_items(self):
+    # note
+    self.note_items = []
+    for instrument in self.pm.instruments:
+      pedal_events = [event for event in instrument.control_changes if event.number == 64]
+      pedal_pressed = False
+      start = None
+      pedals = []
+      for e in pedal_events:
+        if e.value >= 64 and not pedal_pressed:
+          pedal_pressed = True
+          start = e.time
+        elif e.value < 64 and pedal_pressed:
+          pedal_pressed = False
+          pedals.append(Item(name='Pedal', start=start, end=e.time))
+          start = e.time
+      notes = instrument.notes
+      notes.sort(key=lambda x: (x.start, x.pitch))
+      if instrument.is_drum:
+        instrument_name = 'drum'
+      else:
+        instrument_name = instrument.program
+      pedal_idx = 0
+      for note in notes:
+        pedal_candidates = [(i + pedal_idx, pedal) for i, pedal in enumerate(pedals[pedal_idx:]) if note.end >= pedal.start and note.start < pedal.end]
+        if len(pedal_candidates) > 0:
+          pedal_idx = pedal_candidates[0][0]
+          pedal = pedal_candidates[-1][1]
+        else:
+          pedal = Item(name='Pedal', start=0, end=0)
+        self.note_items.append(Item(
+          name='Note',
+          start=self.pm.time_to_tick(note.start),
+          end=self.pm.time_to_tick(max(note.end, pedal.end)),
+          velocity=note.velocity,
+          pitch=note.pitch,
+          instrument=instrument_name))
+    self.note_items.sort(key=lambda x: (x.start, x.pitch))
+    # tempo
+    self.tempo_items = []
+    times, tempi = self.pm.get_tempo_changes()
+    for time, tempo in zip(times, tempi):
+      self.tempo_items.append(Item(
+        name='Tempo',
+        start=time,
+        end=None,
+        velocity=None,
+        pitch=int(tempo)))
+    self.tempo_items.sort(key=lambda x: x.start)
+    # expand to all beat
+    max_tick = self.pm.time_to_tick(self.pm.get_end_time())
+    existing_ticks = {item.start: item.pitch for item in self.tempo_items}
+    wanted_ticks = np.arange(0, max_tick+1, DEFAULT_RESOLUTION)
+    output = []
+    for tick in wanted_ticks:
+      if tick in existing_ticks:
+        output.append(Item(
+          name='Tempo',
+          start=self.pm.time_to_tick(tick),
+          end=None,
+          velocity=None,
+          pitch=existing_ticks[tick]))
+      else:
+        output.append(Item(
+          name='Tempo',
+          start=self.pm.time_to_tick(tick),
+          end=None,
+          velocity=None,
+          pitch=output[-1].pitch))
+    self.tempo_items = output
+  # quantize items
+  def _quantize_items(self):
+    ticks = self.resolution / DEFAULT_POS_PER_QUARTER
+    # grid
+    end_tick = self.pm.time_to_tick(self.pm.get_end_time())
+    grids = np.arange(0, max(self.resolution, end_tick), ticks)
+    # process
+    for item in self.note_items:
+      index = np.searchsorted(grids, item.start, side='right')
+      if index > 0:
+        index -= 1
+      shift = round(grids[index]) - item.start
+      item.start += shift
+      item.end += shift
+  def get_end_tick(self):
+    return self.pm.time_to_tick(self.pm.get_end_time())
+  # extract chord
+  def extract_chords(self):
+    end_tick = self.pm.time_to_tick(self.pm.get_end_time())
+    if end_tick < self.resolution:
+      # If sequence is shorter than 1/4th note, it's probably empty
+      self.chords = []
+      return self.chords
+    method = MIDIChord(self.pm)
+    chords = method.extract()
+    output = []
+    for chord in chords:
+      output.append(Item(
+        name='Chord',
+        start=self.pm.time_to_tick(chord[0]),
+        end=self.pm.time_to_tick(chord[1]),
+        velocity=None,
+        pitch=chord[2].split('/')[0]))
+    if len(output) == 0 or output[0].start > 0:
+      if len(output) == 0:
+        end = self.pm.time_to_tick(self.pm.get_end_time())
+      else:
+        end = output[0].start
+      output.append(Item(
+        name='Chord',
+        start=0,
+        end=end,
+        velocity=None,
+        pitch='N:N'
+      ))
+    self.chords = output
+    return self.chords
+  # group items
+  def _group_items(self):
+    if self.chords:
+      items = self.chords + self.tempo_items + self.note_items
+    else:
+      items = self.tempo_items + self.note_items
+    def _get_key(item):
+      type_priority = {
+        'Chord': 0,
+        'Tempo': 1,
+        'Note': 2
+      }
+      return (
+        item.start, # order by time
+        type_priority[item.name], # chord events first, then tempo events, then note events
+        -1 if item.instrument == 'drum' else item.instrument, # order by instrument
+        item.pitch # order by note pitch
+      )
+    items.sort(key=_get_key)
+    downbeats = self.pm.get_downbeats()
+    downbeats = np.concatenate([downbeats, [self.pm.get_end_time()]])
+    self.groups = []
+    for db1, db2 in zip(downbeats[:-1], downbeats[1:]):
+      db1, db2 = self.pm.time_to_tick(db1), self.pm.time_to_tick(db2)
+      insiders = []
+      for item in items:
+        if (item.start >= db1) and (item.start < db2):
+          insiders.append(item)
+      overall = [db1] + insiders + [db2]
+      self.groups.append(overall)
+    # Trim empty groups from the beginning and end
+    for idx in [0, -1]:
+      while len(self.groups) > 0:
+        group = self.groups[idx]
+        notes = [item for item in group[1:-1] if item.name == 'Note']
+        if len(notes) == 0:
+          self.groups.pop(idx)
+        else:
+          break
+    return self.groups
+  def _get_time_signature(self, start):
+    # This method assumes that time signature changes don't happen within a bar
+    # which is a convention that commonly holds
+    time_sig = None
+    for curr_sig, next_sig in zip(self.pm.time_signature_changes[:-1], self.pm.time_signature_changes[1:]):
+      if self.pm.time_to_tick(curr_sig.time) <= start and self.pm.time_to_tick(next_sig.time) > start:
+        time_sig = curr_sig
+        break
+    if time_sig is None:
+      time_sig = self.pm.time_signature_changes[-1]
+    return time_sig
+  def _get_ticks_per_bar(self, start):
+    time_sig = self._get_time_signature(start)
+    quarters_per_bar = 4 * time_sig.numerator / time_sig.denominator
+    return self.pm.resolution * quarters_per_bar
+  def _get_positions_per_bar(self, start=None, time_sig=None):
+    if time_sig is None:
+      time_sig = self._get_time_signature(start)
+    quarters_per_bar = 4 * time_sig.numerator / time_sig.denominator
+    positions_per_bar = int(DEFAULT_POS_PER_QUARTER * quarters_per_bar)
+    return positions_per_bar
+  def tick_to_position(self, tick):
+    return round(tick / self.pm.resolution * DEFAULT_POS_PER_QUARTER)
+  # item to event
+  def get_remi_events(self):
+    events = []
+    n_downbeat = 0
+    current_chord = None
+    current_tempo = None
+    for i in range(len(self.groups)):
+      bar_st, bar_et = self.groups[i][0], self.groups[i][-1]
+      n_downbeat += 1
+      positions_per_bar = self._get_positions_per_bar(bar_st)
+      if positions_per_bar <= 0:
+        raise ValueError('Invalid REMI file: There must be at least 1 position per bar.')
+      events.append(Event(
+        name=BAR_KEY,
+        time=None,
+        value='{}'.format(n_downbeat),
+        text='{}'.format(n_downbeat)))
+      time_sig = self._get_time_signature(bar_st)
+      events.append(Event(
+        name=TIME_SIGNATURE_KEY,
+        time=None,
+        value='{}/{}'.format(time_sig.numerator, time_sig.denominator),
+        text='{}/{}'.format(time_sig.numerator, time_sig.denominator)
+      ))
+      if current_chord is not None:
+        events.append(Event(
+          name=POSITION_KEY,
+          time=0,
+          value='{}'.format(0),
+          text='{}/{}'.format(1, positions_per_bar)))
+        events.append(Event(
+          name=CHORD_KEY,
+          time=current_chord.start,
+          value=current_chord.pitch,
+          text='{}'.format(current_chord.pitch)))
+      if current_tempo is not None:
+        events.append(Event(
+          name=POSITION_KEY,
+          time=0,
+          value='{}'.format(0),
+          text='{}/{}'.format(1, positions_per_bar)))
+        tempo = current_tempo.pitch
+        index = np.argmin(abs(DEFAULT_TEMPO_BINS-tempo))
+        events.append(Event(
+          name=TEMPO_KEY,
+          time=current_tempo.start,
+          value=index,
+          text='{}/{}'.format(tempo, DEFAULT_TEMPO_BINS[index])))
+      quarters_per_bar = 4 * time_sig.numerator / time_sig.denominator
+      ticks_per_bar = self.pm.resolution * quarters_per_bar
+      flags = np.linspace(bar_st, bar_st + ticks_per_bar, positions_per_bar, endpoint=False)
+      for item in self.groups[i][1:-1]:
+        # position
+        index = np.argmin(abs(flags-item.start))
+        pos_event = Event(
+          name=POSITION_KEY,
+          time=item.start,
+          value='{}'.format(index),
+          text='{}/{}'.format(index+1, positions_per_bar))
+        if item.name == 'Note':
+          events.append(pos_event)
+          # instrument
+          if item.instrument == 'drum':
+            name = 'drum'
+          else:
+            name = pretty_midi.program_to_instrument_name(item.instrument)
+          events.append(Event(
+            name=INSTRUMENT_KEY,
+            time=item.start,
+            value=name,
+            text='{}'.format(name)))
+          # pitch
+          events.append(Event(
+            name=PITCH_KEY,
+            time=item.start,
+            value='drum_{}'.format(item.pitch) if name == 'drum' else item.pitch,
+            text='{}'.format(pretty_midi.note_number_to_name(item.pitch))))
+          # velocity
+          velocity_index = np.argmin(abs(DEFAULT_VELOCITY_BINS - item.velocity))
+          events.append(Event(
+            name=VELOCITY_KEY,
+            time=item.start,
+            value=velocity_index,
+            text='{}/{}'.format(item.velocity, DEFAULT_VELOCITY_BINS[velocity_index])))
+          # duration
+          duration = self.tick_to_position(item.end - item.start)
+          index = np.argmin(abs(DEFAULT_DURATION_BINS-duration))
+          events.append(Event(
+            name=DURATION_KEY,
+            time=item.start,
+            value=index,
+            text='{}/{}'.format(duration, DEFAULT_DURATION_BINS[index])))
+        elif item.name == 'Chord':
+          if current_chord is None or item.pitch != current_chord.pitch:
+            events.append(pos_event)
+            events.append(Event(
+              name=CHORD_KEY,
+              time=item.start,
+              value=item.pitch,
+              text='{}'.format(item.pitch)))
+            current_chord = item
+        elif item.name == 'Tempo':
+          if current_tempo is None or item.pitch != current_tempo.pitch:
+            events.append(pos_event)
+            tempo = item.pitch
+            index = np.argmin(abs(DEFAULT_TEMPO_BINS-tempo))
+            events.append(Event(
+              name=TEMPO_KEY,
+              time=item.start,
+              value=index,
+              text='{}/{}'.format(tempo, DEFAULT_TEMPO_BINS[index])))
+            current_tempo = item
+    return [f'{e.name}_{e.value}' for e in events]
+  def get_description(self,
+                      omit_time_sig=False,
+                      omit_instruments=False,
+                      omit_chords=False,
+                      omit_meta=False):
+    events = []
+    n_downbeat = 0
+    current_chord = None
+    for i in range(len(self.groups)):
+      bar_st, bar_et = self.groups[i][0], self.groups[i][-1]
+      n_downbeat += 1
+      time_sig = self._get_time_signature(bar_st)
+      positions_per_bar = self._get_positions_per_bar(time_sig=time_sig)
+      if positions_per_bar <= 0:
+        raise ValueError('Invalid REMI file: There must be at least 1 position in each bar.')
+      events.append(Event(
+        name=BAR_KEY,
+        time=None,
+        value='{}'.format(n_downbeat),
+        text='{}'.format(n_downbeat)))
+      if not omit_time_sig:
+        events.append(Event(
+          name=TIME_SIGNATURE_KEY,
+          time=None,
+          value='{}/{}'.format(time_sig.numerator, time_sig.denominator),
+          text='{}/{}'.format(time_sig.numerator, time_sig.denominator),
+        ))
+      if not omit_meta:
+        notes = [item for item in self.groups[i][1:-1] if item.name == 'Note']
+        n_notes = len(notes)
+        velocities = np.array([item.velocity for item in notes])
+        pitches = np.array([item.pitch for item in notes])
+        durations = np.array([item.end - item.start for item in notes])
+        note_density = n_notes/positions_per_bar
+        index = np.argmin(abs(DEFAULT_NOTE_DENSITY_BINS-note_density))
+        events.append(Event(
+          name=NOTE_DENSITY_KEY,
+          time=None,
+          value=index,
+          text='{:.2f}/{:.2f}'.format(note_density, DEFAULT_NOTE_DENSITY_BINS[index])
+        ))
+        # will be 0 if there's no notes
+        mean_velocity = velocities.mean() if len(velocities) > 0 else np.nan
+        index = np.argmin(abs(DEFAULT_MEAN_VELOCITY_BINS-mean_velocity))
+        events.append(Event(
+          name=MEAN_VELOCITY_KEY,
+          time=None,
+          value=index if mean_velocity != np.nan else 'NaN',
+          text='{:.2f}/{:.2f}'.format(mean_velocity, DEFAULT_MEAN_VELOCITY_BINS[index])
+        ))
+        # will be 0 if there's no notes
+        mean_pitch = pitches.mean() if len(pitches) > 0 else np.nan
+        index = np.argmin(abs(DEFAULT_MEAN_PITCH_BINS-mean_pitch))
+        events.append(Event(
+          name=MEAN_PITCH_KEY,
+          time=None,
+          value=index if mean_pitch != np.nan else 'NaN',
+          text='{:.2f}/{:.2f}'.format(mean_pitch, DEFAULT_MEAN_PITCH_BINS[index])
+        ))
+        # will be 1 if there's no notes
+        mean_duration = durations.mean() if len(durations) > 0 else np.nan
+        index = np.argmin(abs(DEFAULT_MEAN_DURATION_BINS-mean_duration))
+        events.append(Event(
+          name=MEAN_DURATION_KEY,
+          time=None,
+          value=index if mean_duration != np.nan else 'NaN',
+          text='{:.2f}/{:.2f}'.format(mean_duration, DEFAULT_MEAN_DURATION_BINS[index])
+        ))
+      if not omit_instruments:
+        instruments = set([item.instrument for item in notes])
+        for instrument in instruments:
+          instrument = pretty_midi.program_to_instrument_name(instrument) if instrument != 'drum' else 'drum'
+          events.append(Event(
+            name=INSTRUMENT_KEY,
+            time=None,
+            value=instrument,
+            text=instrument
+          ))
+      if not omit_chords:
+        chords = [item for item in self.groups[i][1:-1] if item.name == 'Chord']
+        if len(chords) == 0 and current_chord is not None:
+          chords = [current_chord]
+        elif len(chords) > 0:
+          if chords[0].start > bar_st and current_chord is not None:
+            chords.insert(0, current_chord)
+          current_chord = chords[-1]
+        for chord in chords:
+          events.append(Event(
+            name=CHORD_KEY,
+            time=None,
+            value=chord.pitch,
+            text='{}'.format(chord.pitch)
+          ))
+    return [f'{e.name}_{e.value}' for e in events]
+#############################################################################################
+# WRITE MIDI
+#############################################################################################
+def remi2midi(events, bpm=120, time_signature=(4, 4), polyphony_limit=16):
+  vocab = RemiVocab()
+  def _get_time(bar, position, bpm=120, positions_per_bar=48):
+    abs_position = bar*positions_per_bar + position
+    beat = abs_position / DEFAULT_POS_PER_QUARTER
+    return beat/bpm*60
+  def _get_time(reference, bar, pos):
+    time_sig = reference['time_sig']
+    num, denom = time_sig.numerator, time_sig.denominator
+    # Quarters per bar, assuming 4 quarters per whole note
+    qpb = 4 * num / denom
+    ref_pos = reference['pos']
+    d_bars = bar - ref_pos[0]
+    d_pos = (pos - ref_pos[1]) + d_bars*qpb*DEFAULT_POS_PER_QUARTER
+    d_quarters = d_pos / DEFAULT_POS_PER_QUARTER
+    # Convert quarters to seconds
+    dt = d_quarters / reference['tempo'] * 60
+    return reference['time'] + dt
+  # time_sigs = [event.split('_')[-1].split('/') for event in events if f"{TIME_SIGNATURE_KEY}_" in event]
+  # time_sigs = [(int(num), int(denom)) for num, denom in time_sigs]
+  tempo_changes = [event for event in events if f"{TEMPO_KEY}_" in event]
+  if len(tempo_changes) > 0:
+    bpm = DEFAULT_TEMPO_BINS[int(tempo_changes[0].split('_')[-1])]
+  pm = pretty_midi.PrettyMIDI(initial_tempo=bpm)
+  num, denom = time_signature
+  pm.time_signature_changes.append(pretty_midi.TimeSignature(num, denom, 0))
+  current_time_sig = pm.time_signature_changes[0]
+  instruments = {}
+  # Use implicit timeline: keep track of last tempo/time signature change event
+  # and calculate time difference relative to that
+  last_tl_event = {
+    'time': 0,
+    'pos': (0, 0),
+    'time_sig': current_time_sig,
+    'tempo': bpm
+  }
+  bar = -1
+  n_notes = 0
+  polyphony_control = {}
+  for i, event in enumerate(events):
+    if event == EOS_TOKEN:
+      break
+    if not bar in polyphony_control:
+      polyphony_control[bar] = {}
+    if f"{BAR_KEY}_" in events[i]:
+      # Next bar is starting
+      bar += 1
+      polyphony_control[bar] = {}
+      if i+1 < len(events) and f"{TIME_SIGNATURE_KEY}_" in events[i+1]:
+        num, denom = events[i+1].split('_')[-1].split('/')
+        num, denom = int(num), int(denom)
+        current_time_sig = last_tl_event['time_sig']
+        if num != current_time_sig.numerator or denom != current_time_sig.denominator:
+          time = _get_time(last_tl_event, bar, 0)
+          time_sig = pretty_midi.TimeSignature(num, denom, time)
+          pm.time_signature_changes.append(time_sig)
+          last_tl_event['time'] = time
+          last_tl_event['pos'] = (bar, 0)
+          last_tl_event['time_sig'] = time_sig
+    elif i+1 < len(events) and \
+        f"{POSITION_KEY}_" in events[i] and \
+        f"{TEMPO_KEY}_" in events[i+1]:
+      position = int(events[i].split('_')[-1])
+      tempo_idx = int(events[i+1].split('_')[-1])
+      tempo = DEFAULT_TEMPO_BINS[tempo_idx]
+      if tempo != last_tl_event['tempo']:
+        time = _get_time(last_tl_event, bar, position)
+        last_tl_event['time'] = time
+        last_tl_event['pos'] = (bar, position)
+        last_tl_event['tempo'] = tempo
+    elif i+4 < len(events) and \
+        f"{POSITION_KEY}_" in events[i] and \
+        f"{INSTRUMENT_KEY}_" in events[i+1] and \
+        f"{PITCH_KEY}_" in events[i+2] and \
+        f"{VELOCITY_KEY}_" in events[i+3] and \
+        f"{DURATION_KEY}_" in events[i+4]:
+      # get position
+      position = int(events[i].split('_')[-1])
+      if not position in polyphony_control[bar]:
+        polyphony_control[bar][position] = {}
+      # get instrument
+      instrument_name = events[i+1].split('_')[-1]
+      if instrument_name not in polyphony_control[bar][position]:
+        polyphony_control[bar][position][instrument_name] = 0
+      elif polyphony_control[bar][position][instrument_name] >= polyphony_limit:
+        # If number of notes exceeds polyphony limit, omit this note
+        continue
+      if instrument_name not in instruments:
+        if instrument_name == 'drum':
+          instrument = pretty_midi.Instrument(0, is_drum=True)
+        else:
+          program = pretty_midi.instrument_name_to_program(instrument_name)
+          instrument = pretty_midi.Instrument(program)
+        instruments[instrument_name] = instrument
+      else:
+        instrument = instruments[instrument_name]
+      # get pitch
+      pitch = int(events[i+2].split('_')[-1])
+      # get velocity
+      velocity_index = int(events[i+3].split('_')[-1])
+      velocity = min(127, DEFAULT_VELOCITY_BINS[velocity_index])
+      # get duration
+      duration_index = int(events[i+4].split('_')[-1])
+      duration = DEFAULT_DURATION_BINS[duration_index]
+      # create not and add to instrument
+      start = _get_time(last_tl_event, bar, position)
+      end = _get_time(last_tl_event, bar, position + duration)
+      note = pretty_midi.Note(velocity=velocity,
+                              pitch=pitch,
+                              start=start,
+                              end=end)
+      instrument.notes.append(note)
+      n_notes += 1
+      polyphony_control[bar][position][instrument_name] += 1
+  for instrument in instruments.values():
+    pm.instruments.append(instrument)
+  return pm

music_evaluation/figaro/vocab.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import pretty_midi
+from collections import Counter
+import torchtext
+from torch import Tensor
+from constants import (
+  DEFAULT_VELOCITY_BINS,
+  DEFAULT_DURATION_BINS,
+  DEFAULT_TEMPO_BINS,
+  DEFAULT_POS_PER_QUARTER,
+  DEFAULT_NOTE_DENSITY_BINS,
+  DEFAULT_MEAN_VELOCITY_BINS,
+  DEFAULT_MEAN_PITCH_BINS,
+  DEFAULT_MEAN_DURATION_BINS
+)
+from constants import (
+  MAX_BAR_LENGTH,
+  MAX_N_BARS,
+  PAD_TOKEN,
+  UNK_TOKEN,
+  BOS_TOKEN,
+  EOS_TOKEN,
+  MASK_TOKEN,
+  TIME_SIGNATURE_KEY,
+  BAR_KEY,
+  POSITION_KEY,
+  INSTRUMENT_KEY,
+  PITCH_KEY,
+  VELOCITY_KEY,
+  DURATION_KEY,
+  TEMPO_KEY,
+  CHORD_KEY,
+  NOTE_DENSITY_KEY,
+  MEAN_PITCH_KEY,
+  MEAN_VELOCITY_KEY,
+  MEAN_DURATION_KEY,
+)
+class Tokens:
+  def get_instrument_tokens(key=INSTRUMENT_KEY):
+    tokens = [f'{key}_{pretty_midi.program_to_instrument_name(i)}' for i in range(128)]
+    tokens.append(f'{key}_drum')
+    return tokens
+  def get_chord_tokens(key=CHORD_KEY, qualities = ['maj', 'min', 'dim', 'aug', 'dom7', 'maj7', 'min7', 'None']):
+    pitch_classes = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
+    chords = [f'{root}:{quality}' for root in pitch_classes for quality in qualities]
+    chords.append('N:N')
+    tokens = [f'{key}_{chord}' for chord in chords]
+    return tokens
+  def get_time_signature_tokens(key=TIME_SIGNATURE_KEY):
+    denominators = [2, 4, 8, 16]
+    time_sigs = [f'{p}/{q}' for q in denominators for p in range(1, MAX_BAR_LENGTH*q + 1)]
+    tokens = [f'{key}_{time_sig}' for time_sig in time_sigs]
+    return tokens
+  def get_midi_tokens(
+    instrument_key=INSTRUMENT_KEY,
+    time_signature_key=TIME_SIGNATURE_KEY,
+    pitch_key=PITCH_KEY,
+    velocity_key=VELOCITY_KEY,
+    duration_key=DURATION_KEY,
+    tempo_key=TEMPO_KEY,
+    bar_key=BAR_KEY,
+    position_key=POSITION_KEY
+  ):
+    instrument_tokens = Tokens.get_instrument_tokens(instrument_key)
+    pitch_tokens = [f'{pitch_key}_{i}' for i in range(128)] + [f'{pitch_key}_drum_{i}' for i in range(128)]
+    velocity_tokens = [f'{velocity_key}_{i}' for i in range(len(DEFAULT_VELOCITY_BINS))]
+    duration_tokens = [f'{duration_key}_{i}' for i in range(len(DEFAULT_DURATION_BINS))]
+    tempo_tokens = [f'{tempo_key}_{i}' for i in range(len(DEFAULT_TEMPO_BINS))]
+    bar_tokens = [f'{bar_key}_{i}' for i in range(MAX_N_BARS)]
+    position_tokens = [f'{position_key}_{i}' for i in range(MAX_BAR_LENGTH*4*DEFAULT_POS_PER_QUARTER)]
+    time_sig_tokens = Tokens.get_time_signature_tokens(time_signature_key)
+    return (
+      time_sig_tokens +
+      tempo_tokens +
+      instrument_tokens +
+      pitch_tokens +
+      velocity_tokens +
+      duration_tokens +
+      bar_tokens +
+      position_tokens
+    )
+class Vocab:
+  def __init__(self, counter, specials=[PAD_TOKEN, UNK_TOKEN, BOS_TOKEN, EOS_TOKEN, MASK_TOKEN], unk_token=UNK_TOKEN):
+    self.vocab = torchtext.vocab.vocab(counter)
+    self.specials = specials
+    for i, token in enumerate(self.specials):
+      self.vocab.insert_token(token, i)
+    if unk_token in specials:
+      self.vocab.set_default_index(self.vocab.get_stoi()[unk_token])
+  def to_i(self, token):
+    return self.vocab.get_stoi()[token]
+  def to_s(self, idx):
+    if idx >= len(self.vocab):
+      return UNK_TOKEN
+    else:
+      return self.vocab.get_itos()[idx]
+  def __len__(self):
+    return len(self.vocab)
+  def encode(self, seq):
+    return self.vocab(seq)
+  def decode(self, seq):
+    if isinstance(seq, Tensor):
+      seq = seq.numpy()
+    return self.vocab.lookup_tokens(seq)
+class RemiVocab(Vocab):
+  def __init__(self):
+    midi_tokens = Tokens.get_midi_tokens()
+    chord_tokens = Tokens.get_chord_tokens()
+    self.tokens = midi_tokens + chord_tokens
+    counter = Counter(self.tokens)
+    super().__init__(counter)
+class DescriptionVocab(Vocab):
+  def __init__(self):
+    time_sig_tokens = Tokens.get_time_signature_tokens()
+    instrument_tokens = Tokens.get_instrument_tokens()
+    chord_tokens = Tokens.get_chord_tokens()
+    bar_tokens = [f'Bar_{i}' for i in range(MAX_N_BARS)]
+    density_tokens = [f'{NOTE_DENSITY_KEY}_{i}' for i in range(len(DEFAULT_NOTE_DENSITY_BINS))]
+    velocity_tokens = [f'{MEAN_VELOCITY_KEY}_{i}' for i in range(len(DEFAULT_MEAN_VELOCITY_BINS))]
+    pitch_tokens = [f'{MEAN_PITCH_KEY}_{i}' for i in range(len(DEFAULT_MEAN_PITCH_BINS))]
+    duration_tokens = [f'{MEAN_DURATION_KEY}_{i}' for i in range(len(DEFAULT_MEAN_DURATION_BINS))]
+    self.tokens = (
+      time_sig_tokens +
+      instrument_tokens +
+      chord_tokens +
+      density_tokens +
+      velocity_tokens +
+      pitch_tokens +
+      duration_tokens +
+      bar_tokens
+    )
+    counter = Counter(self.tokens)
+    super().__init__(counter)

music_evaluation/mgeval/__init__.py ADDED Viewed

File without changes

music_evaluation/mgeval/__init__.pyc ADDED Viewed

Binary file (105 Bytes). View file

music_evaluation/mgeval/core.py ADDED Viewed

	@@ -0,0 +1,644 @@

+# coding:utf-8
+"""core.py
+Include feature extractor and musically informed objective measures.
+"""
+import pretty_midi
+import numpy as np
+import sys
+import os
+import statistics
+# import midi
+import glob
+import math
+# feature extractor
+def extract_feature(_file):
+    """
+      This function extracts two midi feature:
+      pretty_midi object: https://github.com/craffel/pretty-midi
+      midi_pattern: https://github.com/vishnubob/python-midi
+      Returns:
+          dict(pretty_midi: pretty_midi object,
+               midi_pattern: midi pattern contains a list of tracks)
+      """
+    feature = {'pretty_midi': pretty_midi.PrettyMIDI(_file)}
+    #                'midi_pattern': midi.read_midifile(_file)}
+    return feature
+# musically informed objective measures.
+class metrics(object):
+    def total_used_pitch(self, feature):
+        """
+            total_used_pitch (Pitch count): The number of different pitches within a sample.
+            Returns:
+            'used_pitch': pitch count, scalar for each sample.
+            """
+        try:
+            instrument = feature['pretty_midi'].instruments[0]
+            piano_roll = instrument.get_piano_roll(fs=100)
+            sum_notes = np.sum(piano_roll, axis=1)
+            used_pitch = np.sum(sum_notes > 0)
+            return used_pitch
+        except:
+            return 0  # empty piano roll
+    def mean_note_velocity(self, feature):
+        """
+            mean_var_note_velocity: The velocity of different notes within a sample.
+            Returns:
+            note_velocity: the average velocity and the variance of velocity, 2 scalar for each sample.
+            """
+        try:
+            instrument = feature['pretty_midi'].instruments[0]
+            velocity = []
+            for note in instrument.notes:
+                velocity.append(note.velocity)
+            mean_velocity = statistics.mean(velocity)
+            if len(velocity) > 1:
+                variance_velocity = statistics.variance(velocity)
+            else:
+                variance_velocity = 0
+            return mean_velocity
+        except:
+            return 0  # empty piano roll
+    def mean_note_duration(self, feature):
+        """
+            mean_var_note_duration: The duration of different notes within a sample.
+            Returns:
+            note_duration: the average duration and the variance of velocity, 2 scalar for each sample.
+            """
+        try:
+            instrument = feature['pretty_midi'].instruments[0]
+            duration = []
+            for note in instrument.notes:
+                d = note.end - note.start
+                duration.append(d)
+            mean_duration = statistics.mean(duration)
+            if len(duration) > 1:
+                variance_duration = statistics.variance(duration)
+            else:
+                variance_duration = 0
+            return mean_duration
+        except:
+            return 0  # empty piano roll
+    def note_density(self, feature):
+        """
+            note_density: the density of note within a sample.
+            Returns:
+            note_density: the density of notes, 1 scalar for each sample.
+            """
+        # instrument = feature['pretty_midi'].instruments[0]
+        total_notes = sum(len(instrument.notes) for instrument in feature['pretty_midi'].instruments)
+        total_duration = feature['pretty_midi'].get_end_time()
+        # Calculate the note density
+        if total_duration > 0:
+            note_density = total_notes / total_duration
+        else:
+            note_density = 0
+        return note_density
+    #     def bar_used_pitch(self, feature, track_num=1, num_bar=None):
+    #         """
+    #         bar_used_pitch (Pitch count per bar)
+    #         Args:
+    #         'track_num' : specify the track number in the midi pattern, default is 1 (the second track).
+    #         'num_bar': specify the number of bars in the midi pattern, if set as None, round to the number of complete bar.
+    #         Returns:
+    #         'used_pitch': with shape of [num_bar,1]
+    #         """
+    #         pattern = feature['midi_pattern']
+    #         pattern.make_ticks_abs()
+    #         resolution = pattern.resolution
+    #         for i in range(0, len(pattern[track_num])):
+    #             if type(pattern[track_num][i]) == midi.events.TimeSignatureEvent:
+    #                 time_sig = pattern[track_num][i].data
+    #                 bar_length = time_sig[0] * resolution * 4 / 2**(time_sig[1])
+    #                 if num_bar is None:
+    #                     num_bar = int(round(float(pattern[track_num][-1].tick) / bar_length))
+    #                     used_notes = np.zeros((num_bar, 1))
+    #                 else:
+    #                     used_notes = np.zeros((num_bar, 1))
+    #             elif type(pattern[track_num][i]) == midi.events.NoteOnEvent and pattern[track_num][i].data[1] != 0:
+    #                 if 'time_sig' not in locals():  # set default bar length as 4 beat
+    #                     bar_length = 4 * resolution
+    #                     time_sig = [4, 2, 24, 8]
+    #                     if num_bar is None:
+    #                         num_bar = int(round(float(pattern[track_num][-1].tick) / bar_length))
+    #                         used_notes = np.zeros((num_bar, 1))
+    #                         used_notes[pattern[track_num][i].tick / bar_length] += 1
+    #                     else:
+    #                         used_notes = np.zeros((num_bar, 1))
+    #                         used_notes[pattern[track_num][i].tick / bar_length] += 1
+    #                     note_list = []
+    #                     note_list.append(pattern[track_num][i].data[0])
+    #                 else:
+    #                     for j in range(0, num_bar):
+    #                         if 'note_list'in locals():
+    #                             pass
+    #                         else:
+    #                             note_list = []
+    #                     note_list.append(pattern[track_num][i].data[0])
+    #                     idx = pattern[track_num][i].tick / bar_length
+    #                     if idx >= num_bar:
+    #                       continue
+    #                     used_notes[idx] += 1
+    #                     # used_notes[pattern[track_num][i].tick / bar_length] += 1
+    #         used_pitch = np.zeros((num_bar, 1))
+    #         current_note = 0
+    #         for i in range(0, num_bar):
+    #             used_pitch[i] = len(set(note_list[current_note:current_note + int(used_notes[i][0])]))
+    #             current_note += int(used_notes[i][0])
+    #         return used_pitch
+    #     def total_used_note(self, feature, track_num=1):
+    #         """
+    #         total_used_note (Note count): The number of used notes.
+    #         As opposed to the pitch count, the note count does not contain pitch information but is a rhythm-related feature.
+    #         Args:
+    #         'track_num' : specify the track number in the midi pattern, default is 1 (the second track).
+    #         Returns:
+    #         'used_notes': a scalar for each sample.
+    #         """
+    #         pattern = feature['midi_pattern']
+    #         used_notes = 0
+    #         for i in range(0, len(pattern[track_num])):
+    #             if type(pattern[track_num][i]) == midi.events.NoteOnEvent and pattern[track_num][i].data[1] != 0:
+    #                 used_notes += 1
+    #         return used_notes
+    #     def bar_used_note(self, feature, track_num=1, num_bar=None):
+    #         """
+    #         bar_used_note (Note count per bar).
+    #         Args:
+    #         'track_num' : specify the track number in the midi pattern, default is 1 (the second track).
+    #         'num_bar': specify the number of bars in the midi pattern, if set as None, round to the number of complete bar.
+    #         Returns:
+    #         'used_notes': with shape of [num_bar, 1]
+    #         """
+    #         pattern = feature['midi_pattern']
+    #         pattern.make_ticks_abs()
+    #         resolution = pattern.resolution
+    #         for i in range(0, len(pattern[track_num])):
+    #             if type(pattern[track_num][i]) == midi.events.TimeSignatureEvent:
+    #                 time_sig = pattern[track_num][i].data
+    #                 bar_length = time_sig[track_num] * resolution * 4 / 2**(time_sig[1])
+    #                 if num_bar is None:
+    #                     num_bar = int(round(float(pattern[track_num][-1].tick) / bar_length))
+    #                     used_notes = np.zeros((num_bar, 1))
+    #                 else:
+    #                     used_notes = np.zeros((num_bar, 1))
+    #             elif type(pattern[track_num][i]) == midi.events.NoteOnEvent and pattern[track_num][i].data[1] != 0:
+    #                 if 'time_sig' not in locals():  # set default bar length as 4 beat
+    #                     bar_length = 4 * resolution
+    #                     time_sig = [4, 2, 24, 8]
+    #                     if num_bar is None:
+    #                         num_bar = int(round(float(pattern[track_num][-1].tick) / bar_length))
+    #                         used_notes = np.zeros((num_bar, 1))
+    #                         used_notes[pattern[track_num][i].tick / bar_length] += 1
+    #                     else:
+    #                         used_notes = np.zeros((num_bar, 1))
+    #                         used_notes[pattern[track_num][i].tick / bar_length] += 1
+    #                 else:
+    #                     idx = pattern[track_num][i].tick / bar_length
+    #                     if idx >= num_bar:
+    #                       continue
+    #                     used_notes[idx] += 1
+    #         return used_notes
+    def total_pitch_class_histogram(self, feature):
+        """
+            total_pitch_class_histogram (Pitch class histogram):
+            The pitch class histogram is an octave-independent representation of the pitch content with a dimensionality of 12 for a chromatic scale.
+            In our case, it represents to the octave-independent chromatic quantization of the frequency continuum.
+            Returns:
+            'histogram': histrogram of 12 pitch, with weighted duration shape 12
+            """
+        # print(feature['pretty_midi'].instruments)
+        histogram = np.zeros(12)
+        try:
+            piano_roll = feature['pretty_midi'].instruments[0].get_piano_roll(fs=100)
+            # piano_roll = feature['pretty_midi'].get_piano_roll(fs=100)
+            for i in range(0, 128):
+                pitch_class = i % 12
+                histogram[pitch_class] += np.sum(piano_roll, axis=1)[i]
+            histogram = histogram / sum(histogram)
+            return histogram
+        except:
+            return histogram
+    def bar_pitch_class_histogram(self, feature, track_num=1, num_bar=None, bpm=120):
+        """
+            bar_pitch_class_histogram (Pitch class histogram per bar):
+            Args:
+            'bpm' : specify the assigned speed in bpm, default is 120 bpm.
+            'num_bar': specify the number of bars in the midi pattern, if set as None, round to the number of complete bar.
+            'track_num' : specify the track number in the midi pattern, default is 1 (the second track).
+            Returns:
+            'histogram': with shape of [num_bar, 12]
+            """
+        # todo: deal with more than one time signature cases
+        pm_object = feature['pretty_midi']
+        if num_bar is None:
+            numer = pm_object.time_signature_changes[-1].numerator
+            deno = pm_object.time_signature_changes[-1].denominator
+            bar_length = 60. / bpm * numer * 4 / deno * 100
+            piano_roll = pm_object.instruments[track_num].get_piano_roll(fs=100)
+            piano_roll = np.transpose(piano_roll, (1, 0))
+            actual_bar = len(piano_roll) / bar_length
+            num_bar = int(round(actual_bar))
+            bar_length = int(round(bar_length))
+        else:
+            numer = pm_object.time_signature_changes[-1].numerator
+            deno = pm_object.time_signature_changes[-1].denominator
+            bar_length = 60. / bpm * numer * 4 / deno * 100
+            piano_roll = pm_object.instruments[track_num].get_piano_roll(fs=100)
+            piano_roll = np.transpose(piano_roll, (1, 0))
+            actual_bar = len(piano_roll) / bar_length
+            bar_length = int(math.ceil(bar_length))
+        if actual_bar > num_bar:
+            mod = np.mod(len(piano_roll), bar_length * 128)
+            piano_roll = piano_roll[:-np.mod(len(piano_roll), bar_length)].reshape((num_bar, -1, 128))  # make exact bar
+        elif actual_bar == num_bar:
+            piano_roll = piano_roll.reshape((num_bar, -1, 128))
+        else:
+            piano_roll = np.pad(piano_roll, ((0, int(num_bar * bar_length - len(piano_roll))), (0, 0)), mode='constant',
+                                constant_values=0)
+            piano_roll = piano_roll.reshape((num_bar, -1, 128))
+        bar_histogram = np.zeros((num_bar, 12))
+        for i in range(0, num_bar):
+            histogram = np.zeros(12)
+            for j in range(0, 128):
+                pitch_class = j % 12
+                histogram[pitch_class] += np.sum(piano_roll[i], axis=0)[j]
+            if sum(histogram) != 0:
+                bar_histogram[i] = histogram / sum(histogram)
+            else:
+                bar_histogram[i] = np.zeros(12)
+        return bar_histogram
+    def pitch_class_transition_matrix(self, feature, normalize=0):
+        """
+            pitch_class_transition_matrix (Pitch class transition matrix):
+            The transition of pitch classes contains useful information for tasks such as key detection, chord recognition, or genre pattern recognition.
+            The two-dimensional pitch class transition matrix is a histogram-like representation computed by counting the pitch transitions for each (ordered) pair of notes.
+            Args:
+            'normalize' : If set to 0, return transition without normalization.
+                          If set to 1, normalizae by row.
+                          If set to 2, normalize by entire matrix sum.
+            Returns:
+            'transition_matrix': shape of [12, 12], transition_matrix of 12 x 12.
+            """
+        pm_object = feature['pretty_midi']
+        transition_matrix = pm_object.get_pitch_class_transition_matrix()
+        if normalize == 0:
+            return transition_matrix
+        elif normalize == 1:
+            sums = np.sum(transition_matrix, axis=1)
+            sums[sums == 0] = 1
+            return transition_matrix / sums.reshape(-1, 1)
+        elif normalize == 2:
+            return transition_matrix / sum(sum(transition_matrix))
+        else:
+            print("invalid normalization mode, return unnormalized matrix")
+            return transition_matrix
+    def pitch_range(self, feature):
+        """
+            pitch_range (Pitch range):
+            The pitch range is calculated by subtraction of the highest and lowest used pitch in semitones.
+            Returns:
+            'p_range': a scalar for each sample.
+            """
+        try:
+            piano_roll = feature['pretty_midi'].instruments[0].get_piano_roll(fs=100)
+            pitch_index = np.where(np.sum(piano_roll, axis=1) > 0)
+            p_range = np.max(pitch_index) - np.min(pitch_index)
+            return p_range
+        except:
+            return 0  # empty piano roll
+    #     def avg_pitch_shift(self, feature, track_num=1):
+    #         """
+    #         avg_pitch_shift (Average pitch interval):
+    #         Average value of the interval between two consecutive pitches in semitones.
+    #         Args:
+    #         'track_num' : specify the track number in the midi pattern, default is 1 (the second track).
+    #         Returns:
+    #         'pitch_shift': a scalar for each sample.
+    #         """
+    #         pattern = feature['midi_pattern']
+    #         pattern.make_ticks_abs()
+    #         resolution = pattern.resolution
+    #         total_used_note = self.total_used_note(feature, track_num=track_num)
+    #         d_note = np.zeros((max(total_used_note - 1, 0)))
+    #         # if total_used_note == 0:
+    #           # return 0
+    #         # d_note = np.zeros((total_used_note - 1))
+    #         current_note = 0
+    #         counter = 0
+    #         for i in range(0, len(pattern[track_num])):
+    #             if type(pattern[track_num][i]) == midi.events.NoteOnEvent and pattern[track_num][i].data[1] != 0:
+    #                 if counter != 0:
+    #                     d_note[counter - 1] = current_note - pattern[track_num][i].data[0]
+    #                     current_note = pattern[track_num][i].data[0]
+    #                     counter += 1
+    #                 else:
+    #                     current_note = pattern[track_num][i].data[0]
+    #                     counter += 1
+    #         pitch_shift = np.mean(abs(d_note))
+    #         return pitch_shift
+    def avg_IOI(self, feature):
+        """
+            avg_IOI (Average inter-onset-interval):
+            To calculate the inter-onset-interval in the symbolic music domain, we find the time between two consecutive notes.
+            Returns:
+            'avg_ioi': a scalar for each sample.
+            """
+        try:
+            tmp = feature['pretty_midi'].instruments[0]
+            pm_object = feature['pretty_midi']
+            onset = pm_object.get_onsets()
+            ioi = np.diff(onset)
+            avg_ioi = np.mean(ioi)
+            return avg_ioi
+        except:
+            return 0  # empty piano roll
+#     def note_length_hist(self, feature, track_num=1, normalize=True, pause_event=False):
+#         """
+#         note_length_hist (Note length histogram):
+#         To extract the note length histogram, we first define a set of allowable beat length classes:
+#         [full, half, quarter, 8th, 16th, dot half, dot quarter, dot 8th, dot 16th, half note triplet, quarter note triplet, 8th note triplet].
+#         The pause_event option, when activated, will double the vector size to represent the same lengths for rests.
+#         The classification of each event is performed by dividing the basic unit into the length of (barlength)/96, and each note length is quantized to the closest length category.
+#         Args:
+#         'track_num' : specify the track number in the midi pattern, default is 1 (the second track).
+#         'normalize' : If true, normalize by vector sum.
+#         'pause_event' : when activated, will double the vector size to represent the same lengths for rests.
+#         Returns:
+#         'note_length_hist': The output vector has a length of either 12 (or 24 when pause_event is True).
+#         """
+#         pattern = feature['midi_pattern']
+#         if pause_event is False:
+#             note_length_hist = np.zeros((12))
+#             pattern.make_ticks_abs()
+#             resolution = pattern.resolution
+#             # basic unit: bar_length/96
+#             for i in range(0, len(pattern[track_num])):
+#                 if type(pattern[track_num][i]) == midi.events.TimeSignatureEvent:
+#                     time_sig = pattern[track_num][i].data
+#                     bar_length = time_sig[track_num] * resolution * 4 / 2**(time_sig[1])
+#                 elif type(pattern[track_num][i]) == midi.events.NoteOnEvent and pattern[track_num][i].data[1] != 0:
+#                     if 'time_sig' not in locals():  # set default bar length as 4 beat
+#                         bar_length = 4 * resolution
+#                         time_sig = [4, 2, 24, 8]
+#                     unit = bar_length / 96.
+#                     hist_list = [unit * 96, unit * 48, unit * 24, unit * 12, unit * 6, unit * 72, unit * 36, unit * 18, unit * 9, unit * 32, unit * 16, unit * 8]
+#                     current_tick = pattern[track_num][i].tick
+#                     current_note = pattern[track_num][i].data[0]
+#                     # find next note off
+#                     for j in range(i, len(pattern[track_num])):
+#                         if type(pattern[track_num][j]) == midi.events.NoteOffEvent or (type(pattern[track_num][j]) == midi.events.NoteOnEvent and pattern[track_num][j].data[1] == 0):
+#                             if pattern[track_num][j].data[0] == current_note:
+#                                 note_length = pattern[track_num][j].tick - current_tick
+#                                 distance = np.abs(np.array(hist_list) - note_length)
+#                                 idx = distance.argmin()
+#                                 note_length_hist[idx] += 1
+#                                 break
+#         else:
+#             note_length_hist = np.zeros((24))
+#             pattern.make_ticks_abs()
+#             resolution = pattern.resolution
+#             # basic unit: bar_length/96
+#             for i in range(0, len(pattern[track_num])):
+#                 if type(pattern[track_num][i]) == midi.events.TimeSignatureEvent:
+#                     time_sig = pattern[track_num][i].data
+#                     bar_length = time_sig[track_num] * resolution * 4 / 2**(time_sig[1])
+#                 elif type(pattern[track_num][i]) == midi.events.NoteOnEvent and pattern[track_num][i].data[1] != 0:
+#                     check_previous_off = True
+#                     if 'time_sig' not in locals():  # set default bar length as 4 beat
+#                         bar_length = 4 * resolution
+#                         time_sig = [4, 2, 24, 8]
+#                     unit = bar_length / 96.
+#                     tol = 3. * unit
+#                     hist_list = [unit * 96, unit * 48, unit * 24, unit * 12, unit * 6, unit * 72, unit * 36, unit * 18, unit * 9, unit * 32, unit * 16, unit * 8]
+#                     current_tick = pattern[track_num][i].tick
+#                     current_note = pattern[track_num][i].data[0]
+#                     # find next note off
+#                     for j in range(i, len(pattern[track_num])):
+#                         # find next note off
+#                         if type(pattern[track_num][j]) == midi.events.NoteOffEvent or (type(pattern[track_num][j]) == midi.events.NoteOnEvent and pattern[track_num][j].data[1] == 0):
+#                             if pattern[track_num][j].data[0] == current_note:
+#                                 note_length = pattern[track_num][j].tick - current_tick
+#                                 distance = np.abs(np.array(hist_list) - note_length)
+#                                 idx = distance.argmin()
+#                                 note_length_hist[idx] += 1
+#                                 break
+#                             else:
+#                                 if pattern[track_num][j].tick == current_tick:
+#                                     check_previous_off = False
+#                     # find previous note off/on
+#                     if check_previous_off is True:
+#                         for j in range(i - 1, 0, -1):
+#                             if type(pattern[track_num][j]) == midi.events.NoteOnEvent and pattern[track_num][j].data[1] != 0:
+#                                 break
+#                             elif type(pattern[track_num][j]) == midi.events.NoteOffEvent or (type(pattern[track_num][j]) == midi.events.NoteOnEvent and pattern[track_num][j].data[1] == 0):
+#                                 note_length = current_tick - pattern[track_num][j].tick
+#                                 distance = np.abs(np.array(hist_list) - note_length)
+#                                 idx = distance.argmin()
+#                                 if distance[idx] < tol:
+#                                     note_length_hist[idx + 12] += 1
+#                                 break
+#         if normalize is False:
+#             return note_length_hist
+#         elif normalize is True:
+#             return note_length_hist / np.sum(note_length_hist)
+#     def note_length_transition_matrix(self, feature, track_num=1, normalize=0, pause_event=False):
+#         """
+#         note_length_transition_matrix (Note length transition matrix):
+#         Similar to the pitch class transition matrix, the note length tran- sition matrix provides useful information for rhythm description.
+#         Args:
+#         'track_num' : specify the track number in the midi pattern, default is 1 (the second track).
+#         'normalize' : If true, normalize by vector sum.
+#         'pause_event' : when activated, will double the vector size to represent the same lengths for rests.
+#         'normalize' : If set to 0, return transition without normalization.
+#                       If set to 1, normalizae by row.
+#                       If set to 2, normalize by entire matrix sum.
+#         Returns:
+#         'transition_matrix': The output feature dimension is 12 × 12 (or 24 x 24 when pause_event is True).
+#         """
+#         pattern = feature['midi_pattern']
+#         if pause_event is False:
+#             transition_matrix = np.zeros((12, 12))
+#             pattern.make_ticks_abs()
+#             resolution = pattern.resolution
+#             idx = None
+#             # basic unit: bar_length/96
+#             for i in range(0, len(pattern[track_num])):
+#                 if type(pattern[track_num][i]) == midi.events.TimeSignatureEvent:
+#                     time_sig = pattern[track_num][i].data
+#                     bar_length = time_sig[track_num] * resolution * 4 / 2**(time_sig[1])
+#                 elif type(pattern[track_num][i]) == midi.events.NoteOnEvent and pattern[track_num][i].data[1] != 0:
+#                     if 'time_sig' not in locals():  # set default bar length as 4 beat
+#                         bar_length = 4 * resolution
+#                         time_sig = [4, 2, 24, 8]
+#                     unit = bar_length / 96.
+#                     hist_list = [unit * 96, unit * 48, unit * 24, unit * 12, unit * 6, unit * 72, unit * 36, unit * 18, unit * 9, unit * 32, unit * 16, unit * 8]
+#                     current_tick = pattern[track_num][i].tick
+#                     current_note = pattern[track_num][i].data[0]
+#                     # find note off
+#                     for j in range(i, len(pattern[track_num])):
+#                         if type(pattern[track_num][j]) == midi.events.NoteOffEvent or (type(pattern[track_num][j]) == midi.events.NoteOnEvent and pattern[track_num][j].data[1] == 0):
+#                             if pattern[track_num][j].data[0] == current_note:
+#                                 note_length = pattern[track_num][j].tick - current_tick
+#                                 distance = np.abs(np.array(hist_list) - note_length)
+#                                 last_idx = idx
+#                                 idx = distance.argmin()
+#                                 if last_idx is not None:
+#                                     transition_matrix[last_idx][idx] += 1
+#                                 break
+#         else:
+#             transition_matrix = np.zeros((24, 24))
+#             pattern.make_ticks_abs()
+#             resolution = pattern.resolution
+#             idx = None
+#             # basic unit: bar_length/96
+#             for i in range(0, len(pattern[track_num])):
+#                 if type(pattern[track_num][i]) == midi.events.TimeSignatureEvent:
+#                     time_sig = pattern[track_num][i].data
+#                     bar_length = time_sig[track_num] * resolution * 4 / 2**(time_sig[1])
+#                 elif type(pattern[track_num][i]) == midi.events.NoteOnEvent and pattern[track_num][i].data[1] != 0:
+#                     check_previous_off = True
+#                     if 'time_sig' not in locals():  # set default bar length as 4 beat
+#                         bar_length = 4 * resolution
+#                         time_sig = [4, 2, 24, 8]
+#                     unit = bar_length / 96.
+#                     tol = 3. * unit
+#                     hist_list = [unit * 96, unit * 48, unit * 24, unit * 12, unit * 6, unit * 72, unit * 36, unit * 18, unit * 9, unit * 32, unit * 16, unit * 8]
+#                     current_tick = pattern[track_num][i].tick
+#                     current_note = pattern[track_num][i].data[0]
+#                     # find next note off
+#                     for j in range(i, len(pattern[track_num])):
+#                         # find next note off
+#                         if type(pattern[track_num][j]) == midi.events.NoteOffEvent or (type(pattern[track_num][j]) == midi.events.NoteOnEvent and pattern[track_num][j].data[1] == 0):
+#                             if pattern[track_num][j].data[0] == current_note:
+#                                 note_length = pattern[track_num][j].tick - current_tick
+#                                 distance = np.abs(np.array(hist_list) - note_length)
+#                                 last_idx = idx
+#                                 idx = distance.argmin()
+#                                 if last_idx is not None:
+#                                     transition_matrix[last_idx][idx] += 1
+#                                 break
+#                             else:
+#                                 if pattern[track_num][j].tick == current_tick:
+#                                     check_previous_off = False
+#                     # find previous note off/on
+#                     if check_previous_off is True:
+#                         for j in range(i - 1, 0, -1):
+#                             if type(pattern[track_num][j]) == midi.events.NoteOnEvent and pattern[track_num][j].data[1] != 0:
+#                                 break
+#                             elif type(pattern[track_num][j]) == midi.events.NoteOffEvent or (type(pattern[track_num][j]) == midi.events.NoteOnEvent and pattern[track_num][j].data[1] == 0):
+#                                 note_length = current_tick - pattern[track_num][j].tick
+#                                 distance = np.abs(np.array(hist_list) - note_length)
+#                                 last_idx = idx
+#                                 idx = distance.argmin()
+#                                 if last_idx is not None:
+#                                     if distance[idx] < tol:
+#                                         idx = last_idx
+#                                         transition_matrix[last_idx][idx + 12] += 1
+#                                 break
+#         if normalize == 0:
+#             return transition_matrix
+#         elif normalize == 1:
+#             sums = np.sum(transition_matrix, axis=1)
+#             sums[sums == 0] = 1
+#             return transition_matrix / sums.reshape(-1, 1)
+#         elif normalize == 2:
+#             return transition_matrix / sum(sum(transition_matrix))
+#         else:
+#             print "invalid normalization mode, return unnormalized matrix"
+#             return transition_matrix
+# def chord_dependency(self, feature, bar_chord, bpm=120, num_bar=None, track_num=1):
+#     pm_object = feature['pretty_midi']
+#     # compare bar chroma with chord chroma. calculate the ecludian
+#     bar_pitch_class_histogram = self.bar_pitch_class_histogram(pm_object, bpm=bpm, num_bar=num_bar, track_num=track_num)
+#     dist = np.zeros((len(bar_pitch_class_histogram)))
+#     for i in range((len(bar_pitch_class_histogram))):
+#         dist[i] = np.linalg.norm(bar_pitch_class_histogram[i] - bar_chord[i])
+#     average_dist = np.mean(dist)
+#     return average_dist

music_evaluation/mgeval/core.pyc ADDED Viewed

Binary file (18.2 kB). View file