Spaces:

teticio
/

audio-diffusion

Runtime error

App Files Files Community

teticio commited on Oct 13, 2022

Commit

d76bdef

1 Parent(s): 3e8b723

simplify

Browse files

Files changed (1) hide show

train_vae.py +30 -122

train_vae.py CHANGED Viewed

@@ -4,7 +4,6 @@
 # TODO
 # grayscale
-# log audio
 # convert to huggingface / train huggingface
 import os
@@ -57,134 +56,46 @@ class AudioDiffusionDataModule(pl.LightningDataModule):
                           num_workers=self.num_workers)
-# from https://github.com/CompVis/stable-diffusion/blob/main/main.py
 class ImageLogger(Callback):
-    def __init__(self,
-                 batch_frequency,
-                 max_images,
-                 clamp=True,
-                 increase_log_steps=True,
-                 rescale=True,
-                 disabled=False,
-                 log_on_batch_idx=False,
-                 log_first_step=False,
-                 log_images_kwargs=None,
-                 resolution=256,
-                 hop_length=512):
         super().__init__()
         self.mel = Mel(x_res=resolution,
                        y_res=resolution,
                        hop_length=hop_length)
-        self.rescale = rescale
-        self.batch_freq = batch_frequency
-        self.max_images = max_images
-        self.logger_log_images = {
-            pl.loggers.TensorBoardLogger: self._testtube,
-        }
-        self.log_steps = [
-            2**n for n in range(int(np.log2(self.batch_freq)) + 1)
-        ]
-        if not increase_log_steps:
-            self.log_steps = [self.batch_freq]
-        self.clamp = clamp
-        self.disabled = disabled
-        self.log_on_batch_idx = log_on_batch_idx
-        self.log_images_kwargs = log_images_kwargs if log_images_kwargs else {}
-        self.log_first_step = log_first_step
-    #@rank_zero_only
-    def _testtube(self, pl_module, images, batch_idx, split):
         for k in images:
-            images_ = (images[k] + 1.0) / 2.0  # -1,1 -> 0,1; c,h,w
-            grid = torchvision.utils.make_grid(images_)
-            tag = f"{split}/{k}"
             pl_module.logger.experiment.add_image(
                 tag, grid, global_step=pl_module.global_step)
-            for _, image in enumerate(images_):
-                image = (images_.numpy() *
                          255).round().astype("uint8").transpose(0, 2, 3, 1)
                 audio = self.mel.image_to_audio(
-                    Image.fromarray(image[0], mode='RGB').convert('L'))
                 pl_module.logger.experiment.add_audio(
                     tag + f"/{_}",
                     normalize(audio),
                     global_step=pl_module.global_step,
                     sample_rate=self.mel.get_sample_rate())
-    #@rank_zero_only
-    def log_local(self, save_dir, split, images, global_step, current_epoch,
-                  batch_idx):
-        root = os.path.join(save_dir, "images", split)
-        for k in images:
-            grid = torchvision.utils.make_grid(images[k], nrow=4)
-            if self.rescale:
-                grid = (grid + 1.0) / 2.0  # -1,1 -> 0,1; c,h,w
-            grid = grid.transpose(0, 1).transpose(1, 2).squeeze(-1)
-            grid = grid.numpy()
-            grid = (grid * 255).astype(np.uint8)
-            filename = "{}_gs-{:06}_e-{:06}_b-{:06}.png".format(
-                k, global_step, current_epoch, batch_idx)
-            path = os.path.join(root, filename)
-            os.makedirs(os.path.split(path)[0], exist_ok=True)
-            Image.fromarray(grid).save(path)
-    def log_img(self, pl_module, batch, batch_idx, split="train"):
-        check_idx = batch_idx if self.log_on_batch_idx else pl_module.global_step
-        if (self.check_frequency(check_idx)
-                and  # batch_idx % self.batch_freq == 0
-                hasattr(pl_module, "log_images") and
-                callable(pl_module.log_images) and self.max_images > 0):
-            logger = type(pl_module.logger)
-            is_train = pl_module.training
-            if is_train:
-                pl_module.eval()
-            with torch.no_grad():
-                images = pl_module.log_images(batch,
-                                              split=split,
-                                              **self.log_images_kwargs)
-            for k in images:
-                N = min(images[k].shape[0], self.max_images)
-                images[k] = images[k][:N]
-                if isinstance(images[k], torch.Tensor):
-                    images[k] = images[k].detach().cpu()
-                    if self.clamp:
-                        images[k] = torch.clamp(images[k], -1., 1.)
-            #self.log_local(pl_module.logger.save_dir, split, images,
-            #               pl_module.global_step, pl_module.current_epoch,
-            #               batch_idx)
-            logger_log_images = self.logger_log_images.get(
-                logger, lambda *args, **kwargs: None)
-            logger_log_images(pl_module, images, pl_module.global_step, split)
-            if is_train:
-                pl_module.train()
-    def check_frequency(self, check_idx):
-        if ((check_idx % self.batch_freq) == 0 or
-            (check_idx in self.log_steps)) and (check_idx > 0
-                                                or self.log_first_step):
-            try:
-                self.log_steps.pop(0)
-            except IndexError as e:
-                #print(e)
-                pass
-            return True
-        return False
-    def on_train_batch_end(self, trainer, pl_module, outputs, batch,
-                           batch_idx):
-        if not self.disabled and (pl_module.global_step > 0
-                                  or self.log_first_step):
-            self.log_img(pl_module, batch, batch_idx, split="train")
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Train VAE using ldm.")
@@ -195,18 +106,15 @@ if __name__ == "__main__":
     lightning_config = config.pop("lightning", OmegaConf.create())
     trainer_config = lightning_config.get("trainer", OmegaConf.create())
     trainer_opt = argparse.Namespace(**trainer_config)
-    trainer = Trainer.from_argparse_args(
-        trainer_opt,
-        callbacks=[
-            ImageLogger(batch_frequency=1000,
-                        max_images=8,
-                        increase_log_steps=False,
-                        log_on_batch_idx=True),
-            ModelCheckpoint(dirpath='checkpoints',
-                            filename='{epoch:06}',
-                            verbose=True,
-                            save_last=True)
-        ])
     model = instantiate_from_config(config.model)
     model.learning_rate = config.model.base_learning_rate
     data = AudioDiffusionDataModule('teticio/audio-diffusion-256',

 # TODO
 # grayscale
 # convert to huggingface / train huggingface
 import os
                           num_workers=self.num_workers)
 class ImageLogger(Callback):
+    def __init__(self, every=1000, resolution=256, hop_length=512):
         super().__init__()
         self.mel = Mel(x_res=resolution,
                        y_res=resolution,
                        hop_length=hop_length)
+        self.every = every
+    def on_train_batch_end(self, trainer, pl_module, outputs, batch,
+                           batch_idx):
+        if (batch_idx + 1) % self.every != 0:
+            return
+        pl_module.eval()
+        with torch.no_grad():
+            images = pl_module.log_images(batch, split='train')
+        pl_module.train()
         for k in images:
+            images[k] = images[k].detach().cpu()
+            images[k] = torch.clamp(images[k], -1., 1.)
+            images[k] = (images[k] + 1.0) / 2.0  # -1,1 -> 0,1; c,h,w
+            grid = torchvision.utils.make_grid(images[k])
+            tag = f"train/{k}"
             pl_module.logger.experiment.add_image(
                 tag, grid, global_step=pl_module.global_step)
+            images[k] = (images[k].numpy() *
                          255).round().astype("uint8").transpose(0, 2, 3, 1)
+            for _, image in enumerate(images[k]):
                 audio = self.mel.image_to_audio(
+                    Image.fromarray(image, mode='RGB').convert('L'))
                 pl_module.logger.experiment.add_audio(
                     tag + f"/{_}",
                     normalize(audio),
                     global_step=pl_module.global_step,
                     sample_rate=self.mel.get_sample_rate())
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Train VAE using ldm.")
     lightning_config = config.pop("lightning", OmegaConf.create())
     trainer_config = lightning_config.get("trainer", OmegaConf.create())
     trainer_opt = argparse.Namespace(**trainer_config)
+    trainer = Trainer.from_argparse_args(trainer_opt,
+                                         callbacks=[
+                                             ImageLogger(),
+                                             ModelCheckpoint(
+                                                 dirpath='checkpoints',
+                                                 filename='{epoch:06}',
+                                                 verbose=True,
+                                                 save_last=True)
+                                         ])
     model = instantiate_from_config(config.model)
     model.learning_rate = config.model.base_learning_rate
     data = AudioDiffusionDataModule('teticio/audio-diffusion-256',