Spaces:

tetrisd
/

Diffusion-Attentive-Attribution-Maps

Runtime error

App Files Files Community

tetrisd commited on Dec 8, 2022

Commit

b4cfcd3

•

1 Parent(s): e9e7dc5

Add V2 and some paper code

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +2 -2
app.py +195 -144
diffusers/__init__.py +0 -60
diffusers/__pycache__/__init__.cpython-310.pyc +0 -0
diffusers/__pycache__/configuration_utils.cpython-310.pyc +0 -0
diffusers/__pycache__/dependency_versions_check.cpython-310.pyc +0 -0
diffusers/__pycache__/dependency_versions_table.cpython-310.pyc +0 -0
diffusers/__pycache__/dynamic_modules_utils.cpython-310.pyc +0 -0
diffusers/__pycache__/hub_utils.cpython-310.pyc +0 -0
diffusers/__pycache__/modeling_utils.cpython-310.pyc +0 -0
diffusers/__pycache__/onnx_utils.cpython-310.pyc +0 -0
diffusers/__pycache__/optimization.cpython-310.pyc +0 -0
diffusers/__pycache__/pipeline_utils.cpython-310.pyc +0 -0
diffusers/__pycache__/testing_utils.cpython-310.pyc +0 -0
diffusers/__pycache__/training_utils.cpython-310.pyc +0 -0
diffusers/commands/__init__.py +0 -27
diffusers/commands/__pycache__/__init__.cpython-310.pyc +0 -0
diffusers/commands/__pycache__/diffusers_cli.cpython-310.pyc +0 -0
diffusers/commands/__pycache__/env.cpython-310.pyc +0 -0
diffusers/commands/diffusers_cli.py +0 -41
diffusers/commands/env.py +0 -70
diffusers/configuration_utils.py +0 -403
diffusers/dependency_versions_check.py +0 -47
diffusers/dependency_versions_table.py +0 -26
diffusers/dynamic_modules_utils.py +0 -335
diffusers/hub_utils.py +0 -197
diffusers/modeling_utils.py +0 -542
diffusers/models/__init__.py +0 -17
diffusers/models/__pycache__/__init__.cpython-310.pyc +0 -0
diffusers/models/__pycache__/attention.cpython-310.pyc +0 -0
diffusers/models/__pycache__/embeddings.cpython-310.pyc +0 -0
diffusers/models/__pycache__/resnet.cpython-310.pyc +0 -0
diffusers/models/__pycache__/unet_2d.cpython-310.pyc +0 -0
diffusers/models/__pycache__/unet_2d_condition.cpython-310.pyc +0 -0
diffusers/models/__pycache__/unet_blocks.cpython-310.pyc +0 -0
diffusers/models/__pycache__/vae.cpython-310.pyc +0 -0
diffusers/models/attention.py +0 -409
diffusers/models/embeddings.py +0 -115
diffusers/models/resnet.py +0 -483
diffusers/models/unet_2d.py +0 -246
diffusers/models/unet_2d_condition.py +0 -272
diffusers/models/unet_blocks.py +0 -1484
diffusers/models/vae.py +0 -585
diffusers/onnx_utils.py +0 -189
diffusers/optimization.py +0 -275
diffusers/pipeline_utils.py +0 -417
diffusers/pipelines/__init__.py +0 -19
diffusers/pipelines/__pycache__/__init__.cpython-310.pyc +0 -0
diffusers/pipelines/ddim/__init__.py +0 -2
diffusers/pipelines/ddim/__pycache__/__init__.cpython-310.pyc +0 -0

README.md CHANGED Viewed

@@ -1,12 +1,12 @@
 ---
-title: Stable Diffusion Attentive Attribution Maps
 emoji: 👀
 colorFrom: blue
 colorTo: pink
 sdk: gradio
 sdk_version: 3.4.1
 app_file: app.py
-pinned: false
 license: mit
 ---

 ---
+title: Stable Diffusion V2 Attentive Attribution Maps
 emoji: 👀
 colorFrom: blue
 colorTo: pink
 sdk: gradio
 sdk_version: 3.4.1
 app_file: app.py
+pinned: true
 license: mit
 ---

app.py CHANGED Viewed

@@ -1,170 +1,221 @@
-from huggingface_hub import HfApi, HfFolder
-import os
-api = HfApi()
-api.set_access_token(os.environ['HF_SECRET'])
-folder = HfFolder()
-folder.save_token(os.environ['HF_SECRET'])
-from threading import Lock
 import math
-import os
-import random
 from diffusers import StableDiffusionPipeline
-from diffusers.models.attention import get_global_heat_map, clear_heat_maps
 from matplotlib import pyplot as plt
 import gradio as gr
 import torch
-import torch.nn.functional as F
-import spacy
-if not os.environ.get('NO_DOWNLOAD_SPACY'):
-    spacy.cli.download('en_core_web_sm')
-model_id = "runwayml/stable-diffusion-v1-5"
-device = "cuda"
-gen = torch.Generator(device='cuda')
-gen.manual_seed(12758672)
-orig_state = gen.get_state()
-pipe = StableDiffusionPipeline.from_pretrained(model_id, use_auth_token=True).to(device)
-lock = Lock()
-nlp = spacy.load('en_core_web_sm')
-def expand_m(m, n: int = 1, o=512, mode='bicubic'):
-    m = m.unsqueeze(0).unsqueeze(0) / n
-    m = F.interpolate(m.float().detach(), size=(o, o), mode='bicubic', align_corners=False)
-    m = (m - m.min()) / (m.max() - m.min() + 1e-8)
-    m = m.cpu().detach()
-    return m
-@torch.no_grad()
-def predict(prompt, inf_steps, threshold):
-    global lock
-    with torch.cuda.amp.autocast(), lock:
-        try:
-            plt.close('all')
-        except:
-            pass
-        gen.set_state(orig_state.clone())
-        clear_heat_maps()
-        out = pipe(prompt, guidance_scale=7.5, height=512, width=512, do_intermediates=False, generator=gen, num_inference_steps=int(inf_steps))
-        heat_maps = get_global_heat_map()
-    with torch.cuda.amp.autocast(dtype=torch.float32):
-        m = 0
-        n = 0
-        w = ''
-        w_idx = 0
-        fig, ax = plt.subplots()
-        ax.imshow(out.images[0].cpu().float().detach().permute(1, 2, 0).numpy())
-        ax.set_xticks([])
-        ax.set_yticks([])
-        fig1, axs1 = plt.subplots(math.ceil(len(out.words) / 4), 4)#, figsize=(20, 20))
-        fig2, axs2 = plt.subplots(math.ceil(len(out.words) / 4), 4)  # , figsize=(20, 20))
-        for idx in range(len(out.words) + 1):
-            if idx == 0:
-                continue
-            word = out.words[idx - 1]
-            m += heat_maps[idx]
-            n += 1
-            w += word
-            if '</w>' not in word:
-                continue
-            else:
-                mplot = expand_m(m, n)
-                spotlit_im = out.images[0].cpu().float().detach()
-                w = w.replace('</w>', '')
-                spotlit_im2 = torch.cat((spotlit_im, (1 - mplot.squeeze(0)).pow(1)), dim=0)
-                if len(out.words) <= 4:
-                    a1 = axs1[w_idx % 4]
-                    a2 = axs2[w_idx % 4]
                 else:
-                    a1 = axs1[w_idx // 4, w_idx % 4]
-                    a2 = axs2[w_idx // 4, w_idx % 4]
-                a1.set_xticks([])
-                a1.set_yticks([])
-                a1.imshow(mplot.squeeze().numpy(), cmap='jet')
-                a1.imshow(spotlit_im2.permute(1, 2, 0).numpy())
-                a1.set_title(w)
-                mask = torch.ones_like(mplot)
-                mask[mplot < threshold * mplot.max()] = 0
-                im2 = spotlit_im * mask.squeeze(0)
-                a2.set_xticks([])
-                a2.set_yticks([])
-                a2.imshow(im2.permute(1, 2, 0).numpy())
-                a2.set_title(w)
-                m = 0
-                n = 0
-                w_idx += 1
-                w = ''
-        for idx in range(w_idx, len(axs1.flatten())):
-            fig1.delaxes(axs1.flatten()[idx])
-            fig2.delaxes(axs2.flatten()[idx])
-    return fig, fig1, fig2
-def set_prompt(prompt):
-    return prompt
-with gr.Blocks() as demo:
-    md = '''# DAAM: Attention Maps for Interpreting Stable Diffusion
-    Check out the paper: [What the DAAM: Interpreting Stable Diffusion Using Cross Attention](http://arxiv.org/abs/2210.04885).
-    See our (much cleaner) [DAAM codebase](https://github.com/castorini/daam) on GitHub.
-    **Update**: We got a community grant! I'll continue running and updating the space, with a major release planned in December.
-    '''
-    gr.Markdown(md)
-    with gr.Row():
-        with gr.Column():
-            dropdown = gr.Dropdown([
-                'An angry, bald man doing research',
-                'Doing research at Comcast Applied AI labs',
-                'Professor Jimmy Lin from the University of Waterloo',
-                'Yann Lecun teaching machine learning on a chalkboard',
-                'A cat eating cake for her birthday',
-                'Steak and dollars on a plate',
-                'A fox, a dog, and a wolf in a field'
-            ], label='Examples', value='An angry, bald man doing research')
-            text = gr.Textbox(label='Prompt', value='An angry, bald man doing research')
-            slider1 = gr.Slider(15, 35, value=25, interactive=True, step=1, label='Inference steps')
-            slider2 = gr.Slider(0, 1.0, value=0.4, interactive=True, step=0.05, label='Threshold (tau)')
-            submit_btn = gr.Button('Submit')
-        with gr.Tab('Original Image'):
-            p0 = gr.Plot()
-        with gr.Tab('Soft DAAM Maps'):
-            p1 = gr.Plot()
-        with gr.Tab('Hard DAAM Maps'):
-            p2 = gr.Plot()
-        submit_btn.click(fn=predict, inputs=[text, slider1, slider2], outputs=[p0, p1, p2])
-        dropdown.change(set_prompt, dropdown, text)
-        dropdown.update()
-demo.launch()

 import math
+import time
+from threading import Lock
+from typing import Any, List
+import argparse
+import numpy as np
 from diffusers import StableDiffusionPipeline
 from matplotlib import pyplot as plt
 import gradio as gr
 import torch
+from spacy import displacy
+from daam import trace
+from daam.utils import set_seed, cached_nlp
+def dependency(text):
+    doc = cached_nlp(text)
+    svg = displacy.render(doc, style='dep', options={'compact': True, 'distance': 100})
+    return svg
+def get_tokenizing_mapping(prompt: str, tokenizer: Any) -> List[List[int]]:
+    tokens = tokenizer.tokenize(prompt)
+    merge_idxs = []
+    words = []
+    curr_idxs = []
+    curr_word = ''
+    for i, token in enumerate(tokens):
+        curr_idxs.append(i + 1)  # because of the [CLS] token
+        curr_word += token
+        if '</w>' in token:
+            merge_idxs.append(curr_idxs)
+            curr_idxs = []
+            words.append(curr_word[:-4])
+            curr_word = ''
+    return merge_idxs, words
+def get_args():
+    model_id_map = {
+        'v1': 'runwayml/stable-diffusion-v1-5',
+        'v2-base': 'stabilityai/stable-diffusion-2-base',
+        'v2-large': 'stabilityai/stable-diffusion-2'
+    }
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model', '-m', type=str, default='v2-base', choices=list(model_id_map.keys()), help="which diffusion model to use")
+    parser.add_argument('--seed', '-s', type=int, default=0, help="the random seed")
+    parser.add_argument('--port', '-p', type=int, default=8080, help="the port to launch the demo")
+    parser.add_argument('--no-cuda', action='store_true', help="Use CPUs instead of GPUs")
+    args = parser.parse_args()
+    args.model = model_id_map[args.model]
+    return args
+def main():
+    args = get_args()
+    plt.switch_backend('agg')
+    device = "cpu" if args.no_cuda else "cuda"
+    pipe = StableDiffusionPipeline.from_pretrained(args.model, use_auth_token=True).to(device)
+    lock = Lock()
+    @torch.no_grad()
+    def update_dropdown(prompt):
+        tokens = [''] + [x.text for x in cached_nlp(prompt) if x.pos_ == 'ADJ']
+        return gr.Dropdown.update(choices=tokens), dependency(prompt)
+    @torch.no_grad()
+    def plot(prompt, choice, replaced_word, inf_steps, is_random_seed):
+        new_prompt = prompt.replace(',', ', ').replace('.', '. ')
+        if choice:
+            if not replaced_word:
+                replaced_word = '.'
+            new_prompt = [replaced_word if tok.text == choice else tok.text for tok in cached_nlp(prompt)]
+            new_prompt = ' '.join(new_prompt)
+        merge_idxs, words = get_tokenizing_mapping(prompt, pipe.tokenizer)
+        with torch.cuda.amp.autocast(dtype=torch.float16), lock:
+            try:
+                plt.close('all')
+                plt.clf()
+            except:
+                pass
+            seed = int(time.time()) if is_random_seed else args.seed
+            gen = set_seed(seed)
+            prompt = prompt.replace(',', ', ').replace('.', '. ')  # hacky fix to address later
+            if choice:
+                new_prompt = new_prompt.replace(',', ', ').replace('.', '. ')  # hacky fix to address later
+                with trace(pipe, save_heads=new_prompt != prompt) as tc:
+                    out = pipe(prompt, num_inference_steps=inf_steps, generator=gen)
+                    image = np.array(out.images[0]) / 255
+                    heat_map = tc.compute_global_heat_map()
+                if new_prompt == prompt:
+                    image2 = image
                 else:
+                    gen = set_seed(seed)
+                    with trace(pipe, load_heads=True) as tc:
+                        out2 = pipe(new_prompt, num_inference_steps=inf_steps, generator=gen)
+                        image2 = np.array(out2.images[0]) / 255
+            else:
+                with trace(pipe) as tc:
+                    out = pipe(prompt, num_inference_steps=inf_steps, generator=gen)
+                    image = np.array(out.images[0]) / 255
+                    heat_map = tc.compute_global_heat_map()
+        # the main image
+        if new_prompt == prompt:
+            fig, ax = plt.subplots()
+            ax.imshow(image)
+            ax.set_xticks([])
+            ax.set_yticks([])
+        else:
+            fig, ax = plt.subplots(1, 2)
+            ax[0].imshow(image)
+            if choice:
+                ax[1].imshow(image2)
+            ax[0].set_title(choice)
+            ax[0].set_xticks([])
+            ax[0].set_yticks([])
+            ax[1].set_title(replaced_word)
+            ax[1].set_xticks([])
+            ax[1].set_yticks([])
+        # the heat maps
+        num_cells = 4
+        w = int(num_cells * 3.5)
+        h = math.ceil(len(words) / num_cells * 4.5)
+        fig_soft, axs_soft = plt.subplots(math.ceil(len(words) / num_cells), num_cells, figsize=(w, h))
+        axs_soft = axs_soft.flatten()
+        with torch.cuda.amp.autocast(dtype=torch.float32):
+            for idx, parsed_map in enumerate(heat_map.parsed_heat_maps()):
+                word_ax_soft = axs_soft[idx]
+                word_ax_soft.set_xticks([])
+                word_ax_soft.set_yticks([])
+                parsed_map.word_heat_map.plot_overlay(out.images[0], ax=word_ax_soft)
+                word_ax_soft.set_title(parsed_map.word_heat_map.word, fontsize=12)
+        for idx in range(len(words), len(axs_soft)):
+            fig_soft.delaxes(axs_soft[idx])
+        return fig, fig_soft
+    with gr.Blocks(css='scrollbar.css') as demo:
+        md = '''# DAAM: Attention Maps for Interpreting Stable Diffusion
+        Check out the **new** paper (2022/12/7): [What the DAAM: Interpreting Stable Diffusion Using Cross Attention](http://arxiv.org/abs/2210.04885).
+        See our (much cleaner) [DAAM codebase](https://github.com/castorini/daam) on GitHub.
+        '''
+        gr.Markdown(md)
+        with gr.Row():
+            with gr.Column():
+                dropdown = gr.Dropdown([
+                    'An angry, bald man doing research',
+                    'A bear and a moose',
+                    'A blue car driving through the city',
+                    'Monkey walking with hat',
+                    'Doing research at Comcast Applied AI labs',
+                    'Professor Jimmy Lin from the modern University of Waterloo',
+                    'Yann Lecun teaching machine learning on a green chalkboard',
+                    'A brown cat eating yummy cake for her birthday',
+                    'A brown fox, a white dog, and a blue wolf in a green field',
+                ], label='Examples', value='An angry, bald man doing research')
+                text = gr.Textbox(label='Prompt', value='An angry, bald man doing research')
+                with gr.Row():
+                    doc = cached_nlp('An angry, bald man doing research')
+                    tokens = [''] + [x.text for x in doc if x.pos_ == 'ADJ']
+                    dropdown2 = gr.Dropdown(tokens, label='Adjective to replace', interactive=True)
+                    text2 = gr.Textbox(label='New adjective', value='')
+                checkbox = gr.Checkbox(value=False, label='Random seed')
+                slider1 = gr.Slider(15, 30, value=25, interactive=True, step=1, label='Inference steps')
+                submit_btn = gr.Button('Submit', elem_id='submit-btn')
+                viz = gr.HTML(dependency('An angry, bald man doing research'), elem_id='viz')
+            with gr.Column():
+                with gr.Tab('Images'):
+                    p0 = gr.Plot()
+                with gr.Tab('DAAM Maps'):
+                    p1 = gr.Plot()
+            text.change(fn=update_dropdown, inputs=[text], outputs=[dropdown2, viz])
+            submit_btn.click(
+                fn=plot,
+                inputs=[text, dropdown2, text2, slider1, checkbox],
+                outputs=[p0, p1])
+            dropdown.change(lambda prompt: prompt, dropdown, text)
+            dropdown.update()
+    while True:
+        try:
+            demo.launch()
+        except OSError:
+            gr.close_all()
+        except KeyboardInterrupt:
+            gr.close_all()
+            break
+if __name__ == '__main__':
+    main()

diffusers/__init__.py DELETED Viewed

@@ -1,60 +0,0 @@
-from .utils import (
-    is_inflect_available,
-    is_onnx_available,
-    is_scipy_available,
-    is_transformers_available,
-    is_unidecode_available,
-)
-__version__ = "0.3.0"
-from .configuration_utils import ConfigMixin
-from .modeling_utils import ModelMixin
-from .models import AutoencoderKL, UNet2DConditionModel, UNet2DModel, VQModel
-from .onnx_utils import OnnxRuntimeModel
-from .optimization import (
-    get_constant_schedule,
-    get_constant_schedule_with_warmup,
-    get_cosine_schedule_with_warmup,
-    get_cosine_with_hard_restarts_schedule_with_warmup,
-    get_linear_schedule_with_warmup,
-    get_polynomial_decay_schedule_with_warmup,
-    get_scheduler,
-)
-from .pipeline_utils import DiffusionPipeline
-from .pipelines import DDIMPipeline, DDPMPipeline, KarrasVePipeline, LDMPipeline, PNDMPipeline, ScoreSdeVePipeline
-from .schedulers import (
-    DDIMScheduler,
-    DDPMScheduler,
-    KarrasVeScheduler,
-    PNDMScheduler,
-    SchedulerMixin,
-    ScoreSdeVeScheduler,
-)
-from .utils import logging
-if is_scipy_available():
-    from .schedulers import LMSDiscreteScheduler
-else:
-    from .utils.dummy_scipy_objects import *  # noqa F403
-from .training_utils import EMAModel
-if is_transformers_available():
-    from .pipelines import (
-        LDMTextToImagePipeline,
-        StableDiffusionImg2ImgPipeline,
-        StableDiffusionInpaintPipeline,
-        StableDiffusionPipeline,
-    )
-else:
-    from .utils.dummy_transformers_objects import *  # noqa F403
-if is_transformers_available() and is_onnx_available():
-    from .pipelines import StableDiffusionOnnxPipeline
-else:
-    from .utils.dummy_transformers_and_onnx_objects import *  # noqa F403

diffusers/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (1.85 kB)

diffusers/__pycache__/configuration_utils.cpython-310.pyc DELETED Viewed

Binary file (15.4 kB)

diffusers/__pycache__/dependency_versions_check.cpython-310.pyc DELETED Viewed

Binary file (967 Bytes)

diffusers/__pycache__/dependency_versions_table.cpython-310.pyc DELETED Viewed

Binary file (819 Bytes)

diffusers/__pycache__/dynamic_modules_utils.cpython-310.pyc DELETED Viewed

Binary file (11.6 kB)

diffusers/__pycache__/hub_utils.cpython-310.pyc DELETED Viewed

Binary file (5.46 kB)

diffusers/__pycache__/modeling_utils.cpython-310.pyc DELETED Viewed

Binary file (18.7 kB)

diffusers/__pycache__/onnx_utils.cpython-310.pyc DELETED Viewed

Binary file (6.3 kB)

diffusers/__pycache__/optimization.cpython-310.pyc DELETED Viewed

Binary file (10.1 kB)

diffusers/__pycache__/pipeline_utils.cpython-310.pyc DELETED Viewed

Binary file (14 kB)

diffusers/__pycache__/testing_utils.cpython-310.pyc DELETED Viewed

Binary file (1.66 kB)

diffusers/__pycache__/training_utils.cpython-310.pyc DELETED Viewed

Binary file (3.64 kB)

diffusers/commands/__init__.py DELETED Viewed

@@ -1,27 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from abc import ABC, abstractmethod
-from argparse import ArgumentParser
-class BaseDiffusersCLICommand(ABC):
-    @staticmethod
-    @abstractmethod
-    def register_subcommand(parser: ArgumentParser):
-        raise NotImplementedError()
-    @abstractmethod
-    def run(self):
-        raise NotImplementedError()

diffusers/commands/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (817 Bytes)

diffusers/commands/__pycache__/diffusers_cli.cpython-310.pyc DELETED Viewed

Binary file (778 Bytes)

diffusers/commands/__pycache__/env.cpython-310.pyc DELETED Viewed

Binary file (2.17 kB)

diffusers/commands/diffusers_cli.py DELETED Viewed

@@ -1,41 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from argparse import ArgumentParser
-from .env import EnvironmentCommand
-def main():
-    parser = ArgumentParser("Diffusers CLI tool", usage="diffusers-cli <command> [<args>]")
-    commands_parser = parser.add_subparsers(help="diffusers-cli command helpers")
-    # Register commands
-    EnvironmentCommand.register_subcommand(commands_parser)
-    # Let's go
-    args = parser.parse_args()
-    if not hasattr(args, "func"):
-        parser.print_help()
-        exit(1)
-    # Run
-    service = args.func(args)
-    service.run()
-if __name__ == "__main__":
-    main()

diffusers/commands/env.py DELETED Viewed

@@ -1,70 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import platform
-from argparse import ArgumentParser
-import huggingface_hub
-from .. import __version__ as version
-from ..utils import is_torch_available, is_transformers_available
-from . import BaseDiffusersCLICommand
-def info_command_factory(_):
-    return EnvironmentCommand()
-class EnvironmentCommand(BaseDiffusersCLICommand):
-    @staticmethod
-    def register_subcommand(parser: ArgumentParser):
-        download_parser = parser.add_parser("env")
-        download_parser.set_defaults(func=info_command_factory)
-    def run(self):
-        hub_version = huggingface_hub.__version__
-        pt_version = "not installed"
-        pt_cuda_available = "NA"
-        if is_torch_available():
-            import torch
-            pt_version = torch.__version__
-            pt_cuda_available = torch.cuda.is_available()
-        transformers_version = "not installed"
-        if is_transformers_available:
-            import transformers
-            transformers_version = transformers.__version__
-        info = {
-            "`diffusers` version": version,
-            "Platform": platform.platform(),
-            "Python version": platform.python_version(),
-            "PyTorch version (GPU?)": f"{pt_version} ({pt_cuda_available})",
-            "Huggingface_hub version": hub_version,
-            "Transformers version": transformers_version,
-            "Using GPU in script?": "<fill in>",
-            "Using distributed or parallel set-up in script?": "<fill in>",
-        }
-        print("\nCopy-and-paste the text below in your GitHub issue and FILL OUT the two last points.\n")
-        print(self.format_dict(info))
-        return info
-    @staticmethod
-    def format_dict(d):
-        return "\n".join([f"- {prop}: {val}" for prop, val in d.items()]) + "\n"

diffusers/configuration_utils.py DELETED Viewed

@@ -1,403 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" ConfigMixinuration base class and utilities."""
-import functools
-import inspect
-import json
-import os
-import re
-from collections import OrderedDict
-from typing import Any, Dict, Tuple, Union
-from huggingface_hub import hf_hub_download
-from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError, RevisionNotFoundError
-from requests import HTTPError
-from . import __version__
-from .utils import DIFFUSERS_CACHE, HUGGINGFACE_CO_RESOLVE_ENDPOINT, logging
-logger = logging.get_logger(__name__)
-_re_configuration_file = re.compile(r"config\.(.*)\.json")
-class ConfigMixin:
-    r"""
-    Base class for all configuration classes. Stores all configuration parameters under `self.config` Also handles all
-    methods for loading/downloading/saving classes inheriting from [`ConfigMixin`] with
-        - [`~ConfigMixin.from_config`]
-        - [`~ConfigMixin.save_config`]
-    Class attributes:
-        - **config_name** (`str`) -- A filename under which the config should stored when calling
-          [`~ConfigMixin.save_config`] (should be overriden by parent class).
-        - **ignore_for_config** (`List[str]`) -- A list of attributes that should not be saved in the config (should be
-          overriden by parent class).
-    """
-    config_name = None
-    ignore_for_config = []
-    def register_to_config(self, **kwargs):
-        if self.config_name is None:
-            raise NotImplementedError(f"Make sure that {self.__class__} has defined a class name `config_name`")
-        kwargs["_class_name"] = self.__class__.__name__
-        kwargs["_diffusers_version"] = __version__
-        for key, value in kwargs.items():
-            try:
-                setattr(self, key, value)
-            except AttributeError as err:
-                logger.error(f"Can't set {key} with value {value} for {self}")
-                raise err
-        if not hasattr(self, "_internal_dict"):
-            internal_dict = kwargs
-        else:
-            previous_dict = dict(self._internal_dict)
-            internal_dict = {**self._internal_dict, **kwargs}
-            logger.debug(f"Updating config from {previous_dict} to {internal_dict}")
-        self._internal_dict = FrozenDict(internal_dict)
-    def save_config(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
-        """
-        Save a configuration object to the directory `save_directory`, so that it can be re-loaded using the
-        [`~ConfigMixin.from_config`] class method.
-        Args:
-            save_directory (`str` or `os.PathLike`):
-                Directory where the configuration JSON file will be saved (will be created if it does not exist).
-        """
-        if os.path.isfile(save_directory):
-            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
-        os.makedirs(save_directory, exist_ok=True)
-        # If we save using the predefined names, we can load using `from_config`
-        output_config_file = os.path.join(save_directory, self.config_name)
-        self.to_json_file(output_config_file)
-        logger.info(f"ConfigMixinuration saved in {output_config_file}")
-    @classmethod
-    def from_config(cls, pretrained_model_name_or_path: Union[str, os.PathLike], return_unused_kwargs=False, **kwargs):
-        r"""
-        Instantiate a Python class from a pre-defined JSON-file.
-        Parameters:
-            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
-                Can be either:
-                    - A string, the *model id* of a model repo on huggingface.co. Valid model ids should have an
-                      organization name, like `google/ddpm-celebahq-256`.
-                    - A path to a *directory* containing model weights saved using [`~ConfigMixin.save_config`], e.g.,
-                      `./my_model_directory/`.
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory in which a downloaded pretrained model configuration should be cached if the
-                standard cache should not be used.
-            ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`):
-                Whether or not to raise an error if some of the weights from the checkpoint do not have the same size
-                as the weights of the model (if for instance, you are instantiating a model with 10 labels from a
-                checkpoint with 3 labels).
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
-                file exists.
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            output_loading_info(`bool`, *optional*, defaults to `False`):
-                Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
-            local_files_only(`bool`, *optional*, defaults to `False`):
-                Whether or not to only look at local files (i.e., do not try to download the model).
-            use_auth_token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-                when running `transformers-cli login` (stored in `~/.huggingface`).
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
-                identifier allowed by git.
-            mirror (`str`, *optional*):
-                Mirror source to accelerate downloads in China. If you are from China and have an accessibility
-                problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
-                Please refer to the mirror site for more information.
-        <Tip>
-        Passing `use_auth_token=True`` is required when you want to use a private model.
-        </Tip>
-        <Tip>
-        Activate the special ["offline-mode"](https://huggingface.co/transformers/installation.html#offline-mode) to
-        use this method in a firewalled environment.
-        </Tip>
-        """
-        config_dict = cls.get_config_dict(pretrained_model_name_or_path=pretrained_model_name_or_path, **kwargs)
-        init_dict, unused_kwargs = cls.extract_init_dict(config_dict, **kwargs)
-        model = cls(**init_dict)
-        if return_unused_kwargs:
-            return model, unused_kwargs
-        else:
-            return model
-    @classmethod
-    def get_config_dict(
-        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
-    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-        cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
-        force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
-        proxies = kwargs.pop("proxies", None)
-        use_auth_token = kwargs.pop("use_auth_token", None)
-        local_files_only = kwargs.pop("local_files_only", False)
-        revision = kwargs.pop("revision", None)
-        subfolder = kwargs.pop("subfolder", None)
-        user_agent = {"file_type": "config"}
-        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
-        if cls.config_name is None:
-            raise ValueError(
-                "`self.config_name` is not defined. Note that one should not load a config from "
-                "`ConfigMixin`. Please make sure to define `config_name` in a class inheriting from `ConfigMixin`"
-            )
-        if os.path.isfile(pretrained_model_name_or_path):
-            config_file = pretrained_model_name_or_path
-        elif os.path.isdir(pretrained_model_name_or_path):
-            if os.path.isfile(os.path.join(pretrained_model_name_or_path, cls.config_name)):
-                # Load from a PyTorch checkpoint
-                config_file = os.path.join(pretrained_model_name_or_path, cls.config_name)
-            elif subfolder is not None and os.path.isfile(
-                os.path.join(pretrained_model_name_or_path, subfolder, cls.config_name)
-            ):
-                config_file = os.path.join(pretrained_model_name_or_path, subfolder, cls.config_name)
-            else:
-                raise EnvironmentError(
-                    f"Error no file named {cls.config_name} found in directory {pretrained_model_name_or_path}."
-                )
-        else:
-            try:
-                # Load from URL or cache if already cached
-                config_file = hf_hub_download(
-                    pretrained_model_name_or_path,
-                    filename=cls.config_name,
-                    cache_dir=cache_dir,
-                    force_download=force_download,
-                    proxies=proxies,
-                    resume_download=resume_download,
-                    local_files_only=local_files_only,
-                    use_auth_token=use_auth_token,
-                    user_agent=user_agent,
-                    subfolder=subfolder,
-                    revision=revision,
-                )
-            except RepositoryNotFoundError:
-                raise EnvironmentError(
-                    f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier"
-                    " listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a"
-                    " token having permission to this repo with `use_auth_token` or log in with `huggingface-cli"
-                    " login` and pass `use_auth_token=True`."
-                )
-            except RevisionNotFoundError:
-                raise EnvironmentError(
-                    f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for"
-                    " this model name. Check the model page at"
-                    f" 'https://huggingface.co/{pretrained_model_name_or_path}' for available revisions."
-                )
-            except EntryNotFoundError:
-                raise EnvironmentError(
-                    f"{pretrained_model_name_or_path} does not appear to have a file named {cls.config_name}."
-                )
-            except HTTPError as err:
-                raise EnvironmentError(
-                    "There was a specific connection error when trying to load"
-                    f" {pretrained_model_name_or_path}:\n{err}"
-                )
-            except ValueError:
-                raise EnvironmentError(
-                    f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load this model, couldn't find it"
-                    f" in the cached files and it looks like {pretrained_model_name_or_path} is not the path to a"
-                    f" directory containing a {cls.config_name} file.\nCheckout your internet connection or see how to"
-                    " run the library in offline mode at"
-                    " 'https://huggingface.co/docs/diffusers/installation#offline-mode'."
-                )
-            except EnvironmentError:
-                raise EnvironmentError(
-                    f"Can't load config for '{pretrained_model_name_or_path}'. If you were trying to load it from "
-                    "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
-                    f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
-                    f"containing a {cls.config_name} file"
-                )
-        try:
-            # Load config dict
-            config_dict = cls._dict_from_json_file(config_file)
-        except (json.JSONDecodeError, UnicodeDecodeError):
-            raise EnvironmentError(f"It looks like the config file at '{config_file}' is not a valid JSON file.")
-        return config_dict
-    @classmethod
-    def extract_init_dict(cls, config_dict, **kwargs):
-        expected_keys = set(dict(inspect.signature(cls.__init__).parameters).keys())
-        expected_keys.remove("self")
-        # remove general kwargs if present in dict
-        if "kwargs" in expected_keys:
-            expected_keys.remove("kwargs")
-        # remove keys to be ignored
-        if len(cls.ignore_for_config) > 0:
-            expected_keys = expected_keys - set(cls.ignore_for_config)
-        init_dict = {}
-        for key in expected_keys:
-            if key in kwargs:
-                # overwrite key
-                init_dict[key] = kwargs.pop(key)
-            elif key in config_dict:
-                # use value from config dict
-                init_dict[key] = config_dict.pop(key)
-        unused_kwargs = config_dict.update(kwargs)
-        passed_keys = set(init_dict.keys())
-        if len(expected_keys - passed_keys) > 0:
-            logger.warning(
-                f"{expected_keys - passed_keys} was not found in config. Values will be initialized to default values."
-            )
-        return init_dict, unused_kwargs
-    @classmethod
-    def _dict_from_json_file(cls, json_file: Union[str, os.PathLike]):
-        with open(json_file, "r", encoding="utf-8") as reader:
-            text = reader.read()
-        return json.loads(text)
-    def __repr__(self):
-        return f"{self.__class__.__name__} {self.to_json_string()}"
-    @property
-    def config(self) -> Dict[str, Any]:
-        return self._internal_dict
-    def to_json_string(self) -> str:
-        """
-        Serializes this instance to a JSON string.
-        Returns:
-            `str`: String containing all the attributes that make up this configuration instance in JSON format.
-        """
-        config_dict = self._internal_dict if hasattr(self, "_internal_dict") else {}
-        return json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
-    def to_json_file(self, json_file_path: Union[str, os.PathLike]):
-        """
-        Save this instance to a JSON file.
-        Args:
-            json_file_path (`str` or `os.PathLike`):
-                Path to the JSON file in which this configuration instance's parameters will be saved.
-        """
-        with open(json_file_path, "w", encoding="utf-8") as writer:
-            writer.write(self.to_json_string())
-class FrozenDict(OrderedDict):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        for key, value in self.items():
-            setattr(self, key, value)
-        self.__frozen = True
-    def __delitem__(self, *args, **kwargs):
-        raise Exception(f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance.")
-    def setdefault(self, *args, **kwargs):
-        raise Exception(f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance.")
-    def pop(self, *args, **kwargs):
-        raise Exception(f"You cannot use ``pop`` on a {self.__class__.__name__} instance.")
-    def update(self, *args, **kwargs):
-        raise Exception(f"You cannot use ``update`` on a {self.__class__.__name__} instance.")
-    def __setattr__(self, name, value):
-        if hasattr(self, "__frozen") and self.__frozen:
-            raise Exception(f"You cannot use ``__setattr__`` on a {self.__class__.__name__} instance.")
-        super().__setattr__(name, value)
-    def __setitem__(self, name, value):
-        if hasattr(self, "__frozen") and self.__frozen:
-            raise Exception(f"You cannot use ``__setattr__`` on a {self.__class__.__name__} instance.")
-        super().__setitem__(name, value)
-def register_to_config(init):
-    r"""
-    Decorator to apply on the init of classes inheriting from [`ConfigMixin`] so that all the arguments are
-    automatically sent to `self.register_for_config`. To ignore a specific argument accepted by the init but that
-    shouldn't be registered in the config, use the `ignore_for_config` class variable
-    Warning: Once decorated, all private arguments (beginning with an underscore) are trashed and not sent to the init!
-    """
-    @functools.wraps(init)
-    def inner_init(self, *args, **kwargs):
-        # Ignore private kwargs in the init.
-        init_kwargs = {k: v for k, v in kwargs.items() if not k.startswith("_")}
-        init(self, *args, **init_kwargs)
-        if not isinstance(self, ConfigMixin):
-            raise RuntimeError(
-                f"`@register_for_config` was applied to {self.__class__.__name__} init method, but this class does "
-                "not inherit from `ConfigMixin`."
-            )
-        ignore = getattr(self, "ignore_for_config", [])
-        # Get positional arguments aligned with kwargs
-        new_kwargs = {}
-        signature = inspect.signature(init)
-        parameters = {
-            name: p.default for i, (name, p) in enumerate(signature.parameters.items()) if i > 0 and name not in ignore
-        }
-        for arg, name in zip(args, parameters.keys()):
-            new_kwargs[name] = arg
-        # Then add all kwargs
-        new_kwargs.update(
-            {
-                k: init_kwargs.get(k, default)
-                for k, default in parameters.items()
-                if k not in ignore and k not in new_kwargs
-            }
-        )
-        getattr(self, "register_to_config")(**new_kwargs)
-    return inner_init

diffusers/dependency_versions_check.py DELETED Viewed

@@ -1,47 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import sys
-from .dependency_versions_table import deps
-from .utils.versions import require_version, require_version_core
-# define which module versions we always want to check at run time
-# (usually the ones defined in `install_requires` in setup.py)
-#
-# order specific notes:
-# - tqdm must be checked before tokenizers
-pkgs_to_check_at_runtime = "python tqdm regex requests packaging filelock numpy tokenizers".split()
-if sys.version_info < (3, 7):
-    pkgs_to_check_at_runtime.append("dataclasses")
-if sys.version_info < (3, 8):
-    pkgs_to_check_at_runtime.append("importlib_metadata")
-for pkg in pkgs_to_check_at_runtime:
-    if pkg in deps:
-        if pkg == "tokenizers":
-            # must be loaded here, or else tqdm check may fail
-            from .utils import is_tokenizers_available
-            if not is_tokenizers_available():
-                continue  # not required, check version only if installed
-        require_version_core(deps[pkg])
-    else:
-        raise ValueError(f"can't find {pkg} in {deps.keys()}, check dependency_versions_table.py")
-def dep_version_check(pkg, hint=None):
-    require_version(deps[pkg], hint)

diffusers/dependency_versions_table.py DELETED Viewed

@@ -1,26 +0,0 @@
-# THIS FILE HAS BEEN AUTOGENERATED. To update:
-# 1. modify the `_deps` dict in setup.py
-# 2. run `make deps_table_update``
-deps = {
-    "Pillow": "Pillow",
-    "accelerate": "accelerate>=0.11.0",
-    "black": "black==22.3",
-    "datasets": "datasets",
-    "filelock": "filelock",
-    "flake8": "flake8>=3.8.3",
-    "hf-doc-builder": "hf-doc-builder>=0.3.0",
-    "huggingface-hub": "huggingface-hub>=0.8.1",
-    "importlib_metadata": "importlib_metadata",
-    "isort": "isort>=5.5.4",
-    "modelcards": "modelcards==0.1.4",
-    "numpy": "numpy",
-    "pytest": "pytest",
-    "pytest-timeout": "pytest-timeout",
-    "pytest-xdist": "pytest-xdist",
-    "scipy": "scipy",
-    "regex": "regex!=2019.12.17",
-    "requests": "requests",
-    "tensorboard": "tensorboard",
-    "torch": "torch>=1.4",
-    "transformers": "transformers>=4.21.0",
-}

diffusers/dynamic_modules_utils.py DELETED Viewed

@@ -1,335 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Utilities to dynamically load objects from the Hub."""
-import importlib
-import os
-import re
-import shutil
-import sys
-from pathlib import Path
-from typing import Dict, Optional, Union
-from huggingface_hub import cached_download
-from .utils import DIFFUSERS_DYNAMIC_MODULE_NAME, HF_MODULES_CACHE, logging
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-def init_hf_modules():
-    """
-    Creates the cache directory for modules with an init, and adds it to the Python path.
-    """
-    # This function has already been executed if HF_MODULES_CACHE already is in the Python path.
-    if HF_MODULES_CACHE in sys.path:
-        return
-    sys.path.append(HF_MODULES_CACHE)
-    os.makedirs(HF_MODULES_CACHE, exist_ok=True)
-    init_path = Path(HF_MODULES_CACHE) / "__init__.py"
-    if not init_path.exists():
-        init_path.touch()
-def create_dynamic_module(name: Union[str, os.PathLike]):
-    """
-    Creates a dynamic module in the cache directory for modules.
-    """
-    init_hf_modules()
-    dynamic_module_path = Path(HF_MODULES_CACHE) / name
-    # If the parent module does not exist yet, recursively create it.
-    if not dynamic_module_path.parent.exists():
-        create_dynamic_module(dynamic_module_path.parent)
-    os.makedirs(dynamic_module_path, exist_ok=True)
-    init_path = dynamic_module_path / "__init__.py"
-    if not init_path.exists():
-        init_path.touch()
-def get_relative_imports(module_file):
-    """
-    Get the list of modules that are relatively imported in a module file.
-    Args:
-        module_file (`str` or `os.PathLike`): The module file to inspect.
-    """
-    with open(module_file, "r", encoding="utf-8") as f:
-        content = f.read()
-    # Imports of the form `import .xxx`
-    relative_imports = re.findall("^\s*import\s+\.(\S+)\s*$", content, flags=re.MULTILINE)
-    # Imports of the form `from .xxx import yyy`
-    relative_imports += re.findall("^\s*from\s+\.(\S+)\s+import", content, flags=re.MULTILINE)
-    # Unique-ify
-    return list(set(relative_imports))
-def get_relative_import_files(module_file):
-    """
-    Get the list of all files that are needed for a given module. Note that this function recurses through the relative
-    imports (if a imports b and b imports c, it will return module files for b and c).
-    Args:
-        module_file (`str` or `os.PathLike`): The module file to inspect.
-    """
-    no_change = False
-    files_to_check = [module_file]
-    all_relative_imports = []
-    # Let's recurse through all relative imports
-    while not no_change:
-        new_imports = []
-        for f in files_to_check:
-            new_imports.extend(get_relative_imports(f))
-        module_path = Path(module_file).parent
-        new_import_files = [str(module_path / m) for m in new_imports]
-        new_import_files = [f for f in new_import_files if f not in all_relative_imports]
-        files_to_check = [f"{f}.py" for f in new_import_files]
-        no_change = len(new_import_files) == 0
-        all_relative_imports.extend(files_to_check)
-    return all_relative_imports
-def check_imports(filename):
-    """
-    Check if the current Python environment contains all the libraries that are imported in a file.
-    """
-    with open(filename, "r", encoding="utf-8") as f:
-        content = f.read()
-    # Imports of the form `import xxx`
-    imports = re.findall("^\s*import\s+(\S+)\s*$", content, flags=re.MULTILINE)
-    # Imports of the form `from xxx import yyy`
-    imports += re.findall("^\s*from\s+(\S+)\s+import", content, flags=re.MULTILINE)
-    # Only keep the top-level module
-    imports = [imp.split(".")[0] for imp in imports if not imp.startswith(".")]
-    # Unique-ify and test we got them all
-    imports = list(set(imports))
-    missing_packages = []
-    for imp in imports:
-        try:
-            importlib.import_module(imp)
-        except ImportError:
-            missing_packages.append(imp)
-    if len(missing_packages) > 0:
-        raise ImportError(
-            "This modeling file requires the following packages that were not found in your environment: "
-            f"{', '.join(missing_packages)}. Run `pip install {' '.join(missing_packages)}`"
-        )
-    return get_relative_imports(filename)
-def get_class_in_module(class_name, module_path):
-    """
-    Import a module on the cache directory for modules and extract a class from it.
-    """
-    module_path = module_path.replace(os.path.sep, ".")
-    module = importlib.import_module(module_path)
-    return getattr(module, class_name)
-def get_cached_module_file(
-    pretrained_model_name_or_path: Union[str, os.PathLike],
-    module_file: str,
-    cache_dir: Optional[Union[str, os.PathLike]] = None,
-    force_download: bool = False,
-    resume_download: bool = False,
-    proxies: Optional[Dict[str, str]] = None,
-    use_auth_token: Optional[Union[bool, str]] = None,
-    revision: Optional[str] = None,
-    local_files_only: bool = False,
-):
-    """
-    Prepares Downloads a module from a local folder or a distant repo and returns its path inside the cached
-    Transformers module.
-    Args:
-        pretrained_model_name_or_path (`str` or `os.PathLike`):
-            This can be either:
-            - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
-              huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced
-              under a user or organization name, like `dbmdz/bert-base-german-cased`.
-            - a path to a *directory* containing a configuration file saved using the
-              [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
-        module_file (`str`):
-            The name of the module file containing the class to look for.
-        cache_dir (`str` or `os.PathLike`, *optional*):
-            Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
-            cache should not be used.
-        force_download (`bool`, *optional*, defaults to `False`):
-            Whether or not to force to (re-)download the configuration files and override the cached versions if they
-            exist.
-        resume_download (`bool`, *optional*, defaults to `False`):
-            Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists.
-        proxies (`Dict[str, str]`, *optional*):
-            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
-            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-        use_auth_token (`str` or *bool*, *optional*):
-            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-            when running `transformers-cli login` (stored in `~/.huggingface`).
-        revision (`str`, *optional*, defaults to `"main"`):
-            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
-            identifier allowed by git.
-        local_files_only (`bool`, *optional*, defaults to `False`):
-            If `True`, will only try to load the tokenizer configuration from local files.
-    <Tip>
-    Passing `use_auth_token=True` is required when you want to use a private model.
-    </Tip>
-    Returns:
-        `str`: The path to the module inside the cache.
-    """
-    # Download and cache module_file from the repo `pretrained_model_name_or_path` of grab it if it's a local file.
-    pretrained_model_name_or_path = str(pretrained_model_name_or_path)
-    module_file_or_url = os.path.join(pretrained_model_name_or_path, module_file)
-    submodule = "local"
-    if os.path.isfile(module_file_or_url):
-        resolved_module_file = module_file_or_url
-    else:
-        try:
-            # Load from URL or cache if already cached
-            resolved_module_file = cached_download(
-                module_file_or_url,
-                cache_dir=cache_dir,
-                force_download=force_download,
-                proxies=proxies,
-                resume_download=resume_download,
-                local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
-            )
-        except EnvironmentError:
-            logger.error(f"Could not locate the {module_file} inside {pretrained_model_name_or_path}.")
-            raise
-    # Check we have all the requirements in our environment
-    modules_needed = check_imports(resolved_module_file)
-    # Now we move the module inside our cached dynamic modules.
-    full_submodule = DIFFUSERS_DYNAMIC_MODULE_NAME + os.path.sep + submodule
-    create_dynamic_module(full_submodule)
-    submodule_path = Path(HF_MODULES_CACHE) / full_submodule
-    # We always copy local files (we could hash the file to see if there was a change, and give them the name of
-    # that hash, to only copy when there is a modification but it seems overkill for now).
-    # The only reason we do the copy is to avoid putting too many folders in sys.path.
-    shutil.copy(resolved_module_file, submodule_path / module_file)
-    for module_needed in modules_needed:
-        module_needed = f"{module_needed}.py"
-        shutil.copy(os.path.join(pretrained_model_name_or_path, module_needed), submodule_path / module_needed)
-    return os.path.join(full_submodule, module_file)
-def get_class_from_dynamic_module(
-    pretrained_model_name_or_path: Union[str, os.PathLike],
-    module_file: str,
-    class_name: str,
-    cache_dir: Optional[Union[str, os.PathLike]] = None,
-    force_download: bool = False,
-    resume_download: bool = False,
-    proxies: Optional[Dict[str, str]] = None,
-    use_auth_token: Optional[Union[bool, str]] = None,
-    revision: Optional[str] = None,
-    local_files_only: bool = False,
-    **kwargs,
-):
-    """
-    Extracts a class from a module file, present in the local folder or repository of a model.
-    <Tip warning={true}>
-    Calling this function will execute the code in the module file found locally or downloaded from the Hub. It should
-    therefore only be called on trusted repos.
-    </Tip>
-    Args:
-        pretrained_model_name_or_path (`str` or `os.PathLike`):
-            This can be either:
-            - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
-              huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced
-              under a user or organization name, like `dbmdz/bert-base-german-cased`.
-            - a path to a *directory* containing a configuration file saved using the
-              [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
-        module_file (`str`):
-            The name of the module file containing the class to look for.
-        class_name (`str`):
-            The name of the class to import in the module.
-        cache_dir (`str` or `os.PathLike`, *optional*):
-            Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
-            cache should not be used.
-        force_download (`bool`, *optional*, defaults to `False`):
-            Whether or not to force to (re-)download the configuration files and override the cached versions if they
-            exist.
-        resume_download (`bool`, *optional*, defaults to `False`):
-            Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists.
-        proxies (`Dict[str, str]`, *optional*):
-            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
-            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-        use_auth_token (`str` or `bool`, *optional*):
-            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-            when running `transformers-cli login` (stored in `~/.huggingface`).
-        revision (`str`, *optional*, defaults to `"main"`):
-            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
-            identifier allowed by git.
-        local_files_only (`bool`, *optional*, defaults to `False`):
-            If `True`, will only try to load the tokenizer configuration from local files.
-    <Tip>
-    Passing `use_auth_token=True` is required when you want to use a private model.
-    </Tip>
-    Returns:
-        `type`: The class, dynamically imported from the module.
-    Examples:
-    ```python
-    # Download module `modeling.py` from huggingface.co and cache then extract the class `MyBertModel` from this
-    # module.
-    cls = get_class_from_dynamic_module("sgugger/my-bert-model", "modeling.py", "MyBertModel")
-    ```"""
-    # And lastly we get the class inside our newly created module
-    final_module = get_cached_module_file(
-        pretrained_model_name_or_path,
-        module_file,
-        cache_dir=cache_dir,
-        force_download=force_download,
-        resume_download=resume_download,
-        proxies=proxies,
-        use_auth_token=use_auth_token,
-        revision=revision,
-        local_files_only=local_files_only,
-    )
-    return get_class_in_module(class_name, final_module.replace(".py", ""))

diffusers/hub_utils.py DELETED Viewed

@@ -1,197 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import shutil
-from pathlib import Path
-from typing import Optional
-from huggingface_hub import HfFolder, Repository, whoami
-from .pipeline_utils import DiffusionPipeline
-from .utils import is_modelcards_available, logging
-if is_modelcards_available():
-    from modelcards import CardData, ModelCard
-logger = logging.get_logger(__name__)
-MODEL_CARD_TEMPLATE_PATH = Path(__file__).parent / "utils" / "model_card_template.md"
-def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
-    if token is None:
-        token = HfFolder.get_token()
-    if organization is None:
-        username = whoami(token)["name"]
-        return f"{username}/{model_id}"
-    else:
-        return f"{organization}/{model_id}"
-def init_git_repo(args, at_init: bool = False):
-    """
-    Args:
-    Initializes a git repo in `args.hub_model_id`.
-        at_init (`bool`, *optional*, defaults to `False`):
-            Whether this function is called before any training or not. If `self.args.overwrite_output_dir` is `True`
-            and `at_init` is `True`, the path to the repo (which is `self.args.output_dir`) might be wiped out.
-    """
-    if hasattr(args, "local_rank") and args.local_rank not in [-1, 0]:
-        return
-    hub_token = args.hub_token if hasattr(args, "hub_token") else None
-    use_auth_token = True if hub_token is None else hub_token
-    if not hasattr(args, "hub_model_id") or args.hub_model_id is None:
-        repo_name = Path(args.output_dir).absolute().name
-    else:
-        repo_name = args.hub_model_id
-    if "/" not in repo_name:
-        repo_name = get_full_repo_name(repo_name, token=hub_token)
-    try:
-        repo = Repository(
-            args.output_dir,
-            clone_from=repo_name,
-            use_auth_token=use_auth_token,
-            private=args.hub_private_repo,
-        )
-    except EnvironmentError:
-        if args.overwrite_output_dir and at_init:
-            # Try again after wiping output_dir
-            shutil.rmtree(args.output_dir)
-            repo = Repository(
-                args.output_dir,
-                clone_from=repo_name,
-                use_auth_token=use_auth_token,
-            )
-        else:
-            raise
-    repo.git_pull()
-    # By default, ignore the checkpoint folders
-    if not os.path.exists(os.path.join(args.output_dir, ".gitignore")):
-        with open(os.path.join(args.output_dir, ".gitignore"), "w", encoding="utf-8") as writer:
-            writer.writelines(["checkpoint-*/"])
-    return repo
-def push_to_hub(
-    args,
-    pipeline: DiffusionPipeline,
-    repo: Repository,
-    commit_message: Optional[str] = "End of training",
-    blocking: bool = True,
-    **kwargs,
-) -> str:
-    """
-    Parameters:
-    Upload *self.model* and *self.tokenizer* to the 🤗 model hub on the repo *self.args.hub_model_id*.
-        commit_message (`str`, *optional*, defaults to `"End of training"`):
-            Message to commit while pushing.
-        blocking (`bool`, *optional*, defaults to `True`):
-            Whether the function should return only when the `git push` has finished.
-        kwargs:
-            Additional keyword arguments passed along to [`create_model_card`].
-    Returns:
-        The url of the commit of your model in the given repository if `blocking=False`, a tuple with the url of the
-        commit and an object to track the progress of the commit if `blocking=True`
-    """
-    if not hasattr(args, "hub_model_id") or args.hub_model_id is None:
-        model_name = Path(args.output_dir).name
-    else:
-        model_name = args.hub_model_id.split("/")[-1]
-    output_dir = args.output_dir
-    os.makedirs(output_dir, exist_ok=True)
-    logger.info(f"Saving pipeline checkpoint to {output_dir}")
-    pipeline.save_pretrained(output_dir)
-    # Only push from one node.
-    if hasattr(args, "local_rank") and args.local_rank not in [-1, 0]:
-        return
-    # Cancel any async push in progress if blocking=True. The commits will all be pushed together.
-    if (
-        blocking
-        and len(repo.command_queue) > 0
-        and repo.command_queue[-1] is not None
-        and not repo.command_queue[-1].is_done
-    ):
-        repo.command_queue[-1]._process.kill()
-    git_head_commit_url = repo.push_to_hub(commit_message=commit_message, blocking=blocking, auto_lfs_prune=True)
-    # push separately the model card to be independent from the rest of the model
-    create_model_card(args, model_name=model_name)
-    try:
-        repo.push_to_hub(commit_message="update model card README.md", blocking=blocking, auto_lfs_prune=True)
-    except EnvironmentError as exc:
-        logger.error(f"Error pushing update to the model card. Please read logs and retry.\n${exc}")
-    return git_head_commit_url
-def create_model_card(args, model_name):
-    if not is_modelcards_available:
-        raise ValueError(
-            "Please make sure to have `modelcards` installed when using the `create_model_card` function. You can"
-            " install the package with `pip install modelcards`."
-        )
-    if hasattr(args, "local_rank") and args.local_rank not in [-1, 0]:
-        return
-    hub_token = args.hub_token if hasattr(args, "hub_token") else None
-    repo_name = get_full_repo_name(model_name, token=hub_token)
-    model_card = ModelCard.from_template(
-        card_data=CardData(  # Card metadata object that will be converted to YAML block
-            language="en",
-            license="apache-2.0",
-            library_name="diffusers",
-            tags=[],
-            datasets=args.dataset_name,
-            metrics=[],
-        ),
-        template_path=MODEL_CARD_TEMPLATE_PATH,
-        model_name=model_name,
-        repo_name=repo_name,
-        dataset_name=args.dataset_name if hasattr(args, "dataset_name") else None,
-        learning_rate=args.learning_rate,
-        train_batch_size=args.train_batch_size,
-        eval_batch_size=args.eval_batch_size,
-        gradient_accumulation_steps=args.gradient_accumulation_steps
-        if hasattr(args, "gradient_accumulation_steps")
-        else None,
-        adam_beta1=args.adam_beta1 if hasattr(args, "adam_beta1") else None,
-        adam_beta2=args.adam_beta2 if hasattr(args, "adam_beta2") else None,
-        adam_weight_decay=args.adam_weight_decay if hasattr(args, "adam_weight_decay") else None,
-        adam_epsilon=args.adam_epsilon if hasattr(args, "adam_epsilon") else None,
-        lr_scheduler=args.lr_scheduler if hasattr(args, "lr_scheduler") else None,
-        lr_warmup_steps=args.lr_warmup_steps if hasattr(args, "lr_warmup_steps") else None,
-        ema_inv_gamma=args.ema_inv_gamma if hasattr(args, "ema_inv_gamma") else None,
-        ema_power=args.ema_power if hasattr(args, "ema_power") else None,
-        ema_max_decay=args.ema_max_decay if hasattr(args, "ema_max_decay") else None,
-        mixed_precision=args.mixed_precision,
-    )
-    card_path = os.path.join(args.output_dir, "README.md")
-    model_card.save(card_path)

diffusers/modeling_utils.py DELETED Viewed

@@ -1,542 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-from typing import Callable, List, Optional, Tuple, Union
-import torch
-from torch import Tensor, device
-from huggingface_hub import hf_hub_download
-from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError, RevisionNotFoundError
-from requests import HTTPError
-from .utils import CONFIG_NAME, DIFFUSERS_CACHE, HUGGINGFACE_CO_RESOLVE_ENDPOINT, logging
-WEIGHTS_NAME = "diffusion_pytorch_model.bin"
-logger = logging.get_logger(__name__)
-def get_parameter_device(parameter: torch.nn.Module):
-    try:
-        return next(parameter.parameters()).device
-    except StopIteration:
-        # For torch.nn.DataParallel compatibility in PyTorch 1.5
-        def find_tensor_attributes(module: torch.nn.Module) -> List[Tuple[str, Tensor]]:
-            tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
-            return tuples
-        gen = parameter._named_members(get_members_fn=find_tensor_attributes)
-        first_tuple = next(gen)
-        return first_tuple[1].device
-def get_parameter_dtype(parameter: torch.nn.Module):
-    try:
-        return next(parameter.parameters()).dtype
-    except StopIteration:
-        # For torch.nn.DataParallel compatibility in PyTorch 1.5
-        def find_tensor_attributes(module: torch.nn.Module) -> List[Tuple[str, Tensor]]:
-            tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
-            return tuples
-        gen = parameter._named_members(get_members_fn=find_tensor_attributes)
-        first_tuple = next(gen)
-        return first_tuple[1].dtype
-def load_state_dict(checkpoint_file: Union[str, os.PathLike]):
-    """
-    Reads a PyTorch checkpoint file, returning properly formatted errors if they arise.
-    """
-    try:
-        return torch.load(checkpoint_file, map_location="cpu")
-    except Exception as e:
-        try:
-            with open(checkpoint_file) as f:
-                if f.read().startswith("version"):
-                    raise OSError(
-                        "You seem to have cloned a repository without having git-lfs installed. Please install "
-                        "git-lfs and run `git lfs install` followed by `git lfs pull` in the folder "
-                        "you cloned."
-                    )
-                else:
-                    raise ValueError(
-                        f"Unable to locate the file {checkpoint_file} which is necessary to load this pretrained "
-                        "model. Make sure you have saved the model properly."
-                    ) from e
-        except (UnicodeDecodeError, ValueError):
-            raise OSError(
-                f"Unable to load weights from pytorch checkpoint file for '{checkpoint_file}' "
-                f"at '{checkpoint_file}'. "
-                "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True."
-            )
-def _load_state_dict_into_model(model_to_load, state_dict):
-    # Convert old format to new format if needed from a PyTorch state_dict
-    # copy state_dict so _load_from_state_dict can modify it
-    state_dict = state_dict.copy()
-    error_msgs = []
-    # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
-    # so we need to apply the function recursively.
-    def load(module: torch.nn.Module, prefix=""):
-        args = (state_dict, prefix, {}, True, [], [], error_msgs)
-        module._load_from_state_dict(*args)
-        for name, child in module._modules.items():
-            if child is not None:
-                load(child, prefix + name + ".")
-    load(model_to_load)
-    return error_msgs
-class ModelMixin(torch.nn.Module):
-    r"""
-    Base class for all models.
-    [`ModelMixin`] takes care of storing the configuration of the models and handles methods for loading, downloading
-    and saving models.
-        - **config_name** ([`str`]) -- A filename under which the model should be stored when calling
-          [`~modeling_utils.ModelMixin.save_pretrained`].
-    """
-    config_name = CONFIG_NAME
-    _automatically_saved_args = ["_diffusers_version", "_class_name", "_name_or_path"]
-    def __init__(self):
-        super().__init__()
-    def save_pretrained(
-        self,
-        save_directory: Union[str, os.PathLike],
-        is_main_process: bool = True,
-        save_function: Callable = torch.save,
-    ):
-        """
-        Save a model and its configuration file to a directory, so that it can be re-loaded using the
-        `[`~modeling_utils.ModelMixin.from_pretrained`]` class method.
-        Arguments:
-            save_directory (`str` or `os.PathLike`):
-                Directory to which to save. Will be created if it doesn't exist.
-            is_main_process (`bool`, *optional*, defaults to `True`):
-                Whether the process calling this is the main process or not. Useful when in distributed training like
-                TPUs and need to call this function on all processes. In this case, set `is_main_process=True` only on
-                the main process to avoid race conditions.
-            save_function (`Callable`):
-                The function to use to save the state dictionary. Useful on distributed training like TPUs when one
-                need to replace `torch.save` by another method.
-        """
-        if os.path.isfile(save_directory):
-            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
-            return
-        os.makedirs(save_directory, exist_ok=True)
-        model_to_save = self
-        # Attach architecture to the config
-        # Save the config
-        if is_main_process:
-            model_to_save.save_config(save_directory)
-        # Save the model
-        state_dict = model_to_save.state_dict()
-        # Clean the folder from a previous save
-        for filename in os.listdir(save_directory):
-            full_filename = os.path.join(save_directory, filename)
-            # If we have a shard file that is not going to be replaced, we delete it, but only from the main process
-            # in distributed settings to avoid race conditions.
-            if filename.startswith(WEIGHTS_NAME[:-4]) and os.path.isfile(full_filename) and is_main_process:
-                os.remove(full_filename)
-        # Save the model
-        save_function(state_dict, os.path.join(save_directory, WEIGHTS_NAME))
-        logger.info(f"Model weights saved in {os.path.join(save_directory, WEIGHTS_NAME)}")
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
-        r"""
-        Instantiate a pretrained pytorch model from a pre-trained model configuration.
-        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train
-        the model, you should first set it back in training mode with `model.train()`.
-        The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come
-        pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
-        task.
-        The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those
-        weights are discarded.
-        Parameters:
-            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
-                Can be either:
-                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids should have an organization name, like `google/ddpm-celebahq-256`.
-                    - A path to a *directory* containing model weights saved using [`~ModelMixin.save_config`], e.g.,
-                      `./my_model_directory/`.
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory in which a downloaded pretrained model configuration should be cached if the
-                standard cache should not be used.
-            torch_dtype (`str` or `torch.dtype`, *optional*):
-                Override the default `torch.dtype` and load the model under this dtype. If `"auto"` is passed the dtype
-                will be automatically derived from the model's weights.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
-                file exists.
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            output_loading_info(`bool`, *optional*, defaults to `False`):
-                Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
-            local_files_only(`bool`, *optional*, defaults to `False`):
-                Whether or not to only look at local files (i.e., do not try to download the model).
-            use_auth_token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-                when running `diffusers-cli login` (stored in `~/.huggingface`).
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
-                identifier allowed by git.
-            mirror (`str`, *optional*):
-                Mirror source to accelerate downloads in China. If you are from China and have an accessibility
-                problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
-                Please refer to the mirror site for more information.
-        <Tip>
-        Passing `use_auth_token=True`` is required when you want to use a private model.
-        </Tip>
-        <Tip>
-        Activate the special ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use
-        this method in a firewalled environment.
-        </Tip>
-        """
-        cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
-        ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False)
-        force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
-        proxies = kwargs.pop("proxies", None)
-        output_loading_info = kwargs.pop("output_loading_info", False)
-        local_files_only = kwargs.pop("local_files_only", False)
-        use_auth_token = kwargs.pop("use_auth_token", None)
-        revision = kwargs.pop("revision", None)
-        from_auto_class = kwargs.pop("_from_auto", False)
-        torch_dtype = kwargs.pop("torch_dtype", None)
-        subfolder = kwargs.pop("subfolder", None)
-        user_agent = {"file_type": "model", "framework": "pytorch", "from_auto_class": from_auto_class}
-        # Load config if we don't provide a configuration
-        config_path = pretrained_model_name_or_path
-        model, unused_kwargs = cls.from_config(
-            config_path,
-            cache_dir=cache_dir,
-            return_unused_kwargs=True,
-            force_download=force_download,
-            resume_download=resume_download,
-            proxies=proxies,
-            local_files_only=local_files_only,
-            use_auth_token=use_auth_token,
-            revision=revision,
-            subfolder=subfolder,
-            **kwargs,
-        )
-        if torch_dtype is not None and not isinstance(torch_dtype, torch.dtype):
-            raise ValueError(
-                f"{torch_dtype} needs to be of type `torch.dtype`, e.g. `torch.float16`, but is {type(torch_dtype)}."
-            )
-        elif torch_dtype is not None:
-            model = model.to(torch_dtype)
-        model.register_to_config(_name_or_path=pretrained_model_name_or_path)
-        # This variable will flag if we're loading a sharded checkpoint. In this case the archive file is just the
-        # Load model
-        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
-        if os.path.isdir(pretrained_model_name_or_path):
-            if os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)):
-                # Load from a PyTorch checkpoint
-                model_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
-            elif subfolder is not None and os.path.isfile(
-                os.path.join(pretrained_model_name_or_path, subfolder, WEIGHTS_NAME)
-            ):
-                model_file = os.path.join(pretrained_model_name_or_path, subfolder, WEIGHTS_NAME)
-            else:
-                raise EnvironmentError(
-                    f"Error no file named {WEIGHTS_NAME} found in directory {pretrained_model_name_or_path}."
-                )
-        else:
-            try:
-                # Load from URL or cache if already cached
-                model_file = hf_hub_download(
-                    pretrained_model_name_or_path,
-                    filename=WEIGHTS_NAME,
-                    cache_dir=cache_dir,
-                    force_download=force_download,
-                    proxies=proxies,
-                    resume_download=resume_download,
-                    local_files_only=local_files_only,
-                    use_auth_token=use_auth_token,
-                    user_agent=user_agent,
-                    subfolder=subfolder,
-                    revision=revision,
-                )
-            except RepositoryNotFoundError:
-                raise EnvironmentError(
-                    f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier "
-                    "listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a "
-                    "token having permission to this repo with `use_auth_token` or log in with `huggingface-cli "
-                    "login` and pass `use_auth_token=True`."
-                )
-            except RevisionNotFoundError:
-                raise EnvironmentError(
-                    f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for "
-                    "this model name. Check the model page at "
-                    f"'https://huggingface.co/{pretrained_model_name_or_path}' for available revisions."
-                )
-            except EntryNotFoundError:
-                raise EnvironmentError(
-                    f"{pretrained_model_name_or_path} does not appear to have a file named {WEIGHTS_NAME}."
-                )
-            except HTTPError as err:
-                raise EnvironmentError(
-                    "There was a specific connection error when trying to load"
-                    f" {pretrained_model_name_or_path}:\n{err}"
-                )
-            except ValueError:
-                raise EnvironmentError(
-                    f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load this model, couldn't find it"
-                    f" in the cached files and it looks like {pretrained_model_name_or_path} is not the path to a"
-                    f" directory containing a file named {WEIGHTS_NAME} or"
-                    " \nCheckout your internet connection or see how to run the library in"
-                    " offline mode at 'https://huggingface.co/docs/diffusers/installation#offline-mode'."
-                )
-            except EnvironmentError:
-                raise EnvironmentError(
-                    f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it from "
-                    "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
-                    f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
-                    f"containing a file named {WEIGHTS_NAME}"
-                )
-            # restore default dtype
-        state_dict = load_state_dict(model_file)
-        model, missing_keys, unexpected_keys, mismatched_keys, error_msgs = cls._load_pretrained_model(
-            model,
-            state_dict,
-            model_file,
-            pretrained_model_name_or_path,
-            ignore_mismatched_sizes=ignore_mismatched_sizes,
-        )
-        # Set model in evaluation mode to deactivate DropOut modules by default
-        model.eval()
-        if output_loading_info:
-            loading_info = {
-                "missing_keys": missing_keys,
-                "unexpected_keys": unexpected_keys,
-                "mismatched_keys": mismatched_keys,
-                "error_msgs": error_msgs,
-            }
-            return model, loading_info
-        return model
-    @classmethod
-    def _load_pretrained_model(
-        cls,
-        model,
-        state_dict,
-        resolved_archive_file,
-        pretrained_model_name_or_path,
-        ignore_mismatched_sizes=False,
-    ):
-        # Retrieve missing & unexpected_keys
-        model_state_dict = model.state_dict()
-        loaded_keys = [k for k in state_dict.keys()]
-        expected_keys = list(model_state_dict.keys())
-        original_loaded_keys = loaded_keys
-        missing_keys = list(set(expected_keys) - set(loaded_keys))
-        unexpected_keys = list(set(loaded_keys) - set(expected_keys))
-        # Make sure we are able to load base models as well as derived models (with heads)
-        model_to_load = model
-        def _find_mismatched_keys(
-            state_dict,
-            model_state_dict,
-            loaded_keys,
-            ignore_mismatched_sizes,
-        ):
-            mismatched_keys = []
-            if ignore_mismatched_sizes:
-                for checkpoint_key in loaded_keys:
-                    model_key = checkpoint_key
-                    if (
-                        model_key in model_state_dict
-                        and state_dict[checkpoint_key].shape != model_state_dict[model_key].shape
-                    ):
-                        mismatched_keys.append(
-                            (checkpoint_key, state_dict[checkpoint_key].shape, model_state_dict[model_key].shape)
-                        )
-                        del state_dict[checkpoint_key]
-            return mismatched_keys
-        if state_dict is not None:
-            # Whole checkpoint
-            mismatched_keys = _find_mismatched_keys(
-                state_dict,
-                model_state_dict,
-                original_loaded_keys,
-                ignore_mismatched_sizes,
-            )
-            error_msgs = _load_state_dict_into_model(model_to_load, state_dict)
-        if len(error_msgs) > 0:
-            error_msg = "\n\t".join(error_msgs)
-            if "size mismatch" in error_msg:
-                error_msg += (
-                    "\n\tYou may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method."
-                )
-            raise RuntimeError(f"Error(s) in loading state_dict for {model.__class__.__name__}:\n\t{error_msg}")
-        if len(unexpected_keys) > 0:
-            logger.warning(
-                f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when"
-                f" initializing {model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are"
-                f" initializing {model.__class__.__name__} from the checkpoint of a model trained on another task"
-                " or with another architecture (e.g. initializing a BertForSequenceClassification model from a"
-                " BertForPreTraining model).\n- This IS NOT expected if you are initializing"
-                f" {model.__class__.__name__} from the checkpoint of a model that you expect to be exactly"
-                " identical (initializing a BertForSequenceClassification model from a"
-                " BertForSequenceClassification model)."
-            )
-        else:
-            logger.info(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n")
-        if len(missing_keys) > 0:
-            logger.warning(
-                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
-                f" {pretrained_model_name_or_path} and are newly initialized: {missing_keys}\nYou should probably"
-                " TRAIN this model on a down-stream task to be able to use it for predictions and inference."
-            )
-        elif len(mismatched_keys) == 0:
-            logger.info(
-                f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at"
-                f" {pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the"
-                f" checkpoint was trained on, you can already use {model.__class__.__name__} for predictions"
-                " without further training."
-            )
-        if len(mismatched_keys) > 0:
-            mismatched_warning = "\n".join(
-                [
-                    f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated"
-                    for key, shape1, shape2 in mismatched_keys
-                ]
-            )
-            logger.warning(
-                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
-                f" {pretrained_model_name_or_path} and are newly initialized because the shapes did not"
-                f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be"
-                " able to use it for predictions and inference."
-            )
-        return model, missing_keys, unexpected_keys, mismatched_keys, error_msgs
-    @property
-    def device(self) -> device:
-        """
-        `torch.device`: The device on which the module is (assuming that all the module parameters are on the same
-        device).
-        """
-        return get_parameter_device(self)
-    @property
-    def dtype(self) -> torch.dtype:
-        """
-        `torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype).
-        """
-        return get_parameter_dtype(self)
-    def num_parameters(self, only_trainable: bool = False, exclude_embeddings: bool = False) -> int:
-        """
-        Get number of (optionally, trainable or non-embeddings) parameters in the module.
-        Args:
-            only_trainable (`bool`, *optional*, defaults to `False`):
-                Whether or not to return only the number of trainable parameters
-            exclude_embeddings (`bool`, *optional*, defaults to `False`):
-                Whether or not to return only the number of non-embeddings parameters
-        Returns:
-            `int`: The number of parameters.
-        """
-        if exclude_embeddings:
-            embedding_param_names = [
-                f"{name}.weight"
-                for name, module_type in self.named_modules()
-                if isinstance(module_type, torch.nn.Embedding)
-            ]
-            non_embedding_parameters = [
-                parameter for name, parameter in self.named_parameters() if name not in embedding_param_names
-            ]
-            return sum(p.numel() for p in non_embedding_parameters if p.requires_grad or not only_trainable)
-        else:
-            return sum(p.numel() for p in self.parameters() if p.requires_grad or not only_trainable)
-def unwrap_model(model: torch.nn.Module) -> torch.nn.Module:
-    """
-    Recursively unwraps a model from potential containers (as used in distributed training).
-    Args:
-        model (`torch.nn.Module`): The model to unwrap.
-    """
-    # since there could be multiple levels of wrapping, unwrap recursively
-    if hasattr(model, "module"):
-        return unwrap_model(model.module)
-    else:
-        return model

diffusers/models/__init__.py DELETED Viewed

@@ -1,17 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from .unet_2d import UNet2DModel
-from .unet_2d_condition import UNet2DConditionModel
-from .vae import AutoencoderKL, VQModel

diffusers/models/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (313 Bytes)

diffusers/models/__pycache__/attention.cpython-310.pyc DELETED Viewed

Binary file (14.3 kB)

diffusers/models/__pycache__/embeddings.cpython-310.pyc DELETED Viewed

Binary file (3.72 kB)

diffusers/models/__pycache__/resnet.cpython-310.pyc DELETED Viewed

Binary file (14.5 kB)

diffusers/models/__pycache__/unet_2d.cpython-310.pyc DELETED Viewed

Binary file (7.94 kB)

diffusers/models/__pycache__/unet_2d_condition.cpython-310.pyc DELETED Viewed

Binary file (8.73 kB)

diffusers/models/__pycache__/unet_blocks.cpython-310.pyc DELETED Viewed

Binary file (23.7 kB)

diffusers/models/__pycache__/vae.cpython-310.pyc DELETED Viewed

Binary file (16.5 kB)

diffusers/models/attention.py DELETED Viewed

@@ -1,409 +0,0 @@
-import math
-from collections import defaultdict
-from typing import Optional
-import torch
-import torch.nn.functional as F
-from torch import nn
-class AttentionBlock(nn.Module):
-    """
-    An attention block that allows spatial positions to attend to each other. Originally ported from here, but adapted
-    to the N-d case.
-    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
-    Uses three q, k, v linear layers to compute attention.
-    Parameters:
-        channels (:obj:`int`): The number of channels in the input and output.
-        num_head_channels (:obj:`int`, *optional*):
-            The number of channels in each head. If None, then `num_heads` = 1.
-        num_groups (:obj:`int`, *optional*, defaults to 32): The number of groups to use for group norm.
-        rescale_output_factor (:obj:`float`, *optional*, defaults to 1.0): The factor to rescale the output by.
-        eps (:obj:`float`, *optional*, defaults to 1e-5): The epsilon value to use for group norm.
-    """
-    def __init__(
-        self,
-        channels: int,
-        num_head_channels: Optional[int] = None,
-        num_groups: int = 32,
-        rescale_output_factor: float = 1.0,
-        eps: float = 1e-5,
-    ):
-        super().__init__()
-        self.channels = channels
-        self.num_heads = channels // num_head_channels if num_head_channels is not None else 1
-        self.num_head_size = num_head_channels
-        self.group_norm = nn.GroupNorm(num_channels=channels, num_groups=num_groups, eps=eps, affine=True)
-        # define q,k,v as linear layers
-        self.query = nn.Linear(channels, channels)
-        self.key = nn.Linear(channels, channels)
-        self.value = nn.Linear(channels, channels)
-        self.rescale_output_factor = rescale_output_factor
-        self.proj_attn = nn.Linear(channels, channels, 1)
-    def transpose_for_scores(self, projection: torch.Tensor) -> torch.Tensor:
-        new_projection_shape = projection.size()[:-1] + (self.num_heads, -1)
-        # move heads to 2nd position (B, T, H * D) -> (B, T, H, D) -> (B, H, T, D)
-        new_projection = projection.view(new_projection_shape).permute(0, 2, 1, 3)
-        return new_projection
-    def forward(self, hidden_states):
-        residual = hidden_states
-        batch, channel, height, width = hidden_states.shape
-        # norm
-        hidden_states = self.group_norm(hidden_states)
-        hidden_states = hidden_states.view(batch, channel, height * width).transpose(1, 2)
-        # proj to q, k, v
-        query_proj = self.query(hidden_states)
-        key_proj = self.key(hidden_states)
-        value_proj = self.value(hidden_states)
-        # transpose
-        query_states = self.transpose_for_scores(query_proj)
-        key_states = self.transpose_for_scores(key_proj)
-        value_states = self.transpose_for_scores(value_proj)
-        # get scores
-        scale = 1 / math.sqrt(math.sqrt(self.channels / self.num_heads))
-        attention_scores = torch.matmul(query_states * scale, key_states.transpose(-1, -2) * scale)
-        attention_probs = torch.softmax(attention_scores.float(), dim=-1).type(attention_scores.dtype)
-        # compute attention output
-        hidden_states = torch.matmul(attention_probs, value_states)
-        hidden_states = hidden_states.permute(0, 2, 1, 3).contiguous()
-        new_hidden_states_shape = hidden_states.size()[:-2] + (self.channels,)
-        hidden_states = hidden_states.view(new_hidden_states_shape)
-        # compute next hidden_states
-        hidden_states = self.proj_attn(hidden_states)
-        hidden_states = hidden_states.transpose(-1, -2).reshape(batch, channel, height, width)
-        # res connect and rescale
-        hidden_states = (hidden_states + residual) / self.rescale_output_factor
-        return hidden_states
-class SpatialTransformer(nn.Module):
-    """
-    Transformer block for image-like data. First, project the input (aka embedding) and reshape to b, t, d. Then apply
-    standard transformer action. Finally, reshape to image.
-    Parameters:
-        in_channels (:obj:`int`): The number of channels in the input and output.
-        n_heads (:obj:`int`): The number of heads to use for multi-head attention.
-        d_head (:obj:`int`): The number of channels in each head.
-        depth (:obj:`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
-        dropout (:obj:`float`, *optional*, defaults to 0.1): The dropout probability to use.
-        context_dim (:obj:`int`, *optional*): The number of context dimensions to use.
-    """
-    def __init__(
-        self,
-        in_channels: int,
-        n_heads: int,
-        d_head: int,
-        depth: int = 1,
-        dropout: float = 0.0,
-        context_dim: Optional[int] = None,
-    ):
-        super().__init__()
-        self.n_heads = n_heads
-        self.d_head = d_head
-        self.in_channels = in_channels
-        inner_dim = n_heads * d_head
-        self.norm = torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
-        self.proj_in = nn.Conv2d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
-        self.transformer_blocks = nn.ModuleList(
-            [
-                BasicTransformerBlock(inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim)
-                for d in range(depth)
-            ]
-        )
-        self.proj_out = nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
-    def _set_attention_slice(self, slice_size):
-        for block in self.transformer_blocks:
-            block._set_attention_slice(slice_size)
-    def forward(self, x, context=None):
-        # note: if no context is given, cross-attention defaults to self-attention
-        b, c, h, w = x.shape
-        x_in = x
-        x = self.norm(x)
-        x = self.proj_in(x)
-        x = x.permute(0, 2, 3, 1).reshape(b, h * w, c)
-        for block in self.transformer_blocks:
-            x = block(x, context=context)
-        x = x.reshape(b, h, w, c).permute(0, 3, 1, 2)
-        x = self.proj_out(x)
-        return x + x_in
-class BasicTransformerBlock(nn.Module):
-    r"""
-    A basic Transformer block.
-    Parameters:
-        dim (:obj:`int`): The number of channels in the input and output.
-        n_heads (:obj:`int`): The number of heads to use for multi-head attention.
-        d_head (:obj:`int`): The number of channels in each head.
-        dropout (:obj:`float`, *optional*, defaults to 0.0): The dropout probability to use.
-        context_dim (:obj:`int`, *optional*): The size of the context vector for cross attention.
-        gated_ff (:obj:`bool`, *optional*, defaults to :obj:`False`): Whether to use a gated feed-forward network.
-        checkpoint (:obj:`bool`, *optional*, defaults to :obj:`False`): Whether to use checkpointing.
-    """
-    def __init__(
-        self,
-        dim: int,
-        n_heads: int,
-        d_head: int,
-        dropout=0.0,
-        context_dim: Optional[int] = None,
-        gated_ff: bool = True,
-        checkpoint: bool = True,
-    ):
-        super().__init__()
-        self.attn1 = CrossAttention(
-            query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout
-        )  # is a self-attention
-        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
-        self.attn2 = CrossAttention(
-            query_dim=dim, context_dim=context_dim, heads=n_heads, dim_head=d_head, dropout=dropout
-        )  # is self-attn if context is none
-        self.norm1 = nn.LayerNorm(dim)
-        self.norm2 = nn.LayerNorm(dim)
-        self.norm3 = nn.LayerNorm(dim)
-        self.checkpoint = checkpoint
-    def _set_attention_slice(self, slice_size):
-        self.attn1._slice_size = slice_size
-        self.attn2._slice_size = slice_size
-    def forward(self, x, context=None):
-        x = x.contiguous() if x.device.type == "mps" else x
-        x = self.attn1(self.norm1(x)) + x
-        x = self.attn2(self.norm2(x), context=context) + x
-        x = self.ff(self.norm3(x)) + x
-        return x
-heat_maps = defaultdict(list)
-all_heat_maps = []
-def clear_heat_maps():
-    global heat_maps, all_heat_maps
-    heat_maps = defaultdict(list)
-    all_heat_maps = []
-def next_heat_map():
-    global heat_maps, all_heat_maps
-    all_heat_maps.append(heat_maps)
-    heat_maps = defaultdict(list)
-def get_global_heat_map(last_n: int = None, idx: int = None, factors=None):
-    global heat_maps, all_heat_maps
-    if idx is not None:
-        heat_maps2 = [all_heat_maps[idx]]
-    else:
-        heat_maps2 = all_heat_maps[-last_n:] if last_n is not None else all_heat_maps
-    if factors is None:
-        factors = {1, 2, 4, 8, 16, 32}
-    all_merges = []
-    for heat_map_map in heat_maps2:
-        merge_list = []
-        for k, v in heat_map_map.items():
-            if k in factors:
-                merge_list.append(torch.stack(v, 0).mean(0))
-        all_merges.append(merge_list)
-    maps = torch.stack([torch.stack(x, 0) for x in all_merges], dim=0)
-    return maps.sum(0).cuda().sum(2).sum(0)
-class CrossAttention(nn.Module):
-    r"""
-    A cross attention layer.
-    Parameters:
-        query_dim (:obj:`int`): The number of channels in the query.
-        context_dim (:obj:`int`, *optional*):
-            The number of channels in the context. If not given, defaults to `query_dim`.
-        heads (:obj:`int`,  *optional*, defaults to 8): The number of heads to use for multi-head attention.
-        dim_head (:obj:`int`,  *optional*, defaults to 64): The number of channels in each head.
-        dropout (:obj:`float`, *optional*, defaults to 0.0): The dropout probability to use.
-    """
-    def __init__(
-        self, query_dim: int, context_dim: Optional[int] = None, heads: int = 8, dim_head: int = 64, dropout: int = 0.0
-    ):
-        super().__init__()
-        inner_dim = dim_head * heads
-        context_dim = context_dim if context_dim is not None else query_dim
-        self.scale = dim_head**-0.5
-        self.heads = heads
-        # for slice_size > 0 the attention score computation
-        # is split across the batch axis to save memory
-        # You can set slice_size with `set_attention_slice`
-        self._slice_size = None
-        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
-        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
-        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
-        self.to_out = nn.Sequential(nn.Linear(inner_dim, query_dim), nn.Dropout(dropout))
-    def reshape_heads_to_batch_dim(self, tensor):
-        batch_size, seq_len, dim = tensor.shape
-        head_size = self.heads
-        tensor = tensor.reshape(batch_size, seq_len, head_size, dim // head_size)
-        tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size * head_size, seq_len, dim // head_size)
-        return tensor
-    def reshape_batch_dim_to_heads(self, tensor):
-        batch_size, seq_len, dim = tensor.shape
-        head_size = self.heads
-        tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim)
-        tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size // head_size, seq_len, dim * head_size)
-        return tensor
-    def forward(self, x, context=None, mask=None):
-        batch_size, sequence_length, dim = x.shape
-        use_context = context is not None
-        q = self.to_q(x)
-        context = context if context is not None else x
-        k = self.to_k(context)
-        v = self.to_v(context)
-        q = self.reshape_heads_to_batch_dim(q)
-        k = self.reshape_heads_to_batch_dim(k)
-        v = self.reshape_heads_to_batch_dim(v)
-        # TODO(PVP) - mask is currently never used. Remember to re-implement when used
-        # attention, what we cannot get enough of
-        hidden_states = self._attention(q, k, v, sequence_length, dim, use_context=use_context)
-        return self.to_out(hidden_states)
-    @torch.no_grad()
-    def _up_sample_attn(self, x, factor, method: str = 'bicubic'):
-        weight = torch.full((factor, factor), 1 / factor**2, device=x.device)
-        weight = weight.view(1, 1, factor, factor)
-        h = w = int(math.sqrt(x.size(1)))
-        maps = []
-        x = x.permute(2, 0, 1)
-        with torch.cuda.amp.autocast(dtype=torch.float32):
-            for map_ in x:
-                map_ = map_.unsqueeze(1).view(map_.size(0), 1, h, w)
-                if method == 'bicubic':
-                    map_ = F.interpolate(map_, size=(64, 64), mode="bicubic", align_corners=False)
-                    maps.append(map_.squeeze(1))
-                else:
-                    maps.append(F.conv_transpose2d(map_, weight, stride=factor).squeeze(1).cpu())
-        maps = torch.stack(maps, 0).sum(1, keepdim=True).cpu()
-        return maps
-    def _attention(self, query, key, value, sequence_length, dim, use_context: bool = True):
-        batch_size_attention = query.shape[0]
-        hidden_states = torch.zeros(
-            (batch_size_attention, sequence_length, dim // self.heads), device=query.device, dtype=query.dtype
-        )
-        slice_size = self._slice_size if self._slice_size is not None else hidden_states.shape[0]
-        for i in range(hidden_states.shape[0] // slice_size):
-            start_idx = i * slice_size
-            end_idx = (i + 1) * slice_size
-            attn_slice = (
-                torch.einsum("b i d, b j d -> b i j", query[start_idx:end_idx], key[start_idx:end_idx]) * self.scale
-            )
-            factor = int(math.sqrt(4096 // attn_slice.shape[1]))
-            attn_slice = attn_slice.softmax(-1)
-            if use_context and attn_slice.shape[-1] == 77:
-                if factor >= 1:
-                    factor //= 1
-                    maps = self._up_sample_attn(attn_slice, factor)
-                    global heat_maps
-                    heat_maps[factor].append(maps)
-                # print(attn_slice.size(), query.size(), key.size(), value.size())
-            attn_slice = torch.einsum("b i j, b j d -> b i d", attn_slice, value[start_idx:end_idx])
-            hidden_states[start_idx:end_idx] = attn_slice
-        # reshape hidden_states
-        hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
-        return hidden_states
-class FeedForward(nn.Module):
-    r"""
-    A feed-forward layer.
-    Parameters:
-        dim (:obj:`int`): The number of channels in the input.
-        dim_out (:obj:`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
-        mult (:obj:`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
-        glu (:obj:`bool`, *optional*, defaults to :obj:`False`): Whether to use GLU activation.
-        dropout (:obj:`float`, *optional*, defaults to 0.0): The dropout probability to use.
-    """
-    def __init__(
-        self, dim: int, dim_out: Optional[int] = None, mult: int = 4, glu: bool = False, dropout: float = 0.0
-    ):
-        super().__init__()
-        inner_dim = int(dim * mult)
-        dim_out = dim_out if dim_out is not None else dim
-        project_in = GEGLU(dim, inner_dim)
-        self.net = nn.Sequential(project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out))
-    def forward(self, x):
-        return self.net(x)
-# feedforward
-class GEGLU(nn.Module):
-    r"""
-    A variant of the gated linear unit activation function from https://arxiv.org/abs/2002.05202.
-    Parameters:
-        dim_in (:obj:`int`): The number of channels in the input.
-        dim_out (:obj:`int`): The number of channels in the output.
-    """
-    def __init__(self, dim_in: int, dim_out: int):
-        super().__init__()
-        self.proj = nn.Linear(dim_in, dim_out * 2)
-    def forward(self, x):
-        x, gate = self.proj(x).chunk(2, dim=-1)
-        return x * F.gelu(gate)

diffusers/models/embeddings.py DELETED Viewed

@@ -1,115 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-import numpy as np
-import torch
-from torch import nn
-def get_timestep_embedding(
-    timesteps: torch.Tensor,
-    embedding_dim: int,
-    flip_sin_to_cos: bool = False,
-    downscale_freq_shift: float = 1,
-    scale: float = 1,
-    max_period: int = 10000,
-):
-    """
-    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
-    :param timesteps: a 1-D Tensor of N indices, one per batch element.
-                      These may be fractional.
-    :param embedding_dim: the dimension of the output. :param max_period: controls the minimum frequency of the
-    embeddings. :return: an [N x dim] Tensor of positional embeddings.
-    """
-    assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
-    half_dim = embedding_dim // 2
-    exponent = -math.log(max_period) * torch.arange(start=0, end=half_dim, dtype=torch.float32)
-    exponent = exponent / (half_dim - downscale_freq_shift)
-    emb = torch.exp(exponent).to(device=timesteps.device)
-    emb = timesteps[:, None].float() * emb[None, :]
-    # scale embeddings
-    emb = scale * emb
-    # concat sine and cosine embeddings
-    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
-    # flip sine and cosine embeddings
-    if flip_sin_to_cos:
-        emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1)
-    # zero pad
-    if embedding_dim % 2 == 1:
-        emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
-    return emb
-class TimestepEmbedding(nn.Module):
-    def __init__(self, channel: int, time_embed_dim: int, act_fn: str = "silu"):
-        super().__init__()
-        self.linear_1 = nn.Linear(channel, time_embed_dim)
-        self.act = None
-        if act_fn == "silu":
-            self.act = nn.SiLU()
-        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim)
-    def forward(self, sample):
-        sample = self.linear_1(sample)
-        if self.act is not None:
-            sample = self.act(sample)
-        sample = self.linear_2(sample)
-        return sample
-class Timesteps(nn.Module):
-    def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float):
-        super().__init__()
-        self.num_channels = num_channels
-        self.flip_sin_to_cos = flip_sin_to_cos
-        self.downscale_freq_shift = downscale_freq_shift
-    def forward(self, timesteps):
-        t_emb = get_timestep_embedding(
-            timesteps,
-            self.num_channels,
-            flip_sin_to_cos=self.flip_sin_to_cos,
-            downscale_freq_shift=self.downscale_freq_shift,
-        )
-        return t_emb
-class GaussianFourierProjection(nn.Module):
-    """Gaussian Fourier embeddings for noise levels."""
-    def __init__(self, embedding_size: int = 256, scale: float = 1.0):
-        super().__init__()
-        self.weight = nn.Parameter(torch.randn(embedding_size) * scale, requires_grad=False)
-        # to delete later
-        self.W = nn.Parameter(torch.randn(embedding_size) * scale, requires_grad=False)
-        self.weight = self.W
-    def forward(self, x):
-        x = torch.log(x)
-        x_proj = x[:, None] * self.weight[None, :] * 2 * np.pi
-        out = torch.cat([torch.sin(x_proj), torch.cos(x_proj)], dim=-1)
-        return out

diffusers/models/resnet.py DELETED Viewed

@@ -1,483 +0,0 @@
-from functools import partial
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-class Upsample2D(nn.Module):
-    """
-    An upsampling layer with an optional convolution.
-    :param channels: channels in the inputs and outputs. :param use_conv: a bool determining if a convolution is
-    applied. :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
-                 upsampling occurs in the inner-two dimensions.
-    """
-    def __init__(self, channels, use_conv=False, use_conv_transpose=False, out_channels=None, name="conv"):
-        super().__init__()
-        self.channels = channels
-        self.out_channels = out_channels or channels
-        self.use_conv = use_conv
-        self.use_conv_transpose = use_conv_transpose
-        self.name = name
-        conv = None
-        if use_conv_transpose:
-            conv = nn.ConvTranspose2d(channels, self.out_channels, 4, 2, 1)
-        elif use_conv:
-            conv = nn.Conv2d(self.channels, self.out_channels, 3, padding=1)
-        # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
-        if name == "conv":
-            self.conv = conv
-        else:
-            self.Conv2d_0 = conv
-    def forward(self, x):
-        assert x.shape[1] == self.channels
-        if self.use_conv_transpose:
-            return self.conv(x)
-        x = F.interpolate(x, scale_factor=2.0, mode="nearest")
-        # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
-        if self.use_conv:
-            if self.name == "conv":
-                x = self.conv(x)
-            else:
-                x = self.Conv2d_0(x)
-        return x
-class Downsample2D(nn.Module):
-    """
-    A downsampling layer with an optional convolution.
-    :param channels: channels in the inputs and outputs. :param use_conv: a bool determining if a convolution is
-    applied. :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
-                 downsampling occurs in the inner-two dimensions.
-    """
-    def __init__(self, channels, use_conv=False, out_channels=None, padding=1, name="conv"):
-        super().__init__()
-        self.channels = channels
-        self.out_channels = out_channels or channels
-        self.use_conv = use_conv
-        self.padding = padding
-        stride = 2
-        self.name = name
-        if use_conv:
-            conv = nn.Conv2d(self.channels, self.out_channels, 3, stride=stride, padding=padding)
-        else:
-            assert self.channels == self.out_channels
-            conv = nn.AvgPool2d(kernel_size=stride, stride=stride)
-        # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
-        if name == "conv":
-            self.Conv2d_0 = conv
-            self.conv = conv
-        elif name == "Conv2d_0":
-            self.conv = conv
-        else:
-            self.conv = conv
-    def forward(self, x):
-        assert x.shape[1] == self.channels
-        if self.use_conv and self.padding == 0:
-            pad = (0, 1, 0, 1)
-            x = F.pad(x, pad, mode="constant", value=0)
-        assert x.shape[1] == self.channels
-        x = self.conv(x)
-        return x
-class FirUpsample2D(nn.Module):
-    def __init__(self, channels=None, out_channels=None, use_conv=False, fir_kernel=(1, 3, 3, 1)):
-        super().__init__()
-        out_channels = out_channels if out_channels else channels
-        if use_conv:
-            self.Conv2d_0 = nn.Conv2d(channels, out_channels, kernel_size=3, stride=1, padding=1)
-        self.use_conv = use_conv
-        self.fir_kernel = fir_kernel
-        self.out_channels = out_channels
-    def _upsample_2d(self, x, weight=None, kernel=None, factor=2, gain=1):
-        """Fused `upsample_2d()` followed by `Conv2d()`.
-        Args:
-        Padding is performed only once at the beginning, not between the operations. The fused op is considerably more
-        efficient than performing the same calculation using standard TensorFlow ops. It supports gradients of arbitrary:
-        order.
-        x: Input tensor of the shape `[N, C, H, W]` or `[N, H, W,
-            C]`.
-        weight: Weight tensor of the shape `[filterH, filterW, inChannels,
-            outChannels]`. Grouped convolution can be performed by `inChannels = x.shape[0] // numGroups`.
-        kernel: FIR filter of the shape `[firH, firW]` or `[firN]`
-            (separable). The default is `[1] * factor`, which corresponds to nearest-neighbor upsampling.
-        factor: Integer upsampling factor (default: 2). gain: Scaling factor for signal magnitude (default: 1.0).
-        Returns:
-        Tensor of the shape `[N, C, H * factor, W * factor]` or `[N, H * factor, W * factor, C]`, and same datatype as
-        `x`.
-        """
-        assert isinstance(factor, int) and factor >= 1
-        # Setup filter kernel.
-        if kernel is None:
-            kernel = [1] * factor
-        # setup kernel
-        kernel = np.asarray(kernel, dtype=np.float32)
-        if kernel.ndim == 1:
-            kernel = np.outer(kernel, kernel)
-        kernel /= np.sum(kernel)
-        kernel = kernel * (gain * (factor**2))
-        if self.use_conv:
-            convH = weight.shape[2]
-            convW = weight.shape[3]
-            inC = weight.shape[1]
-            p = (kernel.shape[0] - factor) - (convW - 1)
-            stride = (factor, factor)
-            # Determine data dimensions.
-            stride = [1, 1, factor, factor]
-            output_shape = ((x.shape[2] - 1) * factor + convH, (x.shape[3] - 1) * factor + convW)
-            output_padding = (
-                output_shape[0] - (x.shape[2] - 1) * stride[0] - convH,
-                output_shape[1] - (x.shape[3] - 1) * stride[1] - convW,
-            )
-            assert output_padding[0] >= 0 and output_padding[1] >= 0
-            inC = weight.shape[1]
-            num_groups = x.shape[1] // inC
-            # Transpose weights.
-            weight = torch.reshape(weight, (num_groups, -1, inC, convH, convW))
-            weight = weight[..., ::-1, ::-1].permute(0, 2, 1, 3, 4)
-            weight = torch.reshape(weight, (num_groups * inC, -1, convH, convW))
-            x = F.conv_transpose2d(x, weight, stride=stride, output_padding=output_padding, padding=0)
-            x = upfirdn2d_native(x, torch.tensor(kernel, device=x.device), pad=((p + 1) // 2 + factor - 1, p // 2 + 1))
-        else:
-            p = kernel.shape[0] - factor
-            x = upfirdn2d_native(
-                x, torch.tensor(kernel, device=x.device), up=factor, pad=((p + 1) // 2 + factor - 1, p // 2)
-            )
-        return x
-    def forward(self, x):
-        if self.use_conv:
-            height = self._upsample_2d(x, self.Conv2d_0.weight, kernel=self.fir_kernel)
-            height = height + self.Conv2d_0.bias.reshape(1, -1, 1, 1)
-        else:
-            height = self._upsample_2d(x, kernel=self.fir_kernel, factor=2)
-        return height
-class FirDownsample2D(nn.Module):
-    def __init__(self, channels=None, out_channels=None, use_conv=False, fir_kernel=(1, 3, 3, 1)):
-        super().__init__()
-        out_channels = out_channels if out_channels else channels
-        if use_conv:
-            self.Conv2d_0 = nn.Conv2d(channels, out_channels, kernel_size=3, stride=1, padding=1)
-        self.fir_kernel = fir_kernel
-        self.use_conv = use_conv
-        self.out_channels = out_channels
-    def _downsample_2d(self, x, weight=None, kernel=None, factor=2, gain=1):
-        """Fused `Conv2d()` followed by `downsample_2d()`.
-        Args:
-        Padding is performed only once at the beginning, not between the operations. The fused op is considerably more
-        efficient than performing the same calculation using standard TensorFlow ops. It supports gradients of arbitrary:
-        order.
-            x: Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`. w: Weight tensor of the shape `[filterH,
-            filterW, inChannels, outChannels]`. Grouped convolution can be performed by `inChannels = x.shape[0] //
-            numGroups`. k: FIR filter of the shape `[firH, firW]` or `[firN]` (separable). The default is `[1] *
-            factor`, which corresponds to average pooling. factor: Integer downsampling factor (default: 2). gain:
-            Scaling factor for signal magnitude (default: 1.0).
-        Returns:
-            Tensor of the shape `[N, C, H // factor, W // factor]` or `[N, H // factor, W // factor, C]`, and same
-            datatype as `x`.
-        """
-        assert isinstance(factor, int) and factor >= 1
-        if kernel is None:
-            kernel = [1] * factor
-        # setup kernel
-        kernel = np.asarray(kernel, dtype=np.float32)
-        if kernel.ndim == 1:
-            kernel = np.outer(kernel, kernel)
-        kernel /= np.sum(kernel)
-        kernel = kernel * gain
-        if self.use_conv:
-            _, _, convH, convW = weight.shape
-            p = (kernel.shape[0] - factor) + (convW - 1)
-            s = [factor, factor]
-            x = upfirdn2d_native(x, torch.tensor(kernel, device=x.device), pad=((p + 1) // 2, p // 2))
-            x = F.conv2d(x, weight, stride=s, padding=0)
-        else:
-            p = kernel.shape[0] - factor
-            x = upfirdn2d_native(x, torch.tensor(kernel, device=x.device), down=factor, pad=((p + 1) // 2, p // 2))
-        return x
-    def forward(self, x):
-        if self.use_conv:
-            x = self._downsample_2d(x, weight=self.Conv2d_0.weight, kernel=self.fir_kernel)
-            x = x + self.Conv2d_0.bias.reshape(1, -1, 1, 1)
-        else:
-            x = self._downsample_2d(x, kernel=self.fir_kernel, factor=2)
-        return x
-class ResnetBlock2D(nn.Module):
-    def __init__(
-        self,
-        *,
-        in_channels,
-        out_channels=None,
-        conv_shortcut=False,
-        dropout=0.0,
-        temb_channels=512,
-        groups=32,
-        groups_out=None,
-        pre_norm=True,
-        eps=1e-6,
-        non_linearity="swish",
-        time_embedding_norm="default",
-        kernel=None,
-        output_scale_factor=1.0,
-        use_nin_shortcut=None,
-        up=False,
-        down=False,
-    ):
-        super().__init__()
-        self.pre_norm = pre_norm
-        self.pre_norm = True
-        self.in_channels = in_channels
-        out_channels = in_channels if out_channels is None else out_channels
-        self.out_channels = out_channels
-        self.use_conv_shortcut = conv_shortcut
-        self.time_embedding_norm = time_embedding_norm
-        self.up = up
-        self.down = down
-        self.output_scale_factor = output_scale_factor
-        if groups_out is None:
-            groups_out = groups
-        self.norm1 = torch.nn.GroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
-        self.conv1 = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
-        if temb_channels is not None:
-            self.time_emb_proj = torch.nn.Linear(temb_channels, out_channels)
-        else:
-            self.time_emb_proj = None
-        self.norm2 = torch.nn.GroupNorm(num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True)
-        self.dropout = torch.nn.Dropout(dropout)
-        self.conv2 = torch.nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
-        if non_linearity == "swish":
-            self.nonlinearity = lambda x: F.silu(x)
-        elif non_linearity == "mish":
-            self.nonlinearity = Mish()
-        elif non_linearity == "silu":
-            self.nonlinearity = nn.SiLU()
-        self.upsample = self.downsample = None
-        if self.up:
-            if kernel == "fir":
-                fir_kernel = (1, 3, 3, 1)
-                self.upsample = lambda x: upsample_2d(x, kernel=fir_kernel)
-            elif kernel == "sde_vp":
-                self.upsample = partial(F.interpolate, scale_factor=2.0, mode="nearest")
-            else:
-                self.upsample = Upsample2D(in_channels, use_conv=False)
-        elif self.down:
-            if kernel == "fir":
-                fir_kernel = (1, 3, 3, 1)
-                self.downsample = lambda x: downsample_2d(x, kernel=fir_kernel)
-            elif kernel == "sde_vp":
-                self.downsample = partial(F.avg_pool2d, kernel_size=2, stride=2)
-            else:
-                self.downsample = Downsample2D(in_channels, use_conv=False, padding=1, name="op")
-        self.use_nin_shortcut = self.in_channels != self.out_channels if use_nin_shortcut is None else use_nin_shortcut
-        self.conv_shortcut = None
-        if self.use_nin_shortcut:
-            self.conv_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
-    def forward(self, x, temb):
-        hidden_states = x
-        # make sure hidden states is in float32
-        # when running in half-precision
-        hidden_states = self.norm1(hidden_states.float()).type(hidden_states.dtype)
-        hidden_states = self.nonlinearity(hidden_states)
-        if self.upsample is not None:
-            x = self.upsample(x)
-            hidden_states = self.upsample(hidden_states)
-        elif self.downsample is not None:
-            x = self.downsample(x)
-            hidden_states = self.downsample(hidden_states)
-        hidden_states = self.conv1(hidden_states)
-        if temb is not None:
-            temb = self.time_emb_proj(self.nonlinearity(temb))[:, :, None, None]
-            hidden_states = hidden_states + temb
-        # make sure hidden states is in float32
-        # when running in half-precision
-        hidden_states = self.norm2(hidden_states.float()).type(hidden_states.dtype)
-        hidden_states = self.nonlinearity(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.conv2(hidden_states)
-        if self.conv_shortcut is not None:
-            x = self.conv_shortcut(x)
-        out = (x + hidden_states) / self.output_scale_factor
-        return out
-class Mish(torch.nn.Module):
-    def forward(self, x):
-        return x * torch.tanh(torch.nn.functional.softplus(x))
-def upsample_2d(x, kernel=None, factor=2, gain=1):
-    r"""Upsample2D a batch of 2D images with the given filter.
-    Args:
-    Accepts a batch of 2D images of the shape `[N, C, H, W]` or `[N, H, W, C]` and upsamples each image with the given
-    filter. The filter is normalized so that if the input pixels are constant, they will be scaled by the specified
-    `gain`. Pixels outside the image are assumed to be zero, and the filter is padded with zeros so that its shape is a:
-    multiple of the upsampling factor.
-        x: Input tensor of the shape `[N, C, H, W]` or `[N, H, W,
-          C]`.
-        k: FIR filter of the shape `[firH, firW]` or `[firN]`
-          (separable). The default is `[1] * factor`, which corresponds to nearest-neighbor upsampling.
-        factor: Integer upsampling factor (default: 2). gain: Scaling factor for signal magnitude (default: 1.0).
-    Returns:
-        Tensor of the shape `[N, C, H * factor, W * factor]`
-    """
-    assert isinstance(factor, int) and factor >= 1
-    if kernel is None:
-        kernel = [1] * factor
-    kernel = np.asarray(kernel, dtype=np.float32)
-    if kernel.ndim == 1:
-        kernel = np.outer(kernel, kernel)
-    kernel /= np.sum(kernel)
-    kernel = kernel * (gain * (factor**2))
-    p = kernel.shape[0] - factor
-    return upfirdn2d_native(
-        x, torch.tensor(kernel, device=x.device), up=factor, pad=((p + 1) // 2 + factor - 1, p // 2)
-    )
-def downsample_2d(x, kernel=None, factor=2, gain=1):
-    r"""Downsample2D a batch of 2D images with the given filter.
-    Args:
-    Accepts a batch of 2D images of the shape `[N, C, H, W]` or `[N, H, W, C]` and downsamples each image with the
-    given filter. The filter is normalized so that if the input pixels are constant, they will be scaled by the
-    specified `gain`. Pixels outside the image are assumed to be zero, and the filter is padded with zeros so that its
-    shape is a multiple of the downsampling factor.
-        x: Input tensor of the shape `[N, C, H, W]` or `[N, H, W,
-          C]`.
-        kernel: FIR filter of the shape `[firH, firW]` or `[firN]`
-          (separable). The default is `[1] * factor`, which corresponds to average pooling.
-        factor: Integer downsampling factor (default: 2). gain: Scaling factor for signal magnitude (default: 1.0).
-    Returns:
-        Tensor of the shape `[N, C, H // factor, W // factor]`
-    """
-    assert isinstance(factor, int) and factor >= 1
-    if kernel is None:
-        kernel = [1] * factor
-    kernel = np.asarray(kernel, dtype=np.float32)
-    if kernel.ndim == 1:
-        kernel = np.outer(kernel, kernel)
-    kernel /= np.sum(kernel)
-    kernel = kernel * gain
-    p = kernel.shape[0] - factor
-    return upfirdn2d_native(x, torch.tensor(kernel, device=x.device), down=factor, pad=((p + 1) // 2, p // 2))
-def upfirdn2d_native(input, kernel, up=1, down=1, pad=(0, 0)):
-    up_x = up_y = up
-    down_x = down_y = down
-    pad_x0 = pad_y0 = pad[0]
-    pad_x1 = pad_y1 = pad[1]
-    _, channel, in_h, in_w = input.shape
-    input = input.reshape(-1, in_h, in_w, 1)
-    _, in_h, in_w, minor = input.shape
-    kernel_h, kernel_w = kernel.shape
-    out = input.view(-1, in_h, 1, in_w, 1, minor)
-    # Temporary workaround for mps specific issue: https://github.com/pytorch/pytorch/issues/84535
-    if input.device.type == "mps":
-        out = out.to("cpu")
-    out = F.pad(out, [0, 0, 0, up_x - 1, 0, 0, 0, up_y - 1])
-    out = out.view(-1, in_h * up_y, in_w * up_x, minor)
-    out = F.pad(out, [0, 0, max(pad_x0, 0), max(pad_x1, 0), max(pad_y0, 0), max(pad_y1, 0)])
-    out = out.to(input.device)  # Move back to mps if necessary
-    out = out[
-        :,
-        max(-pad_y0, 0) : out.shape[1] - max(-pad_y1, 0),
-        max(-pad_x0, 0) : out.shape[2] - max(-pad_x1, 0),
-        :,
-    ]
-    out = out.permute(0, 3, 1, 2)
-    out = out.reshape([-1, 1, in_h * up_y + pad_y0 + pad_y1, in_w * up_x + pad_x0 + pad_x1])
-    w = torch.flip(kernel, [0, 1]).view(1, 1, kernel_h, kernel_w)
-    out = F.conv2d(out, w)
-    out = out.reshape(
-        -1,
-        minor,
-        in_h * up_y + pad_y0 + pad_y1 - kernel_h + 1,
-        in_w * up_x + pad_x0 + pad_x1 - kernel_w + 1,
-    )
-    out = out.permute(0, 2, 3, 1)
-    out = out[:, ::down_y, ::down_x, :]
-    out_h = (in_h * up_y + pad_y0 + pad_y1 - kernel_h) // down_y + 1
-    out_w = (in_w * up_x + pad_x0 + pad_x1 - kernel_w) // down_x + 1
-    return out.view(-1, channel, out_h, out_w)

diffusers/models/unet_2d.py DELETED Viewed

@@ -1,246 +0,0 @@
-from dataclasses import dataclass
-from typing import Optional, Tuple, Union
-import torch
-import torch.nn as nn
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..modeling_utils import ModelMixin
-from ..utils import BaseOutput
-from .embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps
-from .unet_blocks import UNetMidBlock2D, get_down_block, get_up_block
-@dataclass
-class UNet2DOutput(BaseOutput):
-    """
-    Args:
-        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Hidden states output. Output of last layer of model.
-    """
-    sample: torch.FloatTensor
-class UNet2DModel(ModelMixin, ConfigMixin):
-    r"""
-    UNet2DModel is a 2D UNet model that takes in a noisy sample and a timestep and returns sample shaped output.
-    This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
-    implements for all the model (such as downloading or saving, etc.)
-    Parameters:
-        sample_size (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`, *optional*):
-            Input sample size.
-        in_channels (`int`, *optional*, defaults to 3): Number of channels in the input image.
-        out_channels (`int`, *optional*, defaults to 3): Number of channels in the output.
-        center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
-        time_embedding_type (`str`, *optional*, defaults to `"positional"`): Type of time embedding to use.
-        freq_shift (`int`, *optional*, defaults to 0): Frequency shift for fourier time embedding.
-        flip_sin_to_cos (`bool`, *optional*, defaults to :
-            obj:`False`): Whether to flip sin to cos for fourier time embedding.
-        down_block_types (`Tuple[str]`, *optional*, defaults to :
-            obj:`("DownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D")`): Tuple of downsample block
-            types.
-        up_block_types (`Tuple[str]`, *optional*, defaults to :
-            obj:`("AttnUpBlock2D", "AttnUpBlock2D", "AttnUpBlock2D", "UpBlock2D")`): Tuple of upsample block types.
-        block_out_channels (`Tuple[int]`, *optional*, defaults to :
-            obj:`(224, 448, 672, 896)`): Tuple of block output channels.
-        layers_per_block (`int`, *optional*, defaults to `2`): The number of layers per block.
-        mid_block_scale_factor (`float`, *optional*, defaults to `1`): The scale factor for the mid block.
-        downsample_padding (`int`, *optional*, defaults to `1`): The padding for the downsample convolution.
-        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
-        attention_head_dim (`int`, *optional*, defaults to `8`): The attention head dimension.
-        norm_num_groups (`int`, *optional*, defaults to `32`): The number of groups for the normalization.
-        norm_eps (`float`, *optional*, defaults to `1e-5`): The epsilon for the normalization.
-    """
-    @register_to_config
-    def __init__(
-        self,
-        sample_size: Optional[int] = None,
-        in_channels: int = 3,
-        out_channels: int = 3,
-        center_input_sample: bool = False,
-        time_embedding_type: str = "positional",
-        freq_shift: int = 0,
-        flip_sin_to_cos: bool = True,
-        down_block_types: Tuple[str] = ("DownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D"),
-        up_block_types: Tuple[str] = ("AttnUpBlock2D", "AttnUpBlock2D", "AttnUpBlock2D", "UpBlock2D"),
-        block_out_channels: Tuple[int] = (224, 448, 672, 896),
-        layers_per_block: int = 2,
-        mid_block_scale_factor: float = 1,
-        downsample_padding: int = 1,
-        act_fn: str = "silu",
-        attention_head_dim: int = 8,
-        norm_num_groups: int = 32,
-        norm_eps: float = 1e-5,
-    ):
-        super().__init__()
-        self.sample_size = sample_size
-        time_embed_dim = block_out_channels[0] * 4
-        # input
-        self.conv_in = nn.Conv2d(in_channels, block_out_channels[0], kernel_size=3, padding=(1, 1))
-        # time
-        if time_embedding_type == "fourier":
-            self.time_proj = GaussianFourierProjection(embedding_size=block_out_channels[0], scale=16)
-            timestep_input_dim = 2 * block_out_channels[0]
-        elif time_embedding_type == "positional":
-            self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
-            timestep_input_dim = block_out_channels[0]
-        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
-        self.down_blocks = nn.ModuleList([])
-        self.mid_block = None
-        self.up_blocks = nn.ModuleList([])
-        # down
-        output_channel = block_out_channels[0]
-        for i, down_block_type in enumerate(down_block_types):
-            input_channel = output_channel
-            output_channel = block_out_channels[i]
-            is_final_block = i == len(block_out_channels) - 1
-            down_block = get_down_block(
-                down_block_type,
-                num_layers=layers_per_block,
-                in_channels=input_channel,
-                out_channels=output_channel,
-                temb_channels=time_embed_dim,
-                add_downsample=not is_final_block,
-                resnet_eps=norm_eps,
-                resnet_act_fn=act_fn,
-                attn_num_head_channels=attention_head_dim,
-                downsample_padding=downsample_padding,
-            )
-            self.down_blocks.append(down_block)
-        # mid
-        self.mid_block = UNetMidBlock2D(
-            in_channels=block_out_channels[-1],
-            temb_channels=time_embed_dim,
-            resnet_eps=norm_eps,
-            resnet_act_fn=act_fn,
-            output_scale_factor=mid_block_scale_factor,
-            resnet_time_scale_shift="default",
-            attn_num_head_channels=attention_head_dim,
-            resnet_groups=norm_num_groups,
-        )
-        # up
-        reversed_block_out_channels = list(reversed(block_out_channels))
-        output_channel = reversed_block_out_channels[0]
-        for i, up_block_type in enumerate(up_block_types):
-            prev_output_channel = output_channel
-            output_channel = reversed_block_out_channels[i]
-            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
-            is_final_block = i == len(block_out_channels) - 1
-            up_block = get_up_block(
-                up_block_type,
-                num_layers=layers_per_block + 1,
-                in_channels=input_channel,
-                out_channels=output_channel,
-                prev_output_channel=prev_output_channel,
-                temb_channels=time_embed_dim,
-                add_upsample=not is_final_block,
-                resnet_eps=norm_eps,
-                resnet_act_fn=act_fn,
-                attn_num_head_channels=attention_head_dim,
-            )
-            self.up_blocks.append(up_block)
-            prev_output_channel = output_channel
-        # out
-        num_groups_out = norm_num_groups if norm_num_groups is not None else min(block_out_channels[0] // 4, 32)
-        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=num_groups_out, eps=norm_eps)
-        self.conv_act = nn.SiLU()
-        self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, 3, padding=1)
-    def forward(
-        self,
-        sample: torch.FloatTensor,
-        timestep: Union[torch.Tensor, float, int],
-        return_dict: bool = True,
-    ) -> Union[UNet2DOutput, Tuple]:
-        """r
-        Args:
-            sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
-            timestep (`torch.FloatTensor` or `float` or `int): (batch) timesteps
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.unet_2d.UNet2DOutput`] instead of a plain tuple.
-        Returns:
-            [`~models.unet_2d.UNet2DOutput`] or `tuple`: [`~models.unet_2d.UNet2DOutput`] if `return_dict` is True,
-            otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.
-        """
-        # 0. center input if necessary
-        if self.config.center_input_sample:
-            sample = 2 * sample - 1.0
-        # 1. time
-        timesteps = timestep
-        if not torch.is_tensor(timesteps):
-            timesteps = torch.tensor([timesteps], dtype=torch.long, device=sample.device)
-        elif torch.is_tensor(timesteps) and len(timesteps.shape) == 0:
-            timesteps = timesteps[None].to(sample.device)
-        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-        timesteps = timesteps * torch.ones(sample.shape[0], dtype=timesteps.dtype, device=timesteps.device)
-        t_emb = self.time_proj(timesteps)
-        emb = self.time_embedding(t_emb)
-        # 2. pre-process
-        skip_sample = sample
-        sample = self.conv_in(sample)
-        # 3. down
-        down_block_res_samples = (sample,)
-        for downsample_block in self.down_blocks:
-            if hasattr(downsample_block, "skip_conv"):
-                sample, res_samples, skip_sample = downsample_block(
-                    hidden_states=sample, temb=emb, skip_sample=skip_sample
-                )
-            else:
-                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
-            down_block_res_samples += res_samples
-        # 4. mid
-        sample = self.mid_block(sample, emb)
-        # 5. up
-        skip_sample = None
-        for upsample_block in self.up_blocks:
-            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
-            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
-            if hasattr(upsample_block, "skip_conv"):
-                sample, skip_sample = upsample_block(sample, res_samples, emb, skip_sample)
-            else:
-                sample = upsample_block(sample, res_samples, emb)
-        # 6. post-process
-        # make sure hidden states is in float32
-        # when running in half-precision
-        sample = self.conv_norm_out(sample.float()).type(sample.dtype)
-        sample = self.conv_act(sample)
-        sample = self.conv_out(sample)
-        if skip_sample is not None:
-            sample += skip_sample
-        if self.config.time_embedding_type == "fourier":
-            timesteps = timesteps.reshape((sample.shape[0], *([1] * len(sample.shape[1:]))))
-            sample = sample / timesteps
-        if not return_dict:
-            return (sample,)
-        return UNet2DOutput(sample=sample)

diffusers/models/unet_2d_condition.py DELETED Viewed

@@ -1,272 +0,0 @@
-from dataclasses import dataclass
-from typing import Optional, Tuple, Union
-import torch
-import torch.nn as nn
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..modeling_utils import ModelMixin
-from ..utils import BaseOutput
-from .embeddings import TimestepEmbedding, Timesteps
-from .unet_blocks import UNetMidBlock2DCrossAttn, get_down_block, get_up_block
-@dataclass
-class UNet2DConditionOutput(BaseOutput):
-    """
-    Args:
-        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Hidden states conditioned on `encoder_hidden_states` input. Output of last layer of model.
-    """
-    sample: torch.FloatTensor
-class UNet2DConditionModel(ModelMixin, ConfigMixin):
-    r"""
-    UNet2DConditionModel is a conditional 2D UNet model that takes in a noisy sample, conditional state, and a timestep
-    and returns sample shaped output.
-    This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
-    implements for all the model (such as downloading or saving, etc.)
-    Parameters:
-        sample_size (`int`, *optional*): The size of the input sample.
-        in_channels (`int`, *optional*, defaults to 4): The number of channels in the input sample.
-        out_channels (`int`, *optional*, defaults to 4): The number of channels in the output.
-        center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
-        flip_sin_to_cos (`bool`, *optional*, defaults to `False`):
-            Whether to flip the sin to cos in the time embedding.
-        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
-        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
-            The tuple of downsample blocks to use.
-        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D",)`):
-            The tuple of upsample blocks to use.
-        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
-            The tuple of output channels for each block.
-        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
-        downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
-        mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
-        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
-        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
-        norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
-        cross_attention_dim (`int`, *optional*, defaults to 1280): The dimension of the cross attention features.
-        attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
-    """
-    @register_to_config
-    def __init__(
-        self,
-        sample_size: Optional[int] = None,
-        in_channels: int = 4,
-        out_channels: int = 4,
-        center_input_sample: bool = False,
-        flip_sin_to_cos: bool = True,
-        freq_shift: int = 0,
-        down_block_types: Tuple[str] = (
-            "CrossAttnDownBlock2D",
-            "CrossAttnDownBlock2D",
-            "CrossAttnDownBlock2D",
-            "DownBlock2D",
-        ),
-        up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
-        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
-        layers_per_block: int = 2,
-        downsample_padding: int = 1,
-        mid_block_scale_factor: float = 1,
-        act_fn: str = "silu",
-        norm_num_groups: int = 32,
-        norm_eps: float = 1e-5,
-        cross_attention_dim: int = 1280,
-        attention_head_dim: int = 8,
-    ):
-        super().__init__()
-        self.sample_size = sample_size
-        time_embed_dim = block_out_channels[0] * 4
-        # input
-        self.conv_in = nn.Conv2d(in_channels, block_out_channels[0], kernel_size=3, padding=(1, 1))
-        # time
-        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
-        timestep_input_dim = block_out_channels[0]
-        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
-        self.down_blocks = nn.ModuleList([])
-        self.mid_block = None
-        self.up_blocks = nn.ModuleList([])
-        # down
-        output_channel = block_out_channels[0]
-        for i, down_block_type in enumerate(down_block_types):
-            input_channel = output_channel
-            output_channel = block_out_channels[i]
-            is_final_block = i == len(block_out_channels) - 1
-            down_block = get_down_block(
-                down_block_type,
-                num_layers=layers_per_block,
-                in_channels=input_channel,
-                out_channels=output_channel,
-                temb_channels=time_embed_dim,
-                add_downsample=not is_final_block,
-                resnet_eps=norm_eps,
-                resnet_act_fn=act_fn,
-                cross_attention_dim=cross_attention_dim,
-                attn_num_head_channels=attention_head_dim,
-                downsample_padding=downsample_padding,
-            )
-            self.down_blocks.append(down_block)
-        # mid
-        self.mid_block = UNetMidBlock2DCrossAttn(
-            in_channels=block_out_channels[-1],
-            temb_channels=time_embed_dim,
-            resnet_eps=norm_eps,
-            resnet_act_fn=act_fn,
-            output_scale_factor=mid_block_scale_factor,
-            resnet_time_scale_shift="default",
-            cross_attention_dim=cross_attention_dim,
-            attn_num_head_channels=attention_head_dim,
-            resnet_groups=norm_num_groups,
-        )
-        # up
-        reversed_block_out_channels = list(reversed(block_out_channels))
-        output_channel = reversed_block_out_channels[0]
-        for i, up_block_type in enumerate(up_block_types):
-            prev_output_channel = output_channel
-            output_channel = reversed_block_out_channels[i]
-            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
-            is_final_block = i == len(block_out_channels) - 1
-            up_block = get_up_block(
-                up_block_type,
-                num_layers=layers_per_block + 1,
-                in_channels=input_channel,
-                out_channels=output_channel,
-                prev_output_channel=prev_output_channel,
-                temb_channels=time_embed_dim,
-                add_upsample=not is_final_block,
-                resnet_eps=norm_eps,
-                resnet_act_fn=act_fn,
-                cross_attention_dim=cross_attention_dim,
-                attn_num_head_channels=attention_head_dim,
-            )
-            self.up_blocks.append(up_block)
-            prev_output_channel = output_channel
-        # out
-        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps)
-        self.conv_act = nn.SiLU()
-        self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, 3, padding=1)
-    def set_attention_slice(self, slice_size):
-        if slice_size is not None and self.config.attention_head_dim % slice_size != 0:
-            raise ValueError(
-                f"Make sure slice_size {slice_size} is a divisor of "
-                f"the number of heads used in cross_attention {self.config.attention_head_dim}"
-            )
-        if slice_size is not None and slice_size > self.config.attention_head_dim:
-            raise ValueError(
-                f"Chunk_size {slice_size} has to be smaller or equal to "
-                f"the number of heads used in cross_attention {self.config.attention_head_dim}"
-            )
-        for block in self.down_blocks:
-            if hasattr(block, "attentions") and block.attentions is not None:
-                block.set_attention_slice(slice_size)
-        self.mid_block.set_attention_slice(slice_size)
-        for block in self.up_blocks:
-            if hasattr(block, "attentions") and block.attentions is not None:
-                block.set_attention_slice(slice_size)
-    def forward(
-        self,
-        sample: torch.FloatTensor,
-        timestep: Union[torch.Tensor, float, int],
-        encoder_hidden_states: torch.Tensor,
-        return_dict: bool = True,
-    ) -> Union[UNet2DConditionOutput, Tuple]:
-        """r
-        Args:
-            sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
-            timestep (`torch.FloatTensor` or `float` or `int): (batch) timesteps
-            encoder_hidden_states (`torch.FloatTensor`): (batch, channel, height, width) encoder hidden states
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
-        Returns:
-            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
-            [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
-            returning a tuple, the first element is the sample tensor.
-        """
-        # 0. center input if necessary
-        if self.config.center_input_sample:
-            sample = 2 * sample - 1.0
-        # 1. time
-        timesteps = timestep
-        if not torch.is_tensor(timesteps):
-            timesteps = torch.tensor([timesteps], dtype=torch.long, device=sample.device)
-        elif torch.is_tensor(timesteps) and len(timesteps.shape) == 0:
-            timesteps = timesteps.to(dtype=torch.float32)
-            timesteps = timesteps[None].to(device=sample.device)
-        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-        timesteps = timesteps.expand(sample.shape[0])
-        t_emb = self.time_proj(timesteps)
-        emb = self.time_embedding(t_emb)
-        # 2. pre-process
-        sample = self.conv_in(sample)
-        # 3. down
-        down_block_res_samples = (sample,)
-        for downsample_block in self.down_blocks:
-            if hasattr(downsample_block, "attentions") and downsample_block.attentions is not None:
-                sample, res_samples = downsample_block(
-                    hidden_states=sample, temb=emb, encoder_hidden_states=encoder_hidden_states
-                )
-            else:
-                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
-            down_block_res_samples += res_samples
-        # 4. mid
-        sample = self.mid_block(sample, emb, encoder_hidden_states=encoder_hidden_states)
-        # 5. up
-        for upsample_block in self.up_blocks:
-            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
-            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
-            if hasattr(upsample_block, "attentions") and upsample_block.attentions is not None:
-                sample = upsample_block(
-                    hidden_states=sample,
-                    temb=emb,
-                    res_hidden_states_tuple=res_samples,
-                    encoder_hidden_states=encoder_hidden_states,
-                )
-            else:
-                sample = upsample_block(hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples)
-        # 6. post-process
-        # make sure hidden states is in float32
-        # when running in half-precision
-        sample = self.conv_norm_out(sample.float()).type(sample.dtype)
-        sample = self.conv_act(sample)
-        sample = self.conv_out(sample)
-        return sample
-        if not return_dict:
-            return (sample,)
-        return UNet2DConditionOutput(sample=sample)

diffusers/models/unet_blocks.py DELETED Viewed

@@ -1,1484 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-import numpy as np
-# limitations under the License.
-import torch
-from torch import nn
-from .attention import AttentionBlock, SpatialTransformer
-from .resnet import Downsample2D, FirDownsample2D, FirUpsample2D, ResnetBlock2D, Upsample2D
-def get_down_block(
-    down_block_type,
-    num_layers,
-    in_channels,
-    out_channels,
-    temb_channels,
-    add_downsample,
-    resnet_eps,
-    resnet_act_fn,
-    attn_num_head_channels,
-    cross_attention_dim=None,
-    downsample_padding=None,
-):
-    down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
-    print(down_block_type)
-    if down_block_type == "DownBlock2D":
-        return DownBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            temb_channels=temb_channels,
-            add_downsample=add_downsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            downsample_padding=downsample_padding,
-        )
-    elif down_block_type == "AttnDownBlock2D":
-        return AttnDownBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            temb_channels=temb_channels,
-            add_downsample=add_downsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            downsample_padding=downsample_padding,
-            attn_num_head_channels=attn_num_head_channels,
-        )
-    elif down_block_type == "CrossAttnDownBlock2D":
-        if cross_attention_dim is None:
-            raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlock2D")
-        return CrossAttnDownBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            temb_channels=temb_channels,
-            add_downsample=add_downsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            downsample_padding=downsample_padding,
-            cross_attention_dim=cross_attention_dim,
-            attn_num_head_channels=attn_num_head_channels,
-        )
-    elif down_block_type == "SkipDownBlock2D":
-        return SkipDownBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            temb_channels=temb_channels,
-            add_downsample=add_downsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            downsample_padding=downsample_padding,
-        )
-    elif down_block_type == "AttnSkipDownBlock2D":
-        return AttnSkipDownBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            temb_channels=temb_channels,
-            add_downsample=add_downsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            downsample_padding=downsample_padding,
-            attn_num_head_channels=attn_num_head_channels,
-        )
-    elif down_block_type == "DownEncoderBlock2D":
-        return DownEncoderBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            add_downsample=add_downsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            downsample_padding=downsample_padding,
-        )
-def get_up_block(
-    up_block_type,
-    num_layers,
-    in_channels,
-    out_channels,
-    prev_output_channel,
-    temb_channels,
-    add_upsample,
-    resnet_eps,
-    resnet_act_fn,
-    attn_num_head_channels,
-    cross_attention_dim=None,
-):
-    up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
-    print(up_block_type)
-    if up_block_type == "UpBlock2D":
-        return UpBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            prev_output_channel=prev_output_channel,
-            temb_channels=temb_channels,
-            add_upsample=add_upsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-        )
-    elif up_block_type == "CrossAttnUpBlock2D":
-        if cross_attention_dim is None:
-            raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlock2D")
-        return CrossAttnUpBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            prev_output_channel=prev_output_channel,
-            temb_channels=temb_channels,
-            add_upsample=add_upsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            cross_attention_dim=cross_attention_dim,
-            attn_num_head_channels=attn_num_head_channels,
-        )
-    elif up_block_type == "AttnUpBlock2D":
-        return AttnUpBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            prev_output_channel=prev_output_channel,
-            temb_channels=temb_channels,
-            add_upsample=add_upsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            attn_num_head_channels=attn_num_head_channels,
-        )
-    elif up_block_type == "SkipUpBlock2D":
-        return SkipUpBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            prev_output_channel=prev_output_channel,
-            temb_channels=temb_channels,
-            add_upsample=add_upsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-        )
-    elif up_block_type == "AttnSkipUpBlock2D":
-        return AttnSkipUpBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            prev_output_channel=prev_output_channel,
-            temb_channels=temb_channels,
-            add_upsample=add_upsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            attn_num_head_channels=attn_num_head_channels,
-        )
-    elif up_block_type == "UpDecoderBlock2D":
-        return UpDecoderBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            add_upsample=add_upsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-        )
-    raise ValueError(f"{up_block_type} does not exist.")
-class UNetMidBlock2D(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        attn_num_head_channels=1,
-        attention_type="default",
-        output_scale_factor=1.0,
-        **kwargs,
-    ):
-        super().__init__()
-        self.attention_type = attention_type
-        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
-        # there is always at least one resnet
-        resnets = [
-            ResnetBlock2D(
-                in_channels=in_channels,
-                out_channels=in_channels,
-                temb_channels=temb_channels,
-                eps=resnet_eps,
-                groups=resnet_groups,
-                dropout=dropout,
-                time_embedding_norm=resnet_time_scale_shift,
-                non_linearity=resnet_act_fn,
-                output_scale_factor=output_scale_factor,
-                pre_norm=resnet_pre_norm,
-            )
-        ]
-        attentions = []
-        for _ in range(num_layers):
-            attentions.append(
-                AttentionBlock(
-                    in_channels,
-                    num_head_channels=attn_num_head_channels,
-                    rescale_output_factor=output_scale_factor,
-                    eps=resnet_eps,
-                    num_groups=resnet_groups,
-                )
-            )
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=in_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-        self.attentions = nn.ModuleList(attentions)
-        self.resnets = nn.ModuleList(resnets)
-    def forward(self, hidden_states, temb=None, encoder_states=None):
-        hidden_states = self.resnets[0](hidden_states, temb)
-        print(self.attention_type)
-        for attn, resnet in zip(self.attentions, self.resnets[1:]):
-            if self.attention_type == "default":
-                hidden_states = attn(hidden_states)
-            else:
-                hidden_states = attn(hidden_states, encoder_states)
-            hidden_states = resnet(hidden_states, temb)
-        return hidden_states
-class UNetMidBlock2DCrossAttn(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        attn_num_head_channels=1,
-        attention_type="default",
-        output_scale_factor=1.0,
-        cross_attention_dim=1280,
-        **kwargs,
-    ):
-        super().__init__()
-        self.attention_type = attention_type
-        self.attn_num_head_channels = attn_num_head_channels
-        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
-        # there is always at least one resnet
-        resnets = [
-            ResnetBlock2D(
-                in_channels=in_channels,
-                out_channels=in_channels,
-                temb_channels=temb_channels,
-                eps=resnet_eps,
-                groups=resnet_groups,
-                dropout=dropout,
-                time_embedding_norm=resnet_time_scale_shift,
-                non_linearity=resnet_act_fn,
-                output_scale_factor=output_scale_factor,
-                pre_norm=resnet_pre_norm,
-            )
-        ]
-        attentions = []
-        for _ in range(num_layers):
-            attentions.append(
-                SpatialTransformer(
-                    in_channels,
-                    attn_num_head_channels,
-                    in_channels // attn_num_head_channels,
-                    depth=1,
-                    context_dim=cross_attention_dim,
-                )
-            )
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=in_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-        self.attentions = nn.ModuleList(attentions)
-        self.resnets = nn.ModuleList(resnets)
-    def set_attention_slice(self, slice_size):
-        if slice_size is not None and self.attn_num_head_channels % slice_size != 0:
-            raise ValueError(
-                f"Make sure slice_size {slice_size} is a divisor of "
-                f"the number of heads used in cross_attention {self.attn_num_head_channels}"
-            )
-        if slice_size is not None and slice_size > self.attn_num_head_channels:
-            raise ValueError(
-                f"Chunk_size {slice_size} has to be smaller or equal to "
-                f"the number of heads used in cross_attention {self.attn_num_head_channels}"
-            )
-        for attn in self.attentions:
-            attn._set_attention_slice(slice_size)
-    def forward(self, hidden_states, temb=None, encoder_hidden_states=None):
-        hidden_states = self.resnets[0](hidden_states, temb)
-        for attn, resnet in zip(self.attentions, self.resnets[1:]):
-            hidden_states = attn(hidden_states, encoder_hidden_states)
-            hidden_states = resnet(hidden_states, temb)
-        return hidden_states
-class AttnDownBlock2D(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        attn_num_head_channels=1,
-        attention_type="default",
-        output_scale_factor=1.0,
-        downsample_padding=1,
-        add_downsample=True,
-    ):
-        super().__init__()
-        resnets = []
-        attentions = []
-        self.attention_type = attention_type
-        for i in range(num_layers):
-            in_channels = in_channels if i == 0 else out_channels
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-            attentions.append(
-                AttentionBlock(
-                    out_channels,
-                    num_head_channels=attn_num_head_channels,
-                    rescale_output_factor=output_scale_factor,
-                    eps=resnet_eps,
-                )
-            )
-        self.attentions = nn.ModuleList(attentions)
-        self.resnets = nn.ModuleList(resnets)
-        if add_downsample:
-            self.downsamplers = nn.ModuleList(
-                [
-                    Downsample2D(
-                        in_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
-                    )
-                ]
-            )
-        else:
-            self.downsamplers = None
-    def forward(self, hidden_states, temb=None):
-        output_states = ()
-        for resnet, attn in zip(self.resnets, self.attentions):
-            hidden_states = resnet(hidden_states, temb)
-            hidden_states = attn(hidden_states)
-            output_states += (hidden_states,)
-        if self.downsamplers is not None:
-            for downsampler in self.downsamplers:
-                hidden_states = downsampler(hidden_states)
-            output_states += (hidden_states,)
-        return hidden_states, output_states
-class CrossAttnDownBlock2D(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        attn_num_head_channels=1,
-        cross_attention_dim=1280,
-        attention_type="default",
-        output_scale_factor=1.0,
-        downsample_padding=1,
-        add_downsample=True,
-    ):
-        super().__init__()
-        resnets = []
-        attentions = []
-        self.attention_type = attention_type
-        self.attn_num_head_channels = attn_num_head_channels
-        for i in range(num_layers):
-            in_channels = in_channels if i == 0 else out_channels
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-            attentions.append(
-                SpatialTransformer(
-                    out_channels,
-                    attn_num_head_channels,
-                    out_channels // attn_num_head_channels,
-                    depth=1,
-                    context_dim=cross_attention_dim,
-                )
-            )
-        self.attentions = nn.ModuleList(attentions)
-        self.resnets = nn.ModuleList(resnets)
-        if add_downsample:
-            self.downsamplers = nn.ModuleList(
-                [
-                    Downsample2D(
-                        in_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
-                    )
-                ]
-            )
-        else:
-            self.downsamplers = None
-    def set_attention_slice(self, slice_size):
-        if slice_size is not None and self.attn_num_head_channels % slice_size != 0:
-            raise ValueError(
-                f"Make sure slice_size {slice_size} is a divisor of "
-                f"the number of heads used in cross_attention {self.attn_num_head_channels}"
-            )
-        if slice_size is not None and slice_size > self.attn_num_head_channels:
-            raise ValueError(
-                f"Chunk_size {slice_size} has to be smaller or equal to "
-                f"the number of heads used in cross_attention {self.attn_num_head_channels}"
-            )
-        for attn in self.attentions:
-            attn._set_attention_slice(slice_size)
-    def forward(self, hidden_states, temb=None, encoder_hidden_states=None):
-        output_states = ()
-        for resnet, attn in zip(self.resnets, self.attentions):
-            hidden_states = resnet(hidden_states, temb)
-            hidden_states = attn(hidden_states, context=encoder_hidden_states)
-            output_states += (hidden_states,)
-        if self.downsamplers is not None:
-            for downsampler in self.downsamplers:
-                hidden_states = downsampler(hidden_states)
-            output_states += (hidden_states,)
-        return hidden_states, output_states
-class DownBlock2D(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        output_scale_factor=1.0,
-        add_downsample=True,
-        downsample_padding=1,
-    ):
-        super().__init__()
-        resnets = []
-        for i in range(num_layers):
-            in_channels = in_channels if i == 0 else out_channels
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-        self.resnets = nn.ModuleList(resnets)
-        if add_downsample:
-            self.downsamplers = nn.ModuleList(
-                [
-                    Downsample2D(
-                        in_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
-                    )
-                ]
-            )
-        else:
-            self.downsamplers = None
-    def forward(self, hidden_states, temb=None):
-        output_states = ()
-        for resnet in self.resnets:
-            hidden_states = resnet(hidden_states, temb)
-            output_states += (hidden_states,)
-        if self.downsamplers is not None:
-            for downsampler in self.downsamplers:
-                hidden_states = downsampler(hidden_states)
-            output_states += (hidden_states,)
-        return hidden_states, output_states
-class DownEncoderBlock2D(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        output_scale_factor=1.0,
-        add_downsample=True,
-        downsample_padding=1,
-    ):
-        super().__init__()
-        resnets = []
-        for i in range(num_layers):
-            in_channels = in_channels if i == 0 else out_channels
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=out_channels,
-                    temb_channels=None,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-        self.resnets = nn.ModuleList(resnets)
-        if add_downsample:
-            self.downsamplers = nn.ModuleList(
-                [
-                    Downsample2D(
-                        in_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
-                    )
-                ]
-            )
-        else:
-            self.downsamplers = None
-    def forward(self, hidden_states):
-        for resnet in self.resnets:
-            hidden_states = resnet(hidden_states, temb=None)
-        if self.downsamplers is not None:
-            for downsampler in self.downsamplers:
-                hidden_states = downsampler(hidden_states)
-        return hidden_states
-class AttnDownEncoderBlock2D(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        attn_num_head_channels=1,
-        output_scale_factor=1.0,
-        add_downsample=True,
-        downsample_padding=1,
-    ):
-        super().__init__()
-        resnets = []
-        attentions = []
-        for i in range(num_layers):
-            in_channels = in_channels if i == 0 else out_channels
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=out_channels,
-                    temb_channels=None,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-            attentions.append(
-                AttentionBlock(
-                    out_channels,
-                    num_head_channels=attn_num_head_channels,
-                    rescale_output_factor=output_scale_factor,
-                    eps=resnet_eps,
-                    num_groups=resnet_groups,
-                )
-            )
-        self.attentions = nn.ModuleList(attentions)
-        self.resnets = nn.ModuleList(resnets)
-        if add_downsample:
-            self.downsamplers = nn.ModuleList(
-                [
-                    Downsample2D(
-                        in_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
-                    )
-                ]
-            )
-        else:
-            self.downsamplers = None
-    def forward(self, hidden_states):
-        for resnet, attn in zip(self.resnets, self.attentions):
-            hidden_states = resnet(hidden_states, temb=None)
-            hidden_states = attn(hidden_states)
-        if self.downsamplers is not None:
-            for downsampler in self.downsamplers:
-                hidden_states = downsampler(hidden_states)
-        return hidden_states
-class AttnSkipDownBlock2D(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_pre_norm: bool = True,
-        attn_num_head_channels=1,
-        attention_type="default",
-        output_scale_factor=np.sqrt(2.0),
-        downsample_padding=1,
-        add_downsample=True,
-    ):
-        super().__init__()
-        self.attentions = nn.ModuleList([])
-        self.resnets = nn.ModuleList([])
-        self.attention_type = attention_type
-        for i in range(num_layers):
-            in_channels = in_channels if i == 0 else out_channels
-            self.resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=min(in_channels // 4, 32),
-                    groups_out=min(out_channels // 4, 32),
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-            self.attentions.append(
-                AttentionBlock(
-                    out_channels,
-                    num_head_channels=attn_num_head_channels,
-                    rescale_output_factor=output_scale_factor,
-                    eps=resnet_eps,
-                )
-            )
-        if add_downsample:
-            self.resnet_down = ResnetBlock2D(
-                in_channels=out_channels,
-                out_channels=out_channels,
-                temb_channels=temb_channels,
-                eps=resnet_eps,
-                groups=min(out_channels // 4, 32),
-                dropout=dropout,
-                time_embedding_norm=resnet_time_scale_shift,
-                non_linearity=resnet_act_fn,
-                output_scale_factor=output_scale_factor,
-                pre_norm=resnet_pre_norm,
-                use_nin_shortcut=True,
-                down=True,
-                kernel="fir",
-            )
-            self.downsamplers = nn.ModuleList([FirDownsample2D(in_channels, out_channels=out_channels)])
-            self.skip_conv = nn.Conv2d(3, out_channels, kernel_size=(1, 1), stride=(1, 1))
-        else:
-            self.resnet_down = None
-            self.downsamplers = None
-            self.skip_conv = None
-    def forward(self, hidden_states, temb=None, skip_sample=None):
-        output_states = ()
-        for resnet, attn in zip(self.resnets, self.attentions):
-            hidden_states = resnet(hidden_states, temb)
-            hidden_states = attn(hidden_states)
-            output_states += (hidden_states,)
-        if self.downsamplers is not None:
-            hidden_states = self.resnet_down(hidden_states, temb)
-            for downsampler in self.downsamplers:
-                skip_sample = downsampler(skip_sample)
-            hidden_states = self.skip_conv(skip_sample) + hidden_states
-            output_states += (hidden_states,)
-        return hidden_states, output_states, skip_sample
-class SkipDownBlock2D(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_pre_norm: bool = True,
-        output_scale_factor=np.sqrt(2.0),
-        add_downsample=True,
-        downsample_padding=1,
-    ):
-        super().__init__()
-        self.resnets = nn.ModuleList([])
-        for i in range(num_layers):
-            in_channels = in_channels if i == 0 else out_channels
-            self.resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=min(in_channels // 4, 32),
-                    groups_out=min(out_channels // 4, 32),
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-        if add_downsample:
-            self.resnet_down = ResnetBlock2D(
-                in_channels=out_channels,
-                out_channels=out_channels,
-                temb_channels=temb_channels,
-                eps=resnet_eps,
-                groups=min(out_channels // 4, 32),
-                dropout=dropout,
-                time_embedding_norm=resnet_time_scale_shift,
-                non_linearity=resnet_act_fn,
-                output_scale_factor=output_scale_factor,
-                pre_norm=resnet_pre_norm,
-                use_nin_shortcut=True,
-                down=True,
-                kernel="fir",
-            )
-            self.downsamplers = nn.ModuleList([FirDownsample2D(in_channels, out_channels=out_channels)])
-            self.skip_conv = nn.Conv2d(3, out_channels, kernel_size=(1, 1), stride=(1, 1))
-        else:
-            self.resnet_down = None
-            self.downsamplers = None
-            self.skip_conv = None
-    def forward(self, hidden_states, temb=None, skip_sample=None):
-        output_states = ()
-        for resnet in self.resnets:
-            hidden_states = resnet(hidden_states, temb)
-            output_states += (hidden_states,)
-        if self.downsamplers is not None:
-            hidden_states = self.resnet_down(hidden_states, temb)
-            for downsampler in self.downsamplers:
-                skip_sample = downsampler(skip_sample)
-            hidden_states = self.skip_conv(skip_sample) + hidden_states
-            output_states += (hidden_states,)
-        return hidden_states, output_states, skip_sample
-class AttnUpBlock2D(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        prev_output_channel: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        attention_type="default",
-        attn_num_head_channels=1,
-        output_scale_factor=1.0,
-        add_upsample=True,
-    ):
-        super().__init__()
-        resnets = []
-        attentions = []
-        self.attention_type = attention_type
-        for i in range(num_layers):
-            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
-            resnet_in_channels = prev_output_channel if i == 0 else out_channels
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=resnet_in_channels + res_skip_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-            attentions.append(
-                AttentionBlock(
-                    out_channels,
-                    num_head_channels=attn_num_head_channels,
-                    rescale_output_factor=output_scale_factor,
-                    eps=resnet_eps,
-                )
-            )
-        self.attentions = nn.ModuleList(attentions)
-        self.resnets = nn.ModuleList(resnets)
-        if add_upsample:
-            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
-        else:
-            self.upsamplers = None
-    def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
-        for resnet, attn in zip(self.resnets, self.attentions):
-            # pop res hidden states
-            res_hidden_states = res_hidden_states_tuple[-1]
-            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
-            hidden_states = resnet(hidden_states, temb)
-            hidden_states = attn(hidden_states)
-        if self.upsamplers is not None:
-            for upsampler in self.upsamplers:
-                hidden_states = upsampler(hidden_states)
-        return hidden_states
-class CrossAttnUpBlock2D(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        prev_output_channel: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        attn_num_head_channels=1,
-        cross_attention_dim=1280,
-        attention_type="default",
-        output_scale_factor=1.0,
-        downsample_padding=1,
-        add_upsample=True,
-    ):
-        super().__init__()
-        resnets = []
-        attentions = []
-        self.attention_type = attention_type
-        self.attn_num_head_channels = attn_num_head_channels
-        for i in range(num_layers):
-            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
-            resnet_in_channels = prev_output_channel if i == 0 else out_channels
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=resnet_in_channels + res_skip_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-            attentions.append(
-                SpatialTransformer(
-                    out_channels,
-                    attn_num_head_channels,
-                    out_channels // attn_num_head_channels,
-                    depth=1,
-                    context_dim=cross_attention_dim,
-                )
-            )
-        self.attentions = nn.ModuleList(attentions)
-        self.resnets = nn.ModuleList(resnets)
-        if add_upsample:
-            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
-        else:
-            self.upsamplers = None
-    def set_attention_slice(self, slice_size):
-        if slice_size is not None and self.attn_num_head_channels % slice_size != 0:
-            raise ValueError(
-                f"Make sure slice_size {slice_size} is a divisor of "
-                f"the number of heads used in cross_attention {self.attn_num_head_channels}"
-            )
-        if slice_size is not None and slice_size > self.attn_num_head_channels:
-            raise ValueError(
-                f"Chunk_size {slice_size} has to be smaller or equal to "
-                f"the number of heads used in cross_attention {self.attn_num_head_channels}"
-            )
-        for attn in self.attentions:
-            attn._set_attention_slice(slice_size)
-    def forward(self, hidden_states, res_hidden_states_tuple, temb=None, encoder_hidden_states=None):
-        for resnet, attn in zip(self.resnets, self.attentions):
-            # pop res hidden states
-            res_hidden_states = res_hidden_states_tuple[-1]
-            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
-            hidden_states = resnet(hidden_states, temb)
-            hidden_states = attn(hidden_states, context=encoder_hidden_states)
-        if self.upsamplers is not None:
-            for upsampler in self.upsamplers:
-                hidden_states = upsampler(hidden_states)
-        return hidden_states
-class UpBlock2D(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        prev_output_channel: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        output_scale_factor=1.0,
-        add_upsample=True,
-    ):
-        super().__init__()
-        resnets = []
-        for i in range(num_layers):
-            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
-            resnet_in_channels = prev_output_channel if i == 0 else out_channels
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=resnet_in_channels + res_skip_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-        self.resnets = nn.ModuleList(resnets)
-        if add_upsample:
-            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
-        else:
-            self.upsamplers = None
-    def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
-        for resnet in self.resnets:
-            # pop res hidden states
-            res_hidden_states = res_hidden_states_tuple[-1]
-            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
-            hidden_states = resnet(hidden_states, temb)
-        if self.upsamplers is not None:
-            for upsampler in self.upsamplers:
-                hidden_states = upsampler(hidden_states)
-        return hidden_states
-class UpDecoderBlock2D(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        output_scale_factor=1.0,
-        add_upsample=True,
-    ):
-        super().__init__()
-        resnets = []
-        for i in range(num_layers):
-            input_channels = in_channels if i == 0 else out_channels
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=input_channels,
-                    out_channels=out_channels,
-                    temb_channels=None,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-        self.resnets = nn.ModuleList(resnets)
-        if add_upsample:
-            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
-        else:
-            self.upsamplers = None
-    def forward(self, hidden_states):
-        for resnet in self.resnets:
-            hidden_states = resnet(hidden_states, temb=None)
-        if self.upsamplers is not None:
-            for upsampler in self.upsamplers:
-                hidden_states = upsampler(hidden_states)
-        return hidden_states
-class AttnUpDecoderBlock2D(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        attn_num_head_channels=1,
-        output_scale_factor=1.0,
-        add_upsample=True,
-    ):
-        super().__init__()
-        resnets = []
-        attentions = []
-        for i in range(num_layers):
-            input_channels = in_channels if i == 0 else out_channels
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=input_channels,
-                    out_channels=out_channels,
-                    temb_channels=None,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-            attentions.append(
-                AttentionBlock(
-                    out_channels,
-                    num_head_channels=attn_num_head_channels,
-                    rescale_output_factor=output_scale_factor,
-                    eps=resnet_eps,
-                    num_groups=resnet_groups,
-                )
-            )
-        self.attentions = nn.ModuleList(attentions)
-        self.resnets = nn.ModuleList(resnets)
-        if add_upsample:
-            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
-        else:
-            self.upsamplers = None
-    def forward(self, hidden_states):
-        for resnet, attn in zip(self.resnets, self.attentions):
-            hidden_states = resnet(hidden_states, temb=None)
-            hidden_states = attn(hidden_states)
-        if self.upsamplers is not None:
-            for upsampler in self.upsamplers:
-                hidden_states = upsampler(hidden_states)
-        return hidden_states
-class AttnSkipUpBlock2D(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        prev_output_channel: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_pre_norm: bool = True,
-        attn_num_head_channels=1,
-        attention_type="default",
-        output_scale_factor=np.sqrt(2.0),
-        upsample_padding=1,
-        add_upsample=True,
-    ):
-        super().__init__()
-        self.attentions = nn.ModuleList([])
-        self.resnets = nn.ModuleList([])
-        self.attention_type = attention_type
-        for i in range(num_layers):
-            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
-            resnet_in_channels = prev_output_channel if i == 0 else out_channels
-            self.resnets.append(
-                ResnetBlock2D(
-                    in_channels=resnet_in_channels + res_skip_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=min(resnet_in_channels + res_skip_channels // 4, 32),
-                    groups_out=min(out_channels // 4, 32),
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-        self.attentions.append(
-            AttentionBlock(
-                out_channels,
-                num_head_channels=attn_num_head_channels,
-                rescale_output_factor=output_scale_factor,
-                eps=resnet_eps,
-            )
-        )
-        self.upsampler = FirUpsample2D(in_channels, out_channels=out_channels)
-        if add_upsample:
-            self.resnet_up = ResnetBlock2D(
-                in_channels=out_channels,
-                out_channels=out_channels,
-                temb_channels=temb_channels,
-                eps=resnet_eps,
-                groups=min(out_channels // 4, 32),
-                groups_out=min(out_channels // 4, 32),
-                dropout=dropout,
-                time_embedding_norm=resnet_time_scale_shift,
-                non_linearity=resnet_act_fn,
-                output_scale_factor=output_scale_factor,
-                pre_norm=resnet_pre_norm,
-                use_nin_shortcut=True,
-                up=True,
-                kernel="fir",
-            )
-            self.skip_conv = nn.Conv2d(out_channels, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
-            self.skip_norm = torch.nn.GroupNorm(
-                num_groups=min(out_channels // 4, 32), num_channels=out_channels, eps=resnet_eps, affine=True
-            )
-            self.act = nn.SiLU()
-        else:
-            self.resnet_up = None
-            self.skip_conv = None
-            self.skip_norm = None
-            self.act = None
-    def forward(self, hidden_states, res_hidden_states_tuple, temb=None, skip_sample=None):
-        for resnet in self.resnets:
-            # pop res hidden states
-            res_hidden_states = res_hidden_states_tuple[-1]
-            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
-            hidden_states = resnet(hidden_states, temb)
-        hidden_states = self.attentions[0](hidden_states)
-        if skip_sample is not None:
-            skip_sample = self.upsampler(skip_sample)
-        else:
-            skip_sample = 0
-        if self.resnet_up is not None:
-            skip_sample_states = self.skip_norm(hidden_states)
-            skip_sample_states = self.act(skip_sample_states)
-            skip_sample_states = self.skip_conv(skip_sample_states)
-            skip_sample = skip_sample + skip_sample_states
-            hidden_states = self.resnet_up(hidden_states, temb)
-        return hidden_states, skip_sample
-class SkipUpBlock2D(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        prev_output_channel: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_pre_norm: bool = True,
-        output_scale_factor=np.sqrt(2.0),
-        add_upsample=True,
-        upsample_padding=1,
-    ):
-        super().__init__()
-        self.resnets = nn.ModuleList([])
-        for i in range(num_layers):
-            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
-            resnet_in_channels = prev_output_channel if i == 0 else out_channels
-            self.resnets.append(
-                ResnetBlock2D(
-                    in_channels=resnet_in_channels + res_skip_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=min((resnet_in_channels + res_skip_channels) // 4, 32),
-                    groups_out=min(out_channels // 4, 32),
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-        self.upsampler = FirUpsample2D(in_channels, out_channels=out_channels)
-        if add_upsample:
-            self.resnet_up = ResnetBlock2D(
-                in_channels=out_channels,
-                out_channels=out_channels,
-                temb_channels=temb_channels,
-                eps=resnet_eps,
-                groups=min(out_channels // 4, 32),
-                groups_out=min(out_channels // 4, 32),
-                dropout=dropout,
-                time_embedding_norm=resnet_time_scale_shift,
-                non_linearity=resnet_act_fn,
-                output_scale_factor=output_scale_factor,
-                pre_norm=resnet_pre_norm,
-                use_nin_shortcut=True,
-                up=True,
-                kernel="fir",
-            )
-            self.skip_conv = nn.Conv2d(out_channels, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
-            self.skip_norm = torch.nn.GroupNorm(
-                num_groups=min(out_channels // 4, 32), num_channels=out_channels, eps=resnet_eps, affine=True
-            )
-            self.act = nn.SiLU()
-        else:
-            self.resnet_up = None
-            self.skip_conv = None
-            self.skip_norm = None
-            self.act = None
-    def forward(self, hidden_states, res_hidden_states_tuple, temb=None, skip_sample=None):
-        for resnet in self.resnets:
-            # pop res hidden states
-            res_hidden_states = res_hidden_states_tuple[-1]
-            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
-            hidden_states = resnet(hidden_states, temb)
-        if skip_sample is not None:
-            skip_sample = self.upsampler(skip_sample)
-        else:
-            skip_sample = 0
-        if self.resnet_up is not None:
-            skip_sample_states = self.skip_norm(hidden_states)
-            skip_sample_states = self.act(skip_sample_states)
-            skip_sample_states = self.skip_conv(skip_sample_states)
-            skip_sample = skip_sample + skip_sample_states
-            hidden_states = self.resnet_up(hidden_states, temb)
-        return hidden_states, skip_sample

diffusers/models/vae.py DELETED Viewed

@@ -1,585 +0,0 @@
-from dataclasses import dataclass
-from typing import Optional, Tuple, Union
-import numpy as np
-import torch
-import torch.nn as nn
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..modeling_utils import ModelMixin
-from ..utils import BaseOutput
-from .unet_blocks import UNetMidBlock2D, get_down_block, get_up_block
-@dataclass
-class DecoderOutput(BaseOutput):
-    """
-    Output of decoding method.
-    Args:
-        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Decoded output sample of the model. Output of the last layer of the model.
-    """
-    sample: torch.FloatTensor
-@dataclass
-class VQEncoderOutput(BaseOutput):
-    """
-    Output of VQModel encoding method.
-    Args:
-        latents (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Encoded output sample of the model. Output of the last layer of the model.
-    """
-    latents: torch.FloatTensor
-@dataclass
-class AutoencoderKLOutput(BaseOutput):
-    """
-    Output of AutoencoderKL encoding method.
-    Args:
-        latent_dist (`DiagonalGaussianDistribution`):
-            Encoded outputs of `Encoder` represented as the mean and logvar of `DiagonalGaussianDistribution`.
-            `DiagonalGaussianDistribution` allows for sampling latents from the distribution.
-    """
-    latent_dist: "DiagonalGaussianDistribution"
-class Encoder(nn.Module):
-    def __init__(
-        self,
-        in_channels=3,
-        out_channels=3,
-        down_block_types=("DownEncoderBlock2D",),
-        block_out_channels=(64,),
-        layers_per_block=2,
-        act_fn="silu",
-        double_z=True,
-    ):
-        super().__init__()
-        self.layers_per_block = layers_per_block
-        self.conv_in = torch.nn.Conv2d(in_channels, block_out_channels[0], kernel_size=3, stride=1, padding=1)
-        self.mid_block = None
-        self.down_blocks = nn.ModuleList([])
-        # down
-        output_channel = block_out_channels[0]
-        for i, down_block_type in enumerate(down_block_types):
-            input_channel = output_channel
-            output_channel = block_out_channels[i]
-            is_final_block = i == len(block_out_channels) - 1
-            down_block = get_down_block(
-                down_block_type,
-                num_layers=self.layers_per_block,
-                in_channels=input_channel,
-                out_channels=output_channel,
-                add_downsample=not is_final_block,
-                resnet_eps=1e-6,
-                downsample_padding=0,
-                resnet_act_fn=act_fn,
-                attn_num_head_channels=None,
-                temb_channels=None,
-            )
-            self.down_blocks.append(down_block)
-        # mid
-        self.mid_block = UNetMidBlock2D(
-            in_channels=block_out_channels[-1],
-            resnet_eps=1e-6,
-            resnet_act_fn=act_fn,
-            output_scale_factor=1,
-            resnet_time_scale_shift="default",
-            attn_num_head_channels=None,
-            resnet_groups=32,
-            temb_channels=None,
-        )
-        # out
-        num_groups_out = 32
-        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[-1], num_groups=num_groups_out, eps=1e-6)
-        self.conv_act = nn.SiLU()
-        conv_out_channels = 2 * out_channels if double_z else out_channels
-        self.conv_out = nn.Conv2d(block_out_channels[-1], conv_out_channels, 3, padding=1)
-    def forward(self, x):
-        sample = x
-        sample = self.conv_in(sample)
-        # down
-        for down_block in self.down_blocks:
-            sample = down_block(sample)
-        # middle
-        sample = self.mid_block(sample)
-        # post-process
-        sample = self.conv_norm_out(sample)
-        sample = self.conv_act(sample)
-        sample = self.conv_out(sample)
-        return sample
-class Decoder(nn.Module):
-    def __init__(
-        self,
-        in_channels=3,
-        out_channels=3,
-        up_block_types=("UpDecoderBlock2D",),
-        block_out_channels=(64,),
-        layers_per_block=2,
-        act_fn="silu",
-    ):
-        super().__init__()
-        self.layers_per_block = layers_per_block
-        self.conv_in = nn.Conv2d(in_channels, block_out_channels[-1], kernel_size=3, stride=1, padding=1)
-        self.mid_block = None
-        self.up_blocks = nn.ModuleList([])
-        # mid
-        self.mid_block = UNetMidBlock2D(
-            in_channels=block_out_channels[-1],
-            resnet_eps=1e-6,
-            resnet_act_fn=act_fn,
-            output_scale_factor=1,
-            resnet_time_scale_shift="default",
-            attn_num_head_channels=None,
-            resnet_groups=32,
-            temb_channels=None,
-        )
-        # up
-        reversed_block_out_channels = list(reversed(block_out_channels))
-        output_channel = reversed_block_out_channels[0]
-        for i, up_block_type in enumerate(up_block_types):
-            prev_output_channel = output_channel
-            output_channel = reversed_block_out_channels[i]
-            is_final_block = i == len(block_out_channels) - 1
-            up_block = get_up_block(
-                up_block_type,
-                num_layers=self.layers_per_block + 1,
-                in_channels=prev_output_channel,
-                out_channels=output_channel,
-                prev_output_channel=None,
-                add_upsample=not is_final_block,
-                resnet_eps=1e-6,
-                resnet_act_fn=act_fn,
-                attn_num_head_channels=None,
-                temb_channels=None,
-            )
-            self.up_blocks.append(up_block)
-            prev_output_channel = output_channel
-        # out
-        num_groups_out = 32
-        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=num_groups_out, eps=1e-6)
-        self.conv_act = nn.SiLU()
-        self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, 3, padding=1)
-    def forward(self, z):
-        sample = z
-        sample = self.conv_in(sample)
-        # middle
-        sample = self.mid_block(sample)
-        # up
-        for up_block in self.up_blocks:
-            sample = up_block(sample)
-        # post-process
-        sample = self.conv_norm_out(sample)
-        sample = self.conv_act(sample)
-        sample = self.conv_out(sample)
-        return sample
-class VectorQuantizer(nn.Module):
-    """
-    Improved version over VectorQuantizer, can be used as a drop-in replacement. Mostly avoids costly matrix
-    multiplications and allows for post-hoc remapping of indices.
-    """
-    # NOTE: due to a bug the beta term was applied to the wrong term. for
-    # backwards compatibility we use the buggy version by default, but you can
-    # specify legacy=False to fix it.
-    def __init__(self, n_e, e_dim, beta, remap=None, unknown_index="random", sane_index_shape=False, legacy=True):
-        super().__init__()
-        self.n_e = n_e
-        self.e_dim = e_dim
-        self.beta = beta
-        self.legacy = legacy
-        self.embedding = nn.Embedding(self.n_e, self.e_dim)
-        self.embedding.weight.data.uniform_(-1.0 / self.n_e, 1.0 / self.n_e)
-        self.remap = remap
-        if self.remap is not None:
-            self.register_buffer("used", torch.tensor(np.load(self.remap)))
-            self.re_embed = self.used.shape[0]
-            self.unknown_index = unknown_index  # "random" or "extra" or integer
-            if self.unknown_index == "extra":
-                self.unknown_index = self.re_embed
-                self.re_embed = self.re_embed + 1
-            print(
-                f"Remapping {self.n_e} indices to {self.re_embed} indices. "
-                f"Using {self.unknown_index} for unknown indices."
-            )
-        else:
-            self.re_embed = n_e
-        self.sane_index_shape = sane_index_shape
-    def remap_to_used(self, inds):
-        ishape = inds.shape
-        assert len(ishape) > 1
-        inds = inds.reshape(ishape[0], -1)
-        used = self.used.to(inds)
-        match = (inds[:, :, None] == used[None, None, ...]).long()
-        new = match.argmax(-1)
-        unknown = match.sum(2) < 1
-        if self.unknown_index == "random":
-            new[unknown] = torch.randint(0, self.re_embed, size=new[unknown].shape).to(device=new.device)
-        else:
-            new[unknown] = self.unknown_index
-        return new.reshape(ishape)
-    def unmap_to_all(self, inds):
-        ishape = inds.shape
-        assert len(ishape) > 1
-        inds = inds.reshape(ishape[0], -1)
-        used = self.used.to(inds)
-        if self.re_embed > self.used.shape[0]:  # extra token
-            inds[inds >= self.used.shape[0]] = 0  # simply set to zero
-        back = torch.gather(used[None, :][inds.shape[0] * [0], :], 1, inds)
-        return back.reshape(ishape)
-    def forward(self, z):
-        # reshape z -> (batch, height, width, channel) and flatten
-        z = z.permute(0, 2, 3, 1).contiguous()
-        z_flattened = z.view(-1, self.e_dim)
-        # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
-        d = (
-            torch.sum(z_flattened**2, dim=1, keepdim=True)
-            + torch.sum(self.embedding.weight**2, dim=1)
-            - 2 * torch.einsum("bd,dn->bn", z_flattened, self.embedding.weight.t())
-        )
-        min_encoding_indices = torch.argmin(d, dim=1)
-        z_q = self.embedding(min_encoding_indices).view(z.shape)
-        perplexity = None
-        min_encodings = None
-        # compute loss for embedding
-        if not self.legacy:
-            loss = self.beta * torch.mean((z_q.detach() - z) ** 2) + torch.mean((z_q - z.detach()) ** 2)
-        else:
-            loss = torch.mean((z_q.detach() - z) ** 2) + self.beta * torch.mean((z_q - z.detach()) ** 2)
-        # preserve gradients
-        z_q = z + (z_q - z).detach()
-        # reshape back to match original input shape
-        z_q = z_q.permute(0, 3, 1, 2).contiguous()
-        if self.remap is not None:
-            min_encoding_indices = min_encoding_indices.reshape(z.shape[0], -1)  # add batch axis
-            min_encoding_indices = self.remap_to_used(min_encoding_indices)
-            min_encoding_indices = min_encoding_indices.reshape(-1, 1)  # flatten
-        if self.sane_index_shape:
-            min_encoding_indices = min_encoding_indices.reshape(z_q.shape[0], z_q.shape[2], z_q.shape[3])
-        return z_q, loss, (perplexity, min_encodings, min_encoding_indices)
-    def get_codebook_entry(self, indices, shape):
-        # shape specifying (batch, height, width, channel)
-        if self.remap is not None:
-            indices = indices.reshape(shape[0], -1)  # add batch axis
-            indices = self.unmap_to_all(indices)
-            indices = indices.reshape(-1)  # flatten again
-        # get quantized latent vectors
-        z_q = self.embedding(indices)
-        if shape is not None:
-            z_q = z_q.view(shape)
-            # reshape back to match original input shape
-            z_q = z_q.permute(0, 3, 1, 2).contiguous()
-        return z_q
-class DiagonalGaussianDistribution(object):
-    def __init__(self, parameters, deterministic=False):
-        self.parameters = parameters
-        self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
-        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
-        self.deterministic = deterministic
-        self.std = torch.exp(0.5 * self.logvar)
-        self.var = torch.exp(self.logvar)
-        if self.deterministic:
-            self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device)
-    def sample(self, generator: Optional[torch.Generator] = None) -> torch.FloatTensor:
-        device = self.parameters.device
-        sample_device = "cpu" if device.type == "mps" else device
-        sample = torch.randn(self.mean.shape, generator=generator, device=sample_device).to(device)
-        x = self.mean + self.std * sample
-        return x
-    def kl(self, other=None):
-        if self.deterministic:
-            return torch.Tensor([0.0])
-        else:
-            if other is None:
-                return 0.5 * torch.sum(torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar, dim=[1, 2, 3])
-            else:
-                return 0.5 * torch.sum(
-                    torch.pow(self.mean - other.mean, 2) / other.var
-                    + self.var / other.var
-                    - 1.0
-                    - self.logvar
-                    + other.logvar,
-                    dim=[1, 2, 3],
-                )
-    def nll(self, sample, dims=[1, 2, 3]):
-        if self.deterministic:
-            return torch.Tensor([0.0])
-        logtwopi = np.log(2.0 * np.pi)
-        return 0.5 * torch.sum(logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var, dim=dims)
-    def mode(self):
-        return self.mean
-class VQModel(ModelMixin, ConfigMixin):
-    r"""VQ-VAE model from the paper Neural Discrete Representation Learning by Aaron van den Oord, Oriol Vinyals and Koray
-    Kavukcuoglu.
-    This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
-    implements for all the model (such as downloading or saving, etc.)
-    Parameters:
-        in_channels (int, *optional*, defaults to 3): Number of channels in the input image.
-        out_channels (int,  *optional*, defaults to 3): Number of channels in the output.
-        down_block_types (`Tuple[str]`, *optional*, defaults to :
-            obj:`("DownEncoderBlock2D",)`): Tuple of downsample block types.
-        up_block_types (`Tuple[str]`, *optional*, defaults to :
-            obj:`("UpDecoderBlock2D",)`): Tuple of upsample block types.
-        block_out_channels (`Tuple[int]`, *optional*, defaults to :
-            obj:`(64,)`): Tuple of block output channels.
-        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
-        latent_channels (`int`, *optional*, defaults to `3`): Number of channels in the latent space.
-        sample_size (`int`, *optional*, defaults to `32`): TODO
-        num_vq_embeddings (`int`, *optional*, defaults to `256`): Number of codebook vectors in the VQ-VAE.
-    """
-    @register_to_config
-    def __init__(
-        self,
-        in_channels: int = 3,
-        out_channels: int = 3,
-        down_block_types: Tuple[str] = ("DownEncoderBlock2D",),
-        up_block_types: Tuple[str] = ("UpDecoderBlock2D",),
-        block_out_channels: Tuple[int] = (64,),
-        layers_per_block: int = 1,
-        act_fn: str = "silu",
-        latent_channels: int = 3,
-        sample_size: int = 32,
-        num_vq_embeddings: int = 256,
-    ):
-        super().__init__()
-        # pass init params to Encoder
-        self.encoder = Encoder(
-            in_channels=in_channels,
-            out_channels=latent_channels,
-            down_block_types=down_block_types,
-            block_out_channels=block_out_channels,
-            layers_per_block=layers_per_block,
-            act_fn=act_fn,
-            double_z=False,
-        )
-        self.quant_conv = torch.nn.Conv2d(latent_channels, latent_channels, 1)
-        self.quantize = VectorQuantizer(
-            num_vq_embeddings, latent_channels, beta=0.25, remap=None, sane_index_shape=False
-        )
-        self.post_quant_conv = torch.nn.Conv2d(latent_channels, latent_channels, 1)
-        # pass init params to Decoder
-        self.decoder = Decoder(
-            in_channels=latent_channels,
-            out_channels=out_channels,
-            up_block_types=up_block_types,
-            block_out_channels=block_out_channels,
-            layers_per_block=layers_per_block,
-            act_fn=act_fn,
-        )
-    def encode(self, x: torch.FloatTensor, return_dict: bool = True) -> VQEncoderOutput:
-        h = self.encoder(x)
-        h = self.quant_conv(h)
-        if not return_dict:
-            return (h,)
-        return VQEncoderOutput(latents=h)
-    def decode(
-        self, h: torch.FloatTensor, force_not_quantize: bool = False, return_dict: bool = True
-    ) -> Union[DecoderOutput, torch.FloatTensor]:
-        # also go through quantization layer
-        if not force_not_quantize:
-            quant, emb_loss, info = self.quantize(h)
-        else:
-            quant = h
-        quant = self.post_quant_conv(quant)
-        dec = self.decoder(quant)
-        return dec
-        # if not return_dict:
-        #     return (dec,)
-        #
-        # return DecoderOutput(sample=dec)
-    def forward(self, sample: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
-        r"""
-        Args:
-            sample (`torch.FloatTensor`): Input sample.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
-        """
-        x = sample
-        h = self.encode(x).latents
-        dec = self.decode(h).sample
-        if not return_dict:
-            return (dec,)
-        return DecoderOutput(sample=dec)
-class AutoencoderKL(ModelMixin, ConfigMixin):
-    r"""Variational Autoencoder (VAE) model with KL loss from the paper Auto-Encoding Variational Bayes by Diederik P. Kingma
-    and Max Welling.
-    This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
-    implements for all the model (such as downloading or saving, etc.)
-    Parameters:
-        in_channels (int, *optional*, defaults to 3): Number of channels in the input image.
-        out_channels (int,  *optional*, defaults to 3): Number of channels in the output.
-        down_block_types (`Tuple[str]`, *optional*, defaults to :
-            obj:`("DownEncoderBlock2D",)`): Tuple of downsample block types.
-        up_block_types (`Tuple[str]`, *optional*, defaults to :
-            obj:`("UpDecoderBlock2D",)`): Tuple of upsample block types.
-        block_out_channels (`Tuple[int]`, *optional*, defaults to :
-            obj:`(64,)`): Tuple of block output channels.
-        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
-        latent_channels (`int`, *optional*, defaults to `4`): Number of channels in the latent space.
-        sample_size (`int`, *optional*, defaults to `32`): TODO
-    """
-    @register_to_config
-    def __init__(
-        self,
-        in_channels: int = 3,
-        out_channels: int = 3,
-        down_block_types: Tuple[str] = ("DownEncoderBlock2D",),
-        up_block_types: Tuple[str] = ("UpDecoderBlock2D",),
-        block_out_channels: Tuple[int] = (64,),
-        layers_per_block: int = 1,
-        act_fn: str = "silu",
-        latent_channels: int = 4,
-        sample_size: int = 32,
-    ):
-        super().__init__()
-        # pass init params to Encoder
-        self.encoder = Encoder(
-            in_channels=in_channels,
-            out_channels=latent_channels,
-            down_block_types=down_block_types,
-            block_out_channels=block_out_channels,
-            layers_per_block=layers_per_block,
-            act_fn=act_fn,
-            double_z=True,
-        )
-        # pass init params to Decoder
-        self.decoder = Decoder(
-            in_channels=latent_channels,
-            out_channels=out_channels,
-            up_block_types=up_block_types,
-            block_out_channels=block_out_channels,
-            layers_per_block=layers_per_block,
-            act_fn=act_fn,
-        )
-        self.quant_conv = torch.nn.Conv2d(2 * latent_channels, 2 * latent_channels, 1)
-        self.post_quant_conv = torch.nn.Conv2d(latent_channels, latent_channels, 1)
-    def encode(self, x: torch.FloatTensor, return_dict: bool = True) -> AutoencoderKLOutput:
-        h = self.encoder(x)
-        moments = self.quant_conv(h)
-        posterior = DiagonalGaussianDistribution(moments)
-        if not return_dict:
-            return (posterior,)
-        return AutoencoderKLOutput(latent_dist=posterior)
-    def decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
-        z = self.post_quant_conv(z)
-        dec = self.decoder(z)
-        return dec
-        #
-        # if not return_dict:
-        #     return (dec,)
-        #
-        # return DecoderOutput(sample=dec)
-    def forward(
-        self, sample: torch.FloatTensor, sample_posterior: bool = False, return_dict: bool = True
-    ) -> Union[DecoderOutput, torch.FloatTensor]:
-        r"""
-        Args:
-            sample (`torch.FloatTensor`): Input sample.
-            sample_posterior (`bool`, *optional*, defaults to `False`):
-                Whether to sample from the posterior.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
-        """
-        x = sample
-        posterior = self.encode(x).latent_dist
-        if sample_posterior:
-            z = posterior.sample()
-        else:
-            z = posterior.mode()
-        dec = self.decode(z).sample
-        if not return_dict:
-            return (dec,)
-        return DecoderOutput(sample=dec)

diffusers/onnx_utils.py DELETED Viewed

@@ -1,189 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import shutil
-from pathlib import Path
-from typing import Optional, Union
-import numpy as np
-from huggingface_hub import hf_hub_download
-from .utils import is_onnx_available, logging
-if is_onnx_available():
-    import onnxruntime as ort
-ONNX_WEIGHTS_NAME = "model.onnx"
-logger = logging.get_logger(__name__)
-class OnnxRuntimeModel:
-    base_model_prefix = "onnx_model"
-    def __init__(self, model=None, **kwargs):
-        logger.info("`diffusers.OnnxRuntimeModel` is experimental and might change in the future.")
-        self.model = model
-        self.model_save_dir = kwargs.get("model_save_dir", None)
-        self.latest_model_name = kwargs.get("latest_model_name", "model.onnx")
-    def __call__(self, **kwargs):
-        inputs = {k: np.array(v) for k, v in kwargs.items()}
-        return self.model.run(None, inputs)
-    @staticmethod
-    def load_model(path: Union[str, Path], provider=None):
-        """
-        Loads an ONNX Inference session with an ExecutionProvider. Default provider is `CPUExecutionProvider`
-        Arguments:
-            path (`str` or `Path`):
-                Directory from which to load
-            provider(`str`, *optional*):
-                Onnxruntime execution provider to use for loading the model, defaults to `CPUExecutionProvider`
-        """
-        if provider is None:
-            logger.info("No onnxruntime provider specified, using CPUExecutionProvider")
-            provider = "CPUExecutionProvider"
-        return ort.InferenceSession(path, providers=[provider])
-    def _save_pretrained(self, save_directory: Union[str, Path], file_name: Optional[str] = None, **kwargs):
-        """
-        Save a model and its configuration file to a directory, so that it can be re-loaded using the
-        [`~optimum.onnxruntime.modeling_ort.ORTModel.from_pretrained`] class method. It will always save the
-        latest_model_name.
-        Arguments:
-            save_directory (`str` or `Path`):
-                Directory where to save the model file.
-            file_name(`str`, *optional*):
-                Overwrites the default model file name from `"model.onnx"` to `file_name`. This allows you to save the
-                model with a different name.
-        """
-        model_file_name = file_name if file_name is not None else ONNX_WEIGHTS_NAME
-        src_path = self.model_save_dir.joinpath(self.latest_model_name)
-        dst_path = Path(save_directory).joinpath(model_file_name)
-        if not src_path.samefile(dst_path):
-            shutil.copyfile(src_path, dst_path)
-    def save_pretrained(
-        self,
-        save_directory: Union[str, os.PathLike],
-        **kwargs,
-    ):
-        """
-        Save a model to a directory, so that it can be re-loaded using the [`~OnnxModel.from_pretrained`] class
-        method.:
-        Arguments:
-            save_directory (`str` or `os.PathLike`):
-                Directory to which to save. Will be created if it doesn't exist.
-        """
-        if os.path.isfile(save_directory):
-            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
-            return
-        os.makedirs(save_directory, exist_ok=True)
-        # saving model weights/files
-        self._save_pretrained(save_directory, **kwargs)
-    @classmethod
-    def _from_pretrained(
-        cls,
-        model_id: Union[str, Path],
-        use_auth_token: Optional[Union[bool, str, None]] = None,
-        revision: Optional[Union[str, None]] = None,
-        force_download: bool = False,
-        cache_dir: Optional[str] = None,
-        file_name: Optional[str] = None,
-        provider: Optional[str] = None,
-        **kwargs,
-    ):
-        """
-        Load a model from a directory or the HF Hub.
-        Arguments:
-            model_id (`str` or `Path`):
-                Directory from which to load
-            use_auth_token (`str` or `bool`):
-                Is needed to load models from a private or gated repository
-            revision (`str`):
-                Revision is the specific model version to use. It can be a branch name, a tag name, or a commit id
-            cache_dir (`Union[str, Path]`, *optional*):
-                Path to a directory in which a downloaded pretrained model configuration should be cached if the
-                standard cache should not be used.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-            file_name(`str`):
-                Overwrites the default model file name from `"model.onnx"` to `file_name`. This allows you to load
-                different model files from the same repository or directory.
-            provider(`str`):
-                The ONNX runtime provider, e.g. `CPUExecutionProvider` or `CUDAExecutionProvider`.
-            kwargs (`Dict`, *optional*):
-                kwargs will be passed to the model during initialization
-        """
-        model_file_name = file_name if file_name is not None else ONNX_WEIGHTS_NAME
-        # load model from local directory
-        if os.path.isdir(model_id):
-            model = OnnxRuntimeModel.load_model(os.path.join(model_id, model_file_name), provider=provider)
-            kwargs["model_save_dir"] = Path(model_id)
-        # load model from hub
-        else:
-            # download model
-            model_cache_path = hf_hub_download(
-                repo_id=model_id,
-                filename=model_file_name,
-                use_auth_token=use_auth_token,
-                revision=revision,
-                cache_dir=cache_dir,
-                force_download=force_download,
-            )
-            kwargs["model_save_dir"] = Path(model_cache_path).parent
-            kwargs["latest_model_name"] = Path(model_cache_path).name
-            model = OnnxRuntimeModel.load_model(model_cache_path, provider=provider)
-        return cls(model=model, **kwargs)
-    @classmethod
-    def from_pretrained(
-        cls,
-        model_id: Union[str, Path],
-        force_download: bool = True,
-        use_auth_token: Optional[str] = None,
-        cache_dir: Optional[str] = None,
-        **model_kwargs,
-    ):
-        revision = None
-        if len(str(model_id).split("@")) == 2:
-            model_id, revision = model_id.split("@")
-        return cls._from_pretrained(
-            model_id=model_id,
-            revision=revision,
-            cache_dir=cache_dir,
-            force_download=force_download,
-            use_auth_token=use_auth_token,
-            **model_kwargs,
-        )

diffusers/optimization.py DELETED Viewed

@@ -1,275 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch optimization for diffusion models."""
-import math
-from enum import Enum
-from typing import Optional, Union
-from torch.optim import Optimizer
-from torch.optim.lr_scheduler import LambdaLR
-from .utils import logging
-logger = logging.get_logger(__name__)
-class SchedulerType(Enum):
-    LINEAR = "linear"
-    COSINE = "cosine"
-    COSINE_WITH_RESTARTS = "cosine_with_restarts"
-    POLYNOMIAL = "polynomial"
-    CONSTANT = "constant"
-    CONSTANT_WITH_WARMUP = "constant_with_warmup"
-def get_constant_schedule(optimizer: Optimizer, last_epoch: int = -1):
-    """
-    Create a schedule with a constant learning rate, using the learning rate set in optimizer.
-    Args:
-        optimizer ([`~torch.optim.Optimizer`]):
-            The optimizer for which to schedule the learning rate.
-        last_epoch (`int`, *optional*, defaults to -1):
-            The index of the last epoch when resuming training.
-    Return:
-        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
-    """
-    return LambdaLR(optimizer, lambda _: 1, last_epoch=last_epoch)
-def get_constant_schedule_with_warmup(optimizer: Optimizer, num_warmup_steps: int, last_epoch: int = -1):
-    """
-    Create a schedule with a constant learning rate preceded by a warmup period during which the learning rate
-    increases linearly between 0 and the initial lr set in the optimizer.
-    Args:
-        optimizer ([`~torch.optim.Optimizer`]):
-            The optimizer for which to schedule the learning rate.
-        num_warmup_steps (`int`):
-            The number of steps for the warmup phase.
-        last_epoch (`int`, *optional*, defaults to -1):
-            The index of the last epoch when resuming training.
-    Return:
-        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
-    """
-    def lr_lambda(current_step: int):
-        if current_step < num_warmup_steps:
-            return float(current_step) / float(max(1.0, num_warmup_steps))
-        return 1.0
-    return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch)
-def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1):
-    """
-    Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
-    a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
-    Args:
-        optimizer ([`~torch.optim.Optimizer`]):
-            The optimizer for which to schedule the learning rate.
-        num_warmup_steps (`int`):
-            The number of steps for the warmup phase.
-        num_training_steps (`int`):
-            The total number of training steps.
-        last_epoch (`int`, *optional*, defaults to -1):
-            The index of the last epoch when resuming training.
-    Return:
-        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
-    """
-    def lr_lambda(current_step: int):
-        if current_step < num_warmup_steps:
-            return float(current_step) / float(max(1, num_warmup_steps))
-        return max(
-            0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))
-        )
-    return LambdaLR(optimizer, lr_lambda, last_epoch)
-def get_cosine_schedule_with_warmup(
-    optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, num_cycles: float = 0.5, last_epoch: int = -1
-):
-    """
-    Create a schedule with a learning rate that decreases following the values of the cosine function between the
-    initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
-    initial lr set in the optimizer.
-    Args:
-        optimizer ([`~torch.optim.Optimizer`]):
-            The optimizer for which to schedule the learning rate.
-        num_warmup_steps (`int`):
-            The number of steps for the warmup phase.
-        num_training_steps (`int`):
-            The total number of training steps.
-        num_cycles (`float`, *optional*, defaults to 0.5):
-            The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
-            following a half-cosine).
-        last_epoch (`int`, *optional*, defaults to -1):
-            The index of the last epoch when resuming training.
-    Return:
-        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
-    """
-    def lr_lambda(current_step):
-        if current_step < num_warmup_steps:
-            return float(current_step) / float(max(1, num_warmup_steps))
-        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
-        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
-    return LambdaLR(optimizer, lr_lambda, last_epoch)
-def get_cosine_with_hard_restarts_schedule_with_warmup(
-    optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, num_cycles: int = 1, last_epoch: int = -1
-):
-    """
-    Create a schedule with a learning rate that decreases following the values of the cosine function between the
-    initial lr set in the optimizer to 0, with several hard restarts, after a warmup period during which it increases
-    linearly between 0 and the initial lr set in the optimizer.
-    Args:
-        optimizer ([`~torch.optim.Optimizer`]):
-            The optimizer for which to schedule the learning rate.
-        num_warmup_steps (`int`):
-            The number of steps for the warmup phase.
-        num_training_steps (`int`):
-            The total number of training steps.
-        num_cycles (`int`, *optional*, defaults to 1):
-            The number of hard restarts to use.
-        last_epoch (`int`, *optional*, defaults to -1):
-            The index of the last epoch when resuming training.
-    Return:
-        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
-    """
-    def lr_lambda(current_step):
-        if current_step < num_warmup_steps:
-            return float(current_step) / float(max(1, num_warmup_steps))
-        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
-        if progress >= 1.0:
-            return 0.0
-        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * ((float(num_cycles) * progress) % 1.0))))
-    return LambdaLR(optimizer, lr_lambda, last_epoch)
-def get_polynomial_decay_schedule_with_warmup(
-    optimizer, num_warmup_steps, num_training_steps, lr_end=1e-7, power=1.0, last_epoch=-1
-):
-    """
-    Create a schedule with a learning rate that decreases as a polynomial decay from the initial lr set in the
-    optimizer to end lr defined by *lr_end*, after a warmup period during which it increases linearly from 0 to the
-    initial lr set in the optimizer.
-    Args:
-        optimizer ([`~torch.optim.Optimizer`]):
-            The optimizer for which to schedule the learning rate.
-        num_warmup_steps (`int`):
-            The number of steps for the warmup phase.
-        num_training_steps (`int`):
-            The total number of training steps.
-        lr_end (`float`, *optional*, defaults to 1e-7):
-            The end LR.
-        power (`float`, *optional*, defaults to 1.0):
-            Power factor.
-        last_epoch (`int`, *optional*, defaults to -1):
-            The index of the last epoch when resuming training.
-    Note: *power* defaults to 1.0 as in the fairseq implementation, which in turn is based on the original BERT
-    implementation at
-    https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/optimization.py#L37
-    Return:
-        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
-    """
-    lr_init = optimizer.defaults["lr"]
-    if not (lr_init > lr_end):
-        raise ValueError(f"lr_end ({lr_end}) must be be smaller than initial lr ({lr_init})")
-    def lr_lambda(current_step: int):
-        if current_step < num_warmup_steps:
-            return float(current_step) / float(max(1, num_warmup_steps))
-        elif current_step > num_training_steps:
-            return lr_end / lr_init  # as LambdaLR multiplies by lr_init
-        else:
-            lr_range = lr_init - lr_end
-            decay_steps = num_training_steps - num_warmup_steps
-            pct_remaining = 1 - (current_step - num_warmup_steps) / decay_steps
-            decay = lr_range * pct_remaining**power + lr_end
-            return decay / lr_init  # as LambdaLR multiplies by lr_init
-    return LambdaLR(optimizer, lr_lambda, last_epoch)
-TYPE_TO_SCHEDULER_FUNCTION = {
-    SchedulerType.LINEAR: get_linear_schedule_with_warmup,
-    SchedulerType.COSINE: get_cosine_schedule_with_warmup,
-    SchedulerType.COSINE_WITH_RESTARTS: get_cosine_with_hard_restarts_schedule_with_warmup,
-    SchedulerType.POLYNOMIAL: get_polynomial_decay_schedule_with_warmup,
-    SchedulerType.CONSTANT: get_constant_schedule,
-    SchedulerType.CONSTANT_WITH_WARMUP: get_constant_schedule_with_warmup,
-}
-def get_scheduler(
-    name: Union[str, SchedulerType],
-    optimizer: Optimizer,
-    num_warmup_steps: Optional[int] = None,
-    num_training_steps: Optional[int] = None,
-):
-    """
-    Unified API to get any scheduler from its name.
-    Args:
-        name (`str` or `SchedulerType`):
-            The name of the scheduler to use.
-        optimizer (`torch.optim.Optimizer`):
-            The optimizer that will be used during training.
-        num_warmup_steps (`int`, *optional*):
-            The number of warmup steps to do. This is not required by all schedulers (hence the argument being
-            optional), the function will raise an error if it's unset and the scheduler type requires it.
-        num_training_steps (`int``, *optional*):
-            The number of training steps to do. This is not required by all schedulers (hence the argument being
-            optional), the function will raise an error if it's unset and the scheduler type requires it.
-    """
-    name = SchedulerType(name)
-    schedule_func = TYPE_TO_SCHEDULER_FUNCTION[name]
-    if name == SchedulerType.CONSTANT:
-        return schedule_func(optimizer)
-    # All other schedulers require `num_warmup_steps`
-    if num_warmup_steps is None:
-        raise ValueError(f"{name} requires `num_warmup_steps`, please provide that argument.")
-    if name == SchedulerType.CONSTANT_WITH_WARMUP:
-        return schedule_func(optimizer, num_warmup_steps=num_warmup_steps)
-    # All other schedulers require `num_training_steps`
-    if num_training_steps is None:
-        raise ValueError(f"{name} requires `num_training_steps`, please provide that argument.")
-    return schedule_func(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

diffusers/pipeline_utils.py DELETED Viewed

@@ -1,417 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import importlib
-import inspect
-import os
-from dataclasses import dataclass
-from typing import List, Optional, Union
-import numpy as np
-import torch
-import diffusers
-import PIL
-from huggingface_hub import snapshot_download
-from PIL import Image
-from tqdm.auto import tqdm
-from .configuration_utils import ConfigMixin
-from .utils import DIFFUSERS_CACHE, BaseOutput, logging
-INDEX_FILE = "diffusion_pytorch_model.bin"
-logger = logging.get_logger(__name__)
-LOADABLE_CLASSES = {
-    "diffusers": {
-        "ModelMixin": ["save_pretrained", "from_pretrained"],
-        "SchedulerMixin": ["save_config", "from_config"],
-        "DiffusionPipeline": ["save_pretrained", "from_pretrained"],
-        "OnnxRuntimeModel": ["save_pretrained", "from_pretrained"],
-    },
-    "transformers": {
-        "PreTrainedTokenizer": ["save_pretrained", "from_pretrained"],
-        "PreTrainedTokenizerFast": ["save_pretrained", "from_pretrained"],
-        "PreTrainedModel": ["save_pretrained", "from_pretrained"],
-        "FeatureExtractionMixin": ["save_pretrained", "from_pretrained"],
-    },
-}
-ALL_IMPORTABLE_CLASSES = {}
-for library in LOADABLE_CLASSES:
-    ALL_IMPORTABLE_CLASSES.update(LOADABLE_CLASSES[library])
-@dataclass
-class ImagePipelineOutput(BaseOutput):
-    """
-    Output class for image pipelines.
-    Args:
-        images (`List[PIL.Image.Image]` or `np.ndarray`)
-            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
-            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
-    """
-    images: Union[List[PIL.Image.Image], np.ndarray]
-class DiffusionPipeline(ConfigMixin):
-    r"""
-    Base class for all models.
-    [`DiffusionPipeline`] takes care of storing all components (models, schedulers, processors) for diffusion pipelines
-    and handles methods for loading, downloading and saving models as well as a few methods common to all pipelines to:
-        - move all PyTorch modules to the device of your choice
-        - enabling/disabling the progress bar for the denoising iteration
-    Class attributes:
-        - **config_name** ([`str`]) -- name of the config file that will store the class and module names of all
-          compenents of the diffusion pipeline.
-    """
-    config_name = "model_index.json"
-    def register_modules(self, **kwargs):
-        # import it here to avoid circular import
-        from diffusers import pipelines
-        for name, module in kwargs.items():
-            # retrive library
-            library = module.__module__.split(".")[0]
-            # check if the module is a pipeline module
-            pipeline_dir = module.__module__.split(".")[-2]
-            path = module.__module__.split(".")
-            is_pipeline_module = pipeline_dir in path and hasattr(pipelines, pipeline_dir)
-            # if library is not in LOADABLE_CLASSES, then it is a custom module.
-            # Or if it's a pipeline module, then the module is inside the pipeline
-            # folder so we set the library to module name.
-            if library not in LOADABLE_CLASSES or is_pipeline_module:
-                library = pipeline_dir
-            # retrive class_name
-            class_name = module.__class__.__name__
-            register_dict = {name: (library, class_name)}
-            # save model index config
-            self.register_to_config(**register_dict)
-            # set models
-            setattr(self, name, module)
-    def save_pretrained(self, save_directory: Union[str, os.PathLike]):
-        """
-        Save all variables of the pipeline that can be saved and loaded as well as the pipelines configuration file to
-        a directory. A pipeline variable can be saved and loaded if its class implements both a save and loading
-        method. The pipeline can easily be re-loaded using the `[`~DiffusionPipeline.from_pretrained`]` class method.
-        Arguments:
-            save_directory (`str` or `os.PathLike`):
-                Directory to which to save. Will be created if it doesn't exist.
-        """
-        self.save_config(save_directory)
-        model_index_dict = dict(self.config)
-        model_index_dict.pop("_class_name")
-        model_index_dict.pop("_diffusers_version")
-        model_index_dict.pop("_module", None)
-        for pipeline_component_name in model_index_dict.keys():
-            sub_model = getattr(self, pipeline_component_name)
-            model_cls = sub_model.__class__
-            save_method_name = None
-            # search for the model's base class in LOADABLE_CLASSES
-            for library_name, library_classes in LOADABLE_CLASSES.items():
-                library = importlib.import_module(library_name)
-                for base_class, save_load_methods in library_classes.items():
-                    class_candidate = getattr(library, base_class)
-                    if issubclass(model_cls, class_candidate):
-                        # if we found a suitable base class in LOADABLE_CLASSES then grab its save method
-                        save_method_name = save_load_methods[0]
-                        break
-                if save_method_name is not None:
-                    break
-            save_method = getattr(sub_model, save_method_name)
-            save_method(os.path.join(save_directory, pipeline_component_name))
-    def to(self, torch_device: Optional[Union[str, torch.device]] = None):
-        if torch_device is None:
-            return self
-        module_names, _ = self.extract_init_dict(dict(self.config))
-        for name in module_names.keys():
-            module = getattr(self, name)
-            if isinstance(module, torch.nn.Module):
-                module.to(torch_device)
-        return self
-    @property
-    def device(self) -> torch.device:
-        r"""
-        Returns:
-            `torch.device`: The torch device on which the pipeline is located.
-        """
-        module_names, _ = self.extract_init_dict(dict(self.config))
-        for name in module_names.keys():
-            module = getattr(self, name)
-            if isinstance(module, torch.nn.Module):
-                return module.device
-        return torch.device("cpu")
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
-        r"""
-        Instantiate a PyTorch diffusion pipeline from pre-trained pipeline weights.
-        The pipeline is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated).
-        The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come
-        pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
-        task.
-        The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those
-        weights are discarded.
-        Parameters:
-            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
-                Can be either:
-                    - A string, the *repo id* of a pretrained pipeline hosted inside a model repo on
-                      https://huggingface.co/ Valid repo ids have to be located under a user or organization name, like
-                      `CompVis/ldm-text2im-large-256`.
-                    - A path to a *directory* containing pipeline weights saved using
-                      [`~DiffusionPipeline.save_pretrained`], e.g., `./my_pipeline_directory/`.
-            torch_dtype (`str` or `torch.dtype`, *optional*):
-                Override the default `torch.dtype` and load the model under this dtype. If `"auto"` is passed the dtype
-                will be automatically derived from the model's weights.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
-                file exists.
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            output_loading_info(`bool`, *optional*, defaults to `False`):
-                Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
-            local_files_only(`bool`, *optional*, defaults to `False`):
-                Whether or not to only look at local files (i.e., do not try to download the model).
-            use_auth_token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-                when running `huggingface-cli login` (stored in `~/.huggingface`).
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
-                identifier allowed by git.
-            mirror (`str`, *optional*):
-                Mirror source to accelerate downloads in China. If you are from China and have an accessibility
-                problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
-                Please refer to the mirror site for more information. specify the folder name here.
-            kwargs (remaining dictionary of keyword arguments, *optional*):
-                Can be used to overwrite load - and saveable variables - *i.e.* the pipeline components - of the
-                speficic pipeline class. The overritten components are then directly passed to the pipelines `__init__`
-                method. See example below for more information.
-        <Tip>
-        Passing `use_auth_token=True`` is required when you want to use a private model, *e.g.*
-        `"CompVis/stable-diffusion-v1-4"`
-        </Tip>
-        <Tip>
-        Activate the special ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use
-        this method in a firewalled environment.
-        </Tip>
-        Examples:
-        ```py
-        >>> from diffusers import DiffusionPipeline
-        >>> # Download pipeline from huggingface.co and cache.
-        >>> pipeline = DiffusionPipeline.from_pretrained("CompVis/ldm-text2im-large-256")
-        >>> # Download pipeline that requires an authorization token
-        >>> # For more information on access tokens, please refer to this section
-        >>> # of the documentation](https://huggingface.co/docs/hub/security-tokens)
-        >>> pipeline = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", use_auth_token=True)
-        >>> # Download pipeline, but overwrite scheduler
-        >>> from diffusers import LMSDiscreteScheduler
-        >>> scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
-        >>> pipeline = DiffusionPipeline.from_pretrained(
-        ...     "CompVis/stable-diffusion-v1-4", scheduler=scheduler, use_auth_token=True
-        ... )
-        ```
-        """
-        cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
-        resume_download = kwargs.pop("resume_download", False)
-        proxies = kwargs.pop("proxies", None)
-        local_files_only = kwargs.pop("local_files_only", False)
-        use_auth_token = kwargs.pop("use_auth_token", None)
-        revision = kwargs.pop("revision", None)
-        torch_dtype = kwargs.pop("torch_dtype", None)
-        provider = kwargs.pop("provider", None)
-        # 1. Download the checkpoints and configs
-        # use snapshot download here to get it working from from_pretrained
-        if not os.path.isdir(pretrained_model_name_or_path):
-            cached_folder = snapshot_download(
-                pretrained_model_name_or_path,
-                cache_dir=cache_dir,
-                resume_download=resume_download,
-                proxies=proxies,
-                local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
-                revision=revision,
-            )
-        else:
-            cached_folder = pretrained_model_name_or_path
-        config_dict = cls.get_config_dict(cached_folder)
-        # 2. Load the pipeline class, if using custom module then load it from the hub
-        # if we load from explicit class, let's use it
-        if cls != DiffusionPipeline:
-            pipeline_class = cls
-        else:
-            diffusers_module = importlib.import_module(cls.__module__.split(".")[0])
-            pipeline_class = getattr(diffusers_module, config_dict["_class_name"])
-        # some modules can be passed directly to the init
-        # in this case they are already instantiated in `kwargs`
-        # extract them here
-        expected_modules = set(inspect.signature(pipeline_class.__init__).parameters.keys())
-        passed_class_obj = {k: kwargs.pop(k) for k in expected_modules if k in kwargs}
-        init_dict, _ = pipeline_class.extract_init_dict(config_dict, **kwargs)
-        init_kwargs = {}
-        # import it here to avoid circular import
-        from diffusers import pipelines
-        # 3. Load each module in the pipeline
-        for name, (library_name, class_name) in init_dict.items():
-            is_pipeline_module = hasattr(pipelines, library_name)
-            loaded_sub_model = None
-            # if the model is in a pipeline module, then we load it from the pipeline
-            if name in passed_class_obj:
-                # 1. check that passed_class_obj has correct parent class
-                if not is_pipeline_module:
-                    library = importlib.import_module(library_name)
-                    class_obj = getattr(library, class_name)
-                    importable_classes = LOADABLE_CLASSES[library_name]
-                    class_candidates = {c: getattr(library, c) for c in importable_classes.keys()}
-                    expected_class_obj = None
-                    for class_name, class_candidate in class_candidates.items():
-                        if issubclass(class_obj, class_candidate):
-                            expected_class_obj = class_candidate
-                    if not issubclass(passed_class_obj[name].__class__, expected_class_obj):
-                        raise ValueError(
-                            f"{passed_class_obj[name]} is of type: {type(passed_class_obj[name])}, but should be"
-                            f" {expected_class_obj}"
-                        )
-                else:
-                    logger.warn(
-                        f"You have passed a non-standard module {passed_class_obj[name]}. We cannot verify whether it"
-                        " has the correct type"
-                    )
-                # set passed class object
-                loaded_sub_model = passed_class_obj[name]
-            elif is_pipeline_module:
-                pipeline_module = getattr(pipelines, library_name)
-                class_obj = getattr(pipeline_module, class_name)
-                importable_classes = ALL_IMPORTABLE_CLASSES
-                class_candidates = {c: class_obj for c in importable_classes.keys()}
-            else:
-                # else we just import it from the library.
-                library = importlib.import_module(library_name)
-                class_obj = getattr(library, class_name)
-                importable_classes = LOADABLE_CLASSES[library_name]
-                class_candidates = {c: getattr(library, c) for c in importable_classes.keys()}
-            if loaded_sub_model is None:
-                load_method_name = None
-                for class_name, class_candidate in class_candidates.items():
-                    if issubclass(class_obj, class_candidate):
-                        load_method_name = importable_classes[class_name][1]
-                load_method = getattr(class_obj, load_method_name)
-                loading_kwargs = {}
-                if issubclass(class_obj, torch.nn.Module):
-                    loading_kwargs["torch_dtype"] = torch_dtype
-                if issubclass(class_obj, diffusers.OnnxRuntimeModel):
-                    loading_kwargs["provider"] = provider
-                # check if the module is in a subdirectory
-                if os.path.isdir(os.path.join(cached_folder, name)):
-                    loaded_sub_model = load_method(os.path.join(cached_folder, name), **loading_kwargs)
-                else:
-                    # else load from the root directory
-                    loaded_sub_model = load_method(cached_folder, **loading_kwargs)
-            init_kwargs[name] = loaded_sub_model  # UNet(...), # DiffusionSchedule(...)
-        # 4. Instantiate the pipeline
-        model = pipeline_class(**init_kwargs)
-        return model
-    @staticmethod
-    def numpy_to_pil(images):
-        """
-        Convert a numpy image or a batch of images to a PIL image.
-        """
-        if images.ndim == 3:
-            images = images[None, ...]
-        images = (images * 255).round().astype("uint8")
-        pil_images = [Image.fromarray(image) for image in images]
-        return pil_images
-    def progress_bar(self, iterable):
-        if not hasattr(self, "_progress_bar_config"):
-            self._progress_bar_config = {}
-        elif not isinstance(self._progress_bar_config, dict):
-            raise ValueError(
-                f"`self._progress_bar_config` should be of type `dict`, but is {type(self._progress_bar_config)}."
-            )
-        return tqdm(iterable, **self._progress_bar_config)
-    def set_progress_bar_config(self, **kwargs):
-        self._progress_bar_config = kwargs

diffusers/pipelines/__init__.py DELETED Viewed

@@ -1,19 +0,0 @@
-from ..utils import is_onnx_available, is_transformers_available
-from .ddim import DDIMPipeline
-from .ddpm import DDPMPipeline
-from .latent_diffusion_uncond import LDMPipeline
-from .pndm import PNDMPipeline
-from .score_sde_ve import ScoreSdeVePipeline
-from .stochastic_karras_ve import KarrasVePipeline
-if is_transformers_available():
-    from .latent_diffusion import LDMTextToImagePipeline
-    from .stable_diffusion import (
-        StableDiffusionImg2ImgPipeline,
-        StableDiffusionInpaintPipeline,
-        StableDiffusionPipeline,
-    )
-if is_transformers_available() and is_onnx_available():
-    from .stable_diffusion import StableDiffusionOnnxPipeline

diffusers/pipelines/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (829 Bytes)

diffusers/pipelines/ddim/__init__.py DELETED Viewed

	@@ -1,2 +0,0 @@
1	- # flake8: noqa
2	- from .pipeline_ddim import DDIMPipeline

diffusers/pipelines/ddim/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (207 Bytes)