Spaces:

lnyan
/

flux-dev-flax

Runtime error

App Files Files Community

lnyan commited on Sep 21, 2024

Commit

d4607d7

1 Parent(s): f9616ba

Update

Browse files

Files changed (12) hide show

app.py +269 -0
flux/__init__.py +0 -0
flux/math.py +94 -0
flux/model.py +120 -0
flux/modules/attention_flax.py +494 -0
flux/modules/autoencoder.py +343 -0
flux/modules/conditioner.py +73 -0
flux/modules/layers.py +292 -0
flux/sampling.py +227 -0
flux/util.py +340 -0
flux/wrapper.py +139 -0
requirements.txt +14 -0

app.py ADDED Viewed

	@@ -0,0 +1,269 @@

+import os
+import time
+from io import BytesIO
+import uuid
+import torch
+import gradio as gr
+import spaces
+import numpy as np
+from einops import rearrange
+from PIL import Image, ExifTags
+from dataclasses import dataclass
+from flux.sampling import denoise, get_noise, get_schedule, prepare, unpack, prepare_tokens
+from flux.util import configs, embed_watermark, load_ae, load_clip, load_flow_model, load_t5
+import jax
+import jax.numpy as jnp
+from flax import nnx
+from jax import Array as Tensor
+from einops import repeat
+@dataclass
+class SamplingOptions:
+    prompt: str
+    width: int
+    height: int
+    num_steps: int
+    guidance: float
+    seed: int | None
+NSFW_THRESHOLD = 0.85
+@spaces.GPU
+def get_models(name: str, device: torch.device, offload: bool, is_schnell: bool):
+    t5 = load_t5(device, max_length=256 if is_schnell else 512)
+    clip = load_clip(device)
+    model = load_flow_model(name, device="cpu" if offload else device)
+    ae = load_ae(name, device="cpu" if offload else device)
+    # nsfw_classifier = pipeline("image-classification", model="Falconsai/nsfw_image_detection", device=device)
+    # return model, ae, t5, clip, nsfw_classifier
+    return nnx.split(model), nnx.split(ae), nnx.split(t5), t5.tokenizer, nnx.split(clip), clip.tokenizer, None
+@jax.jit
+def encode(ae,x):
+    ae=nnx.merge(*ae)
+    return ae.encode(x)
+def _generate(model, ae, t5, clip, x, t5_tokens, clip_tokens, num_steps, guidance,
+              #init_image=None,
+              #image2image_strength=0.0,
+              shift=True):
+    b,h,w,c=x.shape
+    model=nnx.merge(*model)
+    ae=nnx.merge(*ae)
+    t5=nnx.merge(*t5)
+    clip=nnx.merge(*clip)
+    timesteps = get_schedule(
+            num_steps,
+            x.shape[-1] * x.shape[-2] // 4,
+            shift=shift,
+        )
+    # if init_image is not None:
+    #     t_idx = int((1 - image2image_strength) * num_steps)
+    #     t = timesteps[t_idx]
+    #     timesteps = timesteps[t_idx:]
+    #     x = t * x + (1.0 - t) * init_image.astype(x.dtype)
+    inp = prepare(t5, clip, x, t5_tokens, clip_tokens)
+    x = denoise(model, **inp, timesteps=timesteps, guidance=guidance)
+    x = unpack(x.astype(jnp.float32), h*8, w*8)
+    x = ae.decode(x)
+    return x
+generate=jax.jit(_generate, static_argnames=("num_steps","shift"))
+def prepare_tokens(t5_tokenizer, clip_tokenizer, prompt: str | list[str]) -> tuple[Tensor, Tensor]:
+    if isinstance(prompt, str):
+        prompt = [prompt]
+    t5_tokens = t5_tokenizer(
+            prompt,
+            truncation=True,
+            max_length=512,
+            return_length=False,
+            return_overflowing_tokens=False,
+            padding="max_length",
+            return_tensors="jax",
+        )["input_ids"]
+    clip_tokens = clip_tokenizer(
+            prompt,
+            truncation=True,
+            max_length=77,
+            return_length=False,
+            return_overflowing_tokens=False,
+            padding="max_length",
+            return_tensors="jax",
+        )["input_ids"]
+    return t5_tokens, clip_tokens
+class FluxGenerator:
+    def __init__(self, model_name: str, device: str, offload: bool):
+        self.device = None
+        self.offload = offload
+        self.model_name = model_name
+        self.is_schnell = model_name == "flux-schnell"
+        self.model, self.ae, self.t5, self.t5_tokenizer, self.clip, self.clip_tokenizer, self.nsfw_classifier = get_models(
+            model_name,
+            device=self.device,
+            offload=self.offload,
+            is_schnell=self.is_schnell,
+        )
+        self.key = jax.random.key(0)
+    @spaces.GPU(duration=180)
+    def generate_image(
+        self,
+        img_size,
+        num_steps,
+        guidance,
+        seed,
+        prompt,
+        # init_image=None,
+        # image2image_strength=0.0,
+        add_sampling_metadata=True,
+    ):
+        seed = int(seed)
+        if seed == -1:
+            seed = None
+        if img_size == "1,024x1,024":
+            width, height = 1024, 1024
+        else:
+            width, height = 512, 512
+        opts = SamplingOptions(
+            prompt=prompt,
+            width=width,
+            height=height,
+            num_steps=num_steps,
+            guidance=guidance,
+            seed=seed,
+        )
+        if opts.seed is None:
+            # opts.seed = torch.Generator(device="cpu").seed()
+            key,self.key=jax.random.split(self.key,2)
+            opts.seed=jax.random.randint(key,(),0,2**30)
+        print(f"Generating '{opts.prompt}' with seed {opts.seed}")
+        t0 = time.perf_counter()
+        # if init_image is not None:
+        #     if isinstance(init_image, np.ndarray):
+        #         init_image = jnp.asarray(init_image).astype(jnp.float32) / 255.0
+        #         init_image = init_image[None]
+        #     # init_image = torch.nn.functional.interpolate(init_image, (opts.height, opts.width))
+        #     init_image = jax.image.resize(init_image, (opts.height, opts.width), method="lanczos5")
+        #     # if self.offload:
+        #         # self.ae.encoder.to(self.device)
+        #     # init_image = self.ae.encode(init_image)
+        #     init_image = encode(self.ae, init_image)
+        # prepare input
+        t5_tokens, clip_tokens = prepare_tokens(self.t5_tokenizer, self.clip_tokenizer, prompt=opts.prompt)
+        x = get_noise(
+            1,
+            opts.height,
+            opts.width,
+            device=None,
+            dtype=jnp.bfloat16,
+            seed=opts.seed,
+        )
+        x = generate(self.model, self.ae, self.t5, self.clip, x, t5_tokens, clip_tokens, opts.num_steps, opts.guidance, shift=(not self.is_schnell))
+        t1 = time.perf_counter()
+        # print(f"Done in {t1 - t0:.1f}s.")
+        runtime = t1 - t0
+        # print(f"Done in {t1 - t0:.1f}s.")
+        # bring into PIL format
+        x= jnp.clip(x, -1, 1)
+        # x = embed_watermark(x.astype(jnp.float32))
+        # x = rearrange(x[0], "c h w -> h w c")
+        img = Image.fromarray(np.asarray((127.5 * (x[0] + 1.0))).astype(np.uint8))
+        # img = Image.fromarray((127.5 * (x + 1.0)).cpu().byte().numpy())
+        # nsfw_score = [x["score"] for x in self.nsfw_classifier(img) if x["label"] == "nsfw"][0]
+        if True:
+            filename = f"output/gradio/{uuid.uuid4()}.jpg"
+            os.makedirs(os.path.dirname(filename), exist_ok=True)
+            exif_data = Image.Exif()
+            # if init_image is None:
+            exif_data[ExifTags.Base.Software] = "AI generated;txt2img;flux"
+            # else:
+                # exif_data[ExifTags.Base.Software] = "AI generated;img2img;flux"
+            exif_data[ExifTags.Base.Make] = "Black Forest Labs"
+            exif_data[ExifTags.Base.Model] = self.model_name
+            if add_sampling_metadata:
+                exif_data[ExifTags.Base.ImageDescription] = prompt
+            img.save(filename, format="jpeg", exif=exif_data, quality=95, subsampling=0)
+            return img, runtime, str(opts.seed), filename, None
+        else:
+            return None, str(opts.seed), None, "Your generated image may contain NSFW content."
+@spaces.GPU(duration=300)
+def create_demo(model_name: str, device: str = "cuda", offload: bool = False):
+    generator = FluxGenerator(model_name, device, offload)
+    is_schnell = model_name == "flux-schnell"
+    with open("./assets/banner.html") as f:
+        banner = f.read()
+    with gr.Blocks() as demo:
+        with gr.Column(elem_id="app-container"):
+            gr.HTML(f"""<iframe  scrolling="no" style="width: 100%; height: 125px; border: 0" srcdoc='{banner}'>""")
+            gr.Markdown(f"""🚀 [Flux-Flax](https://github.com/lkwq007/flux-flax) is a JAX implementation of Flux models. 1-step time statistics for `FLUX.1-schnell`: `0.4s` for 1024x1024, `0.1s` for 512x512; 2-step: `0.6s` for 1024x1024, `0.2s` for 512x512; 4-step: `2.4s` for 1024x1024, `0.8s` for 512x512.
+            """)
+            with gr.Row():
+                with gr.Column(scale=3):
+                    output_image = gr.Image(label="Generated Image")
+                    warning_text = gr.Textbox(label="Warning", visible=False)
+                    download_btn = gr.File(label="Download full-resolution")
+                    gr.Markdown("""
+💡 Note: More resolutions are supports, but here this demo limits to 1024x1024 and 512x512 to avoid jit recompilation (which takes 130s). Flux-Flax also support `FLUX.1-dev`, 50-step time statistics: `18s` for 1024x1024, `6s` for 512x512""")
+                with gr.Column(scale=1):
+                    prompt = gr.Textbox(label="Prompt", value="a photo of a forest with mist swirling around the tree trunks. The word \"FLUX\" is painted over it in big, red brush strokes with visible texture")
+                    generate_btn = gr.Button("Generate")
+                    with gr.Row():
+                        seed_output = gr.Number(label="Used Seed")
+                        runtime = gr.Number(label="Inference Time", precision=3)
+                    with gr.Row():
+                        seed = gr.Textbox(-1, label="Seed (-1 for random)")
+                        img_size = gr.Radio(["1,024x1,024", "512x512"], label="Image Resolution", value="1,024x1,024")
+                    num_steps = gr.Slider(1, 4, 1, step=1, label="Number of steps")
+                    add_sampling_metadata = gr.Checkbox(label="Add sampling parameters to metadata?", value=True)
+                    guidance = gr.Slider(1.0, 10.0, 3.5, step=0.1, label="Guidance", interactive=not is_schnell, visible=False)
+        # def update_img2img(do_img2img):
+        #     return {
+        #         init_image: gr.update(visible=do_img2img),
+        #         image2image_strength: gr.update(visible=do_img2img),
+        #     }
+        # do_img2img.change(update_img2img, do_img2img, [init_image, image2image_strength])
+        generate_btn.click(
+            fn=generator.generate_image,
+            inputs=[img_size, num_steps, guidance, seed, prompt, add_sampling_metadata],
+            outputs=[output_image, runtime, seed_output, download_btn, warning_text],
+        )
+    return demo
+# if __name__ == "__main__":
+#     import argparse
+#     parser = argparse.ArgumentParser(description="Flux")
+#     parser.add_argument("--name", type=str, default="flux-schnell", choices=list(configs.keys()), help="Model name")
+#     parser.add_argument("--device", type=str, default="cpu", help="Device to use")
+#     parser.add_argument("--offload", action="store_true", help="Offload model to CPU when not in use")
+#     parser.add_argument("--share", action="store_true", help="Create a public link to your demo")
+#     args = parser.parse_args()
+demo = create_demo("flux-schnell", None, False)
+demo.launch()

flux/__init__.py ADDED Viewed

File without changes

flux/math.py ADDED Viewed

	@@ -0,0 +1,94 @@

+# import torch
+import math
+import jax
+import jax.numpy as jnp
+from einops import rearrange
+from flax import nnx
+Tensor=jax.Array
+def check_tpu():
+    return any('TPU' in d.device_kind for d in jax.devices())
+# from torch import Tensor
+if check_tpu():
+    from jax.experimental.pallas.ops.tpu.flash_attention import flash_attention
+    # q,  # [batch_size, num_heads, q_seq_len, d_model]
+    # k,  # [batch_size, num_heads, kv_seq_len, d_model]
+    # v,  # [batch_size, num_heads, kv_seq_len, d_model]
+    def flash_mha(q, k, v):
+        return flash_attention(q, k, v, sm_scale=1/math.sqrt(q.shape[-1]))
+else:
+    from jax.experimental.pallas.ops.gpu.attention import mha, mha_reference
+    def pallas_mha(q, k, v):
+        # B L H D
+        # return mha_reference(q,k,v,segment_ids=None,sm_scale=1/math.sqrt(q.shape[-1]))
+        q_len=q.shape[1]
+        diff=(-q_len)&127
+        segment_ids=jnp.zeros((q.shape[0],q.shape[1]),dtype=jnp.int32)
+        segment_ids=jnp.pad(segment_ids,((0,0),(0,diff)),mode="constant",constant_values=1)
+        # q,k,v=map(lambda x: jnp.pad(x,((0,0),(0,diff),(0,0),(0,0)),mode="constant", constant_values=0),(q,k,v))
+        return mha(q,k,v,segment_ids=segment_ids,sm_scale=1/math.sqrt(q.shape[-1]))#[:,:q_len]
+    # mha: batch_size, seq_len, num_heads, head_dim = q.shape
+    from functools import partial
+    from flux.modules.attention_flax import jax_memory_efficient_attention
+    try:
+        from flash_attn_jax import flash_mha
+    except:
+        flash_mha = pallas_mha
+        # flash_mha = nnx.dot_product_attention
+    def dot_product_attention(q, k, v, sm_scale=1.0):
+        q,k,v=map(lambda x: rearrange(x, "b h n d -> b n h d"), (q,k,v))
+        # ret = pallas_mha(q,k,v)
+        ret = nnx.dot_product_attention(q,k,v)
+        # if q.shape[-3] % 64 == 0:
+        #     query_chunk_size = int(q.shape[-3] / 64)
+        # elif q.shape[-3] % 16 == 0:
+        #     query_chunk_size = int(q.shape[-3] / 16)
+        # elif q.shape[-3] % 4 == 0:
+        #     query_chunk_size = int(q.shape[-3] / 4)
+        # else:
+        #     query_chunk_size = int(q.shape[-3])
+        # ret=jax_memory_efficient_attention(q, k, v, query_chunk_size=query_chunk_size)
+        return rearrange(ret, "b n h d -> b h n d")
+def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor) -> Tensor:
+    q, k = apply_rope(q, k, pe)
+    # x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
+    # q is B H L D
+    q,k,v=map(lambda x: rearrange(x, "B H L D -> B L H D"), (q,k,v))
+    # x = nnx.dot_product_attention(q,k,v)
+    x = flash_mha(q,k,v)
+    # x = pallas_mha(q,k,v)
+    # x = mha(q,k,v,None,sm_scale=1/math.sqrt(q.shape[-1]))
+    x = rearrange(x, "B L H D -> B L (H D)")
+    # x = rearrange(x, "B H L D -> B L (H D)")
+    return x
+def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
+    assert dim % 2 == 0
+    # scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim
+    scale = jnp.arange(0, dim, 2, dtype=jnp.float32) / dim
+    omega = 1.0 / (theta**scale)
+    # out = torch.einsum("...n,d->...nd", pos, omega)
+    out = jnp.einsum("...n,d->...nd", pos.astype(jnp.float32), omega)
+    # out = torch.stack([torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)], dim=-1)
+    out = jnp.stack([jnp.cos(out), -jnp.sin(out), jnp.sin(out), jnp.cos(out)], axis=-1)
+    out = rearrange(out, "b n d (i j) -> b n d i j", i=2, j=2)
+    # return out.float()
+    return out.astype(jnp.float32)
+def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor) -> tuple[Tensor, Tensor]:
+    # xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
+    # xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
+    xq_ = xq.astype(jnp.float32).reshape(*xq.shape[:-1], -1, 1, 2)
+    xk_ = xk.astype(jnp.float32).reshape(*xk.shape[:-1], -1, 1, 2)
+    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
+    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
+    # return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)
+    return xq_out.reshape(*xq.shape).astype(xq.dtype), xk_out.reshape(*xk.shape).astype(xk.dtype)

flux/model.py ADDED Viewed

	@@ -0,0 +1,120 @@

+from dataclasses import dataclass
+import jax.numpy as jnp
+from jax import Array as Tensor
+from flax import nnx
+from flux.wrapper import TorchWrapper
+from flux.modules.layers import (DoubleStreamBlock, EmbedND, LastLayer,
+                                 MLPEmbedder, SingleStreamBlock,
+                                 timestep_embedding)
+@dataclass
+class FluxParams:
+    in_channels: int
+    vec_in_dim: int
+    context_in_dim: int
+    hidden_size: int
+    mlp_ratio: float
+    num_heads: int
+    depth: int
+    depth_single_blocks: int
+    axes_dim: list[int]
+    theta: int
+    qkv_bias: bool
+    guidance_embed: bool
+DoubleStreamBlock_class, EmbedND_class, LastLayer_class, MLPEmbedder_class, SingleStreamBlock_class = DoubleStreamBlock, EmbedND, LastLayer, MLPEmbedder, SingleStreamBlock
+class Flux(nnx.Module):
+    """
+    Transformer model for flow matching on sequences.
+    """
+    def __init__(self, params: FluxParams, dtype: jnp.dtype = jnp.float32, rngs: nnx.Rngs = None):
+        nn = TorchWrapper(rngs=rngs, dtype=dtype)
+        DoubleStreamBlock, EmbedND, LastLayer, MLPEmbedder, SingleStreamBlock = nn.declare_with_rng(DoubleStreamBlock_class, EmbedND_class, LastLayer_class, MLPEmbedder_class, SingleStreamBlock_class)
+        self.params = params
+        self.in_channels = params.in_channels
+        self.out_channels = self.in_channels
+        if params.hidden_size % params.num_heads != 0:
+            raise ValueError(
+                f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}"
+            )
+        pe_dim = params.hidden_size // params.num_heads
+        if sum(params.axes_dim) != pe_dim:
+            raise ValueError(f"Got {params.axes_dim} but expected positional dim {pe_dim}")
+        self.hidden_size = params.hidden_size
+        self.num_heads = params.num_heads
+        self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim)
+        self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
+        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
+        self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size)
+        self.guidance_in = (
+            MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) if params.guidance_embed else nn.Identity()
+        )
+        self.txt_in = nn.Linear(params.context_in_dim, self.hidden_size)
+        self.double_blocks = nn.ModuleList(
+            [
+                DoubleStreamBlock(
+                    self.hidden_size,
+                    self.num_heads,
+                    mlp_ratio=params.mlp_ratio,
+                    qkv_bias=params.qkv_bias,
+                )
+                for _ in range(params.depth)
+            ]
+        )
+        self.single_blocks = nn.ModuleList(
+            [
+                SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio)
+                for _ in range(params.depth_single_blocks)
+            ]
+        )
+        self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
+    def __call__(
+        self,
+        img: Tensor,
+        img_ids: Tensor,
+        txt: Tensor,
+        txt_ids: Tensor,
+        timesteps: Tensor,
+        y: Tensor,
+        guidance: Tensor | None = None,
+    ) -> Tensor:
+        if img.ndim != 3 or txt.ndim != 3:
+            raise ValueError("Input img and txt tensors must have 3 dimensions.")
+        # running on sequences img
+        img = self.img_in(img)
+        vec = self.time_in(timestep_embedding(timesteps, 256))
+        if self.params.guidance_embed:
+            if guidance is None:
+                raise ValueError("Didn't get guidance strength for guidance distilled model.")
+            vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
+        vec = vec + self.vector_in(y)
+        txt = self.txt_in(txt)
+        # ids = torch.cat((txt_ids, img_ids), dim=1)
+        ids = jnp.concatenate((txt_ids, img_ids), axis=1)
+        pe = self.pe_embedder(ids)
+        for block in self.double_blocks:
+            img, txt = block(img=img, txt=txt, vec=vec, pe=pe)
+        # img = torch.cat((txt, img), 1)
+        img = jnp.concatenate((txt, img), axis=1)
+        for block in self.single_blocks:
+            img = block(img, vec=vec, pe=pe)
+        img = img[:, txt.shape[1] :, ...]
+        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
+        return img

flux/modules/attention_flax.py ADDED Viewed

	@@ -0,0 +1,494 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+import math
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+def _query_chunk_attention(query, key, value, precision, key_chunk_size: int = 4096):
+    """Multi-head dot product attention with a limited number of queries."""
+    num_kv, num_heads, k_features = key.shape[-3:]
+    v_features = value.shape[-1]
+    key_chunk_size = min(key_chunk_size, num_kv)
+    query = query / jnp.sqrt(k_features)
+    @functools.partial(jax.checkpoint, prevent_cse=False)
+    def summarize_chunk(query, key, value):
+        attn_weights = jnp.einsum("...qhd,...khd->...qhk", query, key, precision=precision)
+        max_score = jnp.max(attn_weights, axis=-1, keepdims=True)
+        max_score = jax.lax.stop_gradient(max_score)
+        exp_weights = jnp.exp(attn_weights - max_score)
+        exp_values = jnp.einsum("...vhf,...qhv->...qhf", value, exp_weights, precision=precision)
+        max_score = jnp.einsum("...qhk->...qh", max_score)
+        return (exp_values, exp_weights.sum(axis=-1), max_score)
+    def chunk_scanner(chunk_idx):
+        # julienne key array
+        key_chunk = jax.lax.dynamic_slice(
+            operand=key,
+            start_indices=[0] * (key.ndim - 3) + [chunk_idx, 0, 0],  # [...,k,h,d]
+            slice_sizes=list(key.shape[:-3]) + [key_chunk_size, num_heads, k_features],  # [...,k,h,d]
+        )
+        # julienne value array
+        value_chunk = jax.lax.dynamic_slice(
+            operand=value,
+            start_indices=[0] * (value.ndim - 3) + [chunk_idx, 0, 0],  # [...,v,h,d]
+            slice_sizes=list(value.shape[:-3]) + [key_chunk_size, num_heads, v_features],  # [...,v,h,d]
+        )
+        return summarize_chunk(query, key_chunk, value_chunk)
+    chunk_values, chunk_weights, chunk_max = jax.lax.map(f=chunk_scanner, xs=jnp.arange(0, num_kv, key_chunk_size))
+    global_max = jnp.max(chunk_max, axis=0, keepdims=True)
+    max_diffs = jnp.exp(chunk_max - global_max)
+    chunk_values *= jnp.expand_dims(max_diffs, axis=-1)
+    chunk_weights *= max_diffs
+    all_values = chunk_values.sum(axis=0)
+    all_weights = jnp.expand_dims(chunk_weights, -1).sum(axis=0)
+    return all_values / all_weights
+def jax_memory_efficient_attention(
+    query, key, value, precision=jax.lax.Precision.HIGHEST, query_chunk_size: int = 1024, key_chunk_size: int = 4096
+):
+    r"""
+    Flax Memory-efficient multi-head dot product attention. https://arxiv.org/abs/2112.05682v2
+    https://github.com/AminRezaei0x443/memory-efficient-attention
+    Args:
+        query (`jnp.ndarray`): (batch..., query_length, head, query_key_depth_per_head)
+        key (`jnp.ndarray`): (batch..., key_value_length, head, query_key_depth_per_head)
+        value (`jnp.ndarray`): (batch..., key_value_length, head, value_depth_per_head)
+        precision (`jax.lax.Precision`, *optional*, defaults to `jax.lax.Precision.HIGHEST`):
+            numerical precision for computation
+        query_chunk_size (`int`, *optional*, defaults to 1024):
+            chunk size to divide query array value must divide query_length equally without remainder
+        key_chunk_size (`int`, *optional*, defaults to 4096):
+            chunk size to divide key and value array value must divide key_value_length equally without remainder
+    Returns:
+        (`jnp.ndarray`) with shape of (batch..., query_length, head, value_depth_per_head)
+    """
+    num_q, num_heads, q_features = query.shape[-3:]
+    def chunk_scanner(chunk_idx, _):
+        # julienne query array
+        query_chunk = jax.lax.dynamic_slice(
+            operand=query,
+            start_indices=([0] * (query.ndim - 3)) + [chunk_idx, 0, 0],  # [...,q,h,d]
+            slice_sizes=list(query.shape[:-3]) + [min(query_chunk_size, num_q), num_heads, q_features],  # [...,q,h,d]
+        )
+        return (
+            chunk_idx + query_chunk_size,  # unused ignore it
+            _query_chunk_attention(
+                query=query_chunk, key=key, value=value, precision=precision, key_chunk_size=key_chunk_size
+            ),
+        )
+    _, res = jax.lax.scan(
+        f=chunk_scanner,
+        init=0,
+        xs=None,
+        length=math.ceil(num_q / query_chunk_size),  # start counter  # stop counter
+    )
+    return jnp.concatenate(res, axis=-3)  # fuse the chunked result back
+class FlaxAttention(nn.Module):
+    r"""
+    A Flax multi-head attention module as described in: https://arxiv.org/abs/1706.03762
+    Parameters:
+        query_dim (:obj:`int`):
+            Input hidden states dimension
+        heads (:obj:`int`, *optional*, defaults to 8):
+            Number of heads
+        dim_head (:obj:`int`, *optional*, defaults to 64):
+            Hidden states dimension inside each head
+        dropout (:obj:`float`, *optional*, defaults to 0.0):
+            Dropout rate
+        use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
+            enable memory efficient attention https://arxiv.org/abs/2112.05682
+        split_head_dim (`bool`, *optional*, defaults to `False`):
+            Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
+            enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+    """
+    query_dim: int
+    heads: int = 8
+    dim_head: int = 64
+    dropout: float = 0.0
+    use_memory_efficient_attention: bool = False
+    split_head_dim: bool = False
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        inner_dim = self.dim_head * self.heads
+        self.scale = self.dim_head**-0.5
+        # Weights were exported with old names {to_q, to_k, to_v, to_out}
+        self.query = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, name="to_q")
+        self.key = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, name="to_k")
+        self.value = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, name="to_v")
+        self.proj_attn = nn.Dense(self.query_dim, dtype=self.dtype, name="to_out_0")
+        self.dropout_layer = nn.Dropout(rate=self.dropout)
+    def reshape_heads_to_batch_dim(self, tensor):
+        batch_size, seq_len, dim = tensor.shape
+        head_size = self.heads
+        tensor = tensor.reshape(batch_size, seq_len, head_size, dim // head_size)
+        tensor = jnp.transpose(tensor, (0, 2, 1, 3))
+        tensor = tensor.reshape(batch_size * head_size, seq_len, dim // head_size)
+        return tensor
+    def reshape_batch_dim_to_heads(self, tensor):
+        batch_size, seq_len, dim = tensor.shape
+        head_size = self.heads
+        tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim)
+        tensor = jnp.transpose(tensor, (0, 2, 1, 3))
+        tensor = tensor.reshape(batch_size // head_size, seq_len, dim * head_size)
+        return tensor
+    def __call__(self, hidden_states, context=None, deterministic=True):
+        context = hidden_states if context is None else context
+        query_proj = self.query(hidden_states)
+        key_proj = self.key(context)
+        value_proj = self.value(context)
+        if self.split_head_dim:
+            b = hidden_states.shape[0]
+            query_states = jnp.reshape(query_proj, (b, -1, self.heads, self.dim_head))
+            key_states = jnp.reshape(key_proj, (b, -1, self.heads, self.dim_head))
+            value_states = jnp.reshape(value_proj, (b, -1, self.heads, self.dim_head))
+        else:
+            query_states = self.reshape_heads_to_batch_dim(query_proj)
+            key_states = self.reshape_heads_to_batch_dim(key_proj)
+            value_states = self.reshape_heads_to_batch_dim(value_proj)
+        if self.use_memory_efficient_attention:
+            query_states = query_states.transpose(1, 0, 2)
+            key_states = key_states.transpose(1, 0, 2)
+            value_states = value_states.transpose(1, 0, 2)
+            # this if statement create a chunk size for each layer of the unet
+            # the chunk size is equal to the query_length dimension of the deepest layer of the unet
+            flatten_latent_dim = query_states.shape[-3]
+            if flatten_latent_dim % 64 == 0:
+                query_chunk_size = int(flatten_latent_dim / 64)
+            elif flatten_latent_dim % 16 == 0:
+                query_chunk_size = int(flatten_latent_dim / 16)
+            elif flatten_latent_dim % 4 == 0:
+                query_chunk_size = int(flatten_latent_dim / 4)
+            else:
+                query_chunk_size = int(flatten_latent_dim)
+            hidden_states = jax_memory_efficient_attention(
+                query_states, key_states, value_states, query_chunk_size=query_chunk_size, key_chunk_size=4096 * 4
+            )
+            hidden_states = hidden_states.transpose(1, 0, 2)
+        else:
+            # compute attentions
+            if self.split_head_dim:
+                attention_scores = jnp.einsum("b t n h, b f n h -> b n f t", key_states, query_states)
+            else:
+                attention_scores = jnp.einsum("b i d, b j d->b i j", query_states, key_states)
+            attention_scores = attention_scores * self.scale
+            attention_probs = nn.softmax(attention_scores, axis=-1 if self.split_head_dim else 2)
+            # attend to values
+            if self.split_head_dim:
+                hidden_states = jnp.einsum("b n f t, b t n h -> b f n h", attention_probs, value_states)
+                b = hidden_states.shape[0]
+                hidden_states = jnp.reshape(hidden_states, (b, -1, self.heads * self.dim_head))
+            else:
+                hidden_states = jnp.einsum("b i j, b j d -> b i d", attention_probs, value_states)
+                hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
+        hidden_states = self.proj_attn(hidden_states)
+        return self.dropout_layer(hidden_states, deterministic=deterministic)
+class FlaxBasicTransformerBlock(nn.Module):
+    r"""
+    A Flax transformer block layer with `GLU` (Gated Linear Unit) activation function as described in:
+    https://arxiv.org/abs/1706.03762
+    Parameters:
+        dim (:obj:`int`):
+            Inner hidden states dimension
+        n_heads (:obj:`int`):
+            Number of heads
+        d_head (:obj:`int`):
+            Hidden states dimension inside each head
+        dropout (:obj:`float`, *optional*, defaults to 0.0):
+            Dropout rate
+        only_cross_attention (`bool`, defaults to `False`):
+            Whether to only apply cross attention.
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+        use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
+            enable memory efficient attention https://arxiv.org/abs/2112.05682
+        split_head_dim (`bool`, *optional*, defaults to `False`):
+            Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
+            enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
+    """
+    dim: int
+    n_heads: int
+    d_head: int
+    dropout: float = 0.0
+    only_cross_attention: bool = False
+    dtype: jnp.dtype = jnp.float32
+    use_memory_efficient_attention: bool = False
+    split_head_dim: bool = False
+    def setup(self):
+        # self attention (or cross_attention if only_cross_attention is True)
+        self.attn1 = FlaxAttention(
+            self.dim,
+            self.n_heads,
+            self.d_head,
+            self.dropout,
+            self.use_memory_efficient_attention,
+            self.split_head_dim,
+            dtype=self.dtype,
+        )
+        # cross attention
+        self.attn2 = FlaxAttention(
+            self.dim,
+            self.n_heads,
+            self.d_head,
+            self.dropout,
+            self.use_memory_efficient_attention,
+            self.split_head_dim,
+            dtype=self.dtype,
+        )
+        self.ff = FlaxFeedForward(dim=self.dim, dropout=self.dropout, dtype=self.dtype)
+        self.norm1 = nn.LayerNorm(epsilon=1e-5, dtype=self.dtype)
+        self.norm2 = nn.LayerNorm(epsilon=1e-5, dtype=self.dtype)
+        self.norm3 = nn.LayerNorm(epsilon=1e-5, dtype=self.dtype)
+        self.dropout_layer = nn.Dropout(rate=self.dropout)
+    def __call__(self, hidden_states, context, deterministic=True):
+        # self attention
+        residual = hidden_states
+        if self.only_cross_attention:
+            hidden_states = self.attn1(self.norm1(hidden_states), context, deterministic=deterministic)
+        else:
+            hidden_states = self.attn1(self.norm1(hidden_states), deterministic=deterministic)
+        hidden_states = hidden_states + residual
+        # cross attention
+        residual = hidden_states
+        hidden_states = self.attn2(self.norm2(hidden_states), context, deterministic=deterministic)
+        hidden_states = hidden_states + residual
+        # feed forward
+        residual = hidden_states
+        hidden_states = self.ff(self.norm3(hidden_states), deterministic=deterministic)
+        hidden_states = hidden_states + residual
+        return self.dropout_layer(hidden_states, deterministic=deterministic)
+class FlaxTransformer2DModel(nn.Module):
+    r"""
+    A Spatial Transformer layer with Gated Linear Unit (GLU) activation function as described in:
+    https://arxiv.org/pdf/1506.02025.pdf
+    Parameters:
+        in_channels (:obj:`int`):
+            Input number of channels
+        n_heads (:obj:`int`):
+            Number of heads
+        d_head (:obj:`int`):
+            Hidden states dimension inside each head
+        depth (:obj:`int`, *optional*, defaults to 1):
+            Number of transformers block
+        dropout (:obj:`float`, *optional*, defaults to 0.0):
+            Dropout rate
+        use_linear_projection (`bool`, defaults to `False`): tbd
+        only_cross_attention (`bool`, defaults to `False`): tbd
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+        use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
+            enable memory efficient attention https://arxiv.org/abs/2112.05682
+        split_head_dim (`bool`, *optional*, defaults to `False`):
+            Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
+            enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
+    """
+    in_channels: int
+    n_heads: int
+    d_head: int
+    depth: int = 1
+    dropout: float = 0.0
+    use_linear_projection: bool = False
+    only_cross_attention: bool = False
+    dtype: jnp.dtype = jnp.float32
+    use_memory_efficient_attention: bool = False
+    split_head_dim: bool = False
+    def setup(self):
+        self.norm = nn.GroupNorm(num_groups=32, epsilon=1e-5)
+        inner_dim = self.n_heads * self.d_head
+        if self.use_linear_projection:
+            self.proj_in = nn.Dense(inner_dim, dtype=self.dtype)
+        else:
+            self.proj_in = nn.Conv(
+                inner_dim,
+                kernel_size=(1, 1),
+                strides=(1, 1),
+                padding="VALID",
+                dtype=self.dtype,
+            )
+        self.transformer_blocks = [
+            FlaxBasicTransformerBlock(
+                inner_dim,
+                self.n_heads,
+                self.d_head,
+                dropout=self.dropout,
+                only_cross_attention=self.only_cross_attention,
+                dtype=self.dtype,
+                use_memory_efficient_attention=self.use_memory_efficient_attention,
+                split_head_dim=self.split_head_dim,
+            )
+            for _ in range(self.depth)
+        ]
+        if self.use_linear_projection:
+            self.proj_out = nn.Dense(inner_dim, dtype=self.dtype)
+        else:
+            self.proj_out = nn.Conv(
+                inner_dim,
+                kernel_size=(1, 1),
+                strides=(1, 1),
+                padding="VALID",
+                dtype=self.dtype,
+            )
+        self.dropout_layer = nn.Dropout(rate=self.dropout)
+    def __call__(self, hidden_states, context, deterministic=True):
+        batch, height, width, channels = hidden_states.shape
+        residual = hidden_states
+        hidden_states = self.norm(hidden_states)
+        if self.use_linear_projection:
+            hidden_states = hidden_states.reshape(batch, height * width, channels)
+            hidden_states = self.proj_in(hidden_states)
+        else:
+            hidden_states = self.proj_in(hidden_states)
+            hidden_states = hidden_states.reshape(batch, height * width, channels)
+        for transformer_block in self.transformer_blocks:
+            hidden_states = transformer_block(hidden_states, context, deterministic=deterministic)
+        if self.use_linear_projection:
+            hidden_states = self.proj_out(hidden_states)
+            hidden_states = hidden_states.reshape(batch, height, width, channels)
+        else:
+            hidden_states = hidden_states.reshape(batch, height, width, channels)
+            hidden_states = self.proj_out(hidden_states)
+        hidden_states = hidden_states + residual
+        return self.dropout_layer(hidden_states, deterministic=deterministic)
+class FlaxFeedForward(nn.Module):
+    r"""
+    Flax module that encapsulates two Linear layers separated by a non-linearity. It is the counterpart of PyTorch's
+    [`FeedForward`] class, with the following simplifications:
+    - The activation function is currently hardcoded to a gated linear unit from:
+    https://arxiv.org/abs/2002.05202
+    - `dim_out` is equal to `dim`.
+    - The number of hidden dimensions is hardcoded to `dim * 4` in [`FlaxGELU`].
+    Parameters:
+        dim (:obj:`int`):
+            Inner hidden states dimension
+        dropout (:obj:`float`, *optional*, defaults to 0.0):
+            Dropout rate
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+    """
+    dim: int
+    dropout: float = 0.0
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        # The second linear layer needs to be called
+        # net_2 for now to match the index of the Sequential layer
+        self.net_0 = FlaxGEGLU(self.dim, self.dropout, self.dtype)
+        self.net_2 = nn.Dense(self.dim, dtype=self.dtype)
+    def __call__(self, hidden_states, deterministic=True):
+        hidden_states = self.net_0(hidden_states, deterministic=deterministic)
+        hidden_states = self.net_2(hidden_states)
+        return hidden_states
+class FlaxGEGLU(nn.Module):
+    r"""
+    Flax implementation of a Linear layer followed by the variant of the gated linear unit activation function from
+    https://arxiv.org/abs/2002.05202.
+    Parameters:
+        dim (:obj:`int`):
+            Input hidden states dimension
+        dropout (:obj:`float`, *optional*, defaults to 0.0):
+            Dropout rate
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+    """
+    dim: int
+    dropout: float = 0.0
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        inner_dim = self.dim * 4
+        self.proj = nn.Dense(inner_dim * 2, dtype=self.dtype)
+        self.dropout_layer = nn.Dropout(rate=self.dropout)
+    def __call__(self, hidden_states, deterministic=True):
+        hidden_states = self.proj(hidden_states)
+        hidden_linear, hidden_gelu = jnp.split(hidden_states, 2, axis=2)
+        return self.dropout_layer(hidden_linear * nn.gelu(hidden_gelu), deterministic=deterministic)

flux/modules/autoencoder.py ADDED Viewed

	@@ -0,0 +1,343 @@

+from dataclasses import dataclass
+from einops import rearrange
+import jax
+import jax.numpy as jnp
+from jax import Array as Tensor
+from flax import nnx
+from flux.wrapper import TorchWrapper
+from flux.math import dot_product_attention
+@dataclass
+class AutoEncoderParams:
+    resolution: int
+    in_channels: int
+    ch: int
+    out_ch: int
+    ch_mult: list[int]
+    num_res_blocks: int
+    z_channels: int
+    scale_factor: float
+    shift_factor: float
+swish = nnx.swish
+class AttnBlock(nnx.Module):
+    def __init__(self, in_channels: int, dtype=jnp.float32, rngs: nnx.Rngs = None):
+        nn = TorchWrapper(rngs, dtype=dtype)
+        self.in_channels = in_channels
+        self.norm = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        self.q = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.k = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.v = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.proj_out = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+    def attention(self, h_: Tensor) -> Tensor:
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # b, c, h, w = q.shape
+        b, h, w, c = q.shape
+        # q = rearrange(q, "b c h w -> b 1 (h w) c").contiguous()
+        # k = rearrange(k, "b c h w -> b 1 (h w) c").contiguous()
+        # v = rearrange(v, "b c h w -> b 1 (h w) c").contiguous()
+        q = rearrange(q, "b h w c -> b 1 (h w) c")
+        k = rearrange(k, "b h w c -> b 1 (h w) c")
+        v = rearrange(v, "b h w c -> b 1 (h w) c")
+        # h_ = nn.functional.scaled_dot_product_attention(q, k, v)
+        h_ = dot_product_attention(q, k, v)
+        # return rearrange(h_, "b 1 (h w) c -> b c h w", h=h, w=w, c=c, b=b)
+        return rearrange(h_, "b 1 (h w) c -> b h w c", h=h, w=w, c=c, b=b)
+    def __call__(self, x: Tensor) -> Tensor:
+        return x + self.proj_out(self.attention(x))
+class ResnetBlock(nnx.Module):
+    def __init__(self, in_channels: int, out_channels: int,  dtype=jnp.float32, rngs: nnx.Rngs = None):
+        nn = TorchWrapper(rngs, dtype=dtype)
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.norm1 = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.norm2 = nn.GroupNorm(num_groups=32, num_channels=out_channels, eps=1e-6, affine=True)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if self.in_channels != self.out_channels:
+            self.nin_shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+    def __call__(self, x):
+        h = x
+        h = self.norm1(h)
+        h = swish(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = swish(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            x = self.nin_shortcut(x)
+        return x + h
+class Downsample(nnx.Module):
+    def __init__(self, in_channels: int, dtype=jnp.float32, rngs: nnx.Rngs = None):
+        nn = TorchWrapper(rngs, dtype=dtype)
+        # no asymmetric padding in torch conv, must do it ourselves
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
+    def __call__(self, x: Tensor):
+        # pad = (0, 1, 0, 1)
+        # x = nn.functional.pad(x, pad, mode="constant", value=0)
+        x = jnp.pad(x, ((0, 0), (0, 1), (0, 1), (0, 0)), mode="constant")
+        x = self.conv(x)
+        return x
+class Upsample(nnx.Module):
+    def __init__(self, in_channels: int, dtype=jnp.float32, rngs: nnx.Rngs = None):
+        nn = TorchWrapper(rngs, dtype=dtype)
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+    def __call__(self, x: Tensor):
+        # x = nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        B, H, W, C = x.shape
+        x = jax.image.resize(x, (B, H * 2, W * 2, C), method="nearest")
+        x = self.conv(x)
+        return x
+ResnetBlock_class, Downsample_class, Upsample_class, AttnBlock_class = ResnetBlock, Downsample, Upsample, AttnBlock
+class Encoder(nnx.Module):
+    def __init__(
+        self,
+        resolution: int,
+        in_channels: int,
+        ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        z_channels: int,
+        dtype=jnp.float32,
+        rngs: nnx.Rngs = None
+    ):
+        nn = TorchWrapper(rngs, dtype=dtype)
+        ResnetBlock, Downsample, Upsample, AttnBlock = nn.declare_with_rng(ResnetBlock_class, Downsample_class, Upsample_class, AttnBlock_class)
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        # downsampling
+        self.conv_in = nn.Conv2d(in_channels, self.ch, kernel_size=3, stride=1, padding=1)
+        curr_res = resolution
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.in_ch_mult = in_ch_mult
+        self.down = nn.ModuleList()
+        block_in = self.ch
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for _ in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        # end
+        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = nn.Conv2d(block_in, 2 * z_channels, kernel_size=3, stride=1, padding=1)
+    def __call__(self, x: Tensor) -> Tensor:
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1])
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # end
+        h = self.norm_out(h)
+        h = swish(h)
+        h = self.conv_out(h)
+        return h
+class Decoder(nnx.Module):
+    def __init__(
+        self,
+        ch: int,
+        out_ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        in_channels: int,
+        resolution: int,
+        z_channels: int,
+        dtype=jnp.float32,
+        rngs: nnx.Rngs = None
+    ):
+        nn = TorchWrapper(rngs, dtype=dtype)
+        ResnetBlock, Downsample, Upsample, AttnBlock = nn.declare_with_rng(ResnetBlock_class, Downsample_class, Upsample_class, AttnBlock_class)
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.ffactor = 2 ** (self.num_resolutions - 1)
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+        curr_res = resolution // 2 ** (self.num_resolutions - 1)
+        self.z_shape = (1, z_channels, curr_res, curr_res)
+        # z to block_in
+        self.conv_in = nn.Conv2d(z_channels, block_in, kernel_size=3, stride=1, padding=1)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for _ in range(self.num_res_blocks + 1):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+        # end
+        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1)
+    def __call__(self, z: Tensor) -> Tensor:
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        h = self.norm_out(h)
+        h = swish(h)
+        h = self.conv_out(h)
+        return h
+class DiagonalGaussian(nnx.Module):
+    def __init__(self, sample: bool = True, chunk_dim: int = -1,  dtype=jnp.float32, rngs: nnx.Rngs = None):
+        self.sample = sample
+        self.chunk_dim = chunk_dim
+        self.rngs = rngs
+        self.dtype = dtype
+    def __call__(self, z: Tensor) -> Tensor:
+        # mean, logvar = torch.chunk(z, 2, dim=self.chunk_dim)
+        mean, logvar = jnp.split(z, 2, axis=self.chunk_dim)
+        if self.sample:
+            # std = torch.exp(0.5 * logvar)
+            # return mean + std * torch.randn_like(mean)
+            std = jnp.exp(0.5 * logvar)
+            return mean + std * jax.random.normal(self.rngs(), mean.shape)
+        else:
+            return mean
+Encoder_class, Decoder_class, DiagonalGaussian_class = Encoder, Decoder, DiagonalGaussian
+class AutoEncoder(nnx.Module):
+    def __init__(self, params: AutoEncoderParams, dtype=jnp.float32, rngs: nnx.Rngs = None):
+        nn = TorchWrapper(rngs, dtype=dtype)
+        Encoder, Decoder, DiagonalGaussian = nn.declare_with_rng(Encoder_class, Decoder_class, DiagonalGaussian_class)
+        self.encoder = Encoder(
+            resolution=params.resolution,
+            in_channels=params.in_channels,
+            ch=params.ch,
+            ch_mult=params.ch_mult,
+            num_res_blocks=params.num_res_blocks,
+            z_channels=params.z_channels,
+        )
+        self.decoder = Decoder(
+            resolution=params.resolution,
+            in_channels=params.in_channels,
+            ch=params.ch,
+            out_ch=params.out_ch,
+            ch_mult=params.ch_mult,
+            num_res_blocks=params.num_res_blocks,
+            z_channels=params.z_channels,
+        )
+        self.reg = DiagonalGaussian()
+        self.scale_factor = params.scale_factor
+        self.shift_factor = params.shift_factor
+    def encode(self, x: Tensor) -> Tensor:
+        z = self.reg(self.encoder(x))
+        z = self.scale_factor * (z - self.shift_factor)
+        return z
+    def decode(self, z: Tensor) -> Tensor:
+        z = z / self.scale_factor + self.shift_factor
+        return self.decoder(z)
+    def __call__(self, x: Tensor) -> Tensor:
+        return self.decode(self.encode(x))

flux/modules/conditioner.py ADDED Viewed

	@@ -0,0 +1,73 @@

+from flax import nnx
+import jax.numpy as jnp
+from jax import Array as Tensor
+from transformers import (FlaxCLIPTextModel, CLIPTokenizer, FlaxT5EncoderModel,
+                          T5Tokenizer)
+class HFEmbedder(nnx.Module):
+    def __init__(self, version: str, max_length: int, **hf_kwargs):
+        self.is_clip = version.startswith("openai")
+        self.max_length = max_length
+        self.output_key = "pooler_output" if self.is_clip else "last_hidden_state"
+        dtype = hf_kwargs.get("dtype", jnp.float32)
+        if self.is_clip:
+            self.tokenizer: CLIPTokenizer = CLIPTokenizer.from_pretrained(version, max_length=max_length)
+            # self.hf_module: CLIPTextModel = CLIPTextModel.from_pretrained(version, **hf_kwargs)
+            self.hf_module: FlaxCLIPTextModel = FlaxCLIPTextModel.from_pretrained(version, **hf_kwargs)
+        else:
+            self.tokenizer: T5Tokenizer = T5Tokenizer.from_pretrained(version, max_length=max_length)
+            # self.hf_module: T5EncoderModel = T5EncoderModel.from_pretrained(version, **hf_kwargs)
+            self.hf_module: FlaxT5EncoderModel = FlaxT5EncoderModel.from_pretrained(version, **hf_kwargs)
+        if dtype==jnp.bfloat16:
+            self.hf_module.params = self.hf_module.to_bf16(self.hf_module.params)
+    def tokenize(self, text: list[str]) -> Tensor:
+        batch_encoding = self.tokenizer(
+            text,
+            truncation=True,
+            max_length=self.max_length,
+            return_length=False,
+            return_overflowing_tokens=False,
+            padding="max_length",
+            return_tensors="jax",
+        )
+        return batch_encoding["input_ids"]
+    def __call__(self, input_ids: Tensor) -> Tensor:
+        # outputs = self.hf_module(
+        #     input_ids=batch_encoding["input_ids"].to(self.hf_module.device),
+        #     attention_mask=None,
+        #     output_hidden_states=False,
+        # )
+        outputs = self.hf_module(
+            input_ids=input_ids,
+            attention_mask=None,
+            output_hidden_states=False,
+            train=False,
+        )
+        return outputs[self.output_key]
+    # def __call__(self, text: list[str]) -> Tensor:
+    #     batch_encoding = self.tokenizer(
+    #         text,
+    #         truncation=True,
+    #         max_length=self.max_length,
+    #         return_length=False,
+    #         return_overflowing_tokens=False,
+    #         padding="max_length",
+    #         return_tensors="jax",
+    #     )
+    #     # outputs = self.hf_module(
+    #     #     input_ids=batch_encoding["input_ids"].to(self.hf_module.device),
+    #     #     attention_mask=None,
+    #     #     output_hidden_states=False,
+    #     # )
+    #     outputs = self.hf_module(
+    #         input_ids=batch_encoding["input_ids"],
+    #         attention_mask=None,
+    #         output_hidden_states=False,
+    #         train=False,
+    #     )
+    #     return outputs[self.output_key]

flux/modules/layers.py ADDED Viewed

	@@ -0,0 +1,292 @@

+import math
+from dataclasses import dataclass
+import jax
+import jax.numpy as jnp
+from jax import Array as Tensor
+from flax import nnx
+from einops import rearrange
+from flux.wrapper import TorchWrapper
+from flux.math import attention, rope
+class EmbedND(nnx.Module):
+    def __init__(self, dim: int, theta: int, axes_dim: list[int], dtype=jnp.float32, rngs: nnx.Rngs = None):
+        self.dim = dim
+        self.theta = theta
+        self.axes_dim = axes_dim
+    def __call__(self, ids: Tensor) -> Tensor:
+        n_axes = ids.shape[-1]
+        # emb = torch.cat(
+        #     [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)],
+        #     dim=-3,
+        # )
+        emb = jnp.concatenate(
+            [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)],
+            axis=-3,
+        )
+        # return emb.unsqueeze(1)
+        return jnp.expand_dims(emb, 1)
+def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 1000.0):
+    """
+    Create sinusoidal timestep embeddings.
+    :param t: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an (N, D) Tensor of positional embeddings.
+    """
+    t = time_factor * t
+    half = dim // 2
+    # freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(
+        # t.device
+    # )
+    freqs = jnp.exp(-math.log(max_period) * jnp.arange(half, dtype=jnp.float32) / half)
+    # args = t[:, None].float() * freqs[None]
+    # embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    args = t[:, None] * freqs[None]
+    embedding = jnp.concatenate([jnp.cos(args), jnp.sin(args)], axis=-1)
+    if dim % 2:
+        # embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        embedding = jnp.concatenate([embedding, jnp.zeros_like(embedding[:, :1])], axis=-1)
+    # if torch.is_floating_point(t):
+        # embedding = embedding.to(t)
+    # return embedding
+    if jnp.issubdtype(t.dtype, jnp.floating):
+        embedding = embedding.astype(t.dtype)
+    return embedding
+class MLPEmbedder(nnx.Module):
+    def __init__(self, in_dim: int, hidden_dim: int, dtype=jnp.float32, rngs: nnx.Rngs = None):
+        nn = TorchWrapper(rngs=rngs, dtype=dtype)
+        self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True)
+        self.silu = nn.SiLU()
+        self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True)
+    def __call__(self, x: Tensor) -> Tensor:
+        return self.out_layer(self.silu(self.in_layer(x)))
+class RMSNorm(nnx.Module):
+    def __init__(self, dim: int, dtype=jnp.float32, rngs: nnx.Rngs = None):
+        nn = TorchWrapper(rngs=rngs, dtype=dtype)
+        # self.scale = nn.Parameter(torch.ones(dim))
+        self.scale = nn.Parameter(jnp.ones((dim,)))
+    def __call__(self, x: Tensor):
+        x_dtype = x.dtype
+        # x = x.float()
+        x = x.astype(jnp.float32)
+        # rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + 1e-6)
+        rrms = jax.lax.rsqrt(jnp.mean(x**2, axis=-1, keepdims=True) + 1e-6)
+        # return (x * rrms).to(dtype=x_dtype) * self.scale
+        return (x * rrms).astype(x.dtype) * self.scale
+RMSNorm_class = RMSNorm
+class QKNorm(nnx.Module):
+    def __init__(self, dim: int, dtype=jnp.float32, rngs: nnx.Rngs = None):
+        nn = TorchWrapper(rngs=rngs, dtype=dtype)
+        RMSNorm = nn.declare_with_rng(RMSNorm_class)
+        self.query_norm = RMSNorm(dim)
+        self.key_norm = RMSNorm(dim)
+    def __call__(self, q: Tensor, k: Tensor, v: Tensor) -> tuple[Tensor, Tensor]:
+        q = self.query_norm(q)
+        k = self.key_norm(k)
+        # return q.to(v), k.to(v)
+        return q.astype(v.dtype), k.astype(v.dtype)
+QKNorm_class = QKNorm
+class SelfAttention(nnx.Module):
+    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False, dtype=jnp.float32, rngs: nnx.Rngs = None):
+        nn = TorchWrapper(rngs=rngs, dtype=dtype)
+        QKNorm = nn.declare_with_rng(QKNorm_class)
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.norm = QKNorm(head_dim)
+        self.proj = nn.Linear(dim, dim)
+    def __call__(self, x: Tensor, pe: Tensor) -> Tensor:
+        qkv = self.qkv(x)
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        q, k = self.norm(q, k, v)
+        x = attention(q, k, v, pe=pe)
+        x = self.proj(x)
+        return x
+@dataclass
+class ModulationOut:
+    shift: Tensor
+    scale: Tensor
+    gate: Tensor
+class Modulation(nnx.Module):
+    def __init__(self, dim: int, double: bool, dtype=jnp.float32, rngs: nnx.Rngs = None):
+        nn = TorchWrapper(rngs=rngs, dtype=dtype)
+        self.is_double = double
+        self.multiplier = 6 if double else 3
+        self.lin = nn.Linear(dim, self.multiplier * dim, bias=True)
+    def __call__(self, vec: Tensor) -> tuple[ModulationOut, ModulationOut | None]:
+        # out = self.lin(nn.functional.silu(vec))[:, None, :].chunk(self.multiplier, dim=-1)
+        out = self.lin(nnx.silu(vec))[:, None, :]
+        out = jnp.split(out, self.multiplier, axis=-1)
+        return (
+            ModulationOut(*out[:3]),
+            ModulationOut(*out[3:]) if self.is_double else None,
+        )
+Modulation_class, SelfAttention_class = Modulation, SelfAttention
+class DoubleStreamBlock(nnx.Module):
+    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, dtype=jnp.float32, rngs: nnx.Rngs = None):
+        nn = TorchWrapper(rngs=rngs, dtype=dtype)
+        Modulation, SelfAttention = nn.declare_with_rng(Modulation_class, SelfAttention_class)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+        self.img_mod = Modulation(hidden_size, double=True)
+        self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
+        self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_mlp = nn.Sequential(
+            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
+        )
+        self.txt_mod = Modulation(hidden_size, double=True)
+        self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
+        self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_mlp = nn.Sequential(
+            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
+        )
+    def __call__(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor) -> tuple[Tensor, Tensor]:
+        img_mod1, img_mod2 = self.img_mod(vec)
+        txt_mod1, txt_mod2 = self.txt_mod(vec)
+        # prepare image for attention
+        img_modulated = self.img_norm1(img)
+        img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
+        img_qkv = self.img_attn.qkv(img_modulated)
+        img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
+        # prepare txt for attention
+        txt_modulated = self.txt_norm1(txt)
+        txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
+        txt_qkv = self.txt_attn.qkv(txt_modulated)
+        txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
+        # run actual attention
+        # q = torch.cat((txt_q, img_q), dim=2)
+        # k = torch.cat((txt_k, img_k), dim=2)
+        # v = torch.cat((txt_v, img_v), dim=2)
+        q = jnp.concatenate((txt_q, img_q), axis=2)
+        k = jnp.concatenate((txt_k, img_k), axis=2)
+        v = jnp.concatenate((txt_v, img_v), axis=2)
+        attn = attention(q, k, v, pe=pe)
+        txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
+        # calculate the img bloks
+        img = img + img_mod1.gate * self.img_attn.proj(img_attn)
+        img = img + img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift)
+        # calculate the txt bloks
+        txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
+        txt = txt + txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift)
+        return img, txt
+class SingleStreamBlock(nnx.Module):
+    """
+    A DiT block with parallel linear layers as described in
+    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
+    """
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qk_scale: float | None = None,
+        dtype=jnp.float32, rngs: nnx.Rngs = None
+    ):
+        nn = TorchWrapper(rngs=rngs, dtype=dtype)
+        QKNorm, Modulation = nn.declare_with_rng(QKNorm_class, Modulation_class)
+        self.hidden_dim = hidden_size
+        self.num_heads = num_heads
+        head_dim = hidden_size // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        # qkv and mlp_in
+        self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim)
+        # proj and mlp_out
+        self.linear2 = nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size)
+        self.norm = QKNorm(head_dim)
+        self.hidden_size = hidden_size
+        self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.mlp_act = nn.GELU(approximate="tanh")
+        self.modulation = Modulation(hidden_size, double=False)
+    def __call__(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
+        mod, _ = self.modulation(vec)
+        x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
+        # qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
+        qkv, mlp = jnp.split(self.linear1(x_mod), [3 * self.hidden_size,], axis=-1)
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        q, k = self.norm(q, k, v)
+        # compute attention
+        attn = attention(q, k, v, pe=pe)
+        # compute activation in mlp stream, cat again and run second linear layer
+        # output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
+        output = self.linear2(jnp.concatenate((attn, self.mlp_act(mlp)), axis=2))
+        return x + mod.gate * output
+class LastLayer(nnx.Module):
+    def __init__(self, hidden_size: int, patch_size: int, out_channels: int, dtype=jnp.float32, rngs: nnx.Rngs = None):
+        nn = TorchWrapper(rngs=rngs, dtype=dtype)
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True))
+    def __call__(self, x: Tensor, vec: Tensor) -> Tensor:
+        # shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
+        shift, scale = jnp.split(self.adaLN_modulation(vec), 2, axis=1)
+        x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
+        x = self.linear(x)
+        return x

flux/sampling.py ADDED Viewed

	@@ -0,0 +1,227 @@

+import math
+from typing import Callable
+from einops import rearrange, repeat
+import jax
+import jax.numpy as jnp
+from jax import Array as Tensor
+from flax import nnx
+from flux.model import Flux
+from flux.modules.conditioner import HFEmbedder
+def get_noise(
+    num_samples: int,
+    height: int,
+    width: int,
+    device,
+    dtype: jnp.dtype,
+    seed: int,
+):
+    # return torch.randn(
+        # num_samples,
+        # 16,
+        # # allow for packing
+        # 2 * math.ceil(height / 16),
+        # 2 * math.ceil(width / 16),
+    #     device=device,
+    #     dtype=dtype,
+    #     generator=torch.Generator(device=device).manual_seed(seed),
+    # )
+    # rngs = nnx.Rngs(seed)
+    key = jax.random.key(seed)
+    return jax.random.normal(
+        # rngs(),
+        key,
+        (
+        num_samples,
+        2 * math.ceil(height / 16),
+        2 * math.ceil(width / 16),
+        16,
+        ),
+        dtype=dtype
+    )
+def prepare_tokens(t5: HFEmbedder, clip: HFEmbedder, prompt: str | list[str]) -> tuple[Tensor, Tensor]:
+    if isinstance(prompt, str):
+        prompt = [prompt]
+    t5_tokens = t5.tokenize(prompt)
+    clip_tokens = clip.tokenize(prompt)
+    return t5_tokens, clip_tokens
+    # return {
+    #     "t5": t5_tokens,
+    #     "clip": clip_tokens,
+    # }
+def prepare(t5: HFEmbedder, clip: HFEmbedder, img: Tensor, t5_tokens: Tensor, clip_tokens: Tensor) -> dict[str, Tensor]:
+    # bs, c, h, w = img.shape
+    bs, h, w, c = img.shape
+    if bs == 1:
+        bs = t5_tokens.shape[0]
+    # img = rearrange(img, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
+    img = rearrange(img, "b (h ph) (w pw) c -> b (h w) (c ph pw)", ph=2, pw=2)
+    if img.shape[0] == 1 and bs > 1:
+        img = repeat(img, "1 ... -> bs ...", bs=bs)
+    # img_ids = torch.zeros(h // 2, w // 2, 3)
+    img_ids = jnp.zeros((h // 2, w // 2, 3))
+    # img_ids[..., 1] = img_ids[..., 1] + torch.arange(h // 2)[:, None]
+    # img_ids[..., 2] = img_ids[..., 2] + torch.arange(w // 2)[None, :]
+    img_ids = img_ids.at[..., 1].set(img_ids[..., 1]+jnp.arange(h // 2)[:, None])
+    img_ids = img_ids.at[..., 2].set(img_ids[..., 2]+jnp.arange(w // 2)[None, :])
+    img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs)
+    # if isinstance(prompt, str):
+    #     prompt = [prompt]
+    txt = t5(t5_tokens)
+    if txt.shape[0] == 1 and bs > 1:
+        txt = repeat(txt, "1 ... -> bs ...", bs=bs)
+    # txt_ids = torch.zeros(bs, txt.shape[1], 3)
+    txt_ids = jnp.zeros((bs, txt.shape[1], 3))
+    vec = clip(clip_tokens)
+    if vec.shape[0] == 1 and bs > 1:
+        vec = repeat(vec, "1 ... -> bs ...", bs=bs)
+    # return {
+    #     "img": img,
+    #     "img_ids": img_ids.to(img.device),
+    #     "txt": txt.to(img.device),
+    #     "txt_ids": txt_ids.to(img.device),
+    #     "vec": vec.to(img.device),
+    # }
+    return {
+        "img": img,
+        "img_ids": img_ids,
+        "txt": txt,
+        "txt_ids": txt_ids,
+        "vec": vec,
+    }
+def time_shift(mu: float, sigma: float, t: Tensor):
+    return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
+    # return jnp.exp(mu) / (jnp.exp(mu) + (1 / t - 1) ** sigma)
+def get_lin_function(
+    x1: float = 256, y1: float = 0.5, x2: float = 4096, y2: float = 1.15
+) -> Callable[[float], float]:
+    m = (y2 - y1) / (x2 - x1)
+    b = y1 - m * x1
+    return lambda x: m * x + b
+def get_schedule(
+    num_steps: int,
+    image_seq_len: int,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+    shift: bool = True,
+) -> Tensor:
+    # extra step for zero
+    # timesteps = torch.linspace(1, 0, num_steps + 1)
+    timesteps = jnp.linspace(1, 0, num_steps + 1)
+    # shifting the schedule to favor high timesteps for higher signal images
+    if shift:
+        # estimate mu based on linear estimation between two points
+        mu = get_lin_function(y1=base_shift, y2=max_shift)(image_seq_len)
+        timesteps = time_shift(mu, 1.0, timesteps)
+    return timesteps#.tolist()
+DEBUG=False
+def denoise_for(
+    model: Flux,
+    # model input
+    img: Tensor,
+    img_ids: Tensor,
+    txt: Tensor,
+    txt_ids: Tensor,
+    vec: Tensor,
+    # sampling parameters
+    timesteps: Tensor,
+    guidance: float = 4.0,
+):
+    # this is ignored for schnell
+    # guidance_vec = torch.full((img.shape[0],), guidance, device=img.device, dtype=img.dtype)
+    guidance_vec = jnp.full((img.shape[0],), guidance, dtype=img.dtype)
+    timesteps = timesteps.tolist()
+    for t_curr, t_prev in zip(timesteps[:-1], timesteps[1:]):
+        # t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
+        t_vec = jnp.full((img.shape[0],), t_curr, dtype=img.dtype)
+        pred = model(
+            img=img,
+            img_ids=img_ids,
+            txt=txt,
+            txt_ids=txt_ids,
+            y=vec,
+            timesteps=t_vec,
+            guidance=guidance_vec,
+        )
+        img = img + (t_prev - t_curr) * pred
+    return img
+# @nnx.jit
+def denoise(
+    model: Flux,
+    # model input
+    img: Tensor,
+    img_ids: Tensor,
+    txt: Tensor,
+    txt_ids: Tensor,
+    vec: Tensor,
+    # sampling parameters
+    timesteps: Tensor,
+    guidance: float = 4.0,
+):
+    # this is ignored for schnell
+    # guidance_vec = torch.full((img.shape[0],), guidance, device=img.device, dtype=img.dtype)
+    guidance_vec = jnp.full((img.shape[0],), guidance, dtype=img.dtype)
+    @nnx.scan
+    def scan_func(acc, t_prev):
+        img, t_curr = acc
+        dtype = img.dtype
+        t_vec = jnp.full((img.shape[0],), t_curr, dtype=img.dtype)
+        pred = model(
+            img=img,
+            img_ids=img_ids,
+            txt=txt,
+            txt_ids=txt_ids,
+            y=vec,
+            timesteps=t_vec,
+            guidance=guidance_vec,
+        )
+        img = img + (t_prev - t_curr) * pred
+        return (img.astype(dtype), t_prev), pred
+    acc,pred=scan_func((img, timesteps[0]), timesteps[1:])
+    return acc[0]
+def unpack(x: Tensor, height: int, width: int) -> Tensor:
+    # return rearrange(
+    #     x,
+    #     "b (h w) (c ph pw) -> b c (h ph) (w pw)",
+    #     h=math.ceil(height / 16),
+    #     w=math.ceil(width / 16),
+    #     ph=2,
+    #     pw=2,
+    # )
+    return rearrange(
+        x,
+        "b (h w) (c ph pw) -> b (h ph) (w pw) c",
+        h=math.ceil(height / 16),
+        w=math.ceil(width / 16),
+        ph=2,
+        pw=2,
+    )

flux/util.py ADDED Viewed

	@@ -0,0 +1,340 @@

+import os
+from dataclasses import dataclass
+import numpy as np
+import jax
+from jax import Array as Tensor
+import jax.numpy as jnp
+from flax import nnx
+import torch
+from einops import rearrange
+from huggingface_hub import hf_hub_download
+from imwatermark import WatermarkEncoder
+from safetensors.torch import load_file as load_sft
+from flux.model import Flux, FluxParams
+from flux.modules.autoencoder import AutoEncoder, AutoEncoderParams
+from flux.modules.conditioner import HFEmbedder
+@dataclass
+class ModelSpec:
+    params: FluxParams
+    ae_params: AutoEncoderParams
+    ckpt_path: str | None
+    ae_path: str | None
+    repo_id: str | None
+    repo_flow: str | None
+    repo_ae: str | None
+configs = {
+    "flux-dev": ModelSpec(
+        repo_id="black-forest-labs/FLUX.1-dev",
+        repo_flow="flux1-dev.safetensors",
+        repo_ae="ae.safetensors",
+        ckpt_path=os.getenv("FLUX_DEV"),
+        params=FluxParams(
+            in_channels=64,
+            vec_in_dim=768,
+            context_in_dim=4096,
+            hidden_size=3072,
+            mlp_ratio=4.0,
+            num_heads=24,
+            depth=19,
+            depth_single_blocks=38,
+            axes_dim=[16, 56, 56],
+            theta=10_000,
+            qkv_bias=True,
+            guidance_embed=True,
+        ),
+        ae_path=os.getenv("AE"),
+        ae_params=AutoEncoderParams(
+            resolution=256,
+            in_channels=3,
+            ch=128,
+            out_ch=3,
+            ch_mult=[1, 2, 4, 4],
+            num_res_blocks=2,
+            z_channels=16,
+            scale_factor=0.3611,
+            shift_factor=0.1159,
+        ),
+    ),
+    "flux-schnell": ModelSpec(
+        repo_id="black-forest-labs/FLUX.1-schnell",
+        repo_flow="flux1-schnell.safetensors",
+        repo_ae="ae.safetensors",
+        ckpt_path=os.getenv("FLUX_SCHNELL"),
+        params=FluxParams(
+            in_channels=64,
+            vec_in_dim=768,
+            context_in_dim=4096,
+            hidden_size=3072,
+            mlp_ratio=4.0,
+            num_heads=24,
+            depth=19,
+            depth_single_blocks=38,
+            axes_dim=[16, 56, 56],
+            theta=10_000,
+            qkv_bias=True,
+            guidance_embed=False,
+        ),
+        ae_path=os.getenv("AE"),
+        ae_params=AutoEncoderParams(
+            resolution=256,
+            in_channels=3,
+            ch=128,
+            out_ch=3,
+            ch_mult=[1, 2, 4, 4],
+            num_res_blocks=2,
+            z_channels=16,
+            scale_factor=0.3611,
+            shift_factor=0.1159,
+        ),
+    ),
+}
+try:
+    import ml_dtypes
+    from_torch_bf16 = lambda x: jnp.asarray(x.view(dtype=torch.uint16).numpy().view(ml_dtypes.bfloat16))
+except:
+    from_torch_bf16 = lambda x: jnp.asarray(x.float().numpy()).astype(jnp.bfloat16)
+def load_from_torch(graph, state, state_dict:dict):
+    cnt=0
+    torch_cnt=0
+    flax_cnt=0
+    val_cnt=0
+    print(f"Torch states: #{len(state_dict)}; Flax states: #{len(state.flat_state())}")
+    def convert_to_jax(tensor):
+        if tensor.dtype==torch.bfloat16:
+            return from_torch_bf16(tensor)
+        else:
+            return jnp.asarray(tensor.numpy())
+    for key in sorted(state_dict.keys()):
+        ptr=state
+        node=graph
+        torch_cnt+=1
+        # print(key)
+        try:
+            for loc in key.split(".")[:-1]:
+                if loc.isnumeric():
+                    if "layers" in ptr:
+                        ptr=ptr["layers"]
+                        node=node.subgraphs["layers"]
+                    loc=int(loc)
+                ptr=ptr[loc]
+                node=node.subgraphs[loc]
+            last=key.split(".")[-1]
+            if last not in ptr._mapping.keys():
+                ptr_keys=list(ptr._mapping.keys())
+                ptr_keys=list(filter(lambda x:x!="bias", ptr_keys))
+                if len(ptr_keys)==1:
+                    ptr_key=ptr_keys[0]
+                elif last=="weight" and "kernel" in ptr_keys:
+                    ptr_key="kernel"
+                else:
+                    cnt+=1
+                    raise Exception(f"Mismatched: {key}: {ptr_keys} ")
+                val=ptr[ptr_key].value
+                # assert state_dict[key].shape==val.shape, f"[{node.type}]mismatched {state_dict[key].shape} {val.shape}"
+            else:
+                if isinstance(ptr[last], jax.Array):
+                    val=ptr[last]
+                else:
+                    val=ptr[last].value
+                ptr_key=last
+                assert state_dict[key].shape==val.shape, f"{key} mismatched"
+            if isinstance(ptr[ptr_key], jax.Array):
+                assert state_dict[key].shape==val.shape, f"Array: [{node.type}]mismatched {state_dict[key].shape} {val.shape}"
+                kernel=convert_to_jax(state_dict[key])
+                val_cnt+=1
+                continue
+            elif ptr_key=="bias":
+                assert state_dict[key].shape==val.shape, f"Bias: [{node.type}]mismatched {state_dict[key].shape} {val.shape}"
+                kernel=nnx.Param(convert_to_jax(state_dict[key])).to_state()
+            else:
+                # print(node.type,node.attributes, )
+                # print(type(ptr._mapping[ptr_key]))
+                if 'kernel_size' in node.attributes:
+                    kernel=convert_to_jax(state_dict[key])
+                    # print(len(kernel.shape))
+                    # print(kernel.shape)
+                    if len(kernel.shape)==3:
+                        kernel=jnp.transpose(kernel, (2, 1, 0))
+                    elif len(kernel.shape)==4:
+                        kernel=jnp.transpose(kernel, (2, 3, 1, 0))
+                    elif len(kernel.shape)==5:
+                        kernel=jnp.transpose(kernel, (2, 3, 4, 1, 0))
+                elif 'dot_general' in node.attributes:
+                    kernel=convert_to_jax(state_dict[key])
+                    kernel=jnp.transpose(kernel, (1, 0))
+                else:
+                    # val=ptr[ptr_key].value
+                    kernel=convert_to_jax(state_dict[key])
+                assert val.shape==kernel.shape, f"[{node.type}]mismatched {val.shape} {kernel.shape}"
+                kernel=nnx.Param(kernel).to_state()
+                # print("new", len(kernel.value.shape), type(kernel))
+            ptr._mapping[ptr_key]=kernel
+            flax_cnt+=1
+        except Exception as e:
+            print(e, f"{key}")
+    print(cnt, torch_cnt, flax_cnt, val_cnt)
+    # print(len(state.flat_state()))
+    return state
+def load_state_dict(model, state_dict):
+    graph,state=nnx.split(model)
+    state=load_from_torch(graph, state, state_dict)
+    nnx.update(model, state)
+    return model
+def print_load_warning(missing: list[str], unexpected: list[str]) -> None:
+    if len(missing) > 0 and len(unexpected) > 0:
+        print(f"Got {len(missing)} missing keys:\n\t" + "\n\t".join(missing))
+        print("\n" + "-" * 79 + "\n")
+        print(f"Got {len(unexpected)} unexpected keys:\n\t" + "\n\t".join(unexpected))
+    elif len(missing) > 0:
+        print(f"Got {len(missing)} missing keys:\n\t" + "\n\t".join(missing))
+    elif len(unexpected) > 0:
+        print(f"Got {len(unexpected)} unexpected keys:\n\t" + "\n\t".join(unexpected))
+def patch_dtype(model,dtype,patch_param=False):
+    for path, module in model.iter_modules():
+        if hasattr(module, "dtype") and (module.dtype is None or jnp.issubdtype(module.dtype, jnp.floating)):
+            module.dtype=dtype
+        if patch_param:
+            if hasattr(module, "param_dtype") and jnp.issubdtype(module.param_dtype, jnp.floating):
+                module.param_dtype=dtype
+    if not patch_param:
+        return model
+    for path, parent in nnx.iter_graph(model):
+        if isinstance(parent, nnx.Module):
+            for name, value in vars(parent).items():
+                if isinstance(value, nnx.Variable) and value.value is None:
+                    pass
+                    # print(name)
+                elif isinstance(value, nnx.Variable):
+                    if jnp.issubdtype(value.value.dtype, jnp.floating):
+                        value.value = value.value.astype(dtype)
+                    # print(name,value.value.dtype,value.dtype)
+                elif isinstance(value,jax.Array):
+                    # print(name,value.dtype)
+                    # print(parent.__getattribute__(name).dtype)
+                    if jnp.issubdtype(value.dtype, jnp.floating):
+                        parent.__setattr__(name,value.astype(dtype))
+    return model
+def load_flow_model(name: str, device: str = "none", hf_download: bool = True):
+    # Loading Flux
+    print("Init model")
+    ckpt_path = configs[name].ckpt_path
+    if (
+        ckpt_path is None
+        and configs[name].repo_id is not None
+        and configs[name].repo_flow is not None
+        and hf_download
+    ):
+        ckpt_path = hf_hub_download(configs[name].repo_id, configs[name].repo_flow)
+    # with torch.device("meta" if ckpt_path is not None else device):
+    model = Flux(configs[name].params, dtype=jnp.bfloat16, rngs=nnx.Rngs(0))
+    model = patch_dtype(model, jnp.bfloat16)
+    if ckpt_path is not None:
+        print("Loading checkpoint")
+        # load_sft doesn't support torch.device
+        sd = load_sft(ckpt_path, device="cpu")
+        # TODO: loading state_dict
+        model = load_state_dict(model, sd)
+        # missing, unexpected = model.load_state_dict(sd, strict=False, assign=True)
+        # print_load_warning(missing, unexpected)
+    return model
+def load_t5(device: str = "none", max_length: int = 512) -> HFEmbedder:
+    # max length 64, 128, 256 and 512 should work (if your sequence is short enough)
+    return HFEmbedder("lnyan/t5-v1_1-xxl-encoder", max_length=max_length, dtype=jnp.bfloat16)
+def load_clip(device: str = "none") -> HFEmbedder:
+    return HFEmbedder("openai/clip-vit-large-patch14", max_length=77, dtype=jnp.bfloat16)
+def load_ae(name: str, device: str = "none", hf_download: bool = True) -> AutoEncoder:
+    ckpt_path = configs[name].ae_path
+    if (
+        ckpt_path is None
+        and configs[name].repo_id is not None
+        and configs[name].repo_ae is not None
+        and hf_download
+    ):
+        ckpt_path = hf_hub_download(configs[name].repo_id, configs[name].repo_ae)
+    # Loading the autoencoder
+    print("Init AE")
+    # with torch.device("meta" if ckpt_path is not None else device):
+    ae = AutoEncoder(configs[name].ae_params, dtype=jnp.bfloat16, rngs=nnx.Rngs(0))
+    ae = patch_dtype(ae, jnp.bfloat16)
+    if ckpt_path is not None:
+        sd = load_sft(ckpt_path, device="cpu")
+        # TODO: loading state_dict
+        ae = load_state_dict(ae, sd)
+        # missing, unexpected = ae.load_state_dict(sd, strict=False, assign=True)
+        # print_load_warning(missing, unexpected)
+    return ae
+class WatermarkEmbedder:
+    def __init__(self, watermark):
+        self.watermark = watermark
+        self.num_bits = len(WATERMARK_BITS)
+        self.encoder = WatermarkEncoder()
+        self.encoder.set_watermark("bits", self.watermark)
+    def __call__(self, image: Tensor) -> Tensor:
+        """
+        Adds a predefined watermark to the input image
+        Args:
+            image: ([N,] B, RGB, H, W) in range [-1, 1]
+        Returns:
+            same as input but watermarked
+        """
+        image = 0.5 * image + 0.5
+        squeeze = len(image.shape) == 4
+        if squeeze:
+            image = image[None, ...]
+        n = image.shape[0]
+        # image_np = rearrange((255 * image).detach().cpu(), "n b c h w -> (n b) h w c").numpy()[:, :, :, ::-1]
+        image_np = np.array(rearrange((255 * image), "n b h w c -> (n b) h w c"))[:, :, :, ::-1]
+        # torch (b, c, h, w) in [0, 1] -> numpy (b, h, w, c) [0, 255]
+        # watermarking libary expects input as cv2 BGR format
+        for k in range(image_np.shape[0]):
+            image_np[k] = self.encoder.encode(image_np[k], "dwtDct")
+        # image = torch.from_numpy(rearrange(image_np[:, :, :, ::-1], "(n b) h w c -> n b c h w", n=n)).to(
+            # image.device
+        # )
+        image = jnp.asarray(rearrange(image_np[:, :, :, ::-1], "(n b) h w c -> n b h w c", n=n))
+        # image = torch.clamp(image / 255, min=0.0, max=1.0)
+        image = jnp.clip(image / 255, min=0.0, max=1.0)
+        if squeeze:
+            image = image[0]
+        image = 2 * image - 1
+        return image
+# A fixed 48-bit message that was chosen at random
+WATERMARK_MESSAGE = 0b001010101111111010000111100111001111010100101110
+# bin(x)[2:] gives bits of x as str, use int to convert them to 0/1
+WATERMARK_BITS = [int(bit) for bit in bin(WATERMARK_MESSAGE)[2:]]
+embed_watermark = WatermarkEmbedder(WATERMARK_BITS)

flux/wrapper.py ADDED Viewed

	@@ -0,0 +1,139 @@

+# Copyright 2024 Lnyan (https://github.com/lkwq007). All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from functools import partial
+import numpy as np
+import jax
+import jax.numpy as jnp
+from jax import Array as Tensor
+import flax
+from flax import nnx
+import flax.linen
+def fake_init(key, feature_shape, param_dtype):
+    return jax.ShapeDtypeStruct(feature_shape, param_dtype)
+def wrap_LayerNorm(dim, *, eps=1e-5, elementwise_affine=True, bias=True, rngs:nnx.Rngs):
+    return nnx.LayerNorm(dim, epsilon=eps, use_bias=elementwise_affine and bias, use_scale=elementwise_affine, bias_init=fake_init, scale_init=fake_init, rngs=rngs)
+def wrap_Linear(dim, inner_dim, *, bias=True, rngs:nnx.Rngs):
+    return nnx.Linear(dim, inner_dim, use_bias=bias, kernel_init=fake_init, bias_init=fake_init, rngs=rngs)
+def wrap_GroupNorm(num_groups, num_channels, *, eps=1e-5, affine=True, rngs:nnx.Rngs):
+    return nnx.GroupNorm(num_channels, num_groups=num_groups, epsilon=eps, use_bias=affine, use_scale=affine, bias_init=fake_init, scale_init=fake_init, rngs=rngs)
+def wrap_Conv(in_channels, out_channels, kernel_size, *, stride=1, padding=0, dilation=1, groups=1, bias=True, padding_mode='zeros', rngs:nnx.Rngs, conv_dim:int):
+    if isinstance(kernel_size, int):
+        kernel_tuple = (kernel_size,) * conv_dim
+    else:
+    # elif isinstance(kernel_size, tuple):
+        assert len(kernel_size) == conv_dim
+        kernel_tuple = kernel_size
+    return nnx.Conv(in_channels, out_channels, kernel_tuple, strides=stride, padding=padding, use_bias=bias, kernel_init=fake_init, bias_init=fake_init, rngs=rngs)
+    # return nnx.Conv(in_channels, out_channels, kernel_tuple, stride=stride, padding=padding, dilation=dilation, feature_group_count=groups, use_bias=bias, rngs=rngs)
+class nn_GELU(nnx.Module):
+    def __init__(self, approximate="none") -> None:
+        self.approximate=approximate=="tanh"
+    def __call__(self, x):
+        return nnx.gelu(x, approximate=self.approximate)
+class nn_SiLU(nnx.Module):
+    def __init__(self) -> None:
+        pass
+    def __call__(self, x):
+        return nnx.silu(x)
+class nn_AvgPool(nnx.Module):
+    def __init__(self, window_shape, strides=None, padding="VALID") -> None:
+        self.window_shape=window_shape
+        self.strides=strides
+        self.padding=padding
+    def __call__(self, x):
+        return flax.linen.avg_pool(x, window_shape=self.window_shape, strides=self.strides, padding=self.padding)
+# a wrapper class
+class TorchWrapper:
+    def __init__(self, rngs: nnx.Rngs, dtype=jnp.float32):
+        self.rngs = rngs
+        self.dtype = dtype
+    def declare_with_rng(self, *args):
+        ret=list(map(lambda f: partial(f, dtype=self.dtype, rngs=self.rngs), args))
+        return ret if len(ret)>1 else ret[0]
+    def conv_nd(self, dims, *args, **kwargs):
+        return wrap_Conv(*args, **kwargs, rngs=self.rngs, conv_dim=dims)
+    def avg_pool(self, *args, **kwargs):
+        return nn_AvgPool(*args, **kwargs)
+    def linear(self, *args, **kwargs):
+        return self.Linear(*args, **kwargs)
+    def SiLU(self):
+        return nn_SiLU()
+    def GELU(self, approximate="none"):
+        return nn_GELU(approximate)
+    def Identity(self):
+        return lambda x: x
+    def LayerNorm(self, dim, eps=1e-5, elementwise_affine=True, bias=True):
+        return wrap_LayerNorm(dim, eps=eps, elementwise_affine=elementwise_affine, bias=bias, rngs=self.rngs)
+    def GroupNorm(self, *args, **kwargs):
+        return wrap_GroupNorm(*args,**kwargs, rngs=self.rngs)
+    def Linear(self, *args, **kwargs):
+        return wrap_Linear(*args, **kwargs, rngs=self.rngs)
+    def Parameter(self, value):
+        return nnx.Param(value)
+    def Dropout(self, p):
+        return nnx.Dropout(rate=p, rngs=self.rngs)
+    def Sequential(self, *args):
+        return nnx.Sequential(*args)
+    def Conv1d(self, *args, **kwargs):
+        return wrap_Conv(*args, **kwargs, rngs=self.rngs, conv_dim=1)
+    def Conv2d(self, *args, **kwargs):
+        return wrap_Conv(*args, **kwargs, rngs=self.rngs, conv_dim=2)
+    def Conv3d(self, *args, **kwargs):
+        return wrap_Conv(*args, **kwargs, rngs=self.rngs, conv_dim=3)
+    def ModuleList(self, lst=None):
+        if lst is None:
+            return []
+        return list(lst)
+    def Module(self,*args,**kwargs):
+        return nnx.Dict()

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+jax[cuda12]
+flax==0.9.0
+flash_attn_jax
+torch
+torchvision
+opencv-python-headless
+einops
+huggingface_hub
+transformers
+tokenizers
+sentencepiece
+fire
+invisible-watermark
+ml-dtypes