Trained for 0 epochs and 500 steps.

Trained with datasets ['text-embeds', 'mj-v6']
Learning rate 8e-06, batch size 32, and 4 gradient accumulation steps.
Used DDPM noise scheduler for training with epsilon prediction type and rescaled_betas_zero_snr=False
Using 'trailing' timestep spacing.
Base model: toilaluan/SoraT2I
VAE: madebyollin/sdxl-vae-fp16-fix

Files changed (9) hide show

.gitattributes +1 -0
README.md +266 -0
optimizer.bin +3 -0
random_states_0.pkl +3 -0
scheduler.bin +3 -0
training_state-mj-v6.json +3 -0
training_state.json +1 -0
transformer/config.json +30 -0
transformer/diffusion_pytorch_model.safetensors +3 -0

.gitattributes CHANGED Viewed

@@ -66,3 +66,4 @@ assets/image_6_0.png filter=lfs diff=lfs merge=lfs -text
 assets/image_7_0.png filter=lfs diff=lfs merge=lfs -text
 assets/image_8_0.png filter=lfs diff=lfs merge=lfs -text
 assets/image_9_0.png filter=lfs diff=lfs merge=lfs -text

 assets/image_7_0.png filter=lfs diff=lfs merge=lfs -text
 assets/image_8_0.png filter=lfs diff=lfs merge=lfs -text
 assets/image_9_0.png filter=lfs diff=lfs merge=lfs -text
+training_state-mj-v6.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,266 @@

+---
+license: creativeml-openrail-m
+base_model: "toilaluan/SoraT2I"
+tags:
+  - stable-diffusion
+  - stable-diffusion-diffusers
+  - text-to-image
+  - diffusers
+  - full
+inference: true
+widget:
+- text: 'unconditional (blank prompt)'
+  parameters:
+    negative_prompt: 'blurry, cropped, ugly'
+  output:
+    url: ./assets/image_0_0.png
+- text: 'a woman sitting on the grass'
+  parameters:
+    negative_prompt: 'blurry, cropped, ugly'
+  output:
+    url: ./assets/image_1_0.png
+- text: 'a professional photo headshot of a man in studio lighting'
+  parameters:
+    negative_prompt: 'blurry, cropped, ugly'
+  output:
+    url: ./assets/image_2_0.png
+- text: 'a person holding a sign that reads ''SOON'''
+  parameters:
+    negative_prompt: 'blurry, cropped, ugly'
+  output:
+    url: ./assets/image_3_0.png
+- text: 'Alien marketplace, bizarre creatures, exotic goods, vibrant colors, otherworldly atmosphere'
+  parameters:
+    negative_prompt: 'blurry, cropped, ugly'
+  output:
+    url: ./assets/image_4_0.png
+- text: 'Child holding a balloon, happy expression, colorful balloons, sunny day, high detail'
+  parameters:
+    negative_prompt: 'blurry, cropped, ugly'
+  output:
+    url: ./assets/image_5_0.png
+- text: 'a 4-panel comic strip showing an orange cat saying the words ''HELP'' and ''LASAGNA'''
+  parameters:
+    negative_prompt: 'blurry, cropped, ugly'
+  output:
+    url: ./assets/image_6_0.png
+- text: 'a hand is holding a comic book with a cover that reads ''The Adventures of Superhero'''
+  parameters:
+    negative_prompt: 'blurry, cropped, ugly'
+  output:
+    url: ./assets/image_7_0.png
+- text: 'Underground cave filled with crystals, glowing lights, reflective surfaces, fantasy environment, high detail'
+  parameters:
+    negative_prompt: 'blurry, cropped, ugly'
+  output:
+    url: ./assets/image_8_0.png
+- text: 'Bustling cyberpunk bazaar, vendors, neon signs, advanced tech, crowded, high detail'
+  parameters:
+    negative_prompt: 'blurry, cropped, ugly'
+  output:
+    url: ./assets/image_9_0.png
+- text: 'Cyberpunk hacker in a dark room, neon glow, multiple screens, intense focus, high detail'
+  parameters:
+    negative_prompt: 'blurry, cropped, ugly'
+  output:
+    url: ./assets/image_10_0.png
+- text: 'a cybernetic anne of green gables with neural implant and bio mech augmentations'
+  parameters:
+    negative_prompt: 'blurry, cropped, ugly'
+  output:
+    url: ./assets/image_11_0.png
+- text: 'Post-apocalyptic cityscape, ruined buildings, overgrown vegetation, dark and gritty, high detail'
+  parameters:
+    negative_prompt: 'blurry, cropped, ugly'
+  output:
+    url: ./assets/image_12_0.png
+- text: 'Magical castle in a lush forest, glowing windows, fantasy architecture, high resolution, detailed textures'
+  parameters:
+    negative_prompt: 'blurry, cropped, ugly'
+  output:
+    url: ./assets/image_13_0.png
+- text: 'Ruins of an ancient temple in an enchanted forest, glowing runes, mystical creatures, high detail'
+  parameters:
+    negative_prompt: 'blurry, cropped, ugly'
+  output:
+    url: ./assets/image_14_0.png
+- text: 'Mystical forest, glowing plants, fairies, magical creatures, fantasy art, high detail'
+  parameters:
+    negative_prompt: 'blurry, cropped, ugly'
+  output:
+    url: ./assets/image_15_0.png
+- text: 'Magical garden with glowing flowers, fairies, serene atmosphere, detailed plants, high resolution'
+  parameters:
+    negative_prompt: 'blurry, cropped, ugly'
+  output:
+    url: ./assets/image_16_0.png
+- text: 'Whimsical garden filled with fairies, magical plants, sparkling lights, serene atmosphere, high detail'
+  parameters:
+    negative_prompt: 'blurry, cropped, ugly'
+  output:
+    url: ./assets/image_17_0.png
+- text: 'Majestic dragon soaring through the sky, detailed scales, dynamic pose, fantasy art, high resolution'
+  parameters:
+    negative_prompt: 'blurry, cropped, ugly'
+  output:
+    url: ./assets/image_18_0.png
+- text: 'Fantasy world, floating islands in the sky, waterfalls, lush vegetation, detailed landscape, high resolution'
+  parameters:
+    negative_prompt: 'blurry, cropped, ugly'
+  output:
+    url: ./assets/image_19_0.png
+- text: 'Futuristic city skyline at night, neon lights, cyberpunk style, high contrast, sharp focus'
+  parameters:
+    negative_prompt: 'blurry, cropped, ugly'
+  output:
+    url: ./assets/image_20_0.png
+- text: 'Space battle scene, starships fighting, laser beams, explosions, cosmic background'
+  parameters:
+    negative_prompt: 'blurry, cropped, ugly'
+  output:
+    url: ./assets/image_21_0.png
+- text: 'Abandoned fairground at night, eerie rides, ghostly figures, fog, dark atmosphere, high detail'
+  parameters:
+    negative_prompt: 'blurry, cropped, ugly'
+  output:
+    url: ./assets/image_22_0.png
+- text: 'Spooky haunted mansion on a hill, dark and eerie, glowing windows, ghostly atmosphere, high detail'
+  parameters:
+    negative_prompt: 'blurry, cropped, ugly'
+  output:
+    url: ./assets/image_23_0.png
+- text: 'a hardcover physics textbook that is called PHYSICS FOR DUMMIES'
+  parameters:
+    negative_prompt: 'blurry, cropped, ugly'
+  output:
+    url: ./assets/image_24_0.png
+- text: 'Epic medieval battle, knights in armor, dynamic action, detailed landscape, high resolution'
+  parameters:
+    negative_prompt: 'blurry, cropped, ugly'
+  output:
+    url: ./assets/image_25_0.png
+- text: 'Bustling medieval market with merchants, knights, and jesters, vibrant colors, detailed'
+  parameters:
+    negative_prompt: 'blurry, cropped, ugly'
+  output:
+    url: ./assets/image_26_0.png
+- text: 'Cozy medieval tavern, warm firelight, adventurers drinking, detailed interior, rustic atmosphere'
+  parameters:
+    negative_prompt: 'blurry, cropped, ugly'
+  output:
+    url: ./assets/image_27_0.png
+- text: 'Futuristic city skyline at night, neon lights, cyberpunk style, high contrast, sharp focus'
+  parameters:
+    negative_prompt: 'blurry, cropped, ugly'
+  output:
+    url: ./assets/image_28_0.png
+- text: 'Forest with neon-lit trees, glowing plants, bioluminescence, surreal atmosphere, high detail'
+  parameters:
+    negative_prompt: 'blurry, cropped, ugly'
+  output:
+    url: ./assets/image_29_0.png
+- text: 'Bright neon sign in a busy city street, ''Open 24 Hours'', bold typography, glowing lights'
+  parameters:
+    negative_prompt: 'blurry, cropped, ugly'
+  output:
+    url: ./assets/image_30_0.png
+- text: 'Retro diner sign, ''Joe''s Diner'', classic 1950s design, neon lights, weathered look'
+  parameters:
+    negative_prompt: 'blurry, cropped, ugly'
+  output:
+    url: ./assets/image_31_0.png
+- text: 'Vintage store sign with elaborate typography, ''Antique Shop'', hand-painted, weathered look'
+  parameters:
+    negative_prompt: 'blurry, cropped, ugly'
+  output:
+    url: ./assets/image_32_0.png
+---
+# pixart-sora-t2i
+This is a full rank finetune derived from [toilaluan/SoraT2I](https://huggingface.co/toilaluan/SoraT2I).
+No validation prompt was used during training.
+None
+## Validation settings
+- CFG: `7.5`
+- CFG Rescale: `0.0`
+- Steps: `30`
+- Sampler: `euler`
+- Seed: `42`
+- Resolution: `1024`
+Note: The validation settings are not necessarily the same as the [training settings](#training-settings).
+You can find some example images in the following gallery:
+<Gallery />
+The text encoder **was not** trained.
+You may reuse the base model text encoder for inference.
+## Training settings
+- Training epochs: 0
+- Training steps: 500
+- Learning rate: 8e-06
+- Effective batch size: 128
+  - Micro-batch size: 32
+  - Gradient accumulation steps: 4
+  - Number of GPUs: 1
+- Prediction type: epsilon
+- Rescaled betas zero SNR: False
+- Optimizer: AdamW, stochastic bf16
+- Precision: Pure BF16
+- Xformers: Enabled
+## Datasets
+### mj-v6
+- Repeats: 0
+- Total number of images: 134144
+- Total number of aspect buckets: 1
+- Resolution: 1.0 megapixels
+- Cropped: False
+- Crop style: None
+- Crop aspect: None
+## Inference
+```python
+import torch
+from diffusers import DiffusionPipeline
+model_id = "pixart-sora-t2i"
+prompt = "An astronaut is riding a horse through the jungles of Thailand."
+negative_prompt = "malformed, disgusting, overexposed, washed-out"
+pipeline = DiffusionPipeline.from_pretrained(model_id)
+pipeline.to('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
+image = pipeline(
+    prompt=prompt,
+    negative_prompt='blurry, cropped, ugly',
+    num_inference_steps=30,
+    generator=torch.Generator(device='cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu').manual_seed(1641421826),
+    width=1152,
+    height=768,
+    guidance_scale=7.5,
+    guidance_rescale=0.0,
+).images[0]
+image.save("output.png", format="PNG")
+```

optimizer.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:60221f59d90258d0dbe2c074d7a9b84e30fcc5f3acaa8ee6ac6ca882eb0c231f
+size 3665677155

random_states_0.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5fdf07135febc8e555d2c56eea3c39ea245eafd412d6ec7a4ff4ffa4728e9def
+size 14344

scheduler.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:57feaeea732a8232dc14923ac8e8cff564f2d6d11728d1405a7f3cfc02efb7ed
+size 1000

training_state-mj-v6.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b10925e346b4ff56ccc9febfaf6d9cece752b1e439082211e8080f57349f7b3d
+size 15443107

training_state.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"global_step": 500, "epoch_step": 500, "epoch": 1, "exhausted_backends": [], "repeats": {}}

transformer/config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "_class_name": "PixArtTransformer2DModel",
+  "_diffusers_version": "0.29.0",
+  "_name_or_path": "toilaluan/SoraT2I",
+  "activation_fn": "gelu-approximate",
+  "attention_bias": true,
+  "attention_head_dim": 72,
+  "attention_type": "default",
+  "caption_channels": 4096,
+  "cross_attention_dim": 1152,
+  "double_self_attention": false,
+  "dropout": 0.0,
+  "in_channels": 4,
+  "interpolation_scale": 2,
+  "norm_elementwise_affine": false,
+  "norm_eps": 1e-06,
+  "norm_num_groups": 32,
+  "norm_type": "ada_norm_single",
+  "num_attention_heads": 16,
+  "num_embeds_ada_norm": 1000,
+  "num_layers": 28,
+  "num_vector_embeds": null,
+  "only_cross_attention": false,
+  "out_channels": 8,
+  "patch_size": 2,
+  "sample_size": 128,
+  "upcast_attention": false,
+  "use_additional_conditions": false,
+  "use_linear_projection": false
+}

transformer/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9ea4fbe7ee4344fc745d218a639508db68b2484f5693affa92b341f90b477eb7
+size 1221780352