chaitnya26

TencentOpen commited on Oct 4

Commit

22ee710

verified ·

0 Parent(s):

Duplicate from tencent/HunyuanImage-3.0

Browse files

Co-authored-by: TencentOpen <TencentOpen@users.noreply.huggingface.co>

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +40 -0
LICENSE +78 -0
README.md +502 -0
__init__.py +0 -0
assets/WECHAT.md +6 -0
assets/banner.png +3 -0
assets/banner_all.jpg +3 -0
assets/framework.png +3 -0
assets/gsb.png +3 -0
assets/logo.png +3 -0
assets/pg_imgs/image1.png +3 -0
assets/pg_imgs/image2.png +3 -0
assets/pg_imgs/image3.png +3 -0
assets/pg_imgs/image4.png +3 -0
assets/pg_imgs/image5.png +3 -0
assets/pg_imgs/image6.png +3 -0
assets/pg_imgs/image7.png +3 -0
assets/pg_imgs/image8.png +3 -0
assets/robot.png +3 -0
assets/ssae_side_by_side_comparison.png +3 -0
assets/ssae_side_by_side_heatmap.png +3 -0
assets/user.png +3 -0
assets/wechat.png +3 -0
autoencoder_kl_3d.py +793 -0
config.json +273 -0
configuration_hunyuan.py +285 -0
generation_config.json +20 -0
hunyuan.py +0 -0
hunyuan_image_3_pipeline.py +879 -0
image_processor.py +125 -0
model-0001-of-0032.safetensors +3 -0
model-0002-of-0032.safetensors +3 -0
model-0003-of-0032.safetensors +3 -0
model-0004-of-0032.safetensors +3 -0
model-0005-of-0032.safetensors +3 -0
model-0006-of-0032.safetensors +3 -0
model-0007-of-0032.safetensors +3 -0
model-0008-of-0032.safetensors +3 -0
model-0009-of-0032.safetensors +3 -0
model-0010-of-0032.safetensors +3 -0
model-0011-of-0032.safetensors +3 -0
model-0012-of-0032.safetensors +3 -0
model-0013-of-0032.safetensors +3 -0
model-0014-of-0032.safetensors +3 -0
model-0015-of-0032.safetensors +3 -0
model-0016-of-0032.safetensors +3 -0
model-0017-of-0032.safetensors +3 -0
model-0018-of-0032.safetensors +3 -0
model-0019-of-0032.safetensors +3 -0
model-0020-of-0032.safetensors +3 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,40 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
+assets/banner_all.jpg filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+assets/**/*.png filter=lfs diff=lfs merge=lfs -text
+*.tar.gz filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,78 @@

+TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT
+Tencent Hunyuan Image 3.0 Release Date: September 28, 2025
+THIS LICENSE AGREEMENT DOES NOT APPLY IN THE EUROPEAN UNION, UNITED KINGDOM AND SOUTH KOREA AND IS EXPRESSLY LIMITED TO THE TERRITORY, AS DEFINED BELOW.
+By clicking to agree or by using, reproducing, modifying, distributing, performing or displaying any portion or element of the Tencent Hunyuan Works, including via any Hosted Service, You will be deemed to have recognized and accepted the content of this Agreement, which is effective immediately.
+1.	DEFINITIONS.
+a.	“Acceptable Use Policy” shall mean the policy made available by Tencent as set forth in the Exhibit A.
+b.	“Agreement” shall mean the terms and conditions for use, reproduction, distribution, modification, performance and displaying of Tencent Hunyuan Works or any portion or element thereof set forth herein.
+c.	“Documentation” shall mean the specifications, manuals and documentation for Tencent Hunyuan made publicly available by Tencent.
+d.	“Hosted Service” shall mean a hosted service offered via an application programming interface (API), web access, or any other electronic or remote means.
+e.	“Licensee,” “You” or “Your” shall mean a natural person or legal entity exercising the rights granted by this Agreement and/or using the Tencent Hunyuan Works for any purpose and in any field of use.
+f.	“Materials” shall mean, collectively, Tencent’s proprietary Tencent Hunyuan and Documentation (and any portion thereof) as made available by Tencent under this Agreement.
+g.	“Model Derivatives” shall mean all: (i) modifications to Tencent Hunyuan or any Model Derivative of Tencent Hunyuan; (ii) works based on Tencent Hunyuan or any Model Derivative of Tencent Hunyuan; or (iii) any other machine learning model which is created by transfer of patterns of the weights, parameters, operations, or Output of Tencent Hunyuan or any Model Derivative of Tencent Hunyuan, to that model in order to cause that model to perform similarly to Tencent Hunyuan or a Model Derivative of Tencent Hunyuan, including distillation methods, methods that use intermediate data representations, or methods based on the generation of synthetic data Outputs by Tencent Hunyuan or a Model Derivative of Tencent Hunyuan for training that model. For clarity, Outputs by themselves are not deemed Model Derivatives.
+h.	“Output” shall mean the information and/or content output of Tencent Hunyuan or a Model Derivative that results from operating or otherwise using Tencent Hunyuan or a Model Derivative, including via a Hosted Service.
+i.	“Tencent,” “We” or “Us” shall mean the applicable entity or entities in the Tencent corporate family that own(s) intellectual property or other rights embodied in or utilized by the Materials.
+j.	“Tencent Hunyuan” shall mean the large language models, text/image/video/audio/3D generation models, and multimodal large language models and their software and algorithms, including trained model weights, parameters (including optimizer states), machine-learning model code, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing made publicly available by Us, including, without limitation to, Tencent Hunyuan Image 2.1 released at [
+https://github.com/Tencent-Hunyuan/HunyuanImage-3.0;https://huggingface.co/tencent/HunyuanImage-3.0;https://huggingface.co/tencent/HunyuanImage-3.0-Instruct;https://modelscope.cn/models/Tencent-Hunyuan HunyuanImage-3.0/;https://ai.gitcode.com/tencent_hunyuan/HunyuanImage-3.0].
+k.	“Tencent Hunyuan Works” shall mean: (i) the Materials; (ii) Model Derivatives; and (iii) all derivative works thereof.
+l.	“Territory” shall mean the worldwide territory, excluding the territory of the European Union, United Kingdom and South Korea.
+m.	“Third Party” or “Third Parties” shall mean individuals or legal entities that are not under common control with Us or You.
+n.	“including” shall mean including but not limited to.
+2.	GRANT OF RIGHTS.
+We grant You, for the Territory only, a non-exclusive, non-transferable and royalty-free limited license under Tencent’s intellectual property or other rights owned by Us embodied in or utilized by the Materials to use, reproduce, distribute, create derivative works of (including Model Derivatives), and make modifications to the Materials, only in accordance with the terms of this Agreement and the Acceptable Use Policy, and You must not violate (or encourage or permit anyone else to violate) any term of this Agreement or the Acceptable Use Policy.
+3.	DISTRIBUTION.
+You may, subject to Your compliance with this Agreement, distribute or make available to Third Parties the Tencent Hunyuan Works, exclusively in the Territory, provided that You meet all of the following conditions:
+a.	You must provide all such Third Party recipients of the Tencent Hunyuan Works or products or services using them a copy of this Agreement;
+b.	You must cause any modified files to carry prominent notices stating that You changed the files;
+c.	You are encouraged to: (i) publish at least one technology introduction blogpost or one public statement expressing Your experience of using the Tencent Hunyuan Works; and (ii) mark the products or services developed by using the Tencent Hunyuan Works to indicate that the product/service is “Powered by Tencent Hunyuan”; and
+d.	All distributions to Third Parties (other than through a Hosted Service) must be accompanied by a “Notice” text file that contains the following notice: “Tencent Hunyuan is licensed under the Tencent Hunyuan Community License Agreement, Copyright © 2025 Tencent. All Rights Reserved. The trademark rights of “Tencent Hunyuan” are owned by Tencent or its affiliate.”
+You may add Your own copyright statement to Your modifications and, except as set forth in this Section and in Section 5, may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Model Derivatives as a whole, provided Your use, reproduction, modification, distribution, performance and display of the work otherwise complies with the terms and conditions of this Agreement (including as regards the Territory). If You receive Tencent Hunyuan Works from a Licensee as part of an integrated end user product, then this Section 3 of this Agreement will not apply to You.
+4.	ADDITIONAL COMMERCIAL TERMS.
+If, on the Tencent Hunyuan version release date, the monthly active users of all products or services made available by or for Licensee is greater than 100 million monthly active users in the preceding calendar month, You must request a license from Tencent, which Tencent may grant to You in its sole discretion, and You are not authorized to exercise any of the rights under this Agreement unless or until Tencent otherwise expressly grants You such rights.
+5.	RULES OF USE.
+a.	Your use of the Tencent Hunyuan Works must comply with applicable laws and regulations (including trade compliance laws and regulations) and adhere to the Acceptable Use Policy for the Tencent Hunyuan Works, which is hereby incorporated by reference into this Agreement. You must include the use restrictions referenced in these Sections 5(a) and 5(b) as an enforceable provision in any agreement (e.g., license agreement, terms of use, etc.) governing the use and/or distribution of Tencent Hunyuan Works and You must provide notice to subsequent users to whom You distribute that Tencent Hunyuan Works are subject to the use restrictions in these Sections 5(a) and 5(b).
+b.	You must not use the Tencent Hunyuan Works or any Output or results of the Tencent Hunyuan Works to improve any other AI model (other than Tencent Hunyuan or Model Derivatives thereof).
+c.	You must not use, reproduce, modify, distribute, or display the Tencent Hunyuan Works, Output or results of the Tencent Hunyuan Works outside the Territory. Any such use outside the Territory is unlicensed and unauthorized under this Agreement.
+6.	INTELLECTUAL PROPERTY.
+a.	Subject to Tencent’s ownership of Tencent Hunyuan Works made by or for Tencent and intellectual property rights therein, conditioned upon Your compliance with the terms and conditions of this Agreement, as between You and Tencent, You will be the owner of any derivative works and modifications of the Materials and any Model Derivatives that are made by or for You.
+b.	No trademark licenses are granted under this Agreement, and in connection with the Tencent Hunyuan Works, Licensee may not use any name or mark owned by or associated with Tencent or any of its affiliates, except as required for reasonable and customary use in describing and distributing the Tencent Hunyuan Works. Tencent hereby grants You a license to use “Tencent Hunyuan” (the “Mark”) in the Territory solely as required to comply with the provisions of Section 3(c), provided that You comply with any applicable laws related to trademark protection. All goodwill arising out of Your use of the Mark will inure to the benefit of Tencent.
+c.	If You commence a lawsuit or other proceedings (including a cross-claim or counterclaim in a lawsuit) against Us or any person or entity alleging that the Materials or any Output, or any portion of any of the foregoing, infringe any intellectual property or other right owned or licensable by You, then all licenses granted to You under this Agreement shall terminate as of the date such lawsuit or other proceeding is filed. You will defend, indemnify and hold harmless Us from and against any claim by any Third Party arising out of or related to Your or the Third Party’s use or distribution of the Tencent Hunyuan Works.
+d.	Tencent claims no rights in Outputs You generate. You and Your users are solely responsible for Outputs and their subsequent uses.
+7.	DISCLAIMERS OF WARRANTY AND LIMITATIONS OF LIABILITY.
+a.	We are not obligated to support, update, provide training for, or develop any further version of the Tencent Hunyuan Works or to grant any license thereto.
+b.	UNLESS AND ONLY TO THE EXTENT REQUIRED BY APPLICABLE LAW, THE TENCENT HUNYUAN WORKS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED “AS IS” WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES OF ANY KIND INCLUDING ANY WARRANTIES OF TITLE, MERCHANTABILITY, NONINFRINGEMENT, COURSE OF DEALING, USAGE OF TRADE, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING, REPRODUCING, MODIFYING, PERFORMING, DISPLAYING OR DISTRIBUTING ANY OF THE TENCENT HUNYUAN WORKS OR OUTPUTS AND ASSUME ANY AND ALL RISKS ASSOCIATED WITH YOUR OR A THIRD PARTY’S USE OR DISTRIBUTION OF ANY OF THE TENCENT HUNYUAN WORKS OR OUTPUTS AND YOUR EXERCISE OF RIGHTS AND PERMISSIONS UNDER THIS AGREEMENT.
+c.	TO THE FULLEST EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL TENCENT OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, FOR ANY DAMAGES, INCLUDING ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, EXEMPLARY, CONSEQUENTIAL OR PUNITIVE DAMAGES, OR LOST PROFITS OF ANY KIND ARISING FROM THIS AGREEMENT OR RELATED TO ANY OF THE TENCENT HUNYUAN WORKS OR OUTPUTS, EVEN IF TENCENT OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
+8.	SURVIVAL AND TERMINATION.
+a.	The term of this Agreement shall commence upon Your acceptance of this Agreement or access to the Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein.
+b.	We may terminate this Agreement if You breach any of the terms or conditions of this Agreement. Upon termination of this Agreement, You must promptly delete and cease use of the Tencent Hunyuan Works. Sections 6(a), 6(c), 7 and 9 shall survive the termination of this Agreement.
+9.	GOVERNING LAW AND JURISDICTION.
+a.	This Agreement and any dispute arising out of or relating to it will be governed by the laws of the Hong Kong Special Administrative Region of the People’s Republic of China, without regard to conflict of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement.
+b.	Exclusive jurisdiction and venue for any dispute arising out of or relating to this Agreement will be a court of competent jurisdiction in the Hong Kong Special Administrative Region of the People’s Republic of China, and Tencent and Licensee consent to the exclusive jurisdiction of such court with respect to any such dispute.
+EXHIBIT A
+ACCEPTABLE USE POLICY
+Tencent reserves the right to update this Acceptable Use Policy from time to time.
+Last modified: November 5, 2024
+Tencent endeavors to promote safe and fair use of its tools and features, including Tencent Hunyuan. You agree not to use Tencent Hunyuan or Model Derivatives:
+1.	Outside the Territory;
+2.	In any way that violates any applicable national, federal, state, local, international or any other law or regulation;
+3.	To harm Yourself or others;
+4.	To repurpose or distribute output from Tencent Hunyuan or any Model Derivatives to harm Yourself or others;
+5.	To override or circumvent the safety guardrails and safeguards We have put in place;
+6.	For the purpose of exploiting, harming or attempting to exploit or harm minors in any way;
+7.	To generate or disseminate verifiably false information and/or content with the purpose of harming others or influencing elections;
+8.	To generate or facilitate false online engagement, including fake reviews and other means of fake online engagement;
+9.	To intentionally defame, disparage or otherwise harass others;
+10.	To generate and/or disseminate malware (including ransomware) or any other content to be used for the purpose of harming electronic systems;
+11.	To generate or disseminate personal identifiable information with the purpose of harming others;
+12.	To generate or disseminate information (including images, code, posts, articles), and place the information in any public context (including –through the use of bot generated tweets), without expressly and conspicuously identifying that the information and/or content is machine generated;
+13.	To impersonate another individual without consent, authorization, or legal right;
+14.	To make high-stakes automated decisions in domains that affect an individual’s safety, rights or wellbeing (e.g., law enforcement, migration, medicine/health, management of critical infrastructure, safety components of products, essential services, credit, employment, housing, education, social scoring, or insurance);
+15.	In a manner that violates or disrespects the social ethics and moral standards of other countries or regions;
+16.	To perform, facilitate, threaten, incite, plan, promote or encourage violent extremism or terrorism;
+17.	For any use intended to discriminate against or harm individuals or groups based on protected characteristics or categories, online or offline social behavior or known or predicted personal or personality characteristics;
+18.	To intentionally exploit any of the vulnerabilities of a specific group of persons based on their age, social, physical or mental characteristics, in order to materially distort the behavior of a person pertaining to that group in a manner that causes or is likely to cause that person or another person physical or psychological harm;
+19.	For military purposes;
+20.	To engage in the unauthorized or unlicensed practice of any profession including, but not limited to, financial, legal, medical/health, or other professional practices.

README.md ADDED Viewed

	@@ -0,0 +1,502 @@

+---
+license: other
+license_name: tencent-hunyuan-community
+license_link: LICENSE
+pipeline_tag: text-to-image
+library_name: transformers
+---
+<div align="center">
+<img src="./assets/logo.png" alt="HunyuanImage-3.0 Logo" width="600">
+# 🎨 HunyuanImage-3.0: A Powerful Native Multimodal Model for Image Generation
+</div>
+<div align="center">
+<img src="./assets/banner.png" alt="HunyuanImage-3.0 Banner" width="800">
+</div>
+<div align="center">
+  <a href=https://hunyuan.tencent.com/image target="_blank"><img src=https://img.shields.io/badge/Official%20Site-333399.svg?logo=homepage height=22px></a>
+  <a href=https://huggingface.co/tencent/HunyuanImage-3.0 target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20Models-d96902.svg height=22px></a>
+  <a href=https://github.com/Tencent-Hunyuan/HunyuanImage-3.0 target="_blank"><img src= https://img.shields.io/badge/Page-bb8a2e.svg?logo=github height=22px></a>
+  <a href=https://arxiv.org/pdf/2509.23951 target="_blank"><img src=https://img.shields.io/badge/Report-b5212f.svg?logo=arxiv height=22px></a>
+  <a href=https://x.com/TencentHunyuan target="_blank"><img src=https://img.shields.io/badge/Hunyuan-black.svg?logo=x height=22px></a>
+  <a href=https://docs.qq.com/doc/DUVVadmhCdG9qRXBU target="_blank"><img src=https://img.shields.io/badge/📚-PromptHandBook-blue.svg?logo=book height=22px></a>
+</div>
+<p align="center">
+    👏 Join our <a href="./assets/WECHAT.md" target="_blank">WeChat</a> and <a href="https://discord.gg/ehjWMqF5wY">Discord</a> |
+💻 <a href="https://hunyuan.tencent.com/modelSquare/home/play?modelId=289&from=/visual">Official website(官网) Try our model!</a>&nbsp&nbsp
+</p>
+## 🔥🔥🔥 News
+- **September 28, 2025**: 📖 **HunyuanImage-3.0 Technical Report Released** - Comprehensive technical documentation now available
+- **September 28, 2025**: 🚀 **HunyuanImage-3.0 Open Source Release** - Inference code and model weights publicly available
+## 🧩 Community Contributions
+If you develop/use HunyuanImage-3.0 in your projects, welcome to let us know.
+## 📑 Open-source Plan
+- HunyuanImage-3.0 (Image Generation Model)
+  - [x] Inference
+  - [x] HunyuanImage-3.0 Checkpoints
+  - [ ] HunyuanImage-3.0-Instruct Checkpoints (with reasoning)
+  - [ ] VLLM Support
+  - [ ] Distilled Checkpoints
+  - [ ] Image-to-Image Generation
+  - [ ] Multi-turn Interaction
+## 🗂️ Contents
+- [🔥🔥🔥 News](#-news)
+- [🧩 Community Contributions](#-community-contributions)
+- [📑 Open-source Plan](#-open-source-plan)
+- [📖 Introduction](#-introduction)
+- [✨ Key Features](#-key-features)
+- [🛠️ Dependencies and Installation](#-dependencies-and-installation)
+  - [💻 System Requirements](#-system-requirements)
+  - [📦 Environment Setup](#-environment-setup)
+  - [📥 Install Dependencies](#-install-dependencies)
+  - [Performance Optimizations](#performance-optimizations)
+- [🚀 Usage](#-usage)
+  - [🔥 Quick Start with Transformers](#-quick-start-with-transformers)
+  - [🏠 Local Installation & Usage](#-local-installation--usage)
+  - [🎨 Interactive Gradio Demo](#-interactive-gradio-demo)
+- [🧱 Models Cards](#-models-cards)
+- [📝 Prompt Guide](#-prompt-guide)
+  - [Manually Writing Prompts](#manually-writing-prompts)
+  - [System Prompt For Automatic Rewriting the Prompt](#system-prompt-for-automatic-rewriting-the-prompt)
+  - [Advanced Tips](#advanced-tips)
+  - [More Cases](#more-cases)
+- [📊 Evaluation](#-evaluation)
+- [📚 Citation](#-citation)
+- [🙏 Acknowledgements](#-acknowledgements)
+- [🌟🚀  Github Star History](#-github-star-history)
+---
+## 📖 Introduction
+**HunyuanImage-3.0** is a groundbreaking native multimodal model that unifies multimodal understanding and generation within an autoregressive framework. Our text-to-image module achieves performance **comparable to or surpassing** leading closed-source models.
+<div align="center">
+  <img src="./assets/framework.png" alt="HunyuanImage-3.0 Framework" width="90%">
+</div>
+## ✨ Key Features
+* 🧠 **Unified Multimodal Architecture:** Moving beyond the prevalent DiT-based architectures, HunyuanImage-3.0 employs a unified autoregressive framework. This design enables a more direct and integrated modeling of text and image modalities, leading to surprisingly effective and contextually rich image generation.
+* 🏆 **The Largest Image Generation MoE Model:** This is the largest open-source image generation Mixture of Experts (MoE) model to date. It features 64 experts and a total of 80 billion parameters, with 13 billion activated per token, significantly enhancing its capacity and performance.
+* 🎨 **Superior Image Generation Performance:** Through rigorous dataset curation and advanced reinforcement learning post-training, we've achieved an optimal balance between semantic accuracy and visual excellence. The model demonstrates exceptional prompt adherence while delivering photorealistic imagery with stunning aesthetic quality and fine-grained details.
+* 💭 **Intelligent World-Knowledge Reasoning:** The unified multimodal architecture endows HunyuanImage-3.0 with powerful reasoning capabilities. It leverages its extensive world knowledge to intelligently interpret user intent, automatically elaborating on sparse prompts with contextually appropriate details to produce superior, more complete visual outputs.
+## 🛠️ Dependencies and Installation
+### 💻 System Requirements
+* 🖥️ **Operating System:** Linux
+* 🎮 **GPU:** NVIDIA GPU with CUDA support
+* 💾 **Disk Space:** 170GB for model weights
+* 🧠 **GPU Memory:** ≥3×80GB (4×80GB recommended for better performance)
+### 📦 Environment Setup
+* 🐍 **Python:** 3.12+ (recommended and tested)
+* 🔥 **PyTorch:** 2.7.1
+* ⚡ **CUDA:** 12.8
+### 📥 Install Dependencies
+```bash
+# 1. First install PyTorch (CUDA 12.8 Version)
+pip install torch==2.7.1 torchvision==0.22.1 torchaudio==2.7.1 --index-url https://download.pytorch.org/whl/cu128
+# 2. Then install tencentcloud-sdk
+pip install -i https://mirrors.tencent.com/pypi/simple/ --upgrade tencentcloud-sdk-python
+# 3. Then install other dependencies
+pip install -r requirements.txt
+```
+#### Performance Optimizations
+For **up to 3x faster inference**, install these optimizations:
+```bash
+# FlashAttention for faster attention computation
+pip install flash-attn==2.8.3 --no-build-isolation
+# FlashInfer for optimized moe inference. v0.3.1 is tested.
+pip install flashinfer-python
+```
+> 💡**Installation Tips:** It is critical that the CUDA version used by PyTorch matches the system's CUDA version.
+> FlashInfer relies on this compatibility when compiling kernels at runtime. Pytorch 2.7.1+cu128 is tested.
+> GCC version >=9 is recommended for compiling FlashAttention and FlashInfer.
+> ⚡ **Performance Tips:** These optimizations can significantly speed up your inference!
+> 💡**Notation:** When FlashInfer is enabled, the first inference may be slower (about 10 minutes) due to kernel compilation. Subsequent inferences on the same machine will be much faster.
+## 🚀 Usage
+### 🔥 Quick Start with Transformers
+#### 1️⃣ Download model weights
+```bash
+# Download from HuggingFace and rename the directory.
+# Notice that the directory name should not contain dots, which may cause issues when loading using Transformers.
+hf download tencent/HunyuanImage-3.0 --local-dir ./HunyuanImage-3
+```
+#### 2️⃣ Run with Transformers
+```python
+from transformers import AutoModelForCausalLM
+# Load the model
+model_id = "./HunyuanImage-3"
+# Currently we can not load the model using HF model_id `tencent/HunyuanImage-3.0` directly
+# due to the dot in the name.
+kwargs = dict(
+    attn_implementation="sdpa",     # Use "flash_attention_2" if FlashAttention is installed
+    trust_remote_code=True,
+    torch_dtype="auto",
+    device_map="auto",
+    moe_impl="eager",   # Use "flashinfer" if FlashInfer is installed
+)
+model = AutoModelForCausalLM.from_pretrained(model_id, **kwargs)
+model.load_tokenizer(model_id)
+# generate the image
+prompt = "A brown and white dog is running on the grass"
+image = model.generate_image(prompt=prompt, stream=True)
+image.save("image.png")
+```
+### 🏠 Local Installation & Usage
+#### 1️⃣ Clone the Repository
+```bash
+git clone https://github.com/Tencent-Hunyuan/HunyuanImage-3.0.git
+cd HunyuanImage-3.0/
+```
+#### 2️⃣ Download Model Weights
+```bash
+# Download from HuggingFace
+hf download tencent/HunyuanImage-3.0 --local-dir ./HunyuanImage-3
+```
+#### 3️⃣ Run the Demo
+The Pretrain Checkpoint does not automatically rewrite or enhance input prompts, for optimal results currently, we recommend community partners to use deepseek to rewrite the prompts. You can go to [Tencent Cloud](https://cloud.tencent.com/document/product/1772/115963#.E5.BF.AB.E9.80.9F.E6.8E.A5.E5.85.A5) to apply for an API Key.
+```bash
+# set env
+export DEEPSEEK_KEY_ID="your_deepseek_key_id"
+export DEEPSEEK_KEY_SECRET="your_deepseek_key_secret"
+python3 run_image_gen.py --model-id ./HunyuanImage-3 --verbose 1 --sys-deepseek-prompt "universal" --prompt "A brown and white dog is running on the grass"
+```
+#### 4️⃣ Command Line Arguments
+| Arguments               | Description                                                  | Default     |
+| ----------------------- | ------------------------------------------------------------ | ----------- |
+| `--prompt`              | Input prompt                                                 | (Required)  |
+| `--model-id`            | Model path                                                   | (Required)  |
+| `--attn-impl`           | Attention implementation. Either `sdpa` or `flash_attention_2`. | `sdpa`      |
+| `--moe-impl`            | MoE implementation. Either `eager` or `flashinfer`           | `eager`     |
+| `--seed`                | Random seed for image generation                             | `None`      |
+| `--diff-infer-steps`    | Diffusion infer steps                                        | `50`        |
+| `--image-size`          | Image resolution. Can be `auto`, like `1280x768` or `16:9`   | `auto`      |
+| `--save`                | Image save path.                                             | `image.png` |
+| `--verbose`             | Verbose level. 0: No log; 1: log inference information.      | `0`         |
+| `--rewrite`             | Whether to enable rewriting                                  | `1`         |
+| `--sys-deepseek-prompt` | Select sys-prompt from `universal` or `text_rendering`       | `universal` |
+### 🎨 Interactive Gradio Demo
+Launch an interactive web interface for easy text-to-image generation.
+#### 1️⃣ Install Gradio
+```bash
+pip install gradio>=4.21.0
+```
+#### 2️⃣ Configure Environment
+```bash
+# Set your model path
+export MODEL_ID="path/to/your/model"
+# Optional: Configure GPU usage (default: 0,1,2,3)
+export GPUS="0,1,2,3"
+# Optional: Configure host and port (default: 0.0.0.0:443)
+export HOST="0.0.0.0"
+export PORT="443"
+```
+#### 3️⃣ Launch the Web Interface
+**Basic Launch:**
+```bash
+sh run_app.sh
+```
+**With Performance Optimizations:**
+```bash
+# Use both optimizations for maximum performance
+sh run_app.sh --moe-impl flashinfer --attn-impl flash_attention_2
+```
+#### 4️⃣ Access the Interface
+> 🌐 **Web Interface:** Open your browser and navigate to `http://localhost:443` (or your configured port)
+## 🧱 Models Cards
+| Model                     | Params | Download | Recommended VRAM | Supported |
+|---------------------------| --- | --- | --- | --- |
+| HunyuanImage-3.0          | 80B total (13B active) | [HuggingFace](https://huggingface.co/tencent/HunyuanImage-3.0) | ≥ 3 × 80 GB | ✅ Text-to-Image
+| HunyuanImage-3.0-Instruct | 80B total (13B active) | [HuggingFace](https://huggingface.co/tencent/HunyuanImage-3.0-Instruct) | ≥ 3 × 80 GB | ✅ Text-to-Image<br>✅ Prompt Self-Rewrite <br>✅ CoT Think
+Notes:
+- Install performance extras (FlashAttention, FlashInfer) for faster inference.
+- Multi‑GPU inference is recommended for the Base model.
+## 📝 Prompt Guide
+### Manually Writing Prompts.
+The Pretrain Checkpoint does not automatically rewrite or enhance input prompts, Instruct Checkpoint can rewrite or enhance input prompts with thinking . For optimal results currently, we recommend community partners consulting our official guide on how to write effective prompts.
+Reference: [HunyuanImage 3.0 Prompt Handbook](
+https://docs.qq.com/doc/DUVVadmhCdG9qRXBU)
+### System Prompt For Automatic Rewriting the Prompt.
+We've included two system prompts in the PE folder of this repository that leverage DeepSeek to automatically enhance user inputs:
+* **system_prompt_universal**: This system prompt converts photographic style, artistic prompts into a detailed one.
+* **system_prompt_text_rendering**: This system prompt converts UI/Poster/Text Rending prompts to a deailed on that suits the model.
+Note that these system prompts are in Chinese because Deepseek works better with Chinese system prompts. If you want to use it for English oriented model, you may translate it into English or refer to the comments in the PE file as a guide.
+We also create a [Yuanqi workflow](https://yuanqi.tencent.com/agent/H69VgtJdj3Dz) to implement the universal one, you can directly try it.
+### Advanced Tips
+- **Content Priority**: Focus on describing the main subject and action first, followed by details about the environment and style. A more general description framework is: **Main subject and scene + Image quality and style + Composition and perspective + Lighting and atmosphere + Technical parameters**. Keywords can be added both before and after this structure.
+- **Image resolution**: Our model not only supports multiple resolutions but also offers both **automatic and specified resolution** options. In auto mode, the model automatically predicts the image resolution based on the input prompt. In specified mode (like traditional DiT), the model outputs an image resolution that strictly aligns with the user's chosen resolution.
+### More Cases
+Our model can follow complex instructions to generate high‑quality, creative images.
+<div align="center">
+  <img src="./assets/banner_all.jpg" width=100% alt="HunyuanImage 3.0 Demo">
+</div>
+Our model can effectively process very long text inputs, enabling users to precisely control the finer details of generated images. Extended prompts allow for intricate elements to be accurately captured, making it ideal for complex projects requiring precision and creativity.
+<p align="center">
+<table>
+<thead>
+</thead>
+<tbody>
+<tr>
+<td>
+<img src="./assets/pg_imgs/image1.png" width=100%><details>
+<summary>Show prompt</summary>
+A cinematic medium shot captures a single Asian woman seated on a chair within a dimly lit room, creating an intimate and theatrical atmosphere. The composition is focused on the subject, rendered with rich colors and intricate textures that evoke a nostalgic and moody feeling.
+The primary subject is a young Asian woman with a thoughtful and expressive countenance, her gaze directed slightly away from the camera. She is seated in a relaxed yet elegant posture on an ornate, vintage armchair. The chair is upholstered in a deep red velvet, its fabric showing detailed, intricate textures and slight signs of wear. She wears a simple, elegant dress in a dark teal hue, the material catching the light in a way that reveals its fine-woven texture. Her skin has a soft, matte quality, and the light delicately models the contours of her face and arms.
+The surrounding room is characterized by its vintage decor, which contributes to the historic and evocative mood. In the immediate background, partially blurred due to a shallow depth of field consistent with a f/2.8 aperture, the wall is covered with wallpaper featuring a subtle, damask pattern. The overall color palette is a carefully balanced interplay of deep teal and rich red hues, creating a visually compelling and cohesive environment. The entire scene is detailed, from the fibers of the upholstery to the subtle patterns on the wall.
+The lighting is highly dramatic and artistic, defined by high contrast and pronounced shadow play. A single key light source, positioned off-camera, projects gobo lighting patterns onto the scene, casting intricate shapes of light and shadow across the woman and the back wall. These dramatic shadows create a strong sense of depth and a theatrical quality. While some shadows are deep and defined, others remain soft, gently wrapping around the subject and preventing the loss of detail in darker areas. The soft focus on the background enhances the intimate feeling, drawing all attention to the expressive subject. The overall image presents a cinematic, photorealistic photography style.
+</details>
+</td>
+<td><img src="./assets/pg_imgs/image2.png" width=100%><details>
+<summary>Show prompt</summary>
+A cinematic, photorealistic medium shot captures a high-contrast urban street corner, defined by the sharp intersection of light and shadow. The primary subject is the exterior corner of a building, rendered in a low-saturation, realistic style.
+The building wall, which occupies the majority of the frame, is painted a warm orange with a finely detailed, rough stucco texture. Horizontal white stripes run across its surface. The base of the building is constructed from large, rough-hewn stone blocks, showing visible particles and texture. On the left, illuminated side of the building, there is a single window with closed, dark-colored shutters. Adjacent to the window, a simple black pendant lamp hangs from a thin, taut rope, casting a distinct, sharp-edged shadow onto the sunlit orange wall. The composition is split diagonally, with the right side of the building enveloped in a deep brown shadow. At the bottom of the frame, a smooth concrete sidewalk is visible, upon which the dynamic silhouette of a person is captured mid-stride, walking from right to left.
+In the shallow background, the faint, out-of-focus outlines of another building and the bare, skeletal branches of trees are softly visible, contributing to the quiet urban atmosphere and adding a sense of depth to the scene. These elements are rendered with minimal detail to keep the focus on the foreground architecture.
+The scene is illuminated by strong, natural sunlight originating from the upper left, creating a dramatic chiaroscuro effect. This hard light source casts deep, well-defined shadows, producing a sharp contrast between the brightly lit warm orange surfaces and the deep brown shadow areas. The lighting highlights the fine details in the wall texture and stone particles, emphasizing the photorealistic quality. The overall presentation reflects a high-quality photorealistic photography style, infused with a cinematic film noir aesthetic.
+</details>
+</td>
+</tr>
+<tr>
+<td>
+<img src="./assets/pg_imgs/image3.png" width=100%><details>
+<summary>Show prompt</summary>
+一幅极具视觉张力的杂志封面风格人像特写。画面主体是一个身着古风汉服的人物，构图采用了从肩部以上的超级近距离特写，人物占据了画面的绝大部分，形成了强烈的视觉冲击力。
+画面中的人物以一种慵懒的姿态出现，微微倾斜着头部，裸露的一侧肩膀线条流畅。她正用一种妩媚而直接的眼神凝视着镜头，双眼微张，眼神深邃，传递出一种神秘而勾人的气质。人物的面部特征精致，皮肤质感细腻，在特定的光线下，面部轮廓清晰分明，展现出一种古典与现代融合的时尚美感。
+整个画面的背景被设定为一种简约而高级的纯红色。这种红色色调深沉，呈现出哑光质感，既纯粹又无任何杂质，为整个暗黑神秘的氛围奠定了沉稳而富有张力的基调。这个纯色的背景有效地突出了前景中的人物主体，使得所有视觉焦点都集中在其身上。
+光线和氛围的营造是这幅杂志风海报的关键。一束暗橘色的柔和光线作为主光源，从人���的一侧斜上方投射下来，精准地勾勒出人物的脸颊、鼻梁和肩膀的轮廓，在皮肤上形成微妙的光影过渡。同时，人物的周身萦绕着一层暗淡且低饱和度的银白色辉光，如同清冷的月光，形成一道朦胧的轮廓光。这道银辉为人物增添了几分疏离的幽灵感，强化了整体暗黑风格的神秘气质。光影的强烈对比与色彩的独特搭配，共同塑造了这张充满故事感的特写画面。整体图像呈现出一种融合了古典元素的现代时尚摄影风格。
+</details>
+</td>
+<td>
+<img src="./assets/pg_imgs/image4.png" width=100%><details>
+<summary>Show prompt</summary>
+一幅采用极简俯视视角的油画作品，画面主体由一道居中斜向的红色笔触构成。
+这道醒目的红色笔触运用了厚涂技法，颜料堆叠形成了强烈的物理厚度和三维立体感。它从画面的左上角附近延伸至右下角附近，构成一个动态的对角线。颜料表面可以清晰地看到画刀刮擦和笔刷拖曳留下的痕迹，边缘处的颜料层相对较薄，而中央部分则高高隆起，形成了不规则的起伏。
+在这道立体的红色颜料之上，巧妙地构建了一处精致的微缩景观。景观的核心是一片模拟红海滩的区域，由细腻的深红色颜料点缀而成，与下方基底的鲜红色形成丰富的层次对比。紧邻着“红海滩”的是一小片湖泊，由一层平滑且带有光泽的蓝色与白色混合颜料构成，质感如同平静无波的水面。湖泊边缘，一小撮芦苇丛生，由几根纤细挺拔的、用淡黄色和棕色颜料勾勒出的线条来表现。一只小巧的白鹭立于芦苇旁，其形态由一小块纯白色的厚涂颜料塑造，仅用一抹精炼的黑色颜料点出其尖喙，姿态优雅宁静。
+整个构图的背景是大面积的留白，呈现为一张带有细微凹凸纹理的白色纸质基底，这种极简处理极大地突出了中央的红色笔触及其上的微缩景观。
+光线从画面一侧柔和地照射下来，在厚涂的颜料堆叠处投下淡淡的、轮廓分明的阴影，进一步增强了画面的三维立体感和油画质感。整幅画面呈现出一种结合了厚涂技法的现代极简主义油画风格。
+</details>
+</td>
+</tr>
+<tr>
+<td>
+<img src="./assets/pg_imgs/image5.png" width=100%><details>
+<summary>Show prompt</summary>
+整体画面采用一个二乘二的四宫格布局，以产品可视化的风格，展示了一只兔子在四种不同材质下的渲染效果。每个宫格内都有一只姿态完全相同的兔子模型，它呈坐姿，双耳竖立，面朝前方。所有宫格的背景均是统一的中性深灰色，这种简约背景旨在最大限度地突出每种材质的独特质感。
+左上角的宫格中，兔子模型由哑光白色石膏材质构成。其表面平滑、均匀且无反射，在模型的耳朵根部、四肢交接处等凹陷区域呈现出柔和的环境光遮蔽阴影，这种微妙的阴影变化凸显了其纯粹的几何形态，整体感觉像一个用于美术研究的基础模型。
+右上角的宫格中，兔子模型由晶莹剔透的无瑕疵玻璃制成。它展现了逼真的物理折射效果，透过其透明的身体看到的背景呈现出轻微的扭曲。清晰的镜面高光沿着其身体的曲线轮廓流动，表面上还能看到微弱而清晰的环境反射，赋予其一种精致而易碎的质感。
+左下角的宫格中，兔子模型呈现为带有拉丝纹理的钛金属材质。金属表面具有明显的各向异性反射效果，呈现出冷峻的灰调金属光泽。锐利明亮的高光和深邃的阴影形成了强烈对比，精确地定义了其坚固的三维形态，展现了工业设计般的美感。
+右下角的宫格中，兔子模型覆盖着一层柔软浓密的灰色毛绒。根根分明的绒毛清晰可见，创造出一种温暖、可触摸的质地。光线照射在绒毛的末梢，形成柔和的光晕效果，而毛绒内部的阴影则显得深邃而柔软，展现了高度写实的毛发渲染效果。
+整个四宫格由来自多个方向的、柔和均匀的影棚灯光照亮，确保了每种材质的细节和特性都得到清晰的展现，没有任何刺眼的阴影或过曝的高光。这张图像以一种高度写实的3D渲染风格呈现，完美地诠释了产品可视化的精髓
+</details>
+</td>
+<td>
+<img src="./assets/pg_imgs/image6.png" width=100%><details>
+<summary>Show prompt</summary>
+由一个两行两列的网格构成，共包含四个独立的场景，每个场景都以不同的艺术风格描绘了一个小男孩（小明）一天中的不同活动。
+左上角的第一个场景，以超写实摄影风格呈现。画面主体是一个大约8岁的东亚小男孩，他穿着整洁的小学制服——一件白色短袖衬衫和蓝色短裤，脖子上系着红领巾。他背着一个蓝色的双肩书包，正走在去上学的路上。他位于画面的前景偏右侧，面带微笑，步伐轻快。场景设定���清晨，柔和的阳光从左上方照射下来，在人行道上投下清晰而柔和的影子。背景是绿树成荫的街道和模糊可见的学校铁艺大门，营造出宁静的早晨氛围。这张图片的细节表现极为丰富，可以清晰地看到男孩头发的光泽、衣服的褶皱纹理以及书包的帆布材质，完全展现了专业摄影的质感。
+右上角的第二个场景，采用日式赛璐璐动漫风格绘制。画面中，小男孩坐在家中的木质餐桌旁吃午饭。他的形象被动漫化，拥有大而明亮的眼睛和简洁的五官线条。他身穿一件简单的黄色T恤，正用筷子夹起碗里的米饭。桌上摆放着一碗汤和两盘家常菜。背景是一个温馨的室内环境，一扇明亮的窗户透进正午的阳光，窗外是蓝天白云。整个画面色彩鲜艳、饱和度高，角色轮廓线清晰明确，阴影部分采用平涂的色块处理，是典型的赛璐璐动漫风格。
+左下角的第三个场景，以细腻的铅笔素描风格呈现。画面描绘了下午在操场上踢足球的小男孩。整个图像由不同灰度的石墨色调构成，没有其他颜色。小男孩身穿运动短袖和短裤，身体呈前倾姿态，右脚正要踢向一个足球，动作充满动感。背景是空旷的操场和远处的球门，用简练的线条和排线勾勒。艺术家通过交叉排线和涂抹技巧来表现光影和体积感，足球上的阴影、人物身上的肌肉线条以及地面粗糙的质感都通过铅笔的笔触得到了充分的展现。这张铅笔画突出了素描的光影关系和线条美感。
+右下角的第四个场景，以文森特·梵高的后印象派油画风格进行诠释。画面描绘了夜晚时分，小男孩独自在河边钓鱼的景象。他坐在一块岩石上，手持一根简易的钓鱼竿，身影在深蓝色的夜幕下显得很渺小。整个画面的视觉焦点是天空和水面，天空布满了旋转、卷曲的星云，星星和月亮被描绘成巨大、发光的光团，使用了厚涂的油画颜料（Impasto），笔触粗犷而充满能量。深蓝、亮黄和白色的颜料在画布上相互交织，形成强烈的视觉冲击力。水面倒映着天空中扭曲的光影，整个场景充满了梵高作品中特有的强烈情感和动荡不安的美感。这幅画作是对梵高风格的深度致敬。
+</details>
+</td>
+</tr>
+<tr>
+<td>
+<img src="./assets/pg_imgs/image7.png" width=100%><details>
+<summary>Show prompt</summary>
+以平视视角，呈现了一幅关于如何用素描技法绘制鹦鹉的九宫格教学图。整体构图规整，九个大小一致的方形画框以三行三列的形式均匀分布在浅灰色背景上，清晰地展示了从基本形状到最终成品的全过程。
+第一行从左至右展示了绘画的初始步骤。左上角的第一个画框中，用简洁的铅笔线条勾勒出鹦鹉的基本几何形态：一个圆形代表头部，一个稍大的椭圆形代表身体。右上角有一个小号的无衬线字体数字“1”。中间的第二个画框中，在基础形态上添加了三角形的鸟喙轮廓和一条长长的弧线作为尾巴的雏形，头部和身体的连接处线条变得更加流畅；右上角标有数字“2”。右侧的第三个画框中，进一步精确了鹦鹉的整体轮廓，勾勒出头部顶端的羽冠和清晰的眼部圆形轮廓；右上角标有数字“3”。
+第二行专注于结构与细节的添加，描绘了绘画的中期阶段。左侧的第四个画框里，鹦鹉的身体上添加了翅膀的基本形状，同时在身体下方画出了一根作为栖木的横向树枝，鹦鹉的爪子初步搭在树枝上；右上角标有数字“4”。中间的第五个画框中，开始细化翅膀和尾部的羽毛分组，用短促的线条表现出层次感，并清晰地画出爪子紧握树枝的细节；右上角标有数字“5”。右侧的第六个画框里，开始为鹦鹉添加初步的阴影，使用交叉排线的素描技法在腹部、翅膀下方和颈部制造出体积感；右上角标有数字“6”。
+第三行则展示了最终的润色与完成阶段。左下角的第七个画框中，素描的排线更加密集，阴影层次更加丰富，羽毛的纹理细节被仔细刻画出来，眼珠也添加了高光点缀，显得炯炯有神；右上角标有数字“7”。中间的第八个画框里，描绘的重点转移到栖木上，增加了树枝的纹理和节疤细节，同时整体调整了鹦鹉身上的光影关系，使立体感更为突出；右上角标有数字“8”。右下角的第九个画框是最终完成图，所有线条都经过了精炼，光影对比强烈，鹦鹉的羽毛质感、木质栖木的粗糙感都表现得淋漓尽致，呈现出一幅完整且细节丰富的素描作品；右上角标有数字“9”。
+整个画面的光线均匀而明亮，没有任何特定的光源方向，确保了每个教学步骤的视觉清晰度。整体呈现出一种清晰、有条理的数字插画教程风格。
+</details>
+</td>
+<td>
+<img src="./assets/pg_imgs/image8.png" width=100%><details>
+<summary>Show prompt</summary>
+一张现代平面设计风格的海报占据了整个画面，构图简洁且中心突出。
+海报的主体是位于画面正中央的一只腾讯QQ企鹅。这只企鹅采用了圆润可爱的3D卡通渲染风格，身体主要为饱满的黑色，腹部为纯白色。它的眼睛大而圆，眼神好奇地直视前方。黄色的嘴巴小巧而立体，双脚同样为鲜明的黄色，稳稳地站立着。一条标志性的红色围巾整齐地系在它的脖子上，围巾的材质带有轻微的布料质感，末端自然下垂。企鹅的整体造型干净利落，边缘光滑，呈现出一种精致的数字插画质感。
+海报的背景是一种从上到下由浅蓝色平滑过渡到白色的柔和渐变，营造出一种开阔、明亮的空间感。在企鹅的身后，散布着一些淡淡的、模糊的圆形光斑和几道柔和的抽象光束，为这个简约的平面设计海报增添了微妙的深度和科技感。
+画面的底部区域是文字部分，排版居中对齐。上半部分是一行稍大的黑色黑体字，内容为“Hunyuan Image 3.0”。紧随其下的是一行字号略小的深灰色黑体字，内容为“原生多模态大模型”。两行文字清晰易读，与整体的现代平面设计风格保持一致。
+整体光线明亮、均匀，没有明显的阴影，突出了企鹅和文字信息，符合现代设计海报的视觉要求。这张图像呈现了现代、简洁的平面设计海报风格。
+</details>
+</td>
+</tr>
+</tbody>
+</table>
+</p>
+## 📊 Evaluation
+* 🤖 **SSAE (Machine Evaluation)**
+SSAE (Structured Semantic Alignment Evaluation) is an intelligent evaluation metric for image-text alignment based on advanced multimodal large language models (MLLMs). We extracted 3500 key points across 12 categories, then used multimodal large language models to automatically evaluate and score by comparing the generated images with these key points based on the visual content of the images. Mean Image Accuracy represents the image-wise average score across all key points, while Global Accuracy directly calculates the average score across all key points.
+<p align="center">
+  <img src="./assets/ssae_side_by_side_comparison.png" width=98% alt="Human Evaluation with Other Models">
+</p>
+<p align="center">
+  <img src="./assets/ssae_side_by_side_heatmap.png" width=98% alt="Human Evaluation with Other Models">
+</p>
+* 👥 **GSB (Human Evaluation)**
+We adopted the GSB (Good/Same/Bad) evaluation method commonly used to assess the relative performance between two models from an overall image perception perspective. In total, we utilized 1,000 text prompts, generating an equal number of image samples for all compared models in a single run. For a fair comparison, we conducted inference only once for each prompt, avoiding any cherry-picking of results. When comparing with the baseline methods, we maintained the default settings for all selected models. The evaluation was performed by more than 100 professional evaluators.
+<p align="center">
+  <img src="./assets/gsb.png" width=98% alt="Human Evaluation with Other Models">
+</p>
+## 📚 Citation
+If you find HunyuanImage-3.0 useful in your research, please cite our work:
+```bibtex
+@article{cao2025hunyuanimage,
+  title={HunyuanImage 3.0 Technical Report},
+  author={Cao, Siyu and Chen, Hangting and Chen, Peng and Cheng, Yiji and Cui, Yutao and Deng, Xinchi and Dong, Ying and Gong, Kipper and Gu, Tianpeng and Gu, Xiusen and others},
+  journal={arXiv preprint arXiv:2509.23951},
+  year={2025}
+}
+```
+## 🙏 Acknowledgements
+We extend our heartfelt gratitude to the following open-source projects and communities for their invaluable contributions:
+* 🤗 [Transformers](https://github.com/huggingface/transformers) - State-of-the-art NLP library
+* 🎨 [Diffusers](https://github.com/huggingface/diffusers) - Diffusion models library
+* 🌐 [HuggingFace](https://huggingface.co/) - AI model hub and community
+* ⚡ [FlashAttention](https://github.com/Dao-AILab/flash-attention) - Memory-efficient attention
+* 🚀 [FlashInfer](https://github.com/flashinfer-ai/flashinfer) - Optimized inference engine
+## 🌟🚀 Github Star History
+[![GitHub stars](https://img.shields.io/github/stars/Tencent-Hunyuan/HunyuanImage-3.0?style=social)](https://github.com/Tencent-Hunyuan/HunyuanImage-3.0)
+[![GitHub forks](https://img.shields.io/github/forks/Tencent-Hunyuan/HunyuanImage-3.0?style=social)](https://github.com/Tencent-Hunyuan/HunyuanImage-3.0)
+[![Star History Chart](https://api.star-history.com/svg?repos=Tencent-Hunyuan/HunyuanImage-3.0&type=Date)](https://www.star-history.com/#Tencent-Hunyuan/HunyuanImage-3.0&Date)

__init__.py ADDED Viewed

File without changes

assets/WECHAT.md ADDED Viewed

	@@ -0,0 +1,6 @@

+<div align="center">
+<img src=wechat.png width="60%"/>
+<p> 扫码关注混元图像系列工作，加入「 腾讯混元生图交流群 」 </p>
+<p> Scan the QR code to  join the "Tencent Hunyuan Image Generation Discussion Group" </p>
+</div>

assets/banner.png ADDED Viewed

Git LFS Details

SHA256: 53bef578e373fc53c8c16d26a1011f85ce8d4f46aeac1222be7263a09a3d8c7f
Pointer size: 132 Bytes
Size of remote file: 1.41 MB

assets/banner_all.jpg ADDED Viewed

Git LFS Details

SHA256: 667e956a3c27f6722eceacebe907b56b6669cc35b706a4957893b7ae88b7fbc0
Pointer size: 133 Bytes
Size of remote file: 15.3 MB

assets/framework.png ADDED Viewed

Git LFS Details

SHA256: f6c0e6751b4bf0f30daeb6a2b7cdb4dc3276bd61ea58294ba528fd807a17473f
Pointer size: 131 Bytes
Size of remote file: 248 kB

assets/gsb.png ADDED Viewed

Git LFS Details

SHA256: 8570c87bbb477e206f61d99c036a53d3671dc589829b335656931dcf01194536
Pointer size: 131 Bytes
Size of remote file: 191 kB

assets/logo.png ADDED Viewed

Git LFS Details

SHA256: f59d594e65aff85c3ac35ff02aa7e14cccfb88d4c6296948efe5cea9a3bfb690
Pointer size: 130 Bytes
Size of remote file: 95.1 kB

assets/pg_imgs/image1.png ADDED Viewed

Git LFS Details

SHA256: a385db722efc89cff4d5e4afdb82c96156f223907d70f0d3c7eb8b3e59edbccb
Pointer size: 132 Bytes
Size of remote file: 1.64 MB

assets/pg_imgs/image2.png ADDED Viewed

Git LFS Details

SHA256: 84a7d37d3ff8452c32ecb79f98692ad38f8f190dc201922a924ae2fda4515e12
Pointer size: 132 Bytes
Size of remote file: 1.7 MB

assets/pg_imgs/image3.png ADDED Viewed

Git LFS Details

SHA256: 913376a1ad5d10bc1549f9f28fc25a0c9f94a119b99434618bacacc7429996fa
Pointer size: 132 Bytes
Size of remote file: 1.39 MB

assets/pg_imgs/image4.png ADDED Viewed

Git LFS Details

SHA256: 71dcfd968f4c76ccec2ccc1806e9ce97babed56c73441d744f7264433bf9339a
Pointer size: 132 Bytes
Size of remote file: 1.55 MB

assets/pg_imgs/image5.png ADDED Viewed

Git LFS Details

SHA256: b93338e2f81f9809f8a9f674e0fe3da7c03de4fc4d7aba1819acb878384abb3e
Pointer size: 132 Bytes
Size of remote file: 3.31 MB

assets/pg_imgs/image6.png ADDED Viewed

Git LFS Details

SHA256: 84e7c73dafea831bf1ceb8c3dd76c16238c9b4a31ac30e3f46c38d61005b5895
Pointer size: 132 Bytes
Size of remote file: 2.02 MB

assets/pg_imgs/image7.png ADDED Viewed

Git LFS Details

SHA256: 701f2449436f8d46f537100bdaa63569586e39448e30fffe2e5bc3e95e558daa
Pointer size: 132 Bytes
Size of remote file: 1.55 MB

assets/pg_imgs/image8.png ADDED Viewed

Git LFS Details

SHA256: b80b97174d4f98030eda02d5dbaac2e294d814f086d957d47957482eb7b70251
Pointer size: 132 Bytes
Size of remote file: 1.27 MB

assets/robot.png ADDED Viewed

Git LFS Details

SHA256: 2a5b09f264c3752199536e92ca57836119604a79e3d08471d2818d2d576dd79b
Pointer size: 130 Bytes
Size of remote file: 16.4 kB

assets/ssae_side_by_side_comparison.png ADDED Viewed

Git LFS Details

SHA256: 665dce959769e799a14fa7d176d4a676feb33193c6575db21da97458732488fc
Pointer size: 132 Bytes
Size of remote file: 1.52 MB

assets/ssae_side_by_side_heatmap.png ADDED Viewed

Git LFS Details

SHA256: 00e2342afb5cabaf20b9b415587fb2986456f8c7c8cb96dd6ecc68455457045e
Pointer size: 131 Bytes
Size of remote file: 639 kB

assets/user.png ADDED Viewed

Git LFS Details

SHA256: 75543c163927df138a1c3d2958322e151ba259fc52fcd91bebb4cea92fc1af89
Pointer size: 130 Bytes
Size of remote file: 13.5 kB

assets/wechat.png ADDED Viewed

Git LFS Details

SHA256: 7bb1d5e06408b09ca6764ddd0a70fe9acfd045295c4144c99235f74336ea0169
Pointer size: 130 Bytes
Size of remote file: 30.9 kB

autoencoder_kl_3d.py ADDED Viewed

	@@ -0,0 +1,793 @@

+# Licensed under the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://github.com/Tencent-Hunyuan/HunyuanImage-3.0/blob/main/LICENSE
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from dataclasses import dataclass
+from typing import Tuple, Optional
+import math
+import random
+import numpy as np
+from einops import rearrange
+import torch
+from torch import Tensor, nn
+import torch.nn.functional as F
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_outputs import AutoencoderKLOutput
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.utils import BaseOutput
+class DiagonalGaussianDistribution(object):
+    def __init__(self, parameters: torch.Tensor, deterministic: bool = False):
+        if parameters.ndim == 3:
+            dim = 2  # (B, L, C)
+        elif parameters.ndim == 5 or parameters.ndim == 4:
+            dim = 1  # (B, C, T, H ,W) / (B, C, H, W)
+        else:
+            raise NotImplementedError
+        self.parameters = parameters
+        self.mean, self.logvar = torch.chunk(parameters, 2, dim=dim)
+        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
+        self.deterministic = deterministic
+        self.std = torch.exp(0.5 * self.logvar)
+        self.var = torch.exp(self.logvar)
+        if self.deterministic:
+            self.var = self.std = torch.zeros_like(
+                self.mean, device=self.parameters.device, dtype=self.parameters.dtype
+            )
+    def sample(self, generator: Optional[torch.Generator] = None) -> torch.FloatTensor:
+        # make sure sample is on the same device as the parameters and has same dtype
+        sample = randn_tensor(
+            self.mean.shape,
+            generator=generator,
+            device=self.parameters.device,
+            dtype=self.parameters.dtype,
+        )
+        x = self.mean + self.std * sample
+        return x
+    def kl(self, other: "DiagonalGaussianDistribution" = None) -> torch.Tensor:
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        else:
+            reduce_dim = list(range(1, self.mean.ndim))
+            if other is None:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar,
+                    dim=reduce_dim,
+                )
+            else:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean - other.mean, 2) / other.var +
+                    self.var / other.var -
+                    1.0 -
+                    self.logvar +
+                    other.logvar,
+                    dim=reduce_dim,
+                )
+    def nll(self, sample: torch.Tensor, dims: Tuple[int, ...] = [1, 2, 3]) -> torch.Tensor:
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        logtwopi = np.log(2.0 * np.pi)
+        return 0.5 * torch.sum(
+            logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
+            dim=dims,
+        )
+    def mode(self) -> torch.Tensor:
+        return self.mean
+@dataclass
+class DecoderOutput(BaseOutput):
+    sample: torch.FloatTensor
+    posterior: Optional[DiagonalGaussianDistribution] = None
+def swish(x: Tensor) -> Tensor:
+    return x * torch.sigmoid(x)
+def forward_with_checkpointing(module, *inputs, use_checkpointing=False):
+    def create_custom_forward(module):
+        def custom_forward(*inputs):
+            return module(*inputs)
+        return custom_forward
+    if use_checkpointing:
+        return torch.utils.checkpoint.checkpoint(create_custom_forward(module), *inputs, use_reentrant=False)
+    else:
+        return module(*inputs)
+class Conv3d(nn.Conv3d):
+    """
+    Perform Conv3d on patches with numerical differences from nn.Conv3d within 1e-5.
+    Only symmetric padding is supported.
+    """
+    def forward(self, input):
+        B, C, T, H, W = input.shape
+        memory_count = (C * T * H * W) * 2 / 1024**3
+        if memory_count > 2:
+            n_split = math.ceil(memory_count / 2)
+            assert n_split >= 2
+            chunks = torch.chunk(input, chunks=n_split, dim=-3)
+            padded_chunks = []
+            for i in range(len(chunks)):
+                if self.padding[0] > 0:
+                    padded_chunk = F.pad(
+                        chunks[i],
+                        (0, 0, 0, 0, self.padding[0], self.padding[0]),
+                        mode="constant" if self.padding_mode == "zeros" else self.padding_mode,
+                        value=0,
+                    )
+                    if i > 0:
+                        padded_chunk[:, :, :self.padding[0]] = chunks[i - 1][:, :, -self.padding[0]:]
+                    if i < len(chunks) - 1:
+                        padded_chunk[:, :, -self.padding[0]:] = chunks[i + 1][:, :, :self.padding[0]]
+                else:
+                    padded_chunk = chunks[i]
+                padded_chunks.append(padded_chunk)
+            padding_bak = self.padding
+            self.padding = (0, self.padding[1], self.padding[2])
+            outputs = []
+            for i in range(len(padded_chunks)):
+                outputs.append(super().forward(padded_chunks[i]))
+            self.padding = padding_bak
+            return torch.cat(outputs, dim=-3)
+        else:
+            return super().forward(input)
+class AttnBlock(nn.Module):
+    """ Attention with torch sdpa implementation. """
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        self.q = Conv3d(in_channels, in_channels, kernel_size=1)
+        self.k = Conv3d(in_channels, in_channels, kernel_size=1)
+        self.v = Conv3d(in_channels, in_channels, kernel_size=1)
+        self.proj_out = Conv3d(in_channels, in_channels, kernel_size=1)
+    def attention(self, h_: Tensor) -> Tensor:
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        b, c, f, h, w = q.shape
+        q = rearrange(q, "b c f h w -> b 1 (f h w) c").contiguous()
+        k = rearrange(k, "b c f h w -> b 1 (f h w) c").contiguous()
+        v = rearrange(v, "b c f h w -> b 1 (f h w) c").contiguous()
+        h_ = nn.functional.scaled_dot_product_attention(q, k, v)
+        return rearrange(h_, "b 1 (f h w) c -> b c f h w", f=f, h=h, w=w, c=c, b=b)
+    def forward(self, x: Tensor) -> Tensor:
+        return x + self.proj_out(self.attention(x))
+class ResnetBlock(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.norm1 = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        self.conv1 = Conv3d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.norm2 = nn.GroupNorm(num_groups=32, num_channels=out_channels, eps=1e-6, affine=True)
+        self.conv2 = Conv3d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if self.in_channels != self.out_channels:
+            self.nin_shortcut = Conv3d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, x):
+        h = x
+        h = self.norm1(h)
+        h = swish(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = swish(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            x = self.nin_shortcut(x)
+        return x + h
+class Downsample(nn.Module):
+    def __init__(self, in_channels: int, add_temporal_downsample: bool = True):
+        super().__init__()
+        self.add_temporal_downsample = add_temporal_downsample
+        stride = (2, 2, 2) if add_temporal_downsample else (1, 2, 2)  # THW
+        # no asymmetric padding in torch conv, must do it ourselves
+        self.conv = Conv3d(in_channels, in_channels, kernel_size=3, stride=stride, padding=0)
+    def forward(self, x: Tensor):
+        spatial_pad = (0, 1, 0, 1, 0, 0)  # WHT
+        x = nn.functional.pad(x, spatial_pad, mode="constant", value=0)
+        temporal_pad = (0, 0, 0, 0, 0, 1) if self.add_temporal_downsample else (0, 0, 0, 0, 1, 1)
+        x = nn.functional.pad(x, temporal_pad, mode="replicate")
+        x = self.conv(x)
+        return x
+class DownsampleDCAE(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, add_temporal_downsample: bool = True):
+        super().__init__()
+        factor = 2 * 2 * 2 if add_temporal_downsample else 1 * 2 * 2
+        assert out_channels % factor == 0
+        self.conv = Conv3d(in_channels, out_channels // factor, kernel_size=3, stride=1, padding=1)
+        self.add_temporal_downsample = add_temporal_downsample
+        self.group_size = factor * in_channels // out_channels
+    def forward(self, x: Tensor):
+        r1 = 2 if self.add_temporal_downsample else 1
+        h = self.conv(x)
+        h = rearrange(h, "b c (f r1) (h r2) (w r3) -> b (r1 r2 r3 c) f h w", r1=r1, r2=2, r3=2)
+        shortcut = rearrange(x, "b c (f r1) (h r2) (w r3) -> b (r1 r2 r3 c) f h w", r1=r1, r2=2, r3=2)
+        B, C, T, H, W = shortcut.shape
+        shortcut = shortcut.view(B, h.shape[1], self.group_size, T, H, W).mean(dim=2)
+        return h + shortcut
+class Upsample(nn.Module):
+    def __init__(self, in_channels: int, add_temporal_upsample: bool = True):
+        super().__init__()
+        self.add_temporal_upsample = add_temporal_upsample
+        self.scale_factor = (2, 2, 2) if add_temporal_upsample else (1, 2, 2)  # THW
+        self.conv = Conv3d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+    def forward(self, x: Tensor):
+        x = nn.functional.interpolate(x, scale_factor=self.scale_factor, mode="nearest")
+        x = self.conv(x)
+        return x
+class UpsampleDCAE(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, add_temporal_upsample: bool = True):
+        super().__init__()
+        factor = 2 * 2 * 2 if add_temporal_upsample else 1 * 2 * 2
+        self.conv = Conv3d(in_channels, out_channels * factor, kernel_size=3, stride=1, padding=1)
+        self.add_temporal_upsample = add_temporal_upsample
+        self.repeats = factor * out_channels // in_channels
+    def forward(self, x: Tensor):
+        r1 = 2 if self.add_temporal_upsample else 1
+        h = self.conv(x)
+        h = rearrange(h, "b (r1 r2 r3 c) f h w -> b c (f r1) (h r2) (w r3)", r1=r1, r2=2, r3=2)
+        shortcut = x.repeat_interleave(repeats=self.repeats, dim=1)
+        shortcut = rearrange(shortcut, "b (r1 r2 r3 c) f h w -> b c (f r1) (h r2) (w r3)", r1=r1, r2=2, r3=2)
+        return h + shortcut
+class Encoder(nn.Module):
+    """
+    The encoder network of AutoencoderKLConv3D.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        z_channels: int,
+        block_out_channels: Tuple[int, ...],
+        num_res_blocks: int,
+        ffactor_spatial: int,
+        ffactor_temporal: int,
+        downsample_match_channel: bool = True,
+    ):
+        super().__init__()
+        assert block_out_channels[-1] % (2 * z_channels) == 0
+        self.z_channels = z_channels
+        self.block_out_channels = block_out_channels
+        self.num_res_blocks = num_res_blocks
+        # downsampling
+        self.conv_in = Conv3d(in_channels, block_out_channels[0], kernel_size=3, stride=1, padding=1)
+        self.down = nn.ModuleList()
+        block_in = block_out_channels[0]
+        for i_level, ch in enumerate(block_out_channels):
+            block = nn.ModuleList()
+            block_out = ch
+            for _ in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            down = nn.Module()
+            down.block = block
+            add_spatial_downsample = bool(i_level < np.log2(ffactor_spatial))
+            add_temporal_downsample = (add_spatial_downsample and
+                                       bool(i_level >= np.log2(ffactor_spatial // ffactor_temporal)))
+            if add_spatial_downsample or add_temporal_downsample:
+                assert i_level < len(block_out_channels) - 1
+                block_out = block_out_channels[i_level + 1] if downsample_match_channel else block_in
+                down.downsample = DownsampleDCAE(block_in, block_out, add_temporal_downsample)
+                block_in = block_out
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        # end
+        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = Conv3d(block_in, 2 * z_channels, kernel_size=3, stride=1, padding=1)
+        self.gradient_checkpointing = False
+    def forward(self, x: Tensor) -> Tensor:
+        use_checkpointing = bool(self.training and self.gradient_checkpointing)
+        # downsampling
+        h = self.conv_in(x)
+        for i_level in range(len(self.block_out_channels)):
+            for i_block in range(self.num_res_blocks):
+                h = forward_with_checkpointing(
+                    self.down[i_level].block[i_block], h, use_checkpointing=use_checkpointing)
+            if hasattr(self.down[i_level], "downsample"):
+                h = forward_with_checkpointing(self.down[i_level].downsample, h, use_checkpointing=use_checkpointing)
+        # middle
+        h = forward_with_checkpointing(self.mid.block_1, h, use_checkpointing=use_checkpointing)
+        h = forward_with_checkpointing(self.mid.attn_1, h, use_checkpointing=use_checkpointing)
+        h = forward_with_checkpointing(self.mid.block_2, h, use_checkpointing=use_checkpointing)
+        # end
+        group_size = self.block_out_channels[-1] // (2 * self.z_channels)
+        shortcut = rearrange(h, "b (c r) f h w -> b c r f h w", r=group_size).mean(dim=2)
+        h = self.norm_out(h)
+        h = swish(h)
+        h = self.conv_out(h)
+        h += shortcut
+        return h
+class Decoder(nn.Module):
+    """
+    The decoder network of AutoencoderKLConv3D.
+    """
+    def __init__(
+        self,
+        z_channels: int,
+        out_channels: int,
+        block_out_channels: Tuple[int, ...],
+        num_res_blocks: int,
+        ffactor_spatial: int,
+        ffactor_temporal: int,
+        upsample_match_channel: bool = True,
+    ):
+        super().__init__()
+        assert block_out_channels[0] % z_channels == 0
+        self.z_channels = z_channels
+        self.block_out_channels = block_out_channels
+        self.num_res_blocks = num_res_blocks
+        # z to block_in
+        block_in = block_out_channels[0]
+        self.conv_in = Conv3d(z_channels, block_in, kernel_size=3, stride=1, padding=1)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level, ch in enumerate(block_out_channels):
+            block = nn.ModuleList()
+            block_out = ch
+            for _ in range(self.num_res_blocks + 1):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            up = nn.Module()
+            up.block = block
+            add_spatial_upsample = bool(i_level < np.log2(ffactor_spatial))
+            add_temporal_upsample = bool(i_level < np.log2(ffactor_temporal))
+            if add_spatial_upsample or add_temporal_upsample:
+                assert i_level < len(block_out_channels) - 1
+                block_out = block_out_channels[i_level + 1] if upsample_match_channel else block_in
+                up.upsample = UpsampleDCAE(block_in, block_out, add_temporal_upsample)
+                block_in = block_out
+            self.up.append(up)
+        # end
+        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = Conv3d(block_in, out_channels, kernel_size=3, stride=1, padding=1)
+        self.gradient_checkpointing = False
+    def forward(self, z: Tensor) -> Tensor:
+        use_checkpointing = bool(self.training and self.gradient_checkpointing)
+        # z to block_in
+        repeats = self.block_out_channels[0] // (self.z_channels)
+        h = self.conv_in(z) + z.repeat_interleave(repeats=repeats, dim=1)
+        # middle
+        h = forward_with_checkpointing(self.mid.block_1, h, use_checkpointing=use_checkpointing)
+        h = forward_with_checkpointing(self.mid.attn_1, h, use_checkpointing=use_checkpointing)
+        h = forward_with_checkpointing(self.mid.block_2, h, use_checkpointing=use_checkpointing)
+        # upsampling
+        for i_level in range(len(self.block_out_channels)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = forward_with_checkpointing(self.up[i_level].block[i_block], h, use_checkpointing=use_checkpointing)
+            if hasattr(self.up[i_level], "upsample"):
+                h = forward_with_checkpointing(self.up[i_level].upsample, h, use_checkpointing=use_checkpointing)
+        # end
+        h = self.norm_out(h)
+        h = swish(h)
+        h = self.conv_out(h)
+        return h
+class AutoencoderKLConv3D(ModelMixin, ConfigMixin):
+    """
+    Autoencoder model with KL-regularized latent space based on 3D convolutions.
+    """
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        latent_channels: int,
+        block_out_channels: Tuple[int, ...],
+        layers_per_block: int,
+        ffactor_spatial: int,
+        ffactor_temporal: int,
+        sample_size: int,
+        sample_tsize: int,
+        scaling_factor: float = None,
+        shift_factor: Optional[float] = None,
+        downsample_match_channel: bool = True,
+        upsample_match_channel: bool = True,
+        only_encoder: bool = False,     # only build encoder for saving memory
+        only_decoder: bool = False,     # only build decoder for saving memory
+    ):
+        super().__init__()
+        self.ffactor_spatial = ffactor_spatial
+        self.ffactor_temporal = ffactor_temporal
+        self.scaling_factor = scaling_factor
+        self.shift_factor = shift_factor
+        # build model
+        if not only_decoder:
+            self.encoder = Encoder(
+                in_channels=in_channels,
+                z_channels=latent_channels,
+                block_out_channels=block_out_channels,
+                num_res_blocks=layers_per_block,
+                ffactor_spatial=ffactor_spatial,
+                ffactor_temporal=ffactor_temporal,
+                downsample_match_channel=downsample_match_channel,
+            )
+        if not only_encoder:
+            self.decoder = Decoder(
+                z_channels=latent_channels,
+                out_channels=out_channels,
+                block_out_channels=list(reversed(block_out_channels)),
+                num_res_blocks=layers_per_block,
+                ffactor_spatial=ffactor_spatial,
+                ffactor_temporal=ffactor_temporal,
+                upsample_match_channel=upsample_match_channel,
+            )
+        # slicing and tiling related
+        self.use_slicing = False
+        self.slicing_bsz = 1
+        self.use_spatial_tiling = False
+        self.use_temporal_tiling = False
+        self.use_tiling_during_training = False
+        # only relevant if vae tiling is enabled
+        self.tile_sample_min_size = sample_size
+        self.tile_latent_min_size = sample_size // ffactor_spatial
+        self.tile_sample_min_tsize = sample_tsize
+        self.tile_latent_min_tsize = sample_tsize // ffactor_temporal
+        self.tile_overlap_factor = 0.25
+        # use torch.compile for faster encode speed
+        self.use_compile = False
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (Encoder, Decoder)):
+            module.gradient_checkpointing = value
+    def enable_tiling_during_training(self, use_tiling: bool = True):
+        self.use_tiling_during_training = use_tiling
+    def disable_tiling_during_training(self):
+        self.enable_tiling_during_training(False)
+    def enable_temporal_tiling(self, use_tiling: bool = True):
+        self.use_temporal_tiling = use_tiling
+    def disable_temporal_tiling(self):
+        self.enable_temporal_tiling(False)
+    def enable_spatial_tiling(self, use_tiling: bool = True):
+        self.use_spatial_tiling = use_tiling
+    def disable_spatial_tiling(self):
+        self.enable_spatial_tiling(False)
+    def enable_tiling(self, use_tiling: bool = True):
+        self.enable_spatial_tiling(use_tiling)
+    def disable_tiling(self):
+        self.disable_spatial_tiling()
+    def enable_slicing(self):
+        self.use_slicing = True
+    def disable_slicing(self):
+        self.use_slicing = False
+    def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int):
+        blend_extent = min(a.shape[-1], b.shape[-1], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, :, :, x] = \
+                a[:, :, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, :, x] * (x / blend_extent)
+        return b
+    def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int):
+        blend_extent = min(a.shape[-2], b.shape[-2], blend_extent)
+        for y in range(blend_extent):
+            b[:, :, :, y, :] = \
+                a[:, :, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, :, y, :] * (y / blend_extent)
+        return b
+    def blend_t(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int):
+        blend_extent = min(a.shape[-3], b.shape[-3], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, x, :, :] = \
+                a[:, :, -blend_extent + x, :, :] * (1 - x / blend_extent) + b[:, :, x, :, :] * (x / blend_extent)
+        return b
+    def spatial_tiled_encode(self, x: torch.Tensor):
+        """ spatial tailing for frames """
+        B, C, T, H, W = x.shape
+        overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor))  # 256 * (1 - 0.25) = 192
+        blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor)  # 8 * 0.25 = 2
+        row_limit = self.tile_latent_min_size - blend_extent  # 8 - 2 = 6
+        rows = []
+        for i in range(0, H, overlap_size):
+            row = []
+            for j in range(0, W, overlap_size):
+                tile = x[:, :, :, i: i + self.tile_sample_min_size, j: j + self.tile_sample_min_size]
+                tile = self.encoder(tile)
+                row.append(tile)
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=-1))
+        moments = torch.cat(result_rows, dim=-2)
+        return moments
+    def temporal_tiled_encode(self, x: torch.Tensor):
+        """ temporal tailing for frames """
+        B, C, T, H, W = x.shape
+        overlap_size = int(self.tile_sample_min_tsize * (1 - self.tile_overlap_factor))  # 64 * (1 - 0.25) = 48
+        blend_extent = int(self.tile_latent_min_tsize * self.tile_overlap_factor)  # 8 * 0.25 = 2
+        t_limit = self.tile_latent_min_tsize - blend_extent  # 8 - 2 = 6
+        row = []
+        for i in range(0, T, overlap_size):
+            tile = x[:, :, i: i + self.tile_sample_min_tsize, :, :]
+            if self.use_spatial_tiling and (
+                    tile.shape[-1] > self.tile_sample_min_size or tile.shape[-2] > self.tile_sample_min_size):
+                tile = self.spatial_tiled_encode(tile)
+            else:
+                tile = self.encoder(tile)
+            row.append(tile)
+        result_row = []
+        for i, tile in enumerate(row):
+            if i > 0:
+                tile = self.blend_t(row[i - 1], tile, blend_extent)
+            result_row.append(tile[:, :, :t_limit, :, :])
+        moments = torch.cat(result_row, dim=-3)
+        return moments
+    def spatial_tiled_decode(self, z: torch.Tensor):
+        """ spatial tailing for frames """
+        B, C, T, H, W = z.shape
+        overlap_size = int(self.tile_latent_min_size * (1 - self.tile_overlap_factor))  # 8 * (1 - 0.25) = 6
+        blend_extent = int(self.tile_sample_min_size * self.tile_overlap_factor)  # 256 * 0.25 = 64
+        row_limit = self.tile_sample_min_size - blend_extent  # 256 - 64 = 192
+        rows = []
+        for i in range(0, H, overlap_size):
+            row = []
+            for j in range(0, W, overlap_size):
+                tile = z[:, :, :, i: i + self.tile_latent_min_size, j: j + self.tile_latent_min_size]
+                decoded = self.decoder(tile)
+                row.append(decoded)
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=-1))
+        dec = torch.cat(result_rows, dim=-2)
+        return dec
+    def temporal_tiled_decode(self, z: torch.Tensor):
+        """ temporal tailing for frames """
+        B, C, T, H, W = z.shape
+        overlap_size = int(self.tile_latent_min_tsize * (1 - self.tile_overlap_factor))  # 8 * (1 - 0.25) = 6
+        blend_extent = int(self.tile_sample_min_tsize * self.tile_overlap_factor)  # 64 * 0.25 = 16
+        t_limit = self.tile_sample_min_tsize - blend_extent  # 64 - 16 = 48
+        assert 0 < overlap_size < self.tile_latent_min_tsize
+        row = []
+        for i in range(0, T, overlap_size):
+            tile = z[:, :, i: i + self.tile_latent_min_tsize, :, :]
+            if self.use_spatial_tiling and (
+                    tile.shape[-1] > self.tile_latent_min_size or tile.shape[-2] > self.tile_latent_min_size):
+                decoded = self.spatial_tiled_decode(tile)
+            else:
+                decoded = self.decoder(tile)
+            row.append(decoded)
+        result_row = []
+        for i, tile in enumerate(row):
+            if i > 0:
+                tile = self.blend_t(row[i - 1], tile, blend_extent)
+            result_row.append(tile[:, :, :t_limit, :, :])
+        dec = torch.cat(result_row, dim=-3)
+        return dec
+    def encode(self, x: Tensor, return_dict: bool = True):
+        """
+        Encodes the input by passing through the encoder network.
+        Support slicing and tiling for memory efficiency.
+        """
+        def _encode(x):
+            if self.use_temporal_tiling and x.shape[-3] > self.tile_sample_min_tsize:
+                return self.temporal_tiled_encode(x)
+            if self.use_spatial_tiling and (
+                    x.shape[-1] > self.tile_sample_min_size or x.shape[-2] > self.tile_sample_min_size):
+                return self.spatial_tiled_encode(x)
+            if self.use_compile:
+                @torch.compile
+                def encoder(x):
+                    return self.encoder(x)
+                return encoder(x)
+            return self.encoder(x)
+        if len(x.shape) != 5:  # (B, C, T, H, W)
+            x = x[:, :, None]
+        assert len(x.shape) == 5  # (B, C, T, H, W)
+        if x.shape[2] == 1:
+            x = x.expand(-1, -1, self.ffactor_temporal, -1, -1)
+        else:
+            assert x.shape[2] != self.ffactor_temporal and x.shape[2] % self.ffactor_temporal == 0
+        if self.use_slicing and x.shape[0] > 1:
+            if self.slicing_bsz == 1:
+                encoded_slices = [_encode(x_slice) for x_slice in x.split(1)]
+            else:
+                sections = [self.slicing_bsz] * (x.shape[0] // self.slicing_bsz)
+                if x.shape[0] % self.slicing_bsz != 0:
+                    sections.append(x.shape[0] % self.slicing_bsz)
+                encoded_slices = [_encode(x_slice) for x_slice in x.split(sections)]
+            h = torch.cat(encoded_slices)
+        else:
+            h = _encode(x)
+        posterior = DiagonalGaussianDistribution(h)
+        if not return_dict:
+            return (posterior,)
+        return AutoencoderKLOutput(latent_dist=posterior)
+    def decode(self, z: Tensor, return_dict: bool = True, generator=None):
+        """
+        Decodes the input by passing through the decoder network.
+        Support slicing and tiling for memory efficiency.
+        """
+        def _decode(z):
+            if self.use_temporal_tiling and z.shape[-3] > self.tile_latent_min_tsize:
+                return self.temporal_tiled_decode(z)
+            if self.use_spatial_tiling and (
+                    z.shape[-1] > self.tile_latent_min_size or z.shape[-2] > self.tile_latent_min_size):
+                return self.spatial_tiled_decode(z)
+            return self.decoder(z)
+        if self.use_slicing and z.shape[0] > 1:
+            decoded_slices = [_decode(z_slice) for z_slice in z.split(1)]
+            decoded = torch.cat(decoded_slices)
+        else:
+            decoded = _decode(z)
+        if z.shape[-3] == 1:
+            decoded = decoded[:, :, -1:]
+        if not return_dict:
+            return (decoded,)
+        return DecoderOutput(sample=decoded)
+    def forward(
+        self,
+        sample: torch.Tensor,
+        sample_posterior: bool = False,
+        return_posterior: bool = True,
+        return_dict: bool = True
+    ):
+        posterior = self.encode(sample).latent_dist
+        z = posterior.sample() if sample_posterior else posterior.mode()
+        dec = self.decode(z).sample
+        return DecoderOutput(sample=dec, posterior=posterior) if return_dict else (dec, posterior)
+    def random_reset_tiling(self, x: torch.Tensor):
+        if x.shape[-3] == 1:
+            self.disable_spatial_tiling()
+            self.disable_temporal_tiling()
+            return
+        # Use fixed shape here
+        min_sample_size = int(1 / self.tile_overlap_factor) * self.ffactor_spatial
+        min_sample_tsize = int(1 / self.tile_overlap_factor) * self.ffactor_temporal
+        sample_size = random.choice([None, 1 * min_sample_size, 2 * min_sample_size, 3 * min_sample_size])
+        if sample_size is None:
+            self.disable_spatial_tiling()
+        else:
+            self.tile_sample_min_size = sample_size
+            self.tile_latent_min_size = sample_size // self.ffactor_spatial
+            self.enable_spatial_tiling()
+        sample_tsize = random.choice([None, 1 * min_sample_tsize, 2 * min_sample_tsize, 3 * min_sample_tsize])
+        if sample_tsize is None:
+            self.disable_temporal_tiling()
+        else:
+            self.tile_sample_min_tsize = sample_tsize
+            self.tile_latent_min_tsize = sample_tsize // self.ffactor_temporal
+            self.enable_temporal_tiling()

config.json ADDED Viewed

	@@ -0,0 +1,273 @@

+{
+    "add_classification_head": false,
+    "anyres_pooling_size": 2,
+    "anyres_vit_max_image_size": null,
+    "anyres_vit_two_views": false,
+    "architectures": [
+        "HunyuanImage3ForCausalMM"
+    ],
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "attention_head_dim": 128,
+    "auto_map": {
+        "AutoConfig": "configuration_hunyuan.HunyuanImage3Config",
+        "AutoModel": "hunyuan.HunyuanImage3Model",
+        "AutoModelForCausalLM": "hunyuan.HunyuanImage3ForCausalMM"
+    },
+    "bos_token_id": 127958,
+    "cla_share_factor": 2,
+    "class_num": 0,
+    "dense_list": [
+        4096,
+        0
+    ],
+    "eod_token_id": 3,
+    "eos_token_id": 127957,
+    "group_limited_greedy": false,
+    "hidden_act": "silu",
+    "hidden_size": 4096,
+    "im_end_id": 128001,
+    "im_newline_id": 11,
+    "im_start_id": 128000,
+    "image_token_id": 128006,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "kv_lora_rank": null,
+    "mask_init_id": 12,
+    "max_position_embeddings": 12800,
+    "mlp_bias": false,
+    "model_type": "hunyuan_image_3_moe",
+    "moe_drop_tokens": false,
+    "moe_intermediate_size": [
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072
+    ],
+    "moe_layer_num_skipped": 0,
+    "moe_random_routing_dropped_token": false,
+    "moe_topk": [
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8
+    ],
+    "n_group": false,
+    "norm_topk_prob": true,
+    "norm_type": "rms",
+    "num_attention_heads": 32,
+    "num_experts": 64,
+    "num_hidden_layers": 32,
+    "num_key_value_heads": 8,
+    "num_media_embeds": 257,
+    "num_shared_expert": [
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1
+    ],
+    "pad_id": 128009,
+    "pad_token_id": 128009,
+    "pool_type": "last",
+    "position_embedding_xdrope": false,
+    "pretraining_tp": 1,
+    "q_lora_rank": null,
+    "qk_nope_head_dim": null,
+    "qk_rope_head_dim": null,
+    "rms_norm_eps": 1e-05,
+    "rope_scaling": {
+        "alpha": 1.0,
+        "beta_fast": 32,
+        "beta_slow": 1,
+        "factor": 1.0,
+        "mscale": 1.0,
+        "mscale_all_dim": 1.0,
+        "type": "custom"
+    },
+    "rope_theta": 10000.0,
+    "routed_scaling_factor": false,
+    "skip_cls_token": false,
+    "text_end_id": 7,
+    "text_start_id": 6,
+    "tie_word_embeddings": false,
+    "topk_group": false,
+    "torch_dtype": "bfloat16",
+    "transformers_version": "4.50.0",
+    "use_cache": true,
+    "use_cla": false,
+    "use_mixed_mlp_moe": true,
+    "use_mla": false,
+    "use_qk_norm": true,
+    "use_rotary_pos_emb": true,
+    "v_head_dim": null,
+    "video_end_id": 10,
+    "video_start_id": 9,
+    "vit_add_patchemb_bias": false,
+    "vit_input_resolution": 224,
+    "vit_mapping_type": "resampler",
+    "vit_norm_type": "fused",
+    "vit_patch": 1,
+    "vit_path": null,
+    "vit_remove_prenorm": false,
+    "vit_token": 64,
+    "vit_type": null,
+    "vit_used_rms_norm": false,
+    "vocab_size": 133120,
+    "xdrope_section": null,
+    "head_dim": 128,
+    "vae_downsample_factor": [
+        16,
+        16
+    ],
+    "vae": {
+        "_class_name": "AutoencoderKLConv3D",
+        "block_out_channels": [
+            128,
+            256,
+            512,
+            1024,
+            1024
+        ],
+        "in_channels": 3,
+        "out_channels": 3,
+        "latent_channels": 32,
+        "layers_per_block": 2,
+        "ffactor_spatial": 16,
+        "ffactor_temporal": 4,
+        "sample_size": 384,
+        "sample_tsize": 96,
+        "downsample_match_channel": true,
+        "upsample_match_channel": true,
+        "scaling_factor": 0.562679178327931
+    },
+    "vit": {
+        "_attn_implementation": "sdpa",
+        "attention_dropout": 0.0,
+        "hidden_act": "gelu_pytorch_tanh",
+        "hidden_size": 1152,
+        "intermediate_size": 4304,
+        "layer_norm_eps": 1e-06,
+        "num_attention_heads": 16,
+        "num_channels": 3,
+        "num_hidden_layers": 27,
+        "num_patches": 256,
+        "patch_size": 16,
+        "torch_dtype": "float32",
+        "output_attentions": false,
+        "output_hidden_states": false,
+        "use_return_dict": true
+    },
+    "vit_processor": {
+        "do_convert_rgb": null,
+        "do_normalize": true,
+        "do_rescale": true,
+        "do_resize": true,
+        "image_mean": [
+        0.5,
+        0.5,
+        0.5
+        ],
+        "image_processor_type": "Siglip2ImageProcessorFast",
+        "image_std": [
+        0.5,
+        0.5,
+        0.5
+        ],
+        "max_num_patches": 1024,
+        "patch_size": 16,
+        "processor_class": "Siglip2Processor",
+        "resample": 2,
+        "rescale_factor": 0.00392156862745098
+    },
+    "vit_aligner": {
+        "projector_type": "mlp_gelu",
+        "input_dim": 1152,
+        "n_embed": 4096,
+        "depth": 2,
+        "torch_dtype": "float32"
+    }
+}

configuration_hunyuan.py ADDED Viewed

	@@ -0,0 +1,285 @@

+# Licensed under the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://github.com/Tencent-Hunyuan/HunyuanImage-3.0/blob/main/LICENSE
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+from typing import List, Union
+logger = logging.get_logger(__name__)
+class HunyuanImage3Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`HunyuanImage3Model`]. It is used to instantiate
+    an Hunyuan model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the Hunyuan-7B.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Hunyuan Image 3 model. Defines the number of different tokens that can be
+            represented by the `inputs_ids` passed when calling [`HunyuanImage3Model`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations or shared MLP representations.
+        moe_intermediate_size (`int` or `List`, *optional*, defaults to 11008):
+            Dimension of the MLP representations in MoE. Use a list if you want a different size per layer.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        pretraining_tp (`int`, *optional*, defaults to 1):
+            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
+            document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
+            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
+            issue](https://github.com/pytorch/pytorch/issues/76232).
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        use_qk_norm (`bool`, *optional*, defaults to `False`):
+            Whether query and key in attention use norm
+        use_cla (`bool`, *optional*, defaults to `False`):
+            Whether to use CLA in attention
+        cla_share_factor (`int`, *optional*, defaults to 1):
+            The share factor of CLA
+        num_experts (`int` or `List`, *optional*, defaults to 1):
+            The number of experts for moe. If it is a list, it will be used as the number of experts for each layer.
+        num_shared_expert (`int` or `List`, *optional*, defaults to 1):
+            The number of shared experts for moe. If it is a list, it will be used as the number of shared experts
+            for each layer.
+        moe_topk (`int` or `List`, *optional*, defaults to 1):
+            The topk value for moe. If it is a list, it will be used as the topk value for each layer.
+        capacity_factor (Not used) (`float` or `List`, *optional*, defaults to 1.0):
+            The capacity factor for moe. If it is a list, it will be used as the capacity factor for each layer.
+        moe_layer_num_skipped (`int`, *optional*, defaults to 0):
+            First moe_layer_num_skipped layers do not use MoE.
+    """
+    model_type = "Hunyuan"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+            self,
+            vocab_size=290943,
+            hidden_size=4096,
+            intermediate_size: int=11008,
+            moe_intermediate_size: Union[int, List]=None,
+            num_hidden_layers=32,
+            num_attention_heads=32,
+            num_key_value_heads=None,
+            attention_head_dim=None,
+            hidden_act="silu",
+            max_position_embeddings=2048,
+            initializer_range=0.02,
+            rms_norm_eps=1e-5,
+            use_cache=True,
+            pad_token_id=0,
+            bos_token_id=1,
+            eos_token_id=2,
+            eod_token_id=3,
+            im_start_id=4,
+            im_end_id=5,
+            text_start_id=6,
+            text_end_id=7,
+            image_token_id=8,
+            video_start_id=9,
+            video_end_id=10,
+            im_newline_id=11,
+            mask_init_id=12,
+            pretraining_tp=1,
+            tie_word_embeddings=False,
+            rope_theta=10000.0,
+            rope_scaling=None,
+            attention_bias=False,
+            mlp_bias=False,
+            attention_dropout=0.0,
+            use_qk_norm=False,
+            use_rotary_pos_emb=True,
+            use_cla=False,
+            cla_share_factor=1,
+            norm_type="hf_rms",
+            num_experts: Union[int, List] = 1,
+            use_mixed_mlp_moe=False,
+            num_shared_expert: Union[int, List] = 1,
+            moe_topk: Union[int, List] = 1,
+            capacity_factor: int = 1.0,
+            moe_drop_tokens=False,
+            moe_random_routing_dropped_token=False,
+            use_mla=False,
+            kv_lora_rank=512,
+            q_lora_rank=1536,
+            qk_rope_head_dim=64,
+            v_head_dim=128,
+            qk_nope_head_dim=128,
+            moe_layer_num_skipped=0,
+            norm_topk_prob=True,
+            routed_scaling_factor=1.0,
+            group_limited_greedy=False,
+            n_group=None,
+            topk_group=None,
+            add_classification_head=False,
+            class_num=0,
+            pool_type="last",
+            pad_id=-1,
+            # Added
+            moe_impl="eager",
+            vae_downsample_factor=(16, 16),     # (h, w)
+            img_proj_type="unet",
+            patch_size=1,
+            patch_embed_hidden_dim=1024,
+            image_base_size=1024,
+            vae=None,
+            vit=None,
+            vit_processor=None,
+            vit_aligner=None,
+            **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.moe_impl = moe_impl
+        self.num_experts = num_experts
+        self.use_mixed_mlp_moe = use_mixed_mlp_moe
+        self.num_shared_expert = num_shared_expert
+        self.moe_topk = moe_topk
+        self.capacity_factor = capacity_factor
+        self.moe_drop_tokens = moe_drop_tokens
+        self.moe_random_routing_dropped_token = moe_random_routing_dropped_token
+        if attention_head_dim is not None:
+            self.attention_head_dim = attention_head_dim
+        else:
+            self.attention_head_dim = self.hidden_size // num_attention_heads
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.mlp_bias = mlp_bias
+        self.attention_dropout = attention_dropout
+        self.use_qk_norm = use_qk_norm
+        self.use_rotary_pos_emb = use_rotary_pos_emb
+        self.use_cla = use_cla
+        self.cla_share_factor = cla_share_factor
+        self.norm_type = norm_type
+        # MLA args
+        self.use_mla = use_mla
+        self.kv_lora_rank = kv_lora_rank
+        self.q_lora_rank = q_lora_rank
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.v_head_dim = v_head_dim
+        # DeepSeek related args
+        self.moe_layer_num_skipped = moe_layer_num_skipped
+        self.norm_topk_prob = norm_topk_prob
+        self.routed_scaling_factor = routed_scaling_factor
+        self.group_limited_greedy = group_limited_greedy
+        self.n_group = n_group
+        self.topk_group = topk_group
+        self.add_classification_head = add_classification_head
+        self.class_num = class_num
+        self.pool_type = pool_type
+        self.pad_id = pad_id
+        if self.class_num is not None:
+            self.dense_list = [self.hidden_size, self.class_num]
+        # ViT args
+        self.vit = vit
+        self.vit_processor = vit_processor
+        self.vit_aligner = vit_aligner
+        # Image Gen args
+        self.vae = vae
+        self.vae_downsample_factor = vae_downsample_factor
+        self.img_proj_type = img_proj_type
+        self.patch_size = patch_size
+        self.patch_embed_hidden_dim = patch_embed_hidden_dim
+        self.image_base_size = image_base_size
+        # token id
+        self.eod_token_id = eod_token_id
+        self.im_start_id = im_start_id
+        self.im_end_id = im_end_id
+        self.text_start_id = text_start_id
+        self.text_end_id = text_end_id
+        self.image_token_id = image_token_id
+        self.video_start_id = video_start_id
+        self.video_end_id = video_end_id
+        self.im_newline_id = im_newline_id
+        self.mask_init_id = mask_init_id
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )

generation_config.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+    "disable_compile": true,
+    "eos_token_id": [
+        127957
+    ],
+    "pad_token_id": 128009,
+    "do_sample": true,
+    "top_k": 1024,
+    "top_p": 0.95,
+    "temperature": 0.6,
+    "max_length": 12800,
+    "sequence_template": "pretrain",
+    "diff_infer_steps": 50,
+    "diff_guidance_scale": 5.0,
+    "flow_shift": 3.0,
+    "use_system_prompt": "None",
+    "drop_think": false,
+    "bot_task": "image",
+    "transformers_version": "4.50.0"
+}

hunyuan.py ADDED Viewed

The diff for this file is too large to render. See raw diff

hunyuan_image_3_pipeline.py ADDED Viewed

	@@ -0,0 +1,879 @@

+# Licensed under the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://github.com/Tencent-Hunyuan/HunyuanImage-3.0/blob/main/LICENSE
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================================
+import inspect
+import math
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List
+from typing import Optional, Tuple, Union
+import numpy as np
+import torch
+from PIL import Image
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.schedulers.scheduling_utils import SchedulerMixin
+from diffusers.utils import BaseOutput, logging
+from diffusers.utils.torch_utils import randn_tensor
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    r"""
+    Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
+    Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
+    Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+    Args:
+        noise_cfg (`torch.Tensor`):
+            The predicted noise tensor for the guided diffusion process.
+        noise_pred_text (`torch.Tensor`):
+            The predicted noise tensor for the text-guided diffusion process.
+        guidance_rescale (`float`, *optional*, defaults to 0.0):
+            A rescale factor applied to the noise predictions.
+    Returns:
+        noise_cfg (`torch.Tensor`): The rescaled noise prediction tensor.
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+@dataclass
+class HunyuanImage3Text2ImagePipelineOutput(BaseOutput):
+    samples: Union[List[Any], np.ndarray]
+@dataclass
+class FlowMatchDiscreteSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+    """
+    prev_sample: torch.FloatTensor
+class FlowMatchDiscreteScheduler(SchedulerMixin, ConfigMixin):
+    """
+    Euler scheduler.
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        shift (`float`, defaults to 1.0):
+            The shift value for the timestep schedule.
+        reverse (`bool`, defaults to `True`):
+            Whether to reverse the timestep schedule.
+    """
+    _compatibles = []
+    order = 1
+    @register_to_config
+    def __init__(
+            self,
+            num_train_timesteps: int = 1000,
+            shift: float = 1.0,
+            reverse: bool = True,
+            solver: str = "euler",
+            use_flux_shift: bool = False,
+            flux_base_shift: float = 0.5,
+            flux_max_shift: float = 1.15,
+            n_tokens: Optional[int] = None,
+    ):
+        sigmas = torch.linspace(1, 0, num_train_timesteps + 1)
+        if not reverse:
+            sigmas = sigmas.flip(0)
+        self.sigmas = sigmas
+        # the value fed to model
+        self.timesteps = (sigmas[:-1] * num_train_timesteps).to(dtype=torch.float32)
+        self.timesteps_full = (sigmas * num_train_timesteps).to(dtype=torch.float32)
+        self._step_index = None
+        self._begin_index = None
+        self.supported_solver = [
+            "euler",
+            "heun-2", "midpoint-2",
+            "kutta-4",
+        ]
+        if solver not in self.supported_solver:
+            raise ValueError(f"Solver {solver} not supported. Supported solvers: {self.supported_solver}")
+        # empty dt and derivative (for heun)
+        self.derivative_1 = None
+        self.derivative_2 = None
+        self.derivative_3 = None
+        self.dt = None
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+    def _sigma_to_t(self, sigma):
+        return sigma * self.config.num_train_timesteps
+    @property
+    def state_in_first_order(self):
+        return self.derivative_1 is None
+    @property
+    def state_in_second_order(self):
+        return self.derivative_2 is None
+    @property
+    def state_in_third_order(self):
+        return self.derivative_3 is None
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None,
+                      n_tokens: int = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+            n_tokens (`int`, *optional*):
+                Number of tokens in the input sequence.
+        """
+        self.num_inference_steps = num_inference_steps
+        sigmas = torch.linspace(1, 0, num_inference_steps + 1)
+        # Apply timestep shift
+        if self.config.use_flux_shift:
+            assert isinstance(n_tokens, int), "n_tokens should be provided for flux shift"
+            mu = self.get_lin_function(y1=self.config.flux_base_shift, y2=self.config.flux_max_shift)(n_tokens)
+            sigmas = self.flux_time_shift(mu, 1.0, sigmas)
+        elif self.config.shift != 1.:
+            sigmas = self.sd3_time_shift(sigmas)
+        if not self.config.reverse:
+            sigmas = 1 - sigmas
+        self.sigmas = sigmas
+        self.timesteps = (sigmas[:-1] * self.config.num_train_timesteps).to(dtype=torch.float32, device=device)
+        self.timesteps_full = (sigmas * self.config.num_train_timesteps).to(dtype=torch.float32, device=device)
+        # empty dt and derivative (for kutta)
+        self.derivative_1 = None
+        self.derivative_2 = None
+        self.derivative_3 = None
+        self.dt = None
+        # Reset step index
+        self._step_index = None
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+        indices = (schedule_timesteps == timestep).nonzero()
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+        return indices[pos].item()
+    def _init_step_index(self, timestep):
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+    def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor:
+        return sample
+    @staticmethod
+    def get_lin_function(x1: float = 256, y1: float = 0.5, x2: float = 4096, y2: float = 1.15):
+        m = (y2 - y1) / (x2 - x1)
+        b = y1 - m * x1
+        return lambda x: m * x + b
+    @staticmethod
+    def flux_time_shift(mu: float, sigma: float, t: torch.Tensor):
+        return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
+    def sd3_time_shift(self, t: torch.Tensor):
+        return (self.config.shift * t) / (1 + (self.config.shift - 1) * t)
+    def step(
+            self,
+            model_output: torch.FloatTensor,
+            timestep: Union[float, torch.FloatTensor],
+            sample: torch.FloatTensor,
+            pred_uncond: torch.FloatTensor = None,
+            generator: Optional[torch.Generator] = None,
+            n_tokens: Optional[int] = None,
+            return_dict: bool = True,
+    ) -> Union[FlowMatchDiscreteSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            n_tokens (`int`, *optional*):
+                Number of tokens in the input sequence.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or
+                tuple.
+        Returns:
+            [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] is
+                returned, otherwise a tuple is returned where the first element is the sample tensor.
+        """
+        if (
+                isinstance(timestep, int)
+                or isinstance(timestep, torch.IntTensor)
+                or isinstance(timestep, torch.LongTensor)
+        ):
+            raise ValueError(
+                (
+                    "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+                    " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
+                    " one of the `scheduler.timesteps` as a timestep."
+                ),
+            )
+        if self.step_index is None:
+            self._init_step_index(timestep)
+        # Upcast to avoid precision issues when computing prev_sample
+        sample = sample.to(torch.float32)
+        model_output = model_output.to(torch.float32)
+        pred_uncond = pred_uncond.to(torch.float32) if pred_uncond is not None else None
+        # dt = self.sigmas[self.step_index + 1] - self.sigmas[self.step_index]
+        sigma = self.sigmas[self.step_index]
+        sigma_next = self.sigmas[self.step_index + 1]
+        last_inner_step = True
+        if self.config.solver == "euler":
+            derivative, dt, sample, last_inner_step = self.first_order_method(model_output, sigma, sigma_next, sample)
+        elif self.config.solver in ["heun-2", "midpoint-2"]:
+            derivative, dt, sample, last_inner_step = self.second_order_method(model_output, sigma, sigma_next, sample)
+        elif self.config.solver == "kutta-4":
+            derivative, dt, sample, last_inner_step = self.fourth_order_method(model_output, sigma, sigma_next, sample)
+        else:
+            raise ValueError(f"Solver {self.config.solver} not supported. Supported solvers: {self.supported_solver}")
+        prev_sample = sample + derivative * dt
+        # Cast sample back to model compatible dtype
+        # prev_sample = prev_sample.to(model_output.dtype)
+        # upon completion increase step index by one
+        if last_inner_step:
+            self._step_index += 1
+        if not return_dict:
+            return (prev_sample,)
+        return FlowMatchDiscreteSchedulerOutput(prev_sample=prev_sample)
+    def first_order_method(self, model_output, sigma, sigma_next, sample):
+        derivative = model_output
+        dt = sigma_next - sigma
+        return derivative, dt, sample, True
+    def second_order_method(self, model_output, sigma, sigma_next, sample):
+        if self.state_in_first_order:
+            # store for 2nd order step
+            self.derivative_1 = model_output
+            self.dt = sigma_next - sigma
+            self.sample = sample
+            derivative = model_output
+            if self.config.solver == 'heun-2':
+                dt = self.dt
+            elif self.config.solver == 'midpoint-2':
+                dt = self.dt / 2
+            else:
+                raise NotImplementedError(f"Solver {self.config.solver} not supported.")
+            last_inner_step = False
+        else:
+            if self.config.solver == 'heun-2':
+                derivative = 0.5 * (self.derivative_1 + model_output)
+            elif self.config.solver == 'midpoint-2':
+                derivative = model_output
+            else:
+                raise NotImplementedError(f"Solver {self.config.solver} not supported.")
+            # 3. take prev timestep & sample
+            dt = self.dt
+            sample = self.sample
+            last_inner_step = True
+            # free dt and derivative
+            # Note, this puts the scheduler in "first order mode"
+            self.derivative_1 = None
+            self.dt = None
+            self.sample = None
+        return derivative, dt, sample, last_inner_step
+    def fourth_order_method(self, model_output, sigma, sigma_next, sample):
+        if self.state_in_first_order:
+            self.derivative_1 = model_output
+            self.dt = sigma_next - sigma
+            self.sample = sample
+            derivative = model_output
+            dt = self.dt / 2
+            last_inner_step = False
+        elif self.state_in_second_order:
+            self.derivative_2 = model_output
+            derivative = model_output
+            dt = self.dt / 2
+            last_inner_step = False
+        elif self.state_in_third_order:
+            self.derivative_3 = model_output
+            derivative = model_output
+            dt = self.dt
+            last_inner_step = False
+        else:
+            derivative = (1/6 * self.derivative_1 + 1/3 * self.derivative_2 + 1/3 * self.derivative_3 +
+                          1/6 * model_output)
+            # 3. take prev timestep & sample
+            dt = self.dt
+            sample = self.sample
+            last_inner_step = True
+            # free dt and derivative
+            # Note, this puts the scheduler in "first order mode"
+            self.derivative_1 = None
+            self.derivative_2 = None
+            self.derivative_3 = None
+            self.dt = None
+            self.sample = None
+        return derivative, dt, sample, last_inner_step
+    def __len__(self):
+        return self.config.num_train_timesteps
+class ClassifierFreeGuidance:
+    def __init__(
+        self,
+        use_original_formulation: bool = False,
+        start: float = 0.0,
+        stop: float = 1.0,
+    ):
+        super().__init__()
+        self.use_original_formulation = use_original_formulation
+    def __call__(
+            self,
+            pred_cond: torch.Tensor,
+            pred_uncond: Optional[torch.Tensor],
+            guidance_scale: float,
+            step: int,
+        ) -> torch.Tensor:
+        shift = pred_cond - pred_uncond
+        pred = pred_cond if self.use_original_formulation else pred_uncond
+        pred = pred + guidance_scale * shift
+        return pred
+class HunyuanImage3Text2ImagePipeline(DiffusionPipeline):
+    r"""
+    Pipeline for condition-to-sample generation using Stable Diffusion.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+    Args:
+        model ([`ModelMixin`]):
+            A model to denoise the diffused latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `diffusion_model` to denoise the diffused latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+    model_cpu_offload_seq = ""
+    _optional_components = []
+    _exclude_from_cpu_offload = []
+    _callback_tensor_inputs = ["latents"]
+    def __init__(
+        self,
+        model,
+        scheduler: SchedulerMixin,
+        vae,
+        progress_bar_config: Dict[str, Any] = None,
+    ):
+        super().__init__()
+        # ==========================================================================================
+        if progress_bar_config is None:
+            progress_bar_config = {}
+        if not hasattr(self, '_progress_bar_config'):
+            self._progress_bar_config = {}
+        self._progress_bar_config.update(progress_bar_config)
+        # ==========================================================================================
+        self.register_modules(
+            model=model,
+            scheduler=scheduler,
+            vae=vae,
+        )
+        # should be a tuple or a list corresponding to the size of latents (batch_size, channel, *size)
+        # if None, will be treated as a tuple of 1
+        self.latent_scale_factor = self.model.config.vae_downsample_factor
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.latent_scale_factor)
+        # Must start with APG_mode_
+        self.cfg_operator = ClassifierFreeGuidance()
+    @staticmethod
+    def denormalize(images: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]:
+        """
+        Denormalize an image array to [0,1].
+        """
+        return (images / 2 + 0.5).clamp(0, 1)
+    @staticmethod
+    def pt_to_numpy(images: torch.Tensor) -> np.ndarray:
+        """
+        Convert a PyTorch tensor to a NumPy image.
+        """
+        images = images.cpu().permute(0, 2, 3, 1).float().numpy()
+        return images
+    @staticmethod
+    def numpy_to_pil(images: np.ndarray):
+        """
+        Convert a numpy image or a batch of images to a PIL image.
+        """
+        if images.ndim == 3:
+            images = images[None, ...]
+        images = (images * 255).round().astype("uint8")
+        if images.shape[-1] == 1:
+            # special case for grayscale (single channel) images
+            pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
+        else:
+            pil_images = [Image.fromarray(image) for image in images]
+        return pil_images
+    def prepare_extra_func_kwargs(self, func, kwargs):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        extra_kwargs = {}
+        for k, v in kwargs.items():
+            accepts = k in set(inspect.signature(func).parameters.keys())
+            if accepts:
+                extra_kwargs[k] = v
+        return extra_kwargs
+    def prepare_latents(self, batch_size, latent_channel, image_size, dtype, device, generator, latents=None):
+        if self.latent_scale_factor is None:
+            latent_scale_factor = (1,) * len(image_size)
+        elif isinstance(self.latent_scale_factor, int):
+            latent_scale_factor = (self.latent_scale_factor,) * len(image_size)
+        elif isinstance(self.latent_scale_factor, tuple) or isinstance(self.latent_scale_factor, list):
+            assert len(self.latent_scale_factor) == len(image_size), \
+                "len(latent_scale_factor) shoudl be the same as len(image_size)"
+            latent_scale_factor = self.latent_scale_factor
+        else:
+            raise ValueError(
+                f"latent_scale_factor should be either None, int, tuple of int, or list of int, "
+                f"but got {self.latent_scale_factor}"
+            )
+        latents_shape = (
+            batch_size,
+            latent_channel,
+            *[int(s) // f for s, f in zip(image_size, latent_scale_factor)],
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            latents = randn_tensor(latents_shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+        # Check existence to make it compatible with FlowMatchEulerDiscreteScheduler
+        if hasattr(self.scheduler, "init_noise_sigma"):
+            # scale the initial noise by the standard deviation required by the scheduler
+            latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1.0
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    def set_scheduler(self, new_scheduler):
+        self.register_modules(scheduler=new_scheduler)
+    @torch.no_grad()
+    def __call__(
+        self,
+        batch_size: int,
+        image_size: List[int],
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        guidance_scale: float = 7.5,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        guidance_rescale: float = 0.0,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        model_kwargs: Dict[str, Any] = None,
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`):
+                The text to guide image generation.
+            image_size (`Tuple[int]` or `List[int]`):
+                The size (height, width) of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate samples closely linked to the
+                `condition` at the expense of lower sample quality. Guidance scale is enabled when `guidance_scale > 1`.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for sample
+                generation. Can be used to tweak the same generation with different conditions. If not provided,
+                a latents tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated sample.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~DiffusionPipelineOutput`] instead of a
+                plain tuple.
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
+                using zero terminal SNR.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+        Examples:
+        Returns:
+            [`~DiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~DiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated samples.
+        """
+        callback_steps = kwargs.pop("callback_steps", None)
+        pbar_steps = kwargs.pop("pbar_steps", None)
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        cfg_factor = 1 + self.do_classifier_free_guidance
+        # Define call parameters
+        device = self._execution_device
+        # Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas,
+        )
+        # Prepare latent variables
+        latents = self.prepare_latents(
+            batch_size=batch_size,
+            latent_channel=self.model.config.vae["latent_channels"],
+            image_size=image_size,
+            dtype=torch.bfloat16,
+            device=device,
+            generator=generator,
+            latents=latents,
+        )
+        # Prepare extra step kwargs.
+        _scheduler_step_extra_kwargs = self.prepare_extra_func_kwargs(
+            self.scheduler.step, {"generator": generator}
+        )
+        # Prepare model kwargs
+        input_ids = model_kwargs.pop("input_ids")
+        attention_mask = self.model._prepare_attention_mask_for_generation(     # noqa
+            input_ids, self.model.generation_config, model_kwargs=model_kwargs,
+        )
+        model_kwargs["attention_mask"] = attention_mask.to(latents.device)
+        # Sampling loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * cfg_factor)
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                t_expand = t.repeat(latent_model_input.shape[0])
+                model_inputs = self.model.prepare_inputs_for_generation(
+                    input_ids,
+                    images=latent_model_input,
+                    timestep=t_expand,
+                    **model_kwargs,
+                )
+                with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True):
+                    model_output = self.model(**model_inputs, first_step=(i == 0))
+                    pred = model_output["diffusion_prediction"]
+                pred = pred.to(dtype=torch.float32)
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    pred_cond, pred_uncond = pred.chunk(2)
+                    pred = self.cfg_operator(pred_cond, pred_uncond, self.guidance_scale, step=i)
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    pred = rescale_noise_cfg(pred, pred_cond, guidance_rescale=self.guidance_rescale)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(pred, t, latents, **_scheduler_step_extra_kwargs, return_dict=False)[0]
+                if i != len(timesteps) - 1:
+                    model_kwargs = self.model._update_model_kwargs_for_generation(  # noqa
+                        model_output,
+                        model_kwargs,
+                    )
+                    if input_ids.shape[1] != model_kwargs["position_ids"].shape[1]:
+                        input_ids = torch.gather(input_ids, 1, index=model_kwargs["position_ids"])
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+        if hasattr(self.vae.config, 'scaling_factor') and self.vae.config.scaling_factor:
+            latents = latents / self.vae.config.scaling_factor
+        if hasattr(self.vae.config, 'shift_factor') and self.vae.config.shift_factor:
+            latents = latents + self.vae.config.shift_factor
+        if hasattr(self.vae, "ffactor_temporal"):
+            latents = latents.unsqueeze(2)
+        with torch.autocast(device_type="cuda", dtype=torch.float16, enabled=True):
+            image = self.vae.decode(latents, return_dict=False, generator=generator)[0]
+        # b c t h w
+        if hasattr(self.vae, "ffactor_temporal"):
+            assert image.shape[2] == 1, "image should have shape [B, C, T, H, W] and T should be 1"
+            image = image.squeeze(2)
+        do_denormalize = [True] * image.shape[0]
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+        if not return_dict:
+            return (image,)
+        return HunyuanImage3Text2ImagePipelineOutput(samples=image)

image_processor.py ADDED Viewed

	@@ -0,0 +1,125 @@

+# Licensed under the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://github.com/Tencent-Hunyuan/HunyuanImage-3.0/blob/main/LICENSE
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from typing import Tuple
+from PIL import Image
+from torchvision import transforms
+from transformers import Siglip2ImageProcessorFast
+from .tokenizer_wrapper import ImageInfo, JointImageInfo, ResolutionGroup
+def resize_and_crop(image: Image.Image, target_size: Tuple[int, int]) -> Image.Image:
+    tw, th = target_size
+    w, h = image.size
+    tr = th / tw
+    r = h / w
+    # resize
+    if r < tr:
+        resize_height = th
+        resize_width = int(round(th / h * w))
+    else:
+        resize_width = tw
+        resize_height = int(round(tw / w * h))
+    image = image.resize((resize_width, resize_height), resample=Image.Resampling.LANCZOS)
+    # center crop
+    crop_top = int(round((resize_height - th) / 2.0))
+    crop_left = int(round((resize_width - tw) / 2.0))
+    image = image.crop((crop_left, crop_top, crop_left + tw, crop_top + th))
+    return image
+class HunyuanImage3ImageProcessor(object):
+    def __init__(self, config):
+        self.config = config
+        self.reso_group = ResolutionGroup(base_size=config.image_base_size)
+        self.vae_processor = transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5]),  # transform to [-1, 1]
+        ])
+        self.vision_encoder_processor = Siglip2ImageProcessorFast.from_dict(config.vit_processor)
+    def build_image_info(self, image_size):
+        # parse image size (HxW, H:W, or <img_ratio_i>)
+        if isinstance(image_size, str):
+            if image_size.startswith("<img_ratio_"):
+                ratio_index = int(image_size.split("_")[-1].rstrip(">"))
+                reso = self.reso_group[ratio_index]
+                image_size = reso.height, reso.width
+            elif 'x' in image_size:
+                image_size = [int(s) for s in image_size.split('x')]
+            elif ':' in image_size:
+                image_size = [int(s) for s in image_size.split(':')]
+            else:
+                raise ValueError(
+                    f"`image_size` should be in the format of 'HxW', 'H:W' or <img_ratio_i>, got {image_size}.")
+            assert len(image_size) == 2, f"`image_size` should be in the format of 'HxW', got {image_size}."
+        elif isinstance(image_size, (list, tuple)):
+            assert len(image_size) == 2 and all(isinstance(s, int) for s in image_size), \
+                f"`image_size` should be a tuple of two integers or a string in the format of 'HxW', got {image_size}."
+        else:
+            raise ValueError(f"`image_size` should be a tuple of two integers or a string in the format of 'WxH', "
+                             f"got {image_size}.")
+        image_width, image_height = self.reso_group.get_target_size(image_size[1], image_size[0])
+        token_height = image_height // (self.config.vae_downsample_factor[0] * self.config.patch_size)
+        token_width = image_width // (self.config.vae_downsample_factor[1] * self.config.patch_size)
+        base_size, ratio_idx = self.reso_group.get_base_size_and_ratio_index(image_size[1], image_size[0])
+        image_info = ImageInfo(
+            image_type="gen_image", image_width=image_width, image_height=image_height,
+            token_width=token_width, token_height=token_height, base_size=base_size, ratio_index=ratio_idx,
+        )
+        return image_info
+    def preprocess(self, image: Image.Image):
+        # ==== VAE processor ====
+        image_width, image_height = self.reso_group.get_target_size(image.width, image.height)
+        resized_image = resize_and_crop(image, (image_width, image_height))
+        image_tensor = self.vae_processor(resized_image)
+        token_height = image_height // (self.config.vae_downsample_factor[0] * self.config.patch_size)
+        token_width = image_width // (self.config.vae_downsample_factor[1] * self.config.patch_size)
+        base_size, ratio_index = self.reso_group.get_base_size_and_ratio_index(width=image_width, height=image_height)
+        vae_image_info = ImageInfo(
+            image_type="vae",
+            image_tensor=image_tensor.unsqueeze(0),     # include batch dim
+            image_width=image_width, image_height=image_height,
+            token_width=token_width, token_height=token_height,
+            base_size=base_size, ratio_index=ratio_index,
+        )
+        # ==== ViT processor ====
+        inputs = self.vision_encoder_processor(image)
+        image = inputs["pixel_values"].squeeze(0)  # seq_len x dim
+        pixel_attention_mask = inputs["pixel_attention_mask"].squeeze(0)  # seq_len
+        spatial_shapes = inputs["spatial_shapes"].squeeze(0)  # 2  (h, w)
+        vision_encoder_kwargs = dict(
+            pixel_attention_mask=pixel_attention_mask,
+            spatial_shapes=spatial_shapes,
+        )
+        vision_image_info = ImageInfo(
+            image_type="vit",
+            image_tensor=image.unsqueeze(0),  # 1 x seq_len x dim
+            image_width=spatial_shapes[1].item() * self.config.vit_processor["patch_size"],
+            image_height=spatial_shapes[0].item() * self.config.vit_processor["patch_size"],
+            token_width=spatial_shapes[1].item(),
+            token_height=spatial_shapes[0].item(),
+            image_token_length=self.config.vit_processor["max_num_patches"],
+            # may not equal to token_width * token_height
+        )
+        return JointImageInfo(vae_image_info, vision_image_info, vision_encoder_kwargs)

model-0001-of-0032.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dad22fa5e99dcda532c242aa4d4875f9ea6fd8b2ed59e39776dec4ea55baf4e5
+size 5363066616

model-0002-of-0032.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9987e8220f81b70d07b62f06ac6c92bb0faf38ccb0ddd3f30b65ed895ad4a2fb
+size 5318937248

model-0003-of-0032.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:79f8d4d1b23562299da3360ac7e2437a4dd24be30b86bc8db580521b5f9b2616
+size 5344627472

model-0004-of-0032.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4faf1357831b25b9f9637594312e9024ee0fa1e87c734e20afdde2845fdaa516
+size 5327343192

model-0005-of-0032.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:46189f8777c117c431e46cc57ec2328fe72050452119ac7bb676bdaca3f76575
+size 5344103080

model-0006-of-0032.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8f9d5f386b7c2d0b171bd8a25f3f08e3150936fde2dfd92e9aa1f6e27dbf2e0d
+size 5318937248

model-0007-of-0032.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d30616044acead06484eacace50a4cab66267feb13555f235bac63d2540cf471
+size 5344103088

model-0008-of-0032.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:740ccbff8fa1dbb2847fe8c342654f7d24fa81f058065e82dfbccb89ce2743c1
+size 5318937256

model-0009-of-0032.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d5fc3df50de8591735d29f7acfece39b64b3735cccef176eb4a137f4ede68430
+size 5344103088

model-0010-of-0032.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f6058eb7527741d18c17131cb7810f11d8bd4c69cce10962e093e684413cd2a
+size 5318937304

model-0011-of-0032.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4c38d5fd2f18191d849b444e873ff91d3f048d8c4bcd71b3035ff0f7973ac273
+size 5344103232

model-0012-of-0032.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:688a6a818f6d164d345e3bb37c4f3fcee40cc7d458027d2a37f7486463843ec3
+size 5318937400

model-0013-of-0032.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f77757aa32fa67f75f8f8ec5bc831d358093483c2a8692bff7477378aea00f28
+size 5344103232

model-0014-of-0032.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3308c079c20008e1ac8852cfb986764064077278754492f2fd9ec893857b6489
+size 5318937400

model-0015-of-0032.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e32b467eb49473c7f42696db0916ca3275c01984c48a10433d78be4d351b7ff8
+size 5344103232

model-0016-of-0032.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b97d98195a45518bae971bc43c224225b60e1fbb8b2eb93115024d2bdf328dca
+size 5318937400

model-0017-of-0032.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2f00339bad7371e59f2d3642fd0575abafa92fc4509803f8fe5a64492185d2ab
+size 5344103224

model-0018-of-0032.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b48a59d090d396aa9801765485381f8255d442c2da2d9e98f1c21a68c6b83b1
+size 5327859080

model-0019-of-0032.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dd4e5a082f3db3b61774ce86675cfb171f33319fd3dd8f942cd952633834d334
+size 5344111888

model-0020-of-0032.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f27fc2c0eedfc6b99ebe07e244c9689e89fa06dc65216d9c07aa6067783f86b5
+size 5318937392