Spaces:

minhho
/

Hunyuan-MT

Runtime error

minhho commited on 22 days ago

Commit

1ac5292

1 Parent(s): ddf83e0

Feature: Enable Text-to-3D with HunyuanDiT (fallback to SDXL-Turbo)

- Created hy3dgen/text2image.py with HunyuanDiTPipeline
- Supports HunyuanDiT-v1.1-Diffusers-Distilled (primary)
- Automatic fallback to SDXL-Turbo if HunyuanDiT fails (VRAM constraints)
- Text Prompt tab now visible when T2I model loads successfully
- Optimized for ZERO GPU deployment
- Compatible with Tencent's official implementation

Files changed (3) hide show

gradio_app.py +7 -19
hy3dgen/__init__.py +1 -0
hy3dgen/text2image.py +156 -0

gradio_app.py CHANGED Viewed

@@ -983,28 +983,16 @@ if __name__ == '__main__':
             print('Please try to install requirements by following README.md')
             HAS_TEXTUREGEN = False
-    # Text-to-Image setup - Always try to load HunyuanDiT
-    HAS_T2I = False
-    t2i_worker = None
     try:
-        print("Loading HunyuanDiT for text-to-image generation...")
-        from diffusers import HunyuanDiTPipeline
-        t2i_worker = HunyuanDiTPipeline.from_pretrained(
-            "Tencent-Hunyuan/HunyuanDiT-v1.1-Diffusers-Distilled",
-            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-        )
-        if torch.cuda.is_available():
-            t2i_worker = t2i_worker.to("cuda")
         HAS_T2I = True
-        print("✓ HunyuanDiT loaded successfully for Text-to-3D feature")
     except Exception as e:
-        print(f"✗ Warning: Failed to load HunyuanDiT: {e}")
-        print("Text-to-3D feature will be disabled")
         HAS_T2I = False
         t2i_worker = None

             print('Please try to install requirements by following README.md')
             HAS_TEXTUREGEN = False
+    # Text-to-Image: Using lightweight model for ZERO GPU compatibility
     try:
+        from hy3dgen.text2image import HunyuanDiTPipeline
+        # Try HunyuanDiT first, fallback to SDXL-Turbo if unavailable
+        t2i_worker = HunyuanDiTPipeline('Tencent-Hunyuan/HunyuanDiT-v1.1-Diffusers-Distilled')
         HAS_T2I = True
+        print("✓ Text-to-Image model loaded successfully")
     except Exception as e:
+        print(f"⚠ Failed to load text-to-image model: {e}")
+        print("→ Text Prompt feature will be disabled")
         HAS_T2I = False
         t2i_worker = None

hy3dgen/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Hunyuan3D text-to-image utilities

hy3dgen/text2image.py ADDED Viewed

	@@ -0,0 +1,156 @@

+"""
+Text-to-Image pipeline for Hunyuan3D
+Supports multiple lightweight models for ZERO GPU deployment
+"""
+import os
+import random
+import torch
+import numpy as np
+from diffusers import AutoPipelineForText2Image, DiffusionPipeline
+def seed_everything(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    os.environ["PL_GLOBAL_SEED"] = str(seed)
+class Text2ImagePipeline:
+    """Lightweight text-to-image pipeline optimized for ZERO GPU"""
+    def __init__(
+        self,
+        model_path="stabilityai/sdxl-turbo",
+        device='cuda'
+    ):
+        self.device = device
+        self.model_path = model_path
+        # Load appropriate pipeline based on model
+        if "sdxl-turbo" in model_path.lower():
+            self.pipe = AutoPipelineForText2Image.from_pretrained(
+                model_path,
+                torch_dtype=torch.float16,
+                variant="fp16"
+            ).to(device)
+            self.num_steps = 4  # SDXL-Turbo needs only 1-4 steps
+            self.guidance_scale = 0.0  # SDXL-Turbo works without guidance
+        elif "stable-diffusion-2-1" in model_path.lower():
+            self.pipe = DiffusionPipeline.from_pretrained(
+                model_path,
+                torch_dtype=torch.float16
+            ).to(device)
+            self.num_steps = 25
+            self.guidance_scale = 7.5
+        else:
+            # Default: use AutoPipeline
+            self.pipe = AutoPipelineForText2Image.from_pretrained(
+                model_path,
+                torch_dtype=torch.float16
+            ).to(device)
+            self.num_steps = 25
+            self.guidance_scale = 7.5
+        # Enhanced prompt for 3D generation
+        self.pos_txt = ", white background, 3D style, high quality, centered object"
+        self.neg_txt = "text, close-up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck"
+    @torch.no_grad()
+    def __call__(self, prompt, seed=0):
+        seed_everything(seed)
+        generator = torch.Generator(device=self.device).manual_seed(int(seed))
+        # Enhance prompt for 3D object generation
+        enhanced_prompt = f"{prompt[:60]}{self.pos_txt}"
+        # Generate image
+        if "sdxl-turbo" in self.model_path.lower():
+            # SDXL-Turbo: fast, no guidance
+            out_img = self.pipe(
+                prompt=enhanced_prompt,
+                num_inference_steps=self.num_steps,
+                guidance_scale=self.guidance_scale,
+                generator=generator,
+            ).images[0]
+        else:
+            # Standard SD pipeline
+            out_img = self.pipe(
+                prompt=enhanced_prompt,
+                negative_prompt=self.neg_txt,
+                num_inference_steps=self.num_steps,
+                guidance_scale=self.guidance_scale,
+                height=1024,
+                width=1024,
+                generator=generator,
+            ).images[0]
+        # Resize to 1024x1024 if needed
+        if out_img.size != (1024, 1024):
+            out_img = out_img.resize((1024, 1024))
+        return out_img
+class HunyuanDiTPipeline(Text2ImagePipeline):
+    """
+    Wrapper for HunyuanDiT - falls back to SDXL-Turbo if HunyuanDiT is unavailable
+    Compatible with original Tencent implementation
+    """
+    def __init__(
+        self,
+        model_path="Tencent-Hunyuan/HunyuanDiT-v1.1-Diffusers-Distilled",
+        device='cuda'
+    ):
+        self.device = device
+        try:
+            # Try to load HunyuanDiT
+            self.pipe = AutoPipelineForText2Image.from_pretrained(
+                model_path,
+                torch_dtype=torch.float16,
+                enable_pag=True,
+                pag_applied_layers=["blocks.(16|17|18|19)"]
+            ).to(device)
+            self.num_steps = 25
+            self.guidance_scale = 1.3
+            self.is_hunyuan = True
+            print(f"✓ Loaded HunyuanDiT from {model_path}")
+        except Exception as e:
+            print(f"⚠ Failed to load HunyuanDiT: {e}")
+            print("→ Falling back to SDXL-Turbo (lightweight alternative)")
+            # Fallback to SDXL-Turbo
+            super().__init__(model_path="stabilityai/sdxl-turbo", device=device)
+            self.is_hunyuan = False
+            return
+        # HunyuanDiT prompts (Chinese)
+        self.pos_txt = ",白色背景,3D风格,最佳质量"
+        self.neg_txt = "文本,特写,裁剪,出框,最差质量,低质量,JPEG伪影,PGLY,重复,病态," \
+                      "残缺,多余的手指,变异的手,画得不好的手,画得不好的脸,变异,畸形,模糊,脱水,糟糕的解剖学," \
+                      "糟糕的比例,多余的肢体,克隆的脸,毁容,恶心的比例,畸形的肢体,缺失的手臂,缺失的腿," \
+                      "额外的手臂,额外的腿,融合的手指,手指太多,长脖子"
+    @torch.no_grad()
+    def __call__(self, prompt, seed=0):
+        if not self.is_hunyuan:
+            # Use parent SDXL-Turbo implementation
+            return super().__call__(prompt, seed)
+        seed_everything(seed)
+        generator = torch.Generator(device=self.device).manual_seed(int(seed))
+        out_img = self.pipe(
+            prompt=prompt[:60] + self.pos_txt,
+            negative_prompt=self.neg_txt,
+            num_inference_steps=self.num_steps,
+            pag_scale=self.guidance_scale,
+            width=1024,
+            height=1024,
+            generator=generator,
+            return_dict=False
+        )[0][0]
+        return out_img