Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	Feature: Enable Text-to-3D with HunyuanDiT (fallback to SDXL-Turbo)
Browse files- Created hy3dgen/text2image.py with HunyuanDiTPipeline
- Supports HunyuanDiT-v1.1-Diffusers-Distilled (primary)
- Automatic fallback to SDXL-Turbo if HunyuanDiT fails (VRAM constraints)
- Text Prompt tab now visible when T2I model loads successfully
- Optimized for ZERO GPU deployment
- Compatible with Tencent's official implementation
- gradio_app.py +7 -19
- hy3dgen/__init__.py +1 -0
- hy3dgen/text2image.py +156 -0
    	
        gradio_app.py
    CHANGED
    
    | @@ -983,28 +983,16 @@ if __name__ == '__main__': | |
| 983 | 
             
                        print('Please try to install requirements by following README.md')
         | 
| 984 | 
             
                        HAS_TEXTUREGEN = False
         | 
| 985 |  | 
| 986 | 
            -
                # Text-to-Image  | 
| 987 | 
            -
                HAS_T2I = False
         | 
| 988 | 
            -
                t2i_worker = None
         | 
| 989 | 
            -
             | 
| 990 | 
             
                try:
         | 
| 991 | 
            -
                     | 
| 992 | 
            -
                     | 
| 993 | 
            -
             | 
| 994 | 
            -
                    t2i_worker = HunyuanDiTPipeline.from_pretrained(
         | 
| 995 | 
            -
                        "Tencent-Hunyuan/HunyuanDiT-v1.1-Diffusers-Distilled",
         | 
| 996 | 
            -
                        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
         | 
| 997 | 
            -
                    )
         | 
| 998 | 
            -
             | 
| 999 | 
            -
                    if torch.cuda.is_available():
         | 
| 1000 | 
            -
                        t2i_worker = t2i_worker.to("cuda")
         | 
| 1001 | 
            -
             | 
| 1002 | 
             
                    HAS_T2I = True
         | 
| 1003 | 
            -
                    print("✓  | 
| 1004 | 
            -
             | 
| 1005 | 
             
                except Exception as e:
         | 
| 1006 | 
            -
                    print(f" | 
| 1007 | 
            -
                    print("Text | 
| 1008 | 
             
                    HAS_T2I = False
         | 
| 1009 | 
             
                    t2i_worker = None
         | 
| 1010 |  | 
|  | |
| 983 | 
             
                        print('Please try to install requirements by following README.md')
         | 
| 984 | 
             
                        HAS_TEXTUREGEN = False
         | 
| 985 |  | 
| 986 | 
            +
                # Text-to-Image: Using lightweight model for ZERO GPU compatibility
         | 
|  | |
|  | |
|  | |
| 987 | 
             
                try:
         | 
| 988 | 
            +
                    from hy3dgen.text2image import HunyuanDiTPipeline
         | 
| 989 | 
            +
                    # Try HunyuanDiT first, fallback to SDXL-Turbo if unavailable
         | 
| 990 | 
            +
                    t2i_worker = HunyuanDiTPipeline('Tencent-Hunyuan/HunyuanDiT-v1.1-Diffusers-Distilled')
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 991 | 
             
                    HAS_T2I = True
         | 
| 992 | 
            +
                    print("✓ Text-to-Image model loaded successfully")
         | 
|  | |
| 993 | 
             
                except Exception as e:
         | 
| 994 | 
            +
                    print(f"⚠ Failed to load text-to-image model: {e}")
         | 
| 995 | 
            +
                    print("→ Text Prompt feature will be disabled")
         | 
| 996 | 
             
                    HAS_T2I = False
         | 
| 997 | 
             
                    t2i_worker = None
         | 
| 998 |  | 
    	
        hy3dgen/__init__.py
    ADDED
    
    | @@ -0,0 +1 @@ | |
|  | 
|  | |
| 1 | 
            +
            # Hunyuan3D text-to-image utilities
         | 
    	
        hy3dgen/text2image.py
    ADDED
    
    | @@ -0,0 +1,156 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            """
         | 
| 2 | 
            +
            Text-to-Image pipeline for Hunyuan3D
         | 
| 3 | 
            +
            Supports multiple lightweight models for ZERO GPU deployment
         | 
| 4 | 
            +
            """
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            import os
         | 
| 7 | 
            +
            import random
         | 
| 8 | 
            +
            import torch
         | 
| 9 | 
            +
            import numpy as np
         | 
| 10 | 
            +
            from diffusers import AutoPipelineForText2Image, DiffusionPipeline
         | 
| 11 | 
            +
             | 
| 12 | 
            +
             | 
| 13 | 
            +
            def seed_everything(seed):
         | 
| 14 | 
            +
                random.seed(seed)
         | 
| 15 | 
            +
                np.random.seed(seed)
         | 
| 16 | 
            +
                torch.manual_seed(seed)
         | 
| 17 | 
            +
                os.environ["PL_GLOBAL_SEED"] = str(seed)
         | 
| 18 | 
            +
             | 
| 19 | 
            +
             | 
| 20 | 
            +
            class Text2ImagePipeline:
         | 
| 21 | 
            +
                """Lightweight text-to-image pipeline optimized for ZERO GPU"""
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                def __init__(
         | 
| 24 | 
            +
                    self,
         | 
| 25 | 
            +
                    model_path="stabilityai/sdxl-turbo",
         | 
| 26 | 
            +
                    device='cuda'
         | 
| 27 | 
            +
                ):
         | 
| 28 | 
            +
                    self.device = device
         | 
| 29 | 
            +
                    self.model_path = model_path
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                    # Load appropriate pipeline based on model
         | 
| 32 | 
            +
                    if "sdxl-turbo" in model_path.lower():
         | 
| 33 | 
            +
                        self.pipe = AutoPipelineForText2Image.from_pretrained(
         | 
| 34 | 
            +
                            model_path,
         | 
| 35 | 
            +
                            torch_dtype=torch.float16,
         | 
| 36 | 
            +
                            variant="fp16"
         | 
| 37 | 
            +
                        ).to(device)
         | 
| 38 | 
            +
                        self.num_steps = 4  # SDXL-Turbo needs only 1-4 steps
         | 
| 39 | 
            +
                        self.guidance_scale = 0.0  # SDXL-Turbo works without guidance
         | 
| 40 | 
            +
                    elif "stable-diffusion-2-1" in model_path.lower():
         | 
| 41 | 
            +
                        self.pipe = DiffusionPipeline.from_pretrained(
         | 
| 42 | 
            +
                            model_path,
         | 
| 43 | 
            +
                            torch_dtype=torch.float16
         | 
| 44 | 
            +
                        ).to(device)
         | 
| 45 | 
            +
                        self.num_steps = 25
         | 
| 46 | 
            +
                        self.guidance_scale = 7.5
         | 
| 47 | 
            +
                    else:
         | 
| 48 | 
            +
                        # Default: use AutoPipeline
         | 
| 49 | 
            +
                        self.pipe = AutoPipelineForText2Image.from_pretrained(
         | 
| 50 | 
            +
                            model_path,
         | 
| 51 | 
            +
                            torch_dtype=torch.float16
         | 
| 52 | 
            +
                        ).to(device)
         | 
| 53 | 
            +
                        self.num_steps = 25
         | 
| 54 | 
            +
                        self.guidance_scale = 7.5
         | 
| 55 | 
            +
             | 
| 56 | 
            +
                    # Enhanced prompt for 3D generation
         | 
| 57 | 
            +
                    self.pos_txt = ", white background, 3D style, high quality, centered object"
         | 
| 58 | 
            +
                    self.neg_txt = "text, close-up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck"
         | 
| 59 | 
            +
             | 
| 60 | 
            +
                @torch.no_grad()
         | 
| 61 | 
            +
                def __call__(self, prompt, seed=0):
         | 
| 62 | 
            +
                    seed_everything(seed)
         | 
| 63 | 
            +
                    generator = torch.Generator(device=self.device).manual_seed(int(seed))
         | 
| 64 | 
            +
             | 
| 65 | 
            +
                    # Enhance prompt for 3D object generation
         | 
| 66 | 
            +
                    enhanced_prompt = f"{prompt[:60]}{self.pos_txt}"
         | 
| 67 | 
            +
             | 
| 68 | 
            +
                    # Generate image
         | 
| 69 | 
            +
                    if "sdxl-turbo" in self.model_path.lower():
         | 
| 70 | 
            +
                        # SDXL-Turbo: fast, no guidance
         | 
| 71 | 
            +
                        out_img = self.pipe(
         | 
| 72 | 
            +
                            prompt=enhanced_prompt,
         | 
| 73 | 
            +
                            num_inference_steps=self.num_steps,
         | 
| 74 | 
            +
                            guidance_scale=self.guidance_scale,
         | 
| 75 | 
            +
                            generator=generator,
         | 
| 76 | 
            +
                        ).images[0]
         | 
| 77 | 
            +
                    else:
         | 
| 78 | 
            +
                        # Standard SD pipeline
         | 
| 79 | 
            +
                        out_img = self.pipe(
         | 
| 80 | 
            +
                            prompt=enhanced_prompt,
         | 
| 81 | 
            +
                            negative_prompt=self.neg_txt,
         | 
| 82 | 
            +
                            num_inference_steps=self.num_steps,
         | 
| 83 | 
            +
                            guidance_scale=self.guidance_scale,
         | 
| 84 | 
            +
                            height=1024,
         | 
| 85 | 
            +
                            width=1024,
         | 
| 86 | 
            +
                            generator=generator,
         | 
| 87 | 
            +
                        ).images[0]
         | 
| 88 | 
            +
             | 
| 89 | 
            +
                    # Resize to 1024x1024 if needed
         | 
| 90 | 
            +
                    if out_img.size != (1024, 1024):
         | 
| 91 | 
            +
                        out_img = out_img.resize((1024, 1024))
         | 
| 92 | 
            +
             | 
| 93 | 
            +
                    return out_img
         | 
| 94 | 
            +
             | 
| 95 | 
            +
             | 
| 96 | 
            +
            class HunyuanDiTPipeline(Text2ImagePipeline):
         | 
| 97 | 
            +
                """
         | 
| 98 | 
            +
                Wrapper for HunyuanDiT - falls back to SDXL-Turbo if HunyuanDiT is unavailable
         | 
| 99 | 
            +
                Compatible with original Tencent implementation
         | 
| 100 | 
            +
                """
         | 
| 101 | 
            +
             | 
| 102 | 
            +
                def __init__(
         | 
| 103 | 
            +
                    self,
         | 
| 104 | 
            +
                    model_path="Tencent-Hunyuan/HunyuanDiT-v1.1-Diffusers-Distilled",
         | 
| 105 | 
            +
                    device='cuda'
         | 
| 106 | 
            +
                ):
         | 
| 107 | 
            +
                    self.device = device
         | 
| 108 | 
            +
             | 
| 109 | 
            +
                    try:
         | 
| 110 | 
            +
                        # Try to load HunyuanDiT
         | 
| 111 | 
            +
                        self.pipe = AutoPipelineForText2Image.from_pretrained(
         | 
| 112 | 
            +
                            model_path,
         | 
| 113 | 
            +
                            torch_dtype=torch.float16,
         | 
| 114 | 
            +
                            enable_pag=True,
         | 
| 115 | 
            +
                            pag_applied_layers=["blocks.(16|17|18|19)"]
         | 
| 116 | 
            +
                        ).to(device)
         | 
| 117 | 
            +
                        self.num_steps = 25
         | 
| 118 | 
            +
                        self.guidance_scale = 1.3
         | 
| 119 | 
            +
                        self.is_hunyuan = True
         | 
| 120 | 
            +
                        print(f"✓ Loaded HunyuanDiT from {model_path}")
         | 
| 121 | 
            +
                    except Exception as e:
         | 
| 122 | 
            +
                        print(f"⚠ Failed to load HunyuanDiT: {e}")
         | 
| 123 | 
            +
                        print("→ Falling back to SDXL-Turbo (lightweight alternative)")
         | 
| 124 | 
            +
                        # Fallback to SDXL-Turbo
         | 
| 125 | 
            +
                        super().__init__(model_path="stabilityai/sdxl-turbo", device=device)
         | 
| 126 | 
            +
                        self.is_hunyuan = False
         | 
| 127 | 
            +
                        return
         | 
| 128 | 
            +
             | 
| 129 | 
            +
                    # HunyuanDiT prompts (Chinese)
         | 
| 130 | 
            +
                    self.pos_txt = ",白色背景,3D风格,最佳质量"
         | 
| 131 | 
            +
                    self.neg_txt = "文本,特写,裁剪,出框,最差质量,低质量,JPEG伪影,PGLY,重复,病态," \
         | 
| 132 | 
            +
                                  "残缺,多余的手指,变异的手,画得不好的手,画得不好的脸,变异,畸形,模糊,脱水,糟糕的解剖学," \
         | 
| 133 | 
            +
                                  "糟糕的比例,多余的肢体,克隆的脸,毁容,恶心的比例,畸形的肢体,缺失的手臂,缺失的腿," \
         | 
| 134 | 
            +
                                  "额外的手臂,额外的腿,融合的手指,手指太多,长脖子"
         | 
| 135 | 
            +
             | 
| 136 | 
            +
                @torch.no_grad()
         | 
| 137 | 
            +
                def __call__(self, prompt, seed=0):
         | 
| 138 | 
            +
                    if not self.is_hunyuan:
         | 
| 139 | 
            +
                        # Use parent SDXL-Turbo implementation
         | 
| 140 | 
            +
                        return super().__call__(prompt, seed)
         | 
| 141 | 
            +
             | 
| 142 | 
            +
                    seed_everything(seed)
         | 
| 143 | 
            +
                    generator = torch.Generator(device=self.device).manual_seed(int(seed))
         | 
| 144 | 
            +
             | 
| 145 | 
            +
                    out_img = self.pipe(
         | 
| 146 | 
            +
                        prompt=prompt[:60] + self.pos_txt,
         | 
| 147 | 
            +
                        negative_prompt=self.neg_txt,
         | 
| 148 | 
            +
                        num_inference_steps=self.num_steps,
         | 
| 149 | 
            +
                        pag_scale=self.guidance_scale,
         | 
| 150 | 
            +
                        width=1024,
         | 
| 151 | 
            +
                        height=1024,
         | 
| 152 | 
            +
                        generator=generator,
         | 
| 153 | 
            +
                        return_dict=False
         | 
| 154 | 
            +
                    )[0][0]
         | 
| 155 | 
            +
             | 
| 156 | 
            +
                    return out_img
         | 
