minhho commited on
Commit
1ac5292
·
1 Parent(s): ddf83e0

Feature: Enable Text-to-3D with HunyuanDiT (fallback to SDXL-Turbo)

Browse files

- Created hy3dgen/text2image.py with HunyuanDiTPipeline
- Supports HunyuanDiT-v1.1-Diffusers-Distilled (primary)
- Automatic fallback to SDXL-Turbo if HunyuanDiT fails (VRAM constraints)
- Text Prompt tab now visible when T2I model loads successfully
- Optimized for ZERO GPU deployment
- Compatible with Tencent's official implementation

Files changed (3) hide show
  1. gradio_app.py +7 -19
  2. hy3dgen/__init__.py +1 -0
  3. hy3dgen/text2image.py +156 -0
gradio_app.py CHANGED
@@ -983,28 +983,16 @@ if __name__ == '__main__':
983
  print('Please try to install requirements by following README.md')
984
  HAS_TEXTUREGEN = False
985
 
986
- # Text-to-Image setup - Always try to load HunyuanDiT
987
- HAS_T2I = False
988
- t2i_worker = None
989
-
990
  try:
991
- print("Loading HunyuanDiT for text-to-image generation...")
992
- from diffusers import HunyuanDiTPipeline
993
-
994
- t2i_worker = HunyuanDiTPipeline.from_pretrained(
995
- "Tencent-Hunyuan/HunyuanDiT-v1.1-Diffusers-Distilled",
996
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
997
- )
998
-
999
- if torch.cuda.is_available():
1000
- t2i_worker = t2i_worker.to("cuda")
1001
-
1002
  HAS_T2I = True
1003
- print("✓ HunyuanDiT loaded successfully for Text-to-3D feature")
1004
-
1005
  except Exception as e:
1006
- print(f" Warning: Failed to load HunyuanDiT: {e}")
1007
- print("Text-to-3D feature will be disabled")
1008
  HAS_T2I = False
1009
  t2i_worker = None
1010
 
 
983
  print('Please try to install requirements by following README.md')
984
  HAS_TEXTUREGEN = False
985
 
986
+ # Text-to-Image: Using lightweight model for ZERO GPU compatibility
 
 
 
987
  try:
988
+ from hy3dgen.text2image import HunyuanDiTPipeline
989
+ # Try HunyuanDiT first, fallback to SDXL-Turbo if unavailable
990
+ t2i_worker = HunyuanDiTPipeline('Tencent-Hunyuan/HunyuanDiT-v1.1-Diffusers-Distilled')
 
 
 
 
 
 
 
 
991
  HAS_T2I = True
992
+ print("✓ Text-to-Image model loaded successfully")
 
993
  except Exception as e:
994
+ print(f" Failed to load text-to-image model: {e}")
995
+ print("Text Prompt feature will be disabled")
996
  HAS_T2I = False
997
  t2i_worker = None
998
 
hy3dgen/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Hunyuan3D text-to-image utilities
hy3dgen/text2image.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Text-to-Image pipeline for Hunyuan3D
3
+ Supports multiple lightweight models for ZERO GPU deployment
4
+ """
5
+
6
+ import os
7
+ import random
8
+ import torch
9
+ import numpy as np
10
+ from diffusers import AutoPipelineForText2Image, DiffusionPipeline
11
+
12
+
13
+ def seed_everything(seed):
14
+ random.seed(seed)
15
+ np.random.seed(seed)
16
+ torch.manual_seed(seed)
17
+ os.environ["PL_GLOBAL_SEED"] = str(seed)
18
+
19
+
20
+ class Text2ImagePipeline:
21
+ """Lightweight text-to-image pipeline optimized for ZERO GPU"""
22
+
23
+ def __init__(
24
+ self,
25
+ model_path="stabilityai/sdxl-turbo",
26
+ device='cuda'
27
+ ):
28
+ self.device = device
29
+ self.model_path = model_path
30
+
31
+ # Load appropriate pipeline based on model
32
+ if "sdxl-turbo" in model_path.lower():
33
+ self.pipe = AutoPipelineForText2Image.from_pretrained(
34
+ model_path,
35
+ torch_dtype=torch.float16,
36
+ variant="fp16"
37
+ ).to(device)
38
+ self.num_steps = 4 # SDXL-Turbo needs only 1-4 steps
39
+ self.guidance_scale = 0.0 # SDXL-Turbo works without guidance
40
+ elif "stable-diffusion-2-1" in model_path.lower():
41
+ self.pipe = DiffusionPipeline.from_pretrained(
42
+ model_path,
43
+ torch_dtype=torch.float16
44
+ ).to(device)
45
+ self.num_steps = 25
46
+ self.guidance_scale = 7.5
47
+ else:
48
+ # Default: use AutoPipeline
49
+ self.pipe = AutoPipelineForText2Image.from_pretrained(
50
+ model_path,
51
+ torch_dtype=torch.float16
52
+ ).to(device)
53
+ self.num_steps = 25
54
+ self.guidance_scale = 7.5
55
+
56
+ # Enhanced prompt for 3D generation
57
+ self.pos_txt = ", white background, 3D style, high quality, centered object"
58
+ self.neg_txt = "text, close-up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck"
59
+
60
+ @torch.no_grad()
61
+ def __call__(self, prompt, seed=0):
62
+ seed_everything(seed)
63
+ generator = torch.Generator(device=self.device).manual_seed(int(seed))
64
+
65
+ # Enhance prompt for 3D object generation
66
+ enhanced_prompt = f"{prompt[:60]}{self.pos_txt}"
67
+
68
+ # Generate image
69
+ if "sdxl-turbo" in self.model_path.lower():
70
+ # SDXL-Turbo: fast, no guidance
71
+ out_img = self.pipe(
72
+ prompt=enhanced_prompt,
73
+ num_inference_steps=self.num_steps,
74
+ guidance_scale=self.guidance_scale,
75
+ generator=generator,
76
+ ).images[0]
77
+ else:
78
+ # Standard SD pipeline
79
+ out_img = self.pipe(
80
+ prompt=enhanced_prompt,
81
+ negative_prompt=self.neg_txt,
82
+ num_inference_steps=self.num_steps,
83
+ guidance_scale=self.guidance_scale,
84
+ height=1024,
85
+ width=1024,
86
+ generator=generator,
87
+ ).images[0]
88
+
89
+ # Resize to 1024x1024 if needed
90
+ if out_img.size != (1024, 1024):
91
+ out_img = out_img.resize((1024, 1024))
92
+
93
+ return out_img
94
+
95
+
96
+ class HunyuanDiTPipeline(Text2ImagePipeline):
97
+ """
98
+ Wrapper for HunyuanDiT - falls back to SDXL-Turbo if HunyuanDiT is unavailable
99
+ Compatible with original Tencent implementation
100
+ """
101
+
102
+ def __init__(
103
+ self,
104
+ model_path="Tencent-Hunyuan/HunyuanDiT-v1.1-Diffusers-Distilled",
105
+ device='cuda'
106
+ ):
107
+ self.device = device
108
+
109
+ try:
110
+ # Try to load HunyuanDiT
111
+ self.pipe = AutoPipelineForText2Image.from_pretrained(
112
+ model_path,
113
+ torch_dtype=torch.float16,
114
+ enable_pag=True,
115
+ pag_applied_layers=["blocks.(16|17|18|19)"]
116
+ ).to(device)
117
+ self.num_steps = 25
118
+ self.guidance_scale = 1.3
119
+ self.is_hunyuan = True
120
+ print(f"✓ Loaded HunyuanDiT from {model_path}")
121
+ except Exception as e:
122
+ print(f"⚠ Failed to load HunyuanDiT: {e}")
123
+ print("→ Falling back to SDXL-Turbo (lightweight alternative)")
124
+ # Fallback to SDXL-Turbo
125
+ super().__init__(model_path="stabilityai/sdxl-turbo", device=device)
126
+ self.is_hunyuan = False
127
+ return
128
+
129
+ # HunyuanDiT prompts (Chinese)
130
+ self.pos_txt = ",白色背景,3D风格,最佳质量"
131
+ self.neg_txt = "文本,特写,裁剪,出框,最差质量,低质量,JPEG伪影,PGLY,重复,病态," \
132
+ "残缺,多余的手指,变异的手,画得不好的手,画得不好的脸,变异,畸形,模糊,脱水,糟糕的解剖学," \
133
+ "糟糕的比例,多余的肢体,克隆的脸,毁容,恶心的比例,畸形的肢体,缺失的手臂,缺失的腿," \
134
+ "额外的手臂,额外的腿,融合的手指,手指太多,长脖子"
135
+
136
+ @torch.no_grad()
137
+ def __call__(self, prompt, seed=0):
138
+ if not self.is_hunyuan:
139
+ # Use parent SDXL-Turbo implementation
140
+ return super().__call__(prompt, seed)
141
+
142
+ seed_everything(seed)
143
+ generator = torch.Generator(device=self.device).manual_seed(int(seed))
144
+
145
+ out_img = self.pipe(
146
+ prompt=prompt[:60] + self.pos_txt,
147
+ negative_prompt=self.neg_txt,
148
+ num_inference_steps=self.num_steps,
149
+ pag_scale=self.guidance_scale,
150
+ width=1024,
151
+ height=1024,
152
+ generator=generator,
153
+ return_dict=False
154
+ )[0][0]
155
+
156
+ return out_img