Spaces:
Running
on
Zero
Running
on
Zero
update
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- ckpts/.gitattributes +35 -0
- ckpts/README.md +117 -0
- ckpts/craftsman-v1-5 +1 -0
- craftsman/__pycache__/__init__.cpython-310.pyc +0 -0
- craftsman/__pycache__/__init__.cpython-311.pyc +0 -0
- craftsman/__pycache__/pipeline.cpython-310.pyc +0 -0
- craftsman/__pycache__/pipeline.cpython-311.pyc +0 -0
- craftsman/data/__pycache__/Objaverse.cpython-310.pyc +0 -0
- craftsman/data/__pycache__/__init__.cpython-310.pyc +0 -0
- craftsman/data/__pycache__/base.cpython-310.pyc +0 -0
- craftsman/data/base.py +9 -8
- craftsman/models/__pycache__/__init__.cpython-310.pyc +0 -0
- craftsman/models/autoencoders/__pycache__/__init__.cpython-310.pyc +0 -0
- craftsman/models/autoencoders/__pycache__/michelangelo_autoencoder.cpython-310.pyc +0 -0
- craftsman/models/conditional_encoders/__pycache__/__init__.cpython-310.pyc +0 -0
- craftsman/models/conditional_encoders/__pycache__/base.cpython-310.pyc +0 -0
- craftsman/models/conditional_encoders/__pycache__/cond_encoder.cpython-310.pyc +0 -0
- craftsman/models/conditional_encoders/clip/__pycache__/modeling_clip.cpython-310.pyc +0 -0
- craftsman/models/conditional_encoders/clip/__pycache__/modeling_conditional_clip.cpython-310.pyc +0 -0
- craftsman/models/conditional_encoders/cond_encoder.py +1 -22
- craftsman/models/conditional_encoders/dino_v2/__pycache__/modeling_conditional_dinov2.cpython-310.pyc +0 -0
- craftsman/models/conditional_encoders/dino_v2/__pycache__/modeling_dinov2.cpython-310.pyc +0 -0
- craftsman/models/denoisers/__pycache__/__init__.cpython-310.pyc +0 -0
- craftsman/models/denoisers/__pycache__/pixart_denoiser.cpython-310.pyc +0 -0
- craftsman/models/denoisers/__pycache__/utils.cpython-310.pyc +0 -0
- craftsman/models/denoisers/pixart_denoiser.py +2 -22
- craftsman/models/denoisers/utils.py +1 -185
- craftsman/models/geometry/__pycache__/__init__.cpython-310.pyc +0 -0
- craftsman/models/geometry/__pycache__/base.cpython-310.pyc +0 -0
- craftsman/models/geometry/__pycache__/utils.cpython-310.pyc +0 -0
- craftsman/models/transformers/__pycache__/attention.cpython-310.pyc +0 -0
- craftsman/models/transformers/__pycache__/perceiver_1d.cpython-310.pyc +0 -0
- craftsman/models/transformers/__pycache__/utils.cpython-310.pyc +0 -0
- craftsman/models/transformers/attention.py +1 -121
- craftsman/models/transformers/perceiver_1d.py +0 -0
- craftsman/models/transformers/utils.py +0 -0
- craftsman/pipeline.py +13 -0
- craftsman/systems/__pycache__/__init__.cpython-310.pyc +0 -0
- craftsman/systems/__pycache__/base.cpython-310.pyc +0 -0
- craftsman/systems/__pycache__/pixart_diffusion.cpython-310.pyc +0 -0
- craftsman/systems/__pycache__/shape_autoencoder.cpython-310.pyc +0 -0
- craftsman/systems/__pycache__/utils.cpython-310.pyc +0 -0
- craftsman/systems/pixart_diffusion.py +3 -3
- craftsman/utils/__pycache__/__init__.cpython-310.pyc +0 -0
- craftsman/utils/__pycache__/__init__.cpython-311.pyc +0 -0
- craftsman/utils/__pycache__/base.cpython-310.pyc +0 -0
- craftsman/utils/__pycache__/base.cpython-311.pyc +0 -0
- craftsman/utils/__pycache__/checkpoint.cpython-310.pyc +0 -0
- craftsman/utils/__pycache__/config.cpython-310.pyc +0 -0
- craftsman/utils/__pycache__/config.cpython-311.pyc +0 -0
ckpts/.gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
ckpts/README.md
ADDED
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
library_name: craftsman-v1-5
|
3 |
+
license: creativeml-openrail-m
|
4 |
+
license_name: creativeml-openrail-m
|
5 |
+
license_link: https://raw.githubusercontent.com/CompVis/stable-diffusion/refs/heads/main/LICENSE
|
6 |
+
pipeline_tag: image-to-3d
|
7 |
+
language:
|
8 |
+
- en
|
9 |
+
- zh
|
10 |
+
---
|
11 |
+
|
12 |
+
## **CraftsMan-v1-5**
|
13 |
+
|
14 |
+
<p align="center">
|
15 |
+
<img src="./assets/teaser.png" height=200>
|
16 |
+
</p>
|
17 |
+
|
18 |
+
### <div align="center">CraftsMan: High-fidelity Mesh Generation <br> with 3D Native Generation and Interactive Geometry Refiner<div>
|
19 |
+
##### <p align="center"> [Weiyu Li<sup>*1,2</sup>](https://wyysf-98.github.io/), Jiarui Liu<sup>*1,2</sup>, Hongyu Yan<sup>*1,2</sup>, [Rui Chen<sup>1,2</sup>](https://aruichen.github.io/), [Yixun Liang<sup>2,3</sup>](https://yixunliang.github.io/), [Xuelin Chen<sup>4</sup>](https://xuelin-chen.github.io/), [Ping Tan<sup>1,2</sup>](https://ece.hkust.edu.hk/pingtan), [Xiaoxiao Long<sup>1,2</sup>](https://www.xxlong.site/)</p>
|
20 |
+
##### <p align="center"> <sup>1</sup>HKUST, <sup>2</sup>LightIllusions, <sup>3</sup>HKUST(GZ), <sup>4</sup>Tencent AI Lab</p>
|
21 |
+
<div align="center">
|
22 |
+
<a href="https://craftsman3d.github.io/"><img src="https://img.shields.io/static/v1?label=Project%20Page&message=Github&color=blue&logo=github-pages"></a>  
|
23 |
+
<a href="http://algodemo.bj.lightions.top:24926"><img src="https://www.gradio.app/_app/immutable/assets/gradio.CHB5adID.svg" height="25"/></a>  
|
24 |
+
<a href="https://arxiv.org/pdf/2405.14979"><img src="https://img.shields.io/static/v1?label=Paper&message=Arxiv&color=red&logo=arxiv"></a>  
|
25 |
+
</div>
|
26 |
+
|
27 |
+
# Usage
|
28 |
+
|
29 |
+
To use the model, please refer to the [official repository](https://github.com/wyysf-98/CraftsMan) for installation and usage instructions.
|
30 |
+
|
31 |
+
```
|
32 |
+
|
33 |
+
from craftsman import CraftsManPipeline
|
34 |
+
import torch
|
35 |
+
|
36 |
+
pipeline = CraftsManPipeline.from_pretrained("./ckpts/craftsman-v1-5", device="cuda:0", torch_dtype=torch.float32) # load from local ckpt
|
37 |
+
mesh = pipeline("https://pub-f9073a756ec645d692ce3d171c2e1232.r2.dev/data/werewolf.png").meshes[0]
|
38 |
+
mesh.export("werewolf.obj")
|
39 |
+
|
40 |
+
```
|
41 |
+
|
42 |
+
## 🔥🔥🔥 News!!
|
43 |
+
|
44 |
+
* Nov 16, 2024: 💬 We release the CraftsMan-v1-5
|
45 |
+
|
46 |
+
|
47 |
+
## 📑 Open-source Plan
|
48 |
+
|
49 |
+
- [x] Inference
|
50 |
+
- [x] Checkpoints
|
51 |
+
- [x] Training
|
52 |
+
- [ ] ComfyUI
|
53 |
+
|
54 |
+
## 🎉 **CraftMan-v1-5 Architecture**
|
55 |
+
|
56 |
+
<p align="center">
|
57 |
+
<img src="./assets/arch.png" height=400>
|
58 |
+
</p>
|
59 |
+
|
60 |
+
|
61 |
+
## Get Started
|
62 |
+
|
63 |
+
#### Begin by cloning the repository:
|
64 |
+
|
65 |
+
```shell
|
66 |
+
git clone https://github.com/wyysf-98/CraftsMan
|
67 |
+
cd CraftsMan
|
68 |
+
```
|
69 |
+
|
70 |
+
#### Installation Guide for Linux
|
71 |
+
|
72 |
+
We provide an env_install.sh script file for setting up environment.
|
73 |
+
|
74 |
+
```
|
75 |
+
# step 1, create conda env
|
76 |
+
conda create -n CraftsMan python=3.10
|
77 |
+
conda activate CraftsMan
|
78 |
+
|
79 |
+
|
80 |
+
# step 2. install torch realated package
|
81 |
+
conda install -c pytorch pytorch=2.3.0 torchvision=0.18.0 cudatoolkit=11.8
|
82 |
+
|
83 |
+
# step 3. install other packages
|
84 |
+
pip install -r docker/requirements.txt
|
85 |
+
```
|
86 |
+
<details>
|
87 |
+
|
88 |
+
|
89 |
+
#### Using Gradio
|
90 |
+
|
91 |
+
We have prepared a gradio demo for you to try out the model. You can run the following command to start the demo.
|
92 |
+
|
93 |
+
```shell
|
94 |
+
# std
|
95 |
+
python3 gradio.py
|
96 |
+
```
|
97 |
+
|
98 |
+
Then the demo can be accessed through the output link.
|
99 |
+
|
100 |
+
|
101 |
+
## Citation
|
102 |
+
|
103 |
+
If you found this repository helpful, please cite our report:
|
104 |
+
```bibtex
|
105 |
+
@misc{li2024craftsman,
|
106 |
+
title = {CraftsMan: High-fidelity Mesh Generation with 3D Native Generation and Interactive Geometry Refiner},
|
107 |
+
author = {Weiyu Li and Jiarui Liu and Rui Chen and Yixun Liang and Xuelin Chen and Ping Tan and Xiaoxiao Long},
|
108 |
+
year = {2024},
|
109 |
+
archivePrefix = {arXiv preprint arXiv:2405.14979},
|
110 |
+
primaryClass = {cs.CG}
|
111 |
+
}
|
112 |
+
```
|
113 |
+
|
114 |
+
|
115 |
+
# License
|
116 |
+
|
117 |
+
[creativeml-openrail-m](https://raw.githubusercontent.com/CompVis/stable-diffusion/refs/heads/main/LICENSE)
|
ckpts/craftsman-v1-5
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Subproject commit 9a5e9189c2dfab20cf838885dd6acaf99b41844e
|
craftsman/__pycache__/__init__.cpython-310.pyc
CHANGED
Binary files a/craftsman/__pycache__/__init__.cpython-310.pyc and b/craftsman/__pycache__/__init__.cpython-310.pyc differ
|
|
craftsman/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (2.24 kB). View file
|
|
craftsman/__pycache__/pipeline.cpython-310.pyc
CHANGED
Binary files a/craftsman/__pycache__/pipeline.cpython-310.pyc and b/craftsman/__pycache__/pipeline.cpython-310.pyc differ
|
|
craftsman/__pycache__/pipeline.cpython-311.pyc
ADDED
Binary file (16.6 kB). View file
|
|
craftsman/data/__pycache__/Objaverse.cpython-310.pyc
CHANGED
Binary files a/craftsman/data/__pycache__/Objaverse.cpython-310.pyc and b/craftsman/data/__pycache__/Objaverse.cpython-310.pyc differ
|
|
craftsman/data/__pycache__/__init__.cpython-310.pyc
CHANGED
Binary files a/craftsman/data/__pycache__/__init__.cpython-310.pyc and b/craftsman/data/__pycache__/__init__.cpython-310.pyc differ
|
|
craftsman/data/__pycache__/base.cpython-310.pyc
CHANGED
Binary files a/craftsman/data/__pycache__/base.cpython-310.pyc and b/craftsman/data/__pycache__/base.cpython-310.pyc differ
|
|
craftsman/data/base.py
CHANGED
@@ -53,7 +53,7 @@ class BaseDataModuleConfig:
|
|
53 |
# for occupancy and sdf data
|
54 |
n_samples: int = 4096 # number of points in input point cloud
|
55 |
upsample_ratio: int = 1 # upsample ratio for input point cloud
|
56 |
-
sampling_strategy: str =
|
57 |
scale: float = 1.0 # scale of the input point cloud and target supervision
|
58 |
load_supervision: bool = True # whether to load supervision
|
59 |
supervision_type: str = "occupancy" # occupancy, sdf, tsdf
|
@@ -70,6 +70,8 @@ class BaseDataModuleConfig:
|
|
70 |
idx: Optional[List[int]] = None # index of the image to load
|
71 |
n_views: int = 1 # number of views
|
72 |
marign_pix_dis: int = 30 # margin of the bounding box
|
|
|
|
|
73 |
|
74 |
|
75 |
class BaseDataset(Dataset):
|
@@ -78,7 +80,7 @@ class BaseDataset(Dataset):
|
|
78 |
self.cfg: BaseDataModuleConfig = cfg
|
79 |
self.split = split
|
80 |
|
81 |
-
self.uids = json.load(open(f'{cfg.
|
82 |
print(f"Loaded {len(self.uids)} {split} uids")
|
83 |
|
84 |
def __len__(self):
|
@@ -94,10 +96,7 @@ class BaseDataset(Dataset):
|
|
94 |
surface = np.concatenate([surface, normal], axis=1)
|
95 |
elif self.cfg.geo_data_type == "sdf":
|
96 |
# for sdf data with our own format
|
97 |
-
|
98 |
-
data = np.load(f'{self.cfg.geo_data_path}/{self.uids[index]}.npz')
|
99 |
-
else:
|
100 |
-
data = np.load(f'{self.uids[index]}.npz')
|
101 |
# for input point cloud
|
102 |
surface = data["surface"]
|
103 |
else:
|
@@ -112,6 +111,8 @@ class BaseDataset(Dataset):
|
|
112 |
import fpsample
|
113 |
kdline_fps_samples_idx = fpsample.bucket_fps_kdline_sampling(surface[:, :3], self.cfg.n_samples, h=5)
|
114 |
surface = surface[kdline_fps_samples_idx]
|
|
|
|
|
115 |
else:
|
116 |
raise NotImplementedError(f"sampling strategy {self.cfg.sampling_strategy} not implemented")
|
117 |
# rescale data
|
@@ -189,9 +190,9 @@ class BaseDataset(Dataset):
|
|
189 |
sel_idx = random.choice(self.cfg.idx)
|
190 |
ret["sel_image_idx"] = sel_idx
|
191 |
if self.cfg.image_type == "rgb":
|
192 |
-
img_path = f'{self.cfg.image_data_path}/' + "/".join(self.uids[index].split('/')[-2:]) + f"/{'{:04d}'.format(sel_idx)}_rgb.
|
193 |
elif self.cfg.image_type == "normal":
|
194 |
-
img_path = f'{self.cfg.image_data_path}/' + "/".join(self.uids[index].split('/')[-2:]) + f"/{'{:04d}'.format(sel_idx)}_normal.
|
195 |
ret["image"], ret["mask"] = _load_single_image(img_path, background_color, self.cfg.marign_pix_dis)
|
196 |
|
197 |
else:
|
|
|
53 |
# for occupancy and sdf data
|
54 |
n_samples: int = 4096 # number of points in input point cloud
|
55 |
upsample_ratio: int = 1 # upsample ratio for input point cloud
|
56 |
+
sampling_strategy: Optional[str] = None # sampling strategy for input point cloud
|
57 |
scale: float = 1.0 # scale of the input point cloud and target supervision
|
58 |
load_supervision: bool = True # whether to load supervision
|
59 |
supervision_type: str = "occupancy" # occupancy, sdf, tsdf
|
|
|
70 |
idx: Optional[List[int]] = None # index of the image to load
|
71 |
n_views: int = 1 # number of views
|
72 |
marign_pix_dis: int = 30 # margin of the bounding box
|
73 |
+
batch_size: int = 32
|
74 |
+
num_workers: int = 8
|
75 |
|
76 |
|
77 |
class BaseDataset(Dataset):
|
|
|
80 |
self.cfg: BaseDataModuleConfig = cfg
|
81 |
self.split = split
|
82 |
|
83 |
+
self.uids = json.load(open(f'{cfg.local_dir}/{split}.json'))
|
84 |
print(f"Loaded {len(self.uids)} {split} uids")
|
85 |
|
86 |
def __len__(self):
|
|
|
96 |
surface = np.concatenate([surface, normal], axis=1)
|
97 |
elif self.cfg.geo_data_type == "sdf":
|
98 |
# for sdf data with our own format
|
99 |
+
data = np.load(f'{self.cfg.geo_data_path}/{self.uids[index]}.npz')
|
|
|
|
|
|
|
100 |
# for input point cloud
|
101 |
surface = data["surface"]
|
102 |
else:
|
|
|
111 |
import fpsample
|
112 |
kdline_fps_samples_idx = fpsample.bucket_fps_kdline_sampling(surface[:, :3], self.cfg.n_samples, h=5)
|
113 |
surface = surface[kdline_fps_samples_idx]
|
114 |
+
elif self.cfg.sampling_strategy is None:
|
115 |
+
pass
|
116 |
else:
|
117 |
raise NotImplementedError(f"sampling strategy {self.cfg.sampling_strategy} not implemented")
|
118 |
# rescale data
|
|
|
190 |
sel_idx = random.choice(self.cfg.idx)
|
191 |
ret["sel_image_idx"] = sel_idx
|
192 |
if self.cfg.image_type == "rgb":
|
193 |
+
img_path = f'{self.cfg.image_data_path}/' + "/".join(self.uids[index].split('/')[-2:]) + f"/{'{:04d}'.format(sel_idx)}_rgb.jpeg"
|
194 |
elif self.cfg.image_type == "normal":
|
195 |
+
img_path = f'{self.cfg.image_data_path}/' + "/".join(self.uids[index].split('/')[-2:]) + f"/{'{:04d}'.format(sel_idx)}_normal.jpeg"
|
196 |
ret["image"], ret["mask"] = _load_single_image(img_path, background_color, self.cfg.marign_pix_dis)
|
197 |
|
198 |
else:
|
craftsman/models/__pycache__/__init__.cpython-310.pyc
CHANGED
Binary files a/craftsman/models/__pycache__/__init__.cpython-310.pyc and b/craftsman/models/__pycache__/__init__.cpython-310.pyc differ
|
|
craftsman/models/autoencoders/__pycache__/__init__.cpython-310.pyc
CHANGED
Binary files a/craftsman/models/autoencoders/__pycache__/__init__.cpython-310.pyc and b/craftsman/models/autoencoders/__pycache__/__init__.cpython-310.pyc differ
|
|
craftsman/models/autoencoders/__pycache__/michelangelo_autoencoder.cpython-310.pyc
CHANGED
Binary files a/craftsman/models/autoencoders/__pycache__/michelangelo_autoencoder.cpython-310.pyc and b/craftsman/models/autoencoders/__pycache__/michelangelo_autoencoder.cpython-310.pyc differ
|
|
craftsman/models/conditional_encoders/__pycache__/__init__.cpython-310.pyc
CHANGED
Binary files a/craftsman/models/conditional_encoders/__pycache__/__init__.cpython-310.pyc and b/craftsman/models/conditional_encoders/__pycache__/__init__.cpython-310.pyc differ
|
|
craftsman/models/conditional_encoders/__pycache__/base.cpython-310.pyc
CHANGED
Binary files a/craftsman/models/conditional_encoders/__pycache__/base.cpython-310.pyc and b/craftsman/models/conditional_encoders/__pycache__/base.cpython-310.pyc differ
|
|
craftsman/models/conditional_encoders/__pycache__/cond_encoder.cpython-310.pyc
CHANGED
Binary files a/craftsman/models/conditional_encoders/__pycache__/cond_encoder.cpython-310.pyc and b/craftsman/models/conditional_encoders/__pycache__/cond_encoder.cpython-310.pyc differ
|
|
craftsman/models/conditional_encoders/clip/__pycache__/modeling_clip.cpython-310.pyc
CHANGED
Binary files a/craftsman/models/conditional_encoders/clip/__pycache__/modeling_clip.cpython-310.pyc and b/craftsman/models/conditional_encoders/clip/__pycache__/modeling_clip.cpython-310.pyc differ
|
|
craftsman/models/conditional_encoders/clip/__pycache__/modeling_conditional_clip.cpython-310.pyc
CHANGED
Binary files a/craftsman/models/conditional_encoders/clip/__pycache__/modeling_conditional_clip.cpython-310.pyc and b/craftsman/models/conditional_encoders/clip/__pycache__/modeling_conditional_clip.cpython-310.pyc differ
|
|
craftsman/models/conditional_encoders/cond_encoder.py
CHANGED
@@ -46,7 +46,6 @@ class CondEmbedder(BaseEmbedder):
|
|
46 |
enable_gradient_checkpointing: bool = False
|
47 |
embeds_fusion_mode: int = 1 # 0: sum | 1: concat
|
48 |
linear_proj_init: str = "constant"
|
49 |
-
text_model_type: str = "clip"
|
50 |
text_max_length: int = 77
|
51 |
image_size_clip: int = 224
|
52 |
image_size_dino: int = 224
|
@@ -277,29 +276,9 @@ class CondEmbedder(BaseEmbedder):
|
|
277 |
else:
|
278 |
return vision_outputs.last_hidden_state
|
279 |
|
280 |
-
def post_process_embeds(self, text_embeds, visual_embeds):
|
281 |
-
clip_embeds, dino_embeds = visual_embeds.chunk(2, dim=2)
|
282 |
-
if self.cfg.normalize_embeds:
|
283 |
-
# post-process the text/visual embeds
|
284 |
-
if text_embeds is not None:
|
285 |
-
text_embeds = text_embeds / text_embeds.norm(dim=-1, keepdim=True)
|
286 |
-
if clip_embeds is not None:
|
287 |
-
clip_embeds = clip_embeds / clip_embeds.norm(dim=-1, keepdim=True)
|
288 |
-
if dino_embeds is not None:
|
289 |
-
dino_embeds = dino_embeds / dino_embeds.norm(dim=-1, keepdim=True)
|
290 |
-
|
291 |
-
assert text_embeds is not None or dino_embeds is not None or clip_embeds is not None
|
292 |
-
|
293 |
-
if text_embeds is not None and visual_embeds is not None:
|
294 |
-
return torch.cat([text_embeds, visual_embeds], dim=1)
|
295 |
-
elif text_embeds is not None:
|
296 |
-
return text_embeds
|
297 |
-
else:
|
298 |
-
return visual_embeds
|
299 |
-
|
300 |
def encode_image(self, images: Iterable[Optional[ImageType]], cameras: Optional[torch.Tensor] = None, force_none_camera_embeds: bool = False, return_dict: bool = False, **kwargs) -> torch.FloatTensor:
|
301 |
clip_embeds = self.encode_image_clip(images, cameras)
|
302 |
dino_embeds = self.encode_image_dino(images, cameras)
|
303 |
dino_embeds = self.linear_proj(dino_embeds)
|
304 |
visual_embeds = torch.cat([clip_embeds, dino_embeds], dim=1)
|
305 |
-
return visual_embeds
|
|
|
46 |
enable_gradient_checkpointing: bool = False
|
47 |
embeds_fusion_mode: int = 1 # 0: sum | 1: concat
|
48 |
linear_proj_init: str = "constant"
|
|
|
49 |
text_max_length: int = 77
|
50 |
image_size_clip: int = 224
|
51 |
image_size_dino: int = 224
|
|
|
276 |
else:
|
277 |
return vision_outputs.last_hidden_state
|
278 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
279 |
def encode_image(self, images: Iterable[Optional[ImageType]], cameras: Optional[torch.Tensor] = None, force_none_camera_embeds: bool = False, return_dict: bool = False, **kwargs) -> torch.FloatTensor:
|
280 |
clip_embeds = self.encode_image_clip(images, cameras)
|
281 |
dino_embeds = self.encode_image_dino(images, cameras)
|
282 |
dino_embeds = self.linear_proj(dino_embeds)
|
283 |
visual_embeds = torch.cat([clip_embeds, dino_embeds], dim=1)
|
284 |
+
return visual_embeds
|
craftsman/models/conditional_encoders/dino_v2/__pycache__/modeling_conditional_dinov2.cpython-310.pyc
CHANGED
Binary files a/craftsman/models/conditional_encoders/dino_v2/__pycache__/modeling_conditional_dinov2.cpython-310.pyc and b/craftsman/models/conditional_encoders/dino_v2/__pycache__/modeling_conditional_dinov2.cpython-310.pyc differ
|
|
craftsman/models/conditional_encoders/dino_v2/__pycache__/modeling_dinov2.cpython-310.pyc
CHANGED
Binary files a/craftsman/models/conditional_encoders/dino_v2/__pycache__/modeling_dinov2.cpython-310.pyc and b/craftsman/models/conditional_encoders/dino_v2/__pycache__/modeling_dinov2.cpython-310.pyc differ
|
|
craftsman/models/denoisers/__pycache__/__init__.cpython-310.pyc
CHANGED
Binary files a/craftsman/models/denoisers/__pycache__/__init__.cpython-310.pyc and b/craftsman/models/denoisers/__pycache__/__init__.cpython-310.pyc differ
|
|
craftsman/models/denoisers/__pycache__/pixart_denoiser.cpython-310.pyc
CHANGED
Binary files a/craftsman/models/denoisers/__pycache__/pixart_denoiser.cpython-310.pyc and b/craftsman/models/denoisers/__pycache__/pixart_denoiser.cpython-310.pyc differ
|
|
craftsman/models/denoisers/__pycache__/utils.cpython-310.pyc
CHANGED
Binary files a/craftsman/models/denoisers/__pycache__/utils.cpython-310.pyc and b/craftsman/models/denoisers/__pycache__/utils.cpython-310.pyc differ
|
|
craftsman/models/denoisers/pixart_denoiser.py
CHANGED
@@ -25,15 +25,11 @@ class PixArtDinoDenoiser(BaseModule):
|
|
25 |
context_dim: int = 1024
|
26 |
n_views: int = 1
|
27 |
context_ln: bool = True
|
28 |
-
skip_ln: bool = False
|
29 |
init_scale: float = 0.25
|
30 |
use_checkpoint: bool = False
|
31 |
drop_path: float = 0.
|
32 |
-
variance_type: str = ""
|
33 |
-
img_pos_embed: bool = False
|
34 |
clip_weight: float = 1.0
|
35 |
dino_weight: float = 1.0
|
36 |
-
dit_block: str = ""
|
37 |
|
38 |
cfg: Config
|
39 |
|
@@ -63,9 +59,8 @@ class PixArtDinoDenoiser(BaseModule):
|
|
63 |
|
64 |
init_scale = self.cfg.init_scale * math.sqrt(1.0 / self.cfg.width)
|
65 |
drop_path = [x.item() for x in torch.linspace(0, self.cfg.drop_path, self.cfg.layers)]
|
66 |
-
ditblock = getattr(importlib.import_module("craftsman.models.denoisers.utils"), self.cfg.dit_block)
|
67 |
self.blocks = nn.ModuleList([
|
68 |
-
|
69 |
width=self.cfg.width,
|
70 |
heads=self.cfg.heads,
|
71 |
init_scale=init_scale,
|
@@ -82,11 +77,7 @@ class PixArtDinoDenoiser(BaseModule):
|
|
82 |
)
|
83 |
|
84 |
# final layer
|
85 |
-
|
86 |
-
self.output_channels = self.cfg.output_channels * 2
|
87 |
-
else:
|
88 |
-
self.output_channels = self.cfg.output_channels
|
89 |
-
self.final_layer = T2IFinalLayer(self.cfg.width, self.output_channels)
|
90 |
|
91 |
self.identity_initialize()
|
92 |
|
@@ -99,17 +90,6 @@ class PixArtDinoDenoiser(BaseModule):
|
|
99 |
self.denoiser_ckpt[k.replace('denoiser_model.', '')] = v
|
100 |
self.load_state_dict(self.denoiser_ckpt, strict=False)
|
101 |
|
102 |
-
def forward_with_dpmsolver(self, model_input, timestep, context):
|
103 |
-
"""
|
104 |
-
dpm solver donnot need variance prediction
|
105 |
-
"""
|
106 |
-
# https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
|
107 |
-
model_out = self.forward(model_input, timestep, context)
|
108 |
-
if self.cfg.variance_type.upper() in ["LEARNED", "LEARNED_RANGE"]:
|
109 |
-
return model_out.chunk(2, dim=-1)[0]
|
110 |
-
else:
|
111 |
-
return model_out
|
112 |
-
|
113 |
def identity_initialize(self):
|
114 |
for block in self.blocks:
|
115 |
nn.init.constant_(block.attn.c_proj.weight, 0)
|
|
|
25 |
context_dim: int = 1024
|
26 |
n_views: int = 1
|
27 |
context_ln: bool = True
|
|
|
28 |
init_scale: float = 0.25
|
29 |
use_checkpoint: bool = False
|
30 |
drop_path: float = 0.
|
|
|
|
|
31 |
clip_weight: float = 1.0
|
32 |
dino_weight: float = 1.0
|
|
|
33 |
|
34 |
cfg: Config
|
35 |
|
|
|
59 |
|
60 |
init_scale = self.cfg.init_scale * math.sqrt(1.0 / self.cfg.width)
|
61 |
drop_path = [x.item() for x in torch.linspace(0, self.cfg.drop_path, self.cfg.layers)]
|
|
|
62 |
self.blocks = nn.ModuleList([
|
63 |
+
DiTBlock(
|
64 |
width=self.cfg.width,
|
65 |
heads=self.cfg.heads,
|
66 |
init_scale=init_scale,
|
|
|
77 |
)
|
78 |
|
79 |
# final layer
|
80 |
+
self.final_layer = T2IFinalLayer(self.cfg.width, self.cfg.output_channels)
|
|
|
|
|
|
|
|
|
81 |
|
82 |
self.identity_initialize()
|
83 |
|
|
|
90 |
self.denoiser_ckpt[k.replace('denoiser_model.', '')] = v
|
91 |
self.load_state_dict(self.denoiser_ckpt, strict=False)
|
92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
def identity_initialize(self):
|
94 |
for block in self.blocks:
|
95 |
nn.init.constant_(block.attn.c_proj.weight, 0)
|
craftsman/models/denoisers/utils.py
CHANGED
@@ -10,126 +10,6 @@ from timm.models.layers import DropPath
|
|
10 |
from craftsman.models.transformers.utils import MLP
|
11 |
from craftsman.models.transformers.attention import MultiheadAttention, MultiheadCrossAttention
|
12 |
|
13 |
-
class PatchEmbed(nn.Module):
|
14 |
-
""" 2D Image to Patch Embedding
|
15 |
-
"""
|
16 |
-
def __init__(
|
17 |
-
self,
|
18 |
-
patch_size=16,
|
19 |
-
in_chans=3,
|
20 |
-
embed_dim=768,
|
21 |
-
norm_layer=None,
|
22 |
-
flatten=True,
|
23 |
-
bias=True,
|
24 |
-
):
|
25 |
-
super().__init__()
|
26 |
-
patch_size = to_2tuple(patch_size)
|
27 |
-
self.patch_size = patch_size
|
28 |
-
self.flatten = flatten
|
29 |
-
self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias)
|
30 |
-
self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
|
31 |
-
|
32 |
-
def forward(self, x):
|
33 |
-
x = self.proj(x)
|
34 |
-
if self.flatten:
|
35 |
-
x = x.flatten(2).transpose(1, 2) # BCHW -> BNC
|
36 |
-
x = self.norm(x)
|
37 |
-
return x
|
38 |
-
|
39 |
-
class DiTBlock(nn.Module):
|
40 |
-
"""
|
41 |
-
A PixArt block with adaptive layer norm (adaLN-single) conditioning.
|
42 |
-
"""
|
43 |
-
|
44 |
-
def __init__(self, width, heads, init_scale=1.0, qkv_bias=True, use_flash=True, drop_path=0.0):
|
45 |
-
super().__init__()
|
46 |
-
self.norm1 = nn.LayerNorm(width, elementwise_affine=True, eps=1e-6)
|
47 |
-
self.attn = MultiheadAttention(
|
48 |
-
n_ctx=None,
|
49 |
-
width=width,
|
50 |
-
heads=heads,
|
51 |
-
init_scale=init_scale,
|
52 |
-
qkv_bias=qkv_bias,
|
53 |
-
use_flash=use_flash
|
54 |
-
)
|
55 |
-
self.cross_attn = MultiheadCrossAttention(
|
56 |
-
n_data=None,
|
57 |
-
width=width,
|
58 |
-
heads=heads,
|
59 |
-
data_width=None,
|
60 |
-
init_scale=init_scale,
|
61 |
-
qkv_bias=qkv_bias,
|
62 |
-
use_flash=use_flash,
|
63 |
-
)
|
64 |
-
|
65 |
-
self.norm2 = nn.LayerNorm(width, elementwise_affine=True, eps=1e-6)
|
66 |
-
|
67 |
-
self.mlp = MLP(width=width, init_scale=init_scale)
|
68 |
-
self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
|
69 |
-
self.scale_shift_table = nn.Parameter(torch.randn(6, width) / width ** 0.5)
|
70 |
-
|
71 |
-
def forward(self, x, visual_cond, t, **kwargs):
|
72 |
-
B, N, C = x.shape
|
73 |
-
|
74 |
-
shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (self.scale_shift_table[None] + t.reshape(B, 6, -1)).chunk(6, dim=1)
|
75 |
-
x = x + self.drop_path(gate_msa * self.attn(t2i_modulate(self.norm1(x), shift_msa, scale_msa)).reshape(B, N, C))
|
76 |
-
x = x + self.cross_attn(x, visual_cond)
|
77 |
-
x = x + self.drop_path(gate_mlp * self.mlp(t2i_modulate(self.norm2(x), shift_mlp, scale_mlp)))
|
78 |
-
|
79 |
-
return x
|
80 |
-
|
81 |
-
class DiTBlock_text(nn.Module):
|
82 |
-
"""
|
83 |
-
A PixArt block with adaptive layer norm (adaLN-single) conditioning.
|
84 |
-
"""
|
85 |
-
|
86 |
-
def __init__(self, width, heads, init_scale=1.0, qkv_bias=True, use_flash=True, drop_path=0.0):
|
87 |
-
super().__init__()
|
88 |
-
self.norm1 = nn.LayerNorm(width, elementwise_affine=True, eps=1e-6)
|
89 |
-
self.attn = MultiheadAttention(
|
90 |
-
n_ctx=None,
|
91 |
-
width=width,
|
92 |
-
heads=heads,
|
93 |
-
init_scale=init_scale,
|
94 |
-
qkv_bias=qkv_bias,
|
95 |
-
use_flash=use_flash
|
96 |
-
)
|
97 |
-
self.cross_attn = MultiheadCrossAttention(
|
98 |
-
n_data=None,
|
99 |
-
width=width,
|
100 |
-
heads=heads,
|
101 |
-
data_width=None,
|
102 |
-
init_scale=init_scale,
|
103 |
-
qkv_bias=qkv_bias,
|
104 |
-
use_flash=use_flash,
|
105 |
-
)
|
106 |
-
|
107 |
-
self.cross_attn_extra = MultiheadCrossAttention(
|
108 |
-
n_data=None,
|
109 |
-
width=width,
|
110 |
-
heads=heads,
|
111 |
-
data_width=None,
|
112 |
-
init_scale=init_scale,
|
113 |
-
qkv_bias=qkv_bias,
|
114 |
-
use_flash=use_flash,
|
115 |
-
)
|
116 |
-
self.norm2 = nn.LayerNorm(width, elementwise_affine=True, eps=1e-6)
|
117 |
-
|
118 |
-
self.mlp = MLP(width=width, init_scale=init_scale)
|
119 |
-
self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
|
120 |
-
self.scale_shift_table = nn.Parameter(torch.randn(6, width) / width ** 0.5)
|
121 |
-
|
122 |
-
def forward(self, x, visual_cond, text_cond, t, **kwargs):
|
123 |
-
B, N, C = x.shape
|
124 |
-
|
125 |
-
shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (self.scale_shift_table[None] + t.reshape(B, 6, -1)).chunk(6, dim=1)
|
126 |
-
x = x + self.drop_path(gate_msa * self.attn(t2i_modulate(self.norm1(x), shift_msa, scale_msa)).reshape(B, N, C))
|
127 |
-
x = x + self.cross_attn(x, visual_cond)
|
128 |
-
x = x + self.cross_attn_extra(x, text_cond)
|
129 |
-
x = x + self.drop_path(gate_mlp * self.mlp(t2i_modulate(self.norm2(x), shift_mlp, scale_mlp)))
|
130 |
-
|
131 |
-
return x
|
132 |
-
|
133 |
class DiTBlock(nn.Module):
|
134 |
"""
|
135 |
A DiT block with adaptive layer norm (adaLN-single) conditioning.
|
@@ -174,11 +54,6 @@ class DiTBlock(nn.Module):
|
|
174 |
def t2i_modulate(x, shift, scale):
|
175 |
return x * (1 + scale) + shift
|
176 |
|
177 |
-
# def t2i_modulate(x, shift, scale):
|
178 |
-
# a = torch.ones_like(scale)
|
179 |
-
# a[..., 768:] = 0
|
180 |
-
# return x * (a + scale) + shift
|
181 |
-
|
182 |
def auto_grad_checkpoint(module, *args, **kwargs):
|
183 |
if getattr(module, 'grad_checkpointing', False):
|
184 |
if not isinstance(module, Iterable):
|
@@ -268,63 +143,4 @@ class T2IFinalLayer(nn.Module):
|
|
268 |
shift, scale = (self.scale_shift_table[None] + t[:, None]).chunk(2, dim=1)
|
269 |
x = t2i_modulate(self.norm_final(x), shift, scale)
|
270 |
x = self.linear(x)
|
271 |
-
return x
|
272 |
-
|
273 |
-
def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
|
274 |
-
"""
|
275 |
-
embed_dim: output dimension for each position
|
276 |
-
pos: a list of positions to be encoded: size (M,)
|
277 |
-
out: (M, D)
|
278 |
-
"""
|
279 |
-
assert embed_dim % 2 == 0
|
280 |
-
omega = np.arange(embed_dim // 2, dtype=np.float64)
|
281 |
-
omega /= embed_dim / 2.
|
282 |
-
omega = 1. / 10000 ** omega # (D/2,)
|
283 |
-
|
284 |
-
pos = pos.reshape(-1) # (M,)
|
285 |
-
out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product
|
286 |
-
|
287 |
-
emb_sin = np.sin(out) # (M, D/2)
|
288 |
-
emb_cos = np.cos(out) # (M, D/2)
|
289 |
-
|
290 |
-
emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D)
|
291 |
-
return emb
|
292 |
-
|
293 |
-
def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
|
294 |
-
assert embed_dim % 2 == 0
|
295 |
-
|
296 |
-
# use half of dimensions to encode grid_h
|
297 |
-
emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2)
|
298 |
-
emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2)
|
299 |
-
|
300 |
-
emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
|
301 |
-
return emb
|
302 |
-
|
303 |
-
def _ntuple(n):
|
304 |
-
def parse(x):
|
305 |
-
if isinstance(x, Iterable) and not isinstance(x, str):
|
306 |
-
return x
|
307 |
-
return tuple(repeat(x, n))
|
308 |
-
return parse
|
309 |
-
|
310 |
-
to_1tuple = _ntuple(1)
|
311 |
-
to_2tuple = _ntuple(2)
|
312 |
-
|
313 |
-
def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0, pe_interpolation=1.0, base_size=16):
|
314 |
-
"""
|
315 |
-
grid_size: int of the grid height and width
|
316 |
-
return:
|
317 |
-
pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
|
318 |
-
"""
|
319 |
-
if isinstance(grid_size, int):
|
320 |
-
grid_size = to_2tuple(grid_size)
|
321 |
-
grid_h = np.arange(grid_size[0], dtype=np.float32) / (grid_size[0]/base_size) / pe_interpolation
|
322 |
-
grid_w = np.arange(grid_size[1], dtype=np.float32) / (grid_size[1]/base_size) / pe_interpolation
|
323 |
-
grid = np.meshgrid(grid_w, grid_h) # here w goes first
|
324 |
-
grid = np.stack(grid, axis=0)
|
325 |
-
grid = grid.reshape([2, 1, grid_size[1], grid_size[0]])
|
326 |
-
|
327 |
-
pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
|
328 |
-
if cls_token and extra_tokens > 0:
|
329 |
-
pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
|
330 |
-
return pos_embed
|
|
|
10 |
from craftsman.models.transformers.utils import MLP
|
11 |
from craftsman.models.transformers.attention import MultiheadAttention, MultiheadCrossAttention
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
class DiTBlock(nn.Module):
|
14 |
"""
|
15 |
A DiT block with adaptive layer norm (adaLN-single) conditioning.
|
|
|
54 |
def t2i_modulate(x, shift, scale):
|
55 |
return x * (1 + scale) + shift
|
56 |
|
|
|
|
|
|
|
|
|
|
|
57 |
def auto_grad_checkpoint(module, *args, **kwargs):
|
58 |
if getattr(module, 'grad_checkpointing', False):
|
59 |
if not isinstance(module, Iterable):
|
|
|
143 |
shift, scale = (self.scale_shift_table[None] + t[:, None]).chunk(2, dim=1)
|
144 |
x = t2i_modulate(self.norm_final(x), shift, scale)
|
145 |
x = self.linear(x)
|
146 |
+
return x
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
craftsman/models/geometry/__pycache__/__init__.cpython-310.pyc
CHANGED
Binary files a/craftsman/models/geometry/__pycache__/__init__.cpython-310.pyc and b/craftsman/models/geometry/__pycache__/__init__.cpython-310.pyc differ
|
|
craftsman/models/geometry/__pycache__/base.cpython-310.pyc
CHANGED
Binary files a/craftsman/models/geometry/__pycache__/base.cpython-310.pyc and b/craftsman/models/geometry/__pycache__/base.cpython-310.pyc differ
|
|
craftsman/models/geometry/__pycache__/utils.cpython-310.pyc
CHANGED
Binary files a/craftsman/models/geometry/__pycache__/utils.cpython-310.pyc and b/craftsman/models/geometry/__pycache__/utils.cpython-310.pyc differ
|
|
craftsman/models/transformers/__pycache__/attention.cpython-310.pyc
CHANGED
Binary files a/craftsman/models/transformers/__pycache__/attention.cpython-310.pyc and b/craftsman/models/transformers/__pycache__/attention.cpython-310.pyc differ
|
|
craftsman/models/transformers/__pycache__/perceiver_1d.cpython-310.pyc
CHANGED
Binary files a/craftsman/models/transformers/__pycache__/perceiver_1d.cpython-310.pyc and b/craftsman/models/transformers/__pycache__/perceiver_1d.cpython-310.pyc differ
|
|
craftsman/models/transformers/__pycache__/utils.cpython-310.pyc
CHANGED
Binary files a/craftsman/models/transformers/__pycache__/utils.cpython-310.pyc and b/craftsman/models/transformers/__pycache__/utils.cpython-310.pyc differ
|
|
craftsman/models/transformers/attention.py
CHANGED
@@ -9,126 +9,6 @@ from craftsman.utils.checkpoint import checkpoint
|
|
9 |
from .utils import init_linear, MLP
|
10 |
from timm.models.vision_transformer import Attention
|
11 |
|
12 |
-
def scaled_dot_product_gqa(
|
13 |
-
query: Tensor,
|
14 |
-
key: Tensor,
|
15 |
-
value: Tensor,
|
16 |
-
dropout: float = 0.0,
|
17 |
-
scale: Optional[float] = None,
|
18 |
-
mask: Optional[Tensor] = None,
|
19 |
-
is_causal: Optional[bool] = None,
|
20 |
-
need_weights: bool = False,
|
21 |
-
average_attn_weights: bool = False,
|
22 |
-
force_grouped: bool = False,
|
23 |
-
):
|
24 |
-
"""Scaled dot product attention with support for grouped queries.
|
25 |
-
|
26 |
-
Einstein notation:
|
27 |
-
- b: batch size
|
28 |
-
- n / s: sequence length
|
29 |
-
- h: number of heads
|
30 |
-
- g: number of groups
|
31 |
-
- d: dimension of query/key/value
|
32 |
-
|
33 |
-
Args:
|
34 |
-
query: Query tensor of shape (b, n, h, d)
|
35 |
-
key: Key tensor of shape (b, s, h, d)
|
36 |
-
value: Value tensor of shape (b, s, h, d)
|
37 |
-
dropout: Dropout probability (default: 0.0)
|
38 |
-
scale: Scale factor for query (default: d_query ** 0.5)
|
39 |
-
mask: Mask tensor of shape (b, n, s) or (b, s). If 'ndim == 2', the mask is
|
40 |
-
applied to all 'n' rows of the attention matrix. (default: None)
|
41 |
-
force_grouped: If True, apply grouped-query attention even if the number of
|
42 |
-
heads is equal for query, key, and value. (default: False)
|
43 |
-
|
44 |
-
Returns:
|
45 |
-
2-tuple of:
|
46 |
-
- Attention output with shape (b, n, h, d)
|
47 |
-
- (Optional) Attention weights with shape (b, h, n, s). Only returned if
|
48 |
-
'need_weights' is True.
|
49 |
-
"""
|
50 |
-
if (mask is not None) and (is_causal is not None):
|
51 |
-
raise ValueError(
|
52 |
-
"Only one of 'mask' and 'is_causal' should be provided, but got both."
|
53 |
-
)
|
54 |
-
elif not query.ndim == key.ndim == value.ndim == 4:
|
55 |
-
raise ValueError(
|
56 |
-
f"Expected query, key, and value to be 4-dimensional, but got shapes "
|
57 |
-
f"{query.shape}, {key.shape}, and {value.shape}."
|
58 |
-
)
|
59 |
-
|
60 |
-
# Move sequence length dimension to axis 2.
|
61 |
-
# This makes the attention operations below *much* faster.
|
62 |
-
query = rearrange(query, "b n h d -> b h n d")
|
63 |
-
key = rearrange(key, "b s h d -> b h s d")
|
64 |
-
value = rearrange(value, "b s h d -> b h s d")
|
65 |
-
|
66 |
-
bq, hq, nq, dq = query.shape
|
67 |
-
bk, hk, nk, dk = key.shape
|
68 |
-
bv, hv, nv, dv = value.shape
|
69 |
-
if not (bq == bk == bv and dq == dk == dv):
|
70 |
-
raise ValueError(
|
71 |
-
"Expected query, key, and value to have the same batch size (dim=0) and "
|
72 |
-
f"embedding dimension (dim=3), but got query: {query.shape}, "
|
73 |
-
f"key: {key.shape}, and value: {value.shape}."
|
74 |
-
)
|
75 |
-
elif (hk != hv) or (nk != nv):
|
76 |
-
raise ValueError(
|
77 |
-
"Expected key and value to have the same size in dimensions 1 and 2, but "
|
78 |
-
f"got key: {key.shape} and value: {value.shape}."
|
79 |
-
)
|
80 |
-
elif hq % hk != 0:
|
81 |
-
raise ValueError(
|
82 |
-
"Expected query heads to be a multiple of key/value heads, but got "
|
83 |
-
f"query: {query.shape} and key/value: {key.shape}."
|
84 |
-
)
|
85 |
-
|
86 |
-
if scale is None:
|
87 |
-
scale = query.size(-1) ** 0.5
|
88 |
-
query = query / scale
|
89 |
-
|
90 |
-
num_head_groups = hq // hk
|
91 |
-
query = rearrange(query, "b (h g) n d -> b g h n d", g=num_head_groups)
|
92 |
-
similarity = einsum(query, key, "b g h n d, b h s d -> b g h n s")
|
93 |
-
|
94 |
-
if is_causal:
|
95 |
-
# Mask out the upper triangular portion of the attention matrix. This prevents
|
96 |
-
# the model from attending to tokens in the future.
|
97 |
-
mask = torch.ones((bq, nq, nk), device=query.device, dtype=torch.bool).tril_()
|
98 |
-
|
99 |
-
if mask is not None:
|
100 |
-
# Expand mask to match the shape of the attention matrix.
|
101 |
-
# If mask is 2D, assume that it is applied to the key/value sequence dimension.
|
102 |
-
# Else if mask is 3D, assume that it is applied to the query/key/value sequence
|
103 |
-
# dimension for all attention heads.
|
104 |
-
#
|
105 |
-
if mask.ndim == 2:
|
106 |
-
mask = rearrange(mask, "b s -> b () () () s")
|
107 |
-
elif mask.ndim == 3:
|
108 |
-
mask = rearrange(mask, "b n s -> b () () n s")
|
109 |
-
# Mask similarity values by setting them to negative infinity. This guarantees
|
110 |
-
# that they will not contribute to the softmax computation below.
|
111 |
-
similarity.masked_fill_(~mask, torch.finfo(similarity.dtype).min)
|
112 |
-
|
113 |
-
attention = F.softmax(similarity, dim=-1)
|
114 |
-
if dropout > 0.0:
|
115 |
-
attention = F.dropout(attention, p=dropout)
|
116 |
-
|
117 |
-
# Apply attention matrix to the value Tensor.
|
118 |
-
out = einsum(attention, value, "b g h n s, b h s d -> b g h n d")
|
119 |
-
# Move head dimension back to axis 2
|
120 |
-
out = rearrange(out, "b g h n d -> b n (h g) d")
|
121 |
-
|
122 |
-
attn_weights: Optional[Tensor] = None
|
123 |
-
if need_weights:
|
124 |
-
# Move the sequence dimensions back to positions 1, 2. Move the head dimension
|
125 |
-
# to position 3. This more closely matches the return shape of the attention
|
126 |
-
# output: (b, n, h, d).
|
127 |
-
attn_weights = rearrange(attention, "b g h n s -> b n s (h g)")
|
128 |
-
if average_attn_weights:
|
129 |
-
attn_weights = attn_weights.mean(dim=1)
|
130 |
-
|
131 |
-
return out, attn_weights
|
132 |
|
133 |
class MultiheadAttention(nn.Module):
|
134 |
def __init__(
|
@@ -327,4 +207,4 @@ class ResidualCrossAttentionBlock(nn.Module):
|
|
327 |
def forward(self, x: torch.Tensor, data: torch.Tensor):
|
328 |
x = x + self.attn(self.ln_1(x), self.ln_2(data))
|
329 |
x = x + self.mlp(self.ln_3(x))
|
330 |
-
return x
|
|
|
9 |
from .utils import init_linear, MLP
|
10 |
from timm.models.vision_transformer import Attention
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
class MultiheadAttention(nn.Module):
|
14 |
def __init__(
|
|
|
207 |
def forward(self, x: torch.Tensor, data: torch.Tensor):
|
208 |
x = x + self.attn(self.ln_1(x), self.ln_2(data))
|
209 |
x = x + self.mlp(self.ln_3(x))
|
210 |
+
return x
|
craftsman/models/transformers/perceiver_1d.py
CHANGED
File without changes
|
craftsman/models/transformers/utils.py
CHANGED
File without changes
|
craftsman/pipeline.py
CHANGED
@@ -158,6 +158,7 @@ class CraftsManPipeline():
|
|
158 |
background_color: List[int] = [255, 255, 255],
|
159 |
foreground_ratio: float = 0.95,
|
160 |
mc_depth: int = 8,
|
|
|
161 |
):
|
162 |
r"""
|
163 |
Function invoked when calling the pipeline for generation.
|
@@ -198,6 +199,9 @@ class CraftsManPipeline():
|
|
198 |
mc_depth (`int`, *optional*, defaults to 8):
|
199 |
The resolution of the Marching Cubes algorithm. The resolution is the number of cubes in the x, y, and z.
|
200 |
8 means 2^8 = 256 cubes in each dimension. The higher the resolution, the more detailed the mesh will be.
|
|
|
|
|
|
|
201 |
Examples:
|
202 |
|
203 |
Returns:
|
@@ -258,6 +262,15 @@ class CraftsManPipeline():
|
|
258 |
if output_type == "trimesh":
|
259 |
import trimesh
|
260 |
cur_mesh = trimesh.Trimesh(vertices=mesh_v_f[0][0], faces=mesh_v_f[0][1])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
261 |
mesh.append(cur_mesh)
|
262 |
elif output_type == "np":
|
263 |
mesh.append(mesh_v_f[0])
|
|
|
158 |
background_color: List[int] = [255, 255, 255],
|
159 |
foreground_ratio: float = 0.95,
|
160 |
mc_depth: int = 8,
|
161 |
+
only_max_component: bool = False,
|
162 |
):
|
163 |
r"""
|
164 |
Function invoked when calling the pipeline for generation.
|
|
|
199 |
mc_depth (`int`, *optional*, defaults to 8):
|
200 |
The resolution of the Marching Cubes algorithm. The resolution is the number of cubes in the x, y, and z.
|
201 |
8 means 2^8 = 256 cubes in each dimension. The higher the resolution, the more detailed the mesh will be.
|
202 |
+
only_max_component (`bool`, *optional*, defaults to `False`):
|
203 |
+
Whether to only keep the largest connected component of the mesh. This is useful when the mesh has
|
204 |
+
multiple components and only the largest one is needed.
|
205 |
Examples:
|
206 |
|
207 |
Returns:
|
|
|
262 |
if output_type == "trimesh":
|
263 |
import trimesh
|
264 |
cur_mesh = trimesh.Trimesh(vertices=mesh_v_f[0][0], faces=mesh_v_f[0][1])
|
265 |
+
if only_max_component:
|
266 |
+
components = cur_mesh.split(only_watertight=False)
|
267 |
+
bbox = []
|
268 |
+
for c in components:
|
269 |
+
bbmin = c.vertices.min(0)
|
270 |
+
bbmax = c.vertices.max(0)
|
271 |
+
bbox.append((bbmax - bbmin).max())
|
272 |
+
max_component = np.argmax(bbox)
|
273 |
+
cur_mesh = components[max_component]
|
274 |
mesh.append(cur_mesh)
|
275 |
elif output_type == "np":
|
276 |
mesh.append(mesh_v_f[0])
|
craftsman/systems/__pycache__/__init__.cpython-310.pyc
CHANGED
Binary files a/craftsman/systems/__pycache__/__init__.cpython-310.pyc and b/craftsman/systems/__pycache__/__init__.cpython-310.pyc differ
|
|
craftsman/systems/__pycache__/base.cpython-310.pyc
CHANGED
Binary files a/craftsman/systems/__pycache__/base.cpython-310.pyc and b/craftsman/systems/__pycache__/base.cpython-310.pyc differ
|
|
craftsman/systems/__pycache__/pixart_diffusion.cpython-310.pyc
CHANGED
Binary files a/craftsman/systems/__pycache__/pixart_diffusion.cpython-310.pyc and b/craftsman/systems/__pycache__/pixart_diffusion.cpython-310.pyc differ
|
|
craftsman/systems/__pycache__/shape_autoencoder.cpython-310.pyc
CHANGED
Binary files a/craftsman/systems/__pycache__/shape_autoencoder.cpython-310.pyc and b/craftsman/systems/__pycache__/shape_autoencoder.cpython-310.pyc differ
|
|
craftsman/systems/__pycache__/utils.cpython-310.pyc
CHANGED
File without changes
|
craftsman/systems/pixart_diffusion.py
CHANGED
@@ -251,9 +251,9 @@ class PixArtDiffusionSystem(BaseSystem):
|
|
251 |
return {
|
252 |
"loss_diffusion": loss,
|
253 |
"latents": latents,
|
254 |
-
"x_t":
|
255 |
"noise": noise,
|
256 |
-
"noise_pred":
|
257 |
"timesteps": timesteps,
|
258 |
}
|
259 |
|
@@ -373,4 +373,4 @@ class PixArtDiffusionSystem(BaseSystem):
|
|
373 |
return outputs
|
374 |
|
375 |
def on_validation_epoch_end(self):
|
376 |
-
pass
|
|
|
251 |
return {
|
252 |
"loss_diffusion": loss,
|
253 |
"latents": latents,
|
254 |
+
"x_t": noisy_z,
|
255 |
"noise": noise,
|
256 |
+
"noise_pred": noise_pred,
|
257 |
"timesteps": timesteps,
|
258 |
}
|
259 |
|
|
|
373 |
return outputs
|
374 |
|
375 |
def on_validation_epoch_end(self):
|
376 |
+
pass
|
craftsman/utils/__pycache__/__init__.cpython-310.pyc
CHANGED
Binary files a/craftsman/utils/__pycache__/__init__.cpython-310.pyc and b/craftsman/utils/__pycache__/__init__.cpython-310.pyc differ
|
|
craftsman/utils/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (223 Bytes). View file
|
|
craftsman/utils/__pycache__/base.cpython-310.pyc
CHANGED
Binary files a/craftsman/utils/__pycache__/base.cpython-310.pyc and b/craftsman/utils/__pycache__/base.cpython-310.pyc differ
|
|
craftsman/utils/__pycache__/base.cpython-311.pyc
ADDED
Binary file (7.57 kB). View file
|
|
craftsman/utils/__pycache__/checkpoint.cpython-310.pyc
CHANGED
Binary files a/craftsman/utils/__pycache__/checkpoint.cpython-310.pyc and b/craftsman/utils/__pycache__/checkpoint.cpython-310.pyc differ
|
|
craftsman/utils/__pycache__/config.cpython-310.pyc
CHANGED
Binary files a/craftsman/utils/__pycache__/config.cpython-310.pyc and b/craftsman/utils/__pycache__/config.cpython-310.pyc differ
|
|
craftsman/utils/__pycache__/config.cpython-311.pyc
ADDED
Binary file (9.1 kB). View file
|
|