gartajackhats1985 commited on
Commit
583c1c7
1 Parent(s): 21d588d

Upload 411 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +4 -0
  2. ComfyUI-PuLID-Flux-Enhanced/LICENSE +201 -0
  3. ComfyUI-PuLID-Flux-Enhanced/README.md +50 -0
  4. ComfyUI-PuLID-Flux-Enhanced/__init__.py +3 -0
  5. ComfyUI-PuLID-Flux-Enhanced/__pycache__/__init__.cpython-312.pyc +0 -0
  6. ComfyUI-PuLID-Flux-Enhanced/__pycache__/encoders_flux.cpython-312.pyc +0 -0
  7. ComfyUI-PuLID-Flux-Enhanced/__pycache__/online_train2.cpython-312.pyc +0 -0
  8. ComfyUI-PuLID-Flux-Enhanced/__pycache__/pulidflux.cpython-312.pyc +0 -0
  9. ComfyUI-PuLID-Flux-Enhanced/encoders_flux.py +207 -0
  10. ComfyUI-PuLID-Flux-Enhanced/eva_clip/__init__.py +11 -0
  11. ComfyUI-PuLID-Flux-Enhanced/eva_clip/__pycache__/__init__.cpython-312.pyc +0 -0
  12. ComfyUI-PuLID-Flux-Enhanced/eva_clip/__pycache__/constants.cpython-312.pyc +0 -0
  13. ComfyUI-PuLID-Flux-Enhanced/eva_clip/__pycache__/eva_vit_model.cpython-312.pyc +0 -0
  14. ComfyUI-PuLID-Flux-Enhanced/eva_clip/__pycache__/factory.cpython-312.pyc +0 -0
  15. ComfyUI-PuLID-Flux-Enhanced/eva_clip/__pycache__/hf_configs.cpython-312.pyc +0 -0
  16. ComfyUI-PuLID-Flux-Enhanced/eva_clip/__pycache__/hf_model.cpython-312.pyc +0 -0
  17. ComfyUI-PuLID-Flux-Enhanced/eva_clip/__pycache__/loss.cpython-312.pyc +0 -0
  18. ComfyUI-PuLID-Flux-Enhanced/eva_clip/__pycache__/model.cpython-312.pyc +0 -0
  19. ComfyUI-PuLID-Flux-Enhanced/eva_clip/__pycache__/modified_resnet.cpython-312.pyc +0 -0
  20. ComfyUI-PuLID-Flux-Enhanced/eva_clip/__pycache__/openai.cpython-312.pyc +0 -0
  21. ComfyUI-PuLID-Flux-Enhanced/eva_clip/__pycache__/pretrained.cpython-312.pyc +0 -0
  22. ComfyUI-PuLID-Flux-Enhanced/eva_clip/__pycache__/rope.cpython-312.pyc +0 -0
  23. ComfyUI-PuLID-Flux-Enhanced/eva_clip/__pycache__/timm_model.cpython-312.pyc +0 -0
  24. ComfyUI-PuLID-Flux-Enhanced/eva_clip/__pycache__/tokenizer.cpython-312.pyc +0 -0
  25. ComfyUI-PuLID-Flux-Enhanced/eva_clip/__pycache__/transform.cpython-312.pyc +0 -0
  26. ComfyUI-PuLID-Flux-Enhanced/eva_clip/__pycache__/transformer.cpython-312.pyc +0 -0
  27. ComfyUI-PuLID-Flux-Enhanced/eva_clip/__pycache__/utils.cpython-312.pyc +0 -0
  28. ComfyUI-PuLID-Flux-Enhanced/eva_clip/bpe_simple_vocab_16e6.txt.gz +3 -0
  29. ComfyUI-PuLID-Flux-Enhanced/eva_clip/constants.py +2 -0
  30. ComfyUI-PuLID-Flux-Enhanced/eva_clip/eva_vit_model.py +548 -0
  31. ComfyUI-PuLID-Flux-Enhanced/eva_clip/factory.py +517 -0
  32. ComfyUI-PuLID-Flux-Enhanced/eva_clip/hf_configs.py +57 -0
  33. ComfyUI-PuLID-Flux-Enhanced/eva_clip/hf_model.py +248 -0
  34. ComfyUI-PuLID-Flux-Enhanced/eva_clip/loss.py +138 -0
  35. ComfyUI-PuLID-Flux-Enhanced/eva_clip/model.py +439 -0
  36. ComfyUI-PuLID-Flux-Enhanced/eva_clip/model_configs/EVA01-CLIP-B-16.json +19 -0
  37. ComfyUI-PuLID-Flux-Enhanced/eva_clip/model_configs/EVA01-CLIP-g-14-plus.json +24 -0
  38. ComfyUI-PuLID-Flux-Enhanced/eva_clip/model_configs/EVA01-CLIP-g-14.json +24 -0
  39. ComfyUI-PuLID-Flux-Enhanced/eva_clip/model_configs/EVA02-CLIP-B-16.json +29 -0
  40. ComfyUI-PuLID-Flux-Enhanced/eva_clip/model_configs/EVA02-CLIP-L-14-336.json +29 -0
  41. ComfyUI-PuLID-Flux-Enhanced/eva_clip/model_configs/EVA02-CLIP-L-14.json +29 -0
  42. ComfyUI-PuLID-Flux-Enhanced/eva_clip/model_configs/EVA02-CLIP-bigE-14-plus.json +25 -0
  43. ComfyUI-PuLID-Flux-Enhanced/eva_clip/model_configs/EVA02-CLIP-bigE-14.json +25 -0
  44. ComfyUI-PuLID-Flux-Enhanced/eva_clip/modified_resnet.py +181 -0
  45. ComfyUI-PuLID-Flux-Enhanced/eva_clip/openai.py +144 -0
  46. ComfyUI-PuLID-Flux-Enhanced/eva_clip/pretrained.py +332 -0
  47. ComfyUI-PuLID-Flux-Enhanced/eva_clip/rope.py +137 -0
  48. ComfyUI-PuLID-Flux-Enhanced/eva_clip/timm_model.py +122 -0
  49. ComfyUI-PuLID-Flux-Enhanced/eva_clip/tokenizer.py +201 -0
  50. ComfyUI-PuLID-Flux-Enhanced/eva_clip/transform.py +103 -0
.gitattributes CHANGED
@@ -42,3 +42,7 @@ ComfyUI-AdvancedLivePortrait/sample/original_sample_asset/driving/d0.mp4 filter=
42
  ComfyUI-AdvancedLivePortrait/sample/original_sample_asset/driving/d3.mp4 filter=lfs diff=lfs merge=lfs -text
43
  ComfyUI-AdvancedLivePortrait/sample/original_sample_asset/driving/d6.mp4 filter=lfs diff=lfs merge=lfs -text
44
  ComfyUI-AdvancedLivePortrait/sample/original_sample_asset/driving/d9.mp4 filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
42
  ComfyUI-AdvancedLivePortrait/sample/original_sample_asset/driving/d3.mp4 filter=lfs diff=lfs merge=lfs -text
43
  ComfyUI-AdvancedLivePortrait/sample/original_sample_asset/driving/d6.mp4 filter=lfs diff=lfs merge=lfs -text
44
  ComfyUI-AdvancedLivePortrait/sample/original_sample_asset/driving/d9.mp4 filter=lfs diff=lfs merge=lfs -text
45
+ rgthree-comfy/docs/rgthree_context_metadata.png filter=lfs diff=lfs merge=lfs -text
46
+ was-node-suite-comfyui/repos/SAM/assets/masks1.png filter=lfs diff=lfs merge=lfs -text
47
+ was-node-suite-comfyui/repos/SAM/assets/minidemo.gif filter=lfs diff=lfs merge=lfs -text
48
+ was-node-suite-comfyui/repos/SAM/assets/notebook2.png filter=lfs diff=lfs merge=lfs -text
ComfyUI-PuLID-Flux-Enhanced/LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
ComfyUI-PuLID-Flux-Enhanced/README.md ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ComfyUI-PuLID-Flux-Enhanced
2
+ adapted from https://github.com/balazik/ComfyUI-PuLID-Flux
3
+
4
+ workflow: see example flux_pulid_multi.json
5
+
6
+ ## update oct.28 2024
7
+ Add an optional prior image input for the node. When using the train_weight method, the prior image will act as the main id image, which will lead the other id images to sum up to an optimized id embedding.
8
+
9
+ This prior was randomly choosen previously, now we can assign it.
10
+
11
+ Leaving the prior image input empty is OK just as previous.
12
+
13
+ Please choose the best id image in your mind as the prior, or just experiment around and see what happens.
14
+ ![oct28](https://github.com/user-attachments/assets/6a481cd9-2836-4f6f-9ad5-7458356c332a)
15
+
16
+ ## new features
17
+ ### common fusion methods for multi-image input
18
+ mean(official), concat, max...etc
19
+
20
+ ### some further experimental fusion methods.
21
+ using the norm of the conditions to weight them
22
+
23
+ using the max norm token among images
24
+
25
+ a novel very fast embeddings self-training methods(explained here: https://github.com/balazik/ComfyUI-PuLID-Flux/issues/28)
26
+
27
+ ### switch between using gray image (official) and rgb.
28
+ in some cases, using gray image will bring detail loss
29
+
30
+ ![2024-10-12_204047](https://github.com/user-attachments/assets/0ae96170-2eff-44e9-a53a-6a7447dbc0f1)
31
+
32
+ ## tricks make your generation better
33
+ ### fusion method leverages many id images to enhance fidelity
34
+ 1. Besides mean fusion, you can try max or max_token, which can boost some major feature of a face (like large eyes, special nose or sth). it can go distortion beyond fidelity though.
35
+ 2. With train_weight method, you can train with less than 2000 steps to make a deeper fusion than the non-training methods. Be aware too many training steps will make the training crash to the prior image.
36
+
37
+ ### additional notes
38
+ 1. Flux is a high capacity base model, it even can cognize the input image in some super human way.
39
+ for example, you can resize your high quality input image with lanczos method rather than nearest area or billinear. you get finer texture. Keep in mind that taking care of your input image is the thing when the base model is strong.
40
+ 2. The best pulid weight is around 0.8-0.95 for flux pulid 0.9.0. 1.0 is not good. For 0.9.1, it's higher towards around 0.9-1.0. Nonetheless the 0.9.1 is not always better than 0.9.0.
41
+ 3. The base model is flux-dev or its finetuning, and the precision does mean the thing. fp16 should always be sound. fp8 is OK. I won't recommend gguf or nf4 things.
42
+ 4. Some of the finetuned flux dev model may have strong bias. for example, it may sway the faces to a certain human race.
43
+ 5. Euler simple is always working. Euler beta give you higher quality especially if your input image is somewhat low quality.
44
+
45
+ ## basic notes for common users
46
+ This is an experimental node. It can give enhanced result but I'm not promising basic instructions for users who barely know about python developing or AI developing.
47
+
48
+ Please follow the comfyui instructions or https://github.com/balazik/ComfyUI-PuLID-Flux to enable usage.
49
+
50
+ If you are just using SDXL pulid, you can use https://github.com/cubiq/PuLID_ComfyUI. Some of the installation instructions there may also help.
ComfyUI-PuLID-Flux-Enhanced/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .pulidflux import NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS
2
+
3
+ __all__ = ['NODE_CLASS_MAPPINGS', 'NODE_DISPLAY_NAME_MAPPINGS']
ComfyUI-PuLID-Flux-Enhanced/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (337 Bytes). View file
 
ComfyUI-PuLID-Flux-Enhanced/__pycache__/encoders_flux.cpython-312.pyc ADDED
Binary file (10.1 kB). View file
 
ComfyUI-PuLID-Flux-Enhanced/__pycache__/online_train2.cpython-312.pyc ADDED
Binary file (4.03 kB). View file
 
ComfyUI-PuLID-Flux-Enhanced/__pycache__/pulidflux.cpython-312.pyc ADDED
Binary file (24.5 kB). View file
 
ComfyUI-PuLID-Flux-Enhanced/encoders_flux.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+
6
+
7
+ # FFN
8
+ def FeedForward(dim, mult=4):
9
+ inner_dim = int(dim * mult)
10
+ return nn.Sequential(
11
+ nn.LayerNorm(dim),
12
+ nn.Linear(dim, inner_dim, bias=False),
13
+ nn.GELU(),
14
+ nn.Linear(inner_dim, dim, bias=False),
15
+ )
16
+
17
+
18
+ def reshape_tensor(x, heads):
19
+ bs, length, width = x.shape
20
+ # (bs, length, width) --> (bs, length, n_heads, dim_per_head)
21
+ x = x.view(bs, length, heads, -1)
22
+ # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head)
23
+ x = x.transpose(1, 2)
24
+ # (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head)
25
+ x = x.reshape(bs, heads, length, -1)
26
+ return x
27
+
28
+
29
+ class PerceiverAttentionCA(nn.Module):
30
+ def __init__(self, *, dim=3072, dim_head=128, heads=16, kv_dim=2048):
31
+ super().__init__()
32
+ self.scale = dim_head ** -0.5
33
+ self.dim_head = dim_head
34
+ self.heads = heads
35
+ inner_dim = dim_head * heads
36
+
37
+ self.norm1 = nn.LayerNorm(dim if kv_dim is None else kv_dim)
38
+ self.norm2 = nn.LayerNorm(dim)
39
+
40
+ self.to_q = nn.Linear(dim, inner_dim, bias=False)
41
+ self.to_kv = nn.Linear(dim if kv_dim is None else kv_dim, inner_dim * 2, bias=False)
42
+ self.to_out = nn.Linear(inner_dim, dim, bias=False)
43
+
44
+ def forward(self, x, latents):
45
+ """
46
+ Args:
47
+ x (torch.Tensor): image features
48
+ shape (b, n1, D)
49
+ latent (torch.Tensor): latent features
50
+ shape (b, n2, D)
51
+ """
52
+ x = self.norm1(x)
53
+ latents = self.norm2(latents)
54
+
55
+ b, seq_len, _ = latents.shape
56
+
57
+ q = self.to_q(latents)
58
+ k, v = self.to_kv(x).chunk(2, dim=-1)
59
+
60
+ q = reshape_tensor(q, self.heads)
61
+ k = reshape_tensor(k, self.heads)
62
+ v = reshape_tensor(v, self.heads)
63
+
64
+ # attention
65
+ scale = 1 / math.sqrt(math.sqrt(self.dim_head))
66
+ weight = (q * scale) @ (k * scale).transpose(-2, -1) # More stable with f16 than dividing afterwards
67
+ weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
68
+ out = weight @ v
69
+
70
+ out = out.permute(0, 2, 1, 3).reshape(b, seq_len, -1)
71
+
72
+ return self.to_out(out)
73
+
74
+
75
+ class PerceiverAttention(nn.Module):
76
+ def __init__(self, *, dim, dim_head=64, heads=8, kv_dim=None):
77
+ super().__init__()
78
+ self.scale = dim_head ** -0.5
79
+ self.dim_head = dim_head
80
+ self.heads = heads
81
+ inner_dim = dim_head * heads
82
+
83
+ self.norm1 = nn.LayerNorm(dim if kv_dim is None else kv_dim)
84
+ self.norm2 = nn.LayerNorm(dim)
85
+
86
+ self.to_q = nn.Linear(dim, inner_dim, bias=False)
87
+ self.to_kv = nn.Linear(dim if kv_dim is None else kv_dim, inner_dim * 2, bias=False)
88
+ self.to_out = nn.Linear(inner_dim, dim, bias=False)
89
+
90
+ def forward(self, x, latents):
91
+ """
92
+ Args:
93
+ x (torch.Tensor): image features
94
+ shape (b, n1, D)
95
+ latent (torch.Tensor): latent features
96
+ shape (b, n2, D)
97
+ """
98
+ x = self.norm1(x)
99
+ latents = self.norm2(latents)
100
+
101
+ b, seq_len, _ = latents.shape
102
+
103
+ q = self.to_q(latents)
104
+ kv_input = torch.cat((x, latents), dim=-2)
105
+ k, v = self.to_kv(kv_input).chunk(2, dim=-1)
106
+
107
+ q = reshape_tensor(q, self.heads)
108
+ k = reshape_tensor(k, self.heads)
109
+ v = reshape_tensor(v, self.heads)
110
+
111
+ # attention
112
+ scale = 1 / math.sqrt(math.sqrt(self.dim_head))
113
+ weight = (q * scale) @ (k * scale).transpose(-2, -1) # More stable with f16 than dividing afterwards
114
+ weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
115
+ out = weight @ v
116
+
117
+ out = out.permute(0, 2, 1, 3).reshape(b, seq_len, -1)
118
+
119
+ return self.to_out(out)
120
+
121
+
122
+ class IDFormer(nn.Module):
123
+ """
124
+ - perceiver resampler like arch (compared with previous MLP-like arch)
125
+ - we concat id embedding (generated by arcface) and query tokens as latents
126
+ - latents will attend each other and interact with vit features through cross-attention
127
+ - vit features are multi-scaled and inserted into IDFormer in order, currently, each scale corresponds to two
128
+ IDFormer layers
129
+ """
130
+ def __init__(
131
+ self,
132
+ dim=1024,
133
+ depth=10,
134
+ dim_head=64,
135
+ heads=16,
136
+ num_id_token=5,
137
+ num_queries=32,
138
+ output_dim=2048,
139
+ ff_mult=4,
140
+ ):
141
+ super().__init__()
142
+
143
+ self.num_id_token = num_id_token
144
+ self.dim = dim
145
+ self.num_queries = num_queries
146
+ assert depth % 5 == 0
147
+ self.depth = depth // 5
148
+ scale = dim ** -0.5
149
+
150
+ self.latents = nn.Parameter(torch.randn(1, num_queries, dim) * scale)
151
+ self.proj_out = nn.Parameter(scale * torch.randn(dim, output_dim))
152
+
153
+ self.layers = nn.ModuleList([])
154
+ for _ in range(depth):
155
+ self.layers.append(
156
+ nn.ModuleList(
157
+ [
158
+ PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
159
+ FeedForward(dim=dim, mult=ff_mult),
160
+ ]
161
+ )
162
+ )
163
+
164
+ for i in range(5):
165
+ setattr(
166
+ self,
167
+ f'mapping_{i}',
168
+ nn.Sequential(
169
+ nn.Linear(1024, 1024),
170
+ nn.LayerNorm(1024),
171
+ nn.LeakyReLU(),
172
+ nn.Linear(1024, 1024),
173
+ nn.LayerNorm(1024),
174
+ nn.LeakyReLU(),
175
+ nn.Linear(1024, dim),
176
+ ),
177
+ )
178
+
179
+ self.id_embedding_mapping = nn.Sequential(
180
+ nn.Linear(1280, 1024),
181
+ nn.LayerNorm(1024),
182
+ nn.LeakyReLU(),
183
+ nn.Linear(1024, 1024),
184
+ nn.LayerNorm(1024),
185
+ nn.LeakyReLU(),
186
+ nn.Linear(1024, dim * num_id_token),
187
+ )
188
+
189
+ def forward(self, x, y):
190
+
191
+ latents = self.latents.repeat(x.size(0), 1, 1)
192
+
193
+ x = self.id_embedding_mapping(x)
194
+ x = x.reshape(-1, self.num_id_token, self.dim)
195
+
196
+ latents = torch.cat((latents, x), dim=1)
197
+
198
+ for i in range(5):
199
+ vit_feature = getattr(self, f'mapping_{i}')(y[i])
200
+ ctx_feature = torch.cat((x, vit_feature), dim=1)
201
+ for attn, ff in self.layers[i * self.depth: (i + 1) * self.depth]:
202
+ latents = attn(ctx_feature, latents) + latents
203
+ latents = ff(latents) + latents
204
+
205
+ latents = latents[:, :self.num_queries]
206
+ latents = latents @ self.proj_out
207
+ return latents
ComfyUI-PuLID-Flux-Enhanced/eva_clip/__init__.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
2
+ from .factory import create_model, create_model_and_transforms, create_model_from_pretrained, get_tokenizer, create_transforms
3
+ from .factory import list_models, add_model_config, get_model_config, load_checkpoint
4
+ from .loss import ClipLoss
5
+ from .model import CLIP, CustomCLIP, CLIPTextCfg, CLIPVisionCfg,\
6
+ convert_weights_to_lp, convert_weights_to_fp16, trace_model, get_cast_dtype
7
+ from .openai import load_openai_model, list_openai_models
8
+ from .pretrained import list_pretrained, list_pretrained_models_by_tag, list_pretrained_tags_by_model,\
9
+ get_pretrained_url, download_pretrained_from_url, is_pretrained_cfg, get_pretrained_cfg, download_pretrained
10
+ from .tokenizer import SimpleTokenizer, tokenize
11
+ from .transform import image_transform
ComfyUI-PuLID-Flux-Enhanced/eva_clip/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (1.36 kB). View file
 
ComfyUI-PuLID-Flux-Enhanced/eva_clip/__pycache__/constants.cpython-312.pyc ADDED
Binary file (332 Bytes). View file
 
ComfyUI-PuLID-Flux-Enhanced/eva_clip/__pycache__/eva_vit_model.cpython-312.pyc ADDED
Binary file (32.1 kB). View file
 
ComfyUI-PuLID-Flux-Enhanced/eva_clip/__pycache__/factory.cpython-312.pyc ADDED
Binary file (19.9 kB). View file
 
ComfyUI-PuLID-Flux-Enhanced/eva_clip/__pycache__/hf_configs.cpython-312.pyc ADDED
Binary file (803 Bytes). View file
 
ComfyUI-PuLID-Flux-Enhanced/eva_clip/__pycache__/hf_model.cpython-312.pyc ADDED
Binary file (12.8 kB). View file
 
ComfyUI-PuLID-Flux-Enhanced/eva_clip/__pycache__/loss.cpython-312.pyc ADDED
Binary file (6.13 kB). View file
 
ComfyUI-PuLID-Flux-Enhanced/eva_clip/__pycache__/model.cpython-312.pyc ADDED
Binary file (22.1 kB). View file
 
ComfyUI-PuLID-Flux-Enhanced/eva_clip/__pycache__/modified_resnet.cpython-312.pyc ADDED
Binary file (12.6 kB). View file
 
ComfyUI-PuLID-Flux-Enhanced/eva_clip/__pycache__/openai.cpython-312.pyc ADDED
Binary file (7.69 kB). View file
 
ComfyUI-PuLID-Flux-Enhanced/eva_clip/__pycache__/pretrained.cpython-312.pyc ADDED
Binary file (13.2 kB). View file
 
ComfyUI-PuLID-Flux-Enhanced/eva_clip/__pycache__/rope.cpython-312.pyc ADDED
Binary file (9.28 kB). View file
 
ComfyUI-PuLID-Flux-Enhanced/eva_clip/__pycache__/timm_model.cpython-312.pyc ADDED
Binary file (5.82 kB). View file
 
ComfyUI-PuLID-Flux-Enhanced/eva_clip/__pycache__/tokenizer.cpython-312.pyc ADDED
Binary file (12.4 kB). View file
 
ComfyUI-PuLID-Flux-Enhanced/eva_clip/__pycache__/transform.cpython-312.pyc ADDED
Binary file (4.39 kB). View file
 
ComfyUI-PuLID-Flux-Enhanced/eva_clip/__pycache__/transformer.cpython-312.pyc ADDED
Binary file (40.7 kB). View file
 
ComfyUI-PuLID-Flux-Enhanced/eva_clip/__pycache__/utils.cpython-312.pyc ADDED
Binary file (17.6 kB). View file
 
ComfyUI-PuLID-Flux-Enhanced/eva_clip/bpe_simple_vocab_16e6.txt.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:924691ac288e54409236115652ad4aa250f48203de50a9e4722a6ecd48d6804a
3
+ size 1356917
ComfyUI-PuLID-Flux-Enhanced/eva_clip/constants.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
2
+ OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
ComfyUI-PuLID-Flux-Enhanced/eva_clip/eva_vit_model.py ADDED
@@ -0,0 +1,548 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # Adapted from https://github.com/microsoft/unilm/tree/master/beit
3
+ # --------------------------------------------------------
4
+ import math
5
+ import os
6
+ from functools import partial
7
+ import torch
8
+ import torch.nn as nn
9
+ import torch.nn.functional as F
10
+ try:
11
+ from timm.models.layers import drop_path, to_2tuple, trunc_normal_
12
+ except:
13
+ from timm.layers import drop_path, to_2tuple, trunc_normal_
14
+
15
+ from .transformer import PatchDropout
16
+ from .rope import VisionRotaryEmbedding, VisionRotaryEmbeddingFast
17
+
18
+ if os.getenv('ENV_TYPE') == 'deepspeed':
19
+ try:
20
+ from deepspeed.runtime.activation_checkpointing.checkpointing import checkpoint
21
+ except:
22
+ from torch.utils.checkpoint import checkpoint
23
+ else:
24
+ from torch.utils.checkpoint import checkpoint
25
+
26
+ try:
27
+ import xformers
28
+ import xformers.ops as xops
29
+ XFORMERS_IS_AVAILBLE = True
30
+ except:
31
+ XFORMERS_IS_AVAILBLE = False
32
+
33
+ class DropPath(nn.Module):
34
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
35
+ """
36
+ def __init__(self, drop_prob=None):
37
+ super(DropPath, self).__init__()
38
+ self.drop_prob = drop_prob
39
+
40
+ def forward(self, x):
41
+ return drop_path(x, self.drop_prob, self.training)
42
+
43
+ def extra_repr(self) -> str:
44
+ return 'p={}'.format(self.drop_prob)
45
+
46
+
47
+ class Mlp(nn.Module):
48
+ def __init__(
49
+ self,
50
+ in_features,
51
+ hidden_features=None,
52
+ out_features=None,
53
+ act_layer=nn.GELU,
54
+ norm_layer=nn.LayerNorm,
55
+ drop=0.,
56
+ subln=False,
57
+
58
+ ):
59
+ super().__init__()
60
+ out_features = out_features or in_features
61
+ hidden_features = hidden_features or in_features
62
+ self.fc1 = nn.Linear(in_features, hidden_features)
63
+ self.act = act_layer()
64
+
65
+ self.ffn_ln = norm_layer(hidden_features) if subln else nn.Identity()
66
+
67
+ self.fc2 = nn.Linear(hidden_features, out_features)
68
+ self.drop = nn.Dropout(drop)
69
+
70
+ def forward(self, x):
71
+ x = self.fc1(x)
72
+ x = self.act(x)
73
+ # x = self.drop(x)
74
+ # commit this for the orignal BERT implement
75
+ x = self.ffn_ln(x)
76
+
77
+ x = self.fc2(x)
78
+ x = self.drop(x)
79
+ return x
80
+
81
+ class SwiGLU(nn.Module):
82
+ def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.SiLU, drop=0.,
83
+ norm_layer=nn.LayerNorm, subln=False):
84
+ super().__init__()
85
+ out_features = out_features or in_features
86
+ hidden_features = hidden_features or in_features
87
+
88
+ self.w1 = nn.Linear(in_features, hidden_features)
89
+ self.w2 = nn.Linear(in_features, hidden_features)
90
+
91
+ self.act = act_layer()
92
+ self.ffn_ln = norm_layer(hidden_features) if subln else nn.Identity()
93
+ self.w3 = nn.Linear(hidden_features, out_features)
94
+
95
+ self.drop = nn.Dropout(drop)
96
+
97
+ def forward(self, x):
98
+ x1 = self.w1(x)
99
+ x2 = self.w2(x)
100
+ hidden = self.act(x1) * x2
101
+ x = self.ffn_ln(hidden)
102
+ x = self.w3(x)
103
+ x = self.drop(x)
104
+ return x
105
+
106
+ class Attention(nn.Module):
107
+ def __init__(
108
+ self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.,
109
+ proj_drop=0., window_size=None, attn_head_dim=None, xattn=False, rope=None, subln=False, norm_layer=nn.LayerNorm):
110
+ super().__init__()
111
+ self.num_heads = num_heads
112
+ head_dim = dim // num_heads
113
+ if attn_head_dim is not None:
114
+ head_dim = attn_head_dim
115
+ all_head_dim = head_dim * self.num_heads
116
+ self.scale = qk_scale or head_dim ** -0.5
117
+
118
+ self.subln = subln
119
+ if self.subln:
120
+ self.q_proj = nn.Linear(dim, all_head_dim, bias=False)
121
+ self.k_proj = nn.Linear(dim, all_head_dim, bias=False)
122
+ self.v_proj = nn.Linear(dim, all_head_dim, bias=False)
123
+ else:
124
+ self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False)
125
+
126
+ if qkv_bias:
127
+ self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
128
+ self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
129
+ else:
130
+ self.q_bias = None
131
+ self.v_bias = None
132
+
133
+ if window_size:
134
+ self.window_size = window_size
135
+ self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
136
+ self.relative_position_bias_table = nn.Parameter(
137
+ torch.zeros(self.num_relative_distance, num_heads)) # 2*Wh-1 * 2*Ww-1, nH
138
+ # cls to token & token 2 cls & cls to cls
139
+
140
+ # get pair-wise relative position index for each token inside the window
141
+ coords_h = torch.arange(window_size[0])
142
+ coords_w = torch.arange(window_size[1])
143
+ coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww
144
+ coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww
145
+ relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww
146
+ relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2
147
+ relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0
148
+ relative_coords[:, :, 1] += window_size[1] - 1
149
+ relative_coords[:, :, 0] *= 2 * window_size[1] - 1
150
+ relative_position_index = \
151
+ torch.zeros(size=(window_size[0] * window_size[1] + 1, ) * 2, dtype=relative_coords.dtype)
152
+ relative_position_index[1:, 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww
153
+ relative_position_index[0, 0:] = self.num_relative_distance - 3
154
+ relative_position_index[0:, 0] = self.num_relative_distance - 2
155
+ relative_position_index[0, 0] = self.num_relative_distance - 1
156
+
157
+ self.register_buffer("relative_position_index", relative_position_index)
158
+ else:
159
+ self.window_size = None
160
+ self.relative_position_bias_table = None
161
+ self.relative_position_index = None
162
+
163
+ self.attn_drop = nn.Dropout(attn_drop)
164
+ self.inner_attn_ln = norm_layer(all_head_dim) if subln else nn.Identity()
165
+ # self.proj = nn.Linear(all_head_dim, all_head_dim)
166
+ self.proj = nn.Linear(all_head_dim, dim)
167
+ self.proj_drop = nn.Dropout(proj_drop)
168
+ self.xattn = xattn
169
+ self.xattn_drop = attn_drop
170
+
171
+ self.rope = rope
172
+
173
+ def forward(self, x, rel_pos_bias=None, attn_mask=None):
174
+ B, N, C = x.shape
175
+ if self.subln:
176
+ q = F.linear(input=x, weight=self.q_proj.weight, bias=self.q_bias)
177
+ k = F.linear(input=x, weight=self.k_proj.weight, bias=None)
178
+ v = F.linear(input=x, weight=self.v_proj.weight, bias=self.v_bias)
179
+
180
+ q = q.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3) # B, num_heads, N, C
181
+ k = k.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3)
182
+ v = v.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3)
183
+ else:
184
+
185
+ qkv_bias = None
186
+ if self.q_bias is not None:
187
+ qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias))
188
+
189
+ qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
190
+ qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) # 3, B, num_heads, N, C
191
+ q, k, v = qkv[0], qkv[1], qkv[2]
192
+
193
+ if self.rope:
194
+ # slightly fast impl
195
+ q_t = q[:, :, 1:, :]
196
+ ro_q_t = self.rope(q_t)
197
+ q = torch.cat((q[:, :, :1, :], ro_q_t), -2).type_as(v)
198
+
199
+ k_t = k[:, :, 1:, :]
200
+ ro_k_t = self.rope(k_t)
201
+ k = torch.cat((k[:, :, :1, :], ro_k_t), -2).type_as(v)
202
+
203
+ if self.xattn:
204
+ q = q.permute(0, 2, 1, 3) # B, num_heads, N, C -> B, N, num_heads, C
205
+ k = k.permute(0, 2, 1, 3)
206
+ v = v.permute(0, 2, 1, 3)
207
+
208
+ x = xops.memory_efficient_attention(
209
+ q, k, v,
210
+ p=self.xattn_drop,
211
+ scale=self.scale,
212
+ )
213
+ x = x.reshape(B, N, -1)
214
+ x = self.inner_attn_ln(x)
215
+ x = self.proj(x)
216
+ x = self.proj_drop(x)
217
+ else:
218
+ q = q * self.scale
219
+ attn = (q @ k.transpose(-2, -1))
220
+
221
+ if self.relative_position_bias_table is not None:
222
+ relative_position_bias = \
223
+ self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
224
+ self.window_size[0] * self.window_size[1] + 1,
225
+ self.window_size[0] * self.window_size[1] + 1, -1) # Wh*Ww,Wh*Ww,nH
226
+ relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww
227
+ attn = attn + relative_position_bias.unsqueeze(0).type_as(attn)
228
+
229
+ if rel_pos_bias is not None:
230
+ attn = attn + rel_pos_bias.type_as(attn)
231
+
232
+ if attn_mask is not None:
233
+ attn_mask = attn_mask.bool()
234
+ attn = attn.masked_fill(~attn_mask[:, None, None, :], float("-inf"))
235
+
236
+ attn = attn.softmax(dim=-1)
237
+ attn = self.attn_drop(attn)
238
+
239
+ x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
240
+ x = self.inner_attn_ln(x)
241
+ x = self.proj(x)
242
+ x = self.proj_drop(x)
243
+ return x
244
+
245
+
246
+ class Block(nn.Module):
247
+
248
+ def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
249
+ drop_path=0., init_values=None, act_layer=nn.GELU, norm_layer=nn.LayerNorm,
250
+ window_size=None, attn_head_dim=None, xattn=False, rope=None, postnorm=False,
251
+ subln=False, naiveswiglu=False):
252
+ super().__init__()
253
+ self.norm1 = norm_layer(dim)
254
+ self.attn = Attention(
255
+ dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
256
+ attn_drop=attn_drop, proj_drop=drop, window_size=window_size, attn_head_dim=attn_head_dim,
257
+ xattn=xattn, rope=rope, subln=subln, norm_layer=norm_layer)
258
+ # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
259
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
260
+ self.norm2 = norm_layer(dim)
261
+ mlp_hidden_dim = int(dim * mlp_ratio)
262
+
263
+ if naiveswiglu:
264
+ self.mlp = SwiGLU(
265
+ in_features=dim,
266
+ hidden_features=mlp_hidden_dim,
267
+ subln=subln,
268
+ norm_layer=norm_layer,
269
+ )
270
+ else:
271
+ self.mlp = Mlp(
272
+ in_features=dim,
273
+ hidden_features=mlp_hidden_dim,
274
+ act_layer=act_layer,
275
+ subln=subln,
276
+ drop=drop
277
+ )
278
+
279
+ if init_values is not None and init_values > 0:
280
+ self.gamma_1 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True)
281
+ self.gamma_2 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True)
282
+ else:
283
+ self.gamma_1, self.gamma_2 = None, None
284
+
285
+ self.postnorm = postnorm
286
+
287
+ def forward(self, x, rel_pos_bias=None, attn_mask=None):
288
+ if self.gamma_1 is None:
289
+ if self.postnorm:
290
+ x = x + self.drop_path(self.norm1(self.attn(x, rel_pos_bias=rel_pos_bias, attn_mask=attn_mask)))
291
+ x = x + self.drop_path(self.norm2(self.mlp(x)))
292
+ else:
293
+ x = x + self.drop_path(self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias, attn_mask=attn_mask))
294
+ x = x + self.drop_path(self.mlp(self.norm2(x)))
295
+ else:
296
+ if self.postnorm:
297
+ x = x + self.drop_path(self.gamma_1 * self.norm1(self.attn(x, rel_pos_bias=rel_pos_bias, attn_mask=attn_mask)))
298
+ x = x + self.drop_path(self.gamma_2 * self.norm2(self.mlp(x)))
299
+ else:
300
+ x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias, attn_mask=attn_mask))
301
+ x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
302
+ return x
303
+
304
+
305
+ class PatchEmbed(nn.Module):
306
+ """ Image to Patch Embedding
307
+ """
308
+ def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
309
+ super().__init__()
310
+ img_size = to_2tuple(img_size)
311
+ patch_size = to_2tuple(patch_size)
312
+ num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
313
+ self.patch_shape = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
314
+ self.img_size = img_size
315
+ self.patch_size = patch_size
316
+ self.num_patches = num_patches
317
+
318
+ self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
319
+
320
+ def forward(self, x, **kwargs):
321
+ B, C, H, W = x.shape
322
+ # FIXME look at relaxing size constraints
323
+ assert H == self.img_size[0] and W == self.img_size[1], \
324
+ f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
325
+ x = self.proj(x).flatten(2).transpose(1, 2)
326
+ return x
327
+
328
+
329
+ class RelativePositionBias(nn.Module):
330
+
331
+ def __init__(self, window_size, num_heads):
332
+ super().__init__()
333
+ self.window_size = window_size
334
+ self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
335
+ self.relative_position_bias_table = nn.Parameter(
336
+ torch.zeros(self.num_relative_distance, num_heads)) # 2*Wh-1 * 2*Ww-1, nH
337
+ # cls to token & token 2 cls & cls to cls
338
+
339
+ # get pair-wise relative position index for each token inside the window
340
+ coords_h = torch.arange(window_size[0])
341
+ coords_w = torch.arange(window_size[1])
342
+ coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww
343
+ coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww
344
+ relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww
345
+ relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2
346
+ relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0
347
+ relative_coords[:, :, 1] += window_size[1] - 1
348
+ relative_coords[:, :, 0] *= 2 * window_size[1] - 1
349
+ relative_position_index = \
350
+ torch.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
351
+ relative_position_index[1:, 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww
352
+ relative_position_index[0, 0:] = self.num_relative_distance - 3
353
+ relative_position_index[0:, 0] = self.num_relative_distance - 2
354
+ relative_position_index[0, 0] = self.num_relative_distance - 1
355
+
356
+ self.register_buffer("relative_position_index", relative_position_index)
357
+
358
+ def forward(self):
359
+ relative_position_bias = \
360
+ self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
361
+ self.window_size[0] * self.window_size[1] + 1,
362
+ self.window_size[0] * self.window_size[1] + 1, -1) # Wh*Ww,Wh*Ww,nH
363
+ return relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww
364
+
365
+
366
+ class EVAVisionTransformer(nn.Module):
367
+ """ Vision Transformer with support for patch or hybrid CNN input stage
368
+ """
369
+ def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12,
370
+ num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
371
+ drop_path_rate=0., norm_layer=nn.LayerNorm, init_values=None, patch_dropout=0.,
372
+ use_abs_pos_emb=True, use_rel_pos_bias=False, use_shared_rel_pos_bias=False, rope=False,
373
+ use_mean_pooling=True, init_scale=0.001, grad_checkpointing=False, xattn=False, postnorm=False,
374
+ pt_hw_seq_len=16, intp_freq=False, naiveswiglu=False, subln=False):
375
+ super().__init__()
376
+
377
+ if not XFORMERS_IS_AVAILBLE:
378
+ xattn = False
379
+
380
+ self.image_size = img_size
381
+ self.num_classes = num_classes
382
+ self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models
383
+
384
+ self.patch_embed = PatchEmbed(
385
+ img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
386
+ num_patches = self.patch_embed.num_patches
387
+
388
+ self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
389
+ # self.mask_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
390
+ if use_abs_pos_emb:
391
+ self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
392
+ else:
393
+ self.pos_embed = None
394
+ self.pos_drop = nn.Dropout(p=drop_rate)
395
+
396
+ if use_shared_rel_pos_bias:
397
+ self.rel_pos_bias = RelativePositionBias(window_size=self.patch_embed.patch_shape, num_heads=num_heads)
398
+ else:
399
+ self.rel_pos_bias = None
400
+
401
+ if rope:
402
+ half_head_dim = embed_dim // num_heads // 2
403
+ hw_seq_len = img_size // patch_size
404
+ self.rope = VisionRotaryEmbeddingFast(
405
+ dim=half_head_dim,
406
+ pt_seq_len=pt_hw_seq_len,
407
+ ft_seq_len=hw_seq_len if intp_freq else None,
408
+ # patch_dropout=patch_dropout
409
+ )
410
+ else:
411
+ self.rope = None
412
+
413
+ self.naiveswiglu = naiveswiglu
414
+
415
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule
416
+ self.use_rel_pos_bias = use_rel_pos_bias
417
+ self.blocks = nn.ModuleList([
418
+ Block(
419
+ dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
420
+ drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
421
+ init_values=init_values, window_size=self.patch_embed.patch_shape if use_rel_pos_bias else None,
422
+ xattn=xattn, rope=self.rope, postnorm=postnorm, subln=subln, naiveswiglu=naiveswiglu)
423
+ for i in range(depth)])
424
+ self.norm = nn.Identity() if use_mean_pooling else norm_layer(embed_dim)
425
+ self.fc_norm = norm_layer(embed_dim) if use_mean_pooling else None
426
+ self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
427
+
428
+ if self.pos_embed is not None:
429
+ trunc_normal_(self.pos_embed, std=.02)
430
+
431
+ trunc_normal_(self.cls_token, std=.02)
432
+ # trunc_normal_(self.mask_token, std=.02)
433
+
434
+ self.apply(self._init_weights)
435
+ self.fix_init_weight()
436
+
437
+ if isinstance(self.head, nn.Linear):
438
+ trunc_normal_(self.head.weight, std=.02)
439
+ self.head.weight.data.mul_(init_scale)
440
+ self.head.bias.data.mul_(init_scale)
441
+
442
+ # setting a patch_dropout of 0. would mean it is disabled and this function would be the identity fn
443
+ self.patch_dropout = PatchDropout(patch_dropout) if patch_dropout > 0. else nn.Identity()
444
+
445
+ self.grad_checkpointing = grad_checkpointing
446
+
447
+ def fix_init_weight(self):
448
+ def rescale(param, layer_id):
449
+ param.div_(math.sqrt(2.0 * layer_id))
450
+
451
+ for layer_id, layer in enumerate(self.blocks):
452
+ rescale(layer.attn.proj.weight.data, layer_id + 1)
453
+ if self.naiveswiglu:
454
+ rescale(layer.mlp.w3.weight.data, layer_id + 1)
455
+ else:
456
+ rescale(layer.mlp.fc2.weight.data, layer_id + 1)
457
+
458
+ def get_cast_dtype(self) -> torch.dtype:
459
+ return self.blocks[0].mlp.fc2.weight.dtype
460
+
461
+ def _init_weights(self, m):
462
+ if isinstance(m, nn.Linear):
463
+ trunc_normal_(m.weight, std=.02)
464
+ if m.bias is not None:
465
+ nn.init.constant_(m.bias, 0)
466
+ elif isinstance(m, nn.LayerNorm):
467
+ nn.init.constant_(m.bias, 0)
468
+ nn.init.constant_(m.weight, 1.0)
469
+
470
+ def get_num_layers(self):
471
+ return len(self.blocks)
472
+
473
+ def lock(self, unlocked_groups=0, freeze_bn_stats=False):
474
+ assert unlocked_groups == 0, 'partial locking not currently supported for this model'
475
+ for param in self.parameters():
476
+ param.requires_grad = False
477
+
478
+ @torch.jit.ignore
479
+ def set_grad_checkpointing(self, enable=True):
480
+ self.grad_checkpointing = enable
481
+
482
+ @torch.jit.ignore
483
+ def no_weight_decay(self):
484
+ return {'pos_embed', 'cls_token'}
485
+
486
+ def get_classifier(self):
487
+ return self.head
488
+
489
+ def reset_classifier(self, num_classes, global_pool=''):
490
+ self.num_classes = num_classes
491
+ self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
492
+
493
+ def forward_features(self, x, return_all_features=False, return_hidden=False, shuffle=False):
494
+
495
+ x = self.patch_embed(x)
496
+ batch_size, seq_len, _ = x.size()
497
+
498
+ if shuffle:
499
+ idx = torch.randperm(x.shape[1]) + 1
500
+ zero = torch.LongTensor([0, ])
501
+ idx = torch.cat([zero, idx])
502
+ pos_embed = self.pos_embed[:, idx]
503
+
504
+ cls_tokens = self.cls_token.expand(batch_size, -1, -1) # stole cls_tokens impl from Phil Wang, thanks
505
+ x = torch.cat((cls_tokens, x), dim=1)
506
+ if shuffle:
507
+ x = x + pos_embed
508
+ elif self.pos_embed is not None:
509
+ x = x + self.pos_embed
510
+ x = self.pos_drop(x)
511
+
512
+ # a patch_dropout of 0. would mean it is disabled and this function would do nothing but return what was passed in
513
+ if os.getenv('RoPE') == '1':
514
+ if self.training and not isinstance(self.patch_dropout, nn.Identity):
515
+ x, patch_indices_keep = self.patch_dropout(x)
516
+ self.rope.forward = partial(self.rope.forward, patch_indices_keep=patch_indices_keep)
517
+ else:
518
+ self.rope.forward = partial(self.rope.forward, patch_indices_keep=None)
519
+ x = self.patch_dropout(x)
520
+ else:
521
+ x = self.patch_dropout(x)
522
+
523
+ rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None
524
+ hidden_states = []
525
+ for idx, blk in enumerate(self.blocks):
526
+ if (0 < idx <= 20) and (idx % 4 == 0) and return_hidden:
527
+ hidden_states.append(x)
528
+ if self.grad_checkpointing:
529
+ x = checkpoint(blk, x, (rel_pos_bias,))
530
+ else:
531
+ x = blk(x, rel_pos_bias=rel_pos_bias)
532
+
533
+ if not return_all_features:
534
+ x = self.norm(x)
535
+ if self.fc_norm is not None:
536
+ return self.fc_norm(x.mean(1)), hidden_states
537
+ else:
538
+ return x[:, 0], hidden_states
539
+ return x
540
+
541
+ def forward(self, x, return_all_features=False, return_hidden=False, shuffle=False):
542
+ if return_all_features:
543
+ return self.forward_features(x, return_all_features, return_hidden, shuffle)
544
+ x, hidden_states = self.forward_features(x, return_all_features, return_hidden, shuffle)
545
+ x = self.head(x)
546
+ if return_hidden:
547
+ return x, hidden_states
548
+ return x
ComfyUI-PuLID-Flux-Enhanced/eva_clip/factory.py ADDED
@@ -0,0 +1,517 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import os
4
+ import pathlib
5
+ import re
6
+ from copy import deepcopy
7
+ from pathlib import Path
8
+ from typing import Optional, Tuple, Union, Dict, Any
9
+ import torch
10
+
11
+ from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
12
+ from .model import CLIP, CustomCLIP, convert_weights_to_lp, convert_to_custom_text_state_dict,\
13
+ get_cast_dtype
14
+ from .openai import load_openai_model
15
+ from .pretrained import is_pretrained_cfg, get_pretrained_cfg, download_pretrained, list_pretrained_tags_by_model
16
+ from .transform import image_transform
17
+ from .tokenizer import HFTokenizer, tokenize
18
+ from .utils import resize_clip_pos_embed, resize_evaclip_pos_embed, resize_visual_pos_embed, resize_eva_pos_embed
19
+
20
+
21
+ _MODEL_CONFIG_PATHS = [Path(__file__).parent / f"model_configs/"]
22
+ _MODEL_CONFIGS = {} # directory (model_name: config) of model architecture configs
23
+
24
+
25
+ def _natural_key(string_):
26
+ return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_.lower())]
27
+
28
+
29
+ def _rescan_model_configs():
30
+ global _MODEL_CONFIGS
31
+
32
+ config_ext = ('.json',)
33
+ config_files = []
34
+ for config_path in _MODEL_CONFIG_PATHS:
35
+ if config_path.is_file() and config_path.suffix in config_ext:
36
+ config_files.append(config_path)
37
+ elif config_path.is_dir():
38
+ for ext in config_ext:
39
+ config_files.extend(config_path.glob(f'*{ext}'))
40
+
41
+ for cf in config_files:
42
+ with open(cf, "r", encoding="utf8") as f:
43
+ model_cfg = json.load(f)
44
+ if all(a in model_cfg for a in ('embed_dim', 'vision_cfg', 'text_cfg')):
45
+ _MODEL_CONFIGS[cf.stem] = model_cfg
46
+
47
+ _MODEL_CONFIGS = dict(sorted(_MODEL_CONFIGS.items(), key=lambda x: _natural_key(x[0])))
48
+
49
+
50
+ _rescan_model_configs() # initial populate of model config registry
51
+
52
+
53
+ def list_models():
54
+ """ enumerate available model architectures based on config files """
55
+ return list(_MODEL_CONFIGS.keys())
56
+
57
+
58
+ def add_model_config(path):
59
+ """ add model config path or file and update registry """
60
+ if not isinstance(path, Path):
61
+ path = Path(path)
62
+ _MODEL_CONFIG_PATHS.append(path)
63
+ _rescan_model_configs()
64
+
65
+
66
+ def get_model_config(model_name):
67
+ if model_name in _MODEL_CONFIGS:
68
+ return deepcopy(_MODEL_CONFIGS[model_name])
69
+ else:
70
+ return None
71
+
72
+
73
+ def get_tokenizer(model_name):
74
+ config = get_model_config(model_name)
75
+ tokenizer = HFTokenizer(config['text_cfg']['hf_tokenizer_name']) if 'hf_tokenizer_name' in config['text_cfg'] else tokenize
76
+ return tokenizer
77
+
78
+
79
+ # loading openai CLIP weights when is_openai=True for training
80
+ def load_state_dict(checkpoint_path: str, map_location: str='cpu', model_key: str='model|module|state_dict', is_openai: bool=False, skip_list: list=[]):
81
+ if is_openai:
82
+ model = torch.jit.load(checkpoint_path, map_location="cpu").eval()
83
+ state_dict = model.state_dict()
84
+ for key in ["input_resolution", "context_length", "vocab_size"]:
85
+ state_dict.pop(key, None)
86
+ else:
87
+ checkpoint = torch.load(checkpoint_path, map_location=map_location)
88
+ for mk in model_key.split('|'):
89
+ if isinstance(checkpoint, dict) and mk in checkpoint:
90
+ state_dict = checkpoint[mk]
91
+ break
92
+ else:
93
+ state_dict = checkpoint
94
+ if next(iter(state_dict.items()))[0].startswith('module'):
95
+ state_dict = {k[7:]: v for k, v in state_dict.items()}
96
+
97
+ for k in skip_list:
98
+ if k in list(state_dict.keys()):
99
+ logging.info(f"Removing key {k} from pretrained checkpoint")
100
+ del state_dict[k]
101
+
102
+ if os.getenv('RoPE') == '1':
103
+ for k in list(state_dict.keys()):
104
+ if 'freqs_cos' in k or 'freqs_sin' in k:
105
+ del state_dict[k]
106
+ return state_dict
107
+
108
+
109
+
110
+ def load_checkpoint(model, checkpoint_path, model_key="model|module|state_dict", strict=True):
111
+ state_dict = load_state_dict(checkpoint_path, model_key=model_key, is_openai=False)
112
+ # detect old format and make compatible with new format
113
+ if 'positional_embedding' in state_dict and not hasattr(model, 'positional_embedding'):
114
+ state_dict = convert_to_custom_text_state_dict(state_dict)
115
+ if 'text.logit_scale' in state_dict and hasattr(model, 'logit_scale'):
116
+ state_dict['logit_scale'] = state_dict['text.logit_scale']
117
+ del state_dict['text.logit_scale']
118
+
119
+ # resize_clip_pos_embed for CLIP and open CLIP
120
+ if 'visual.positional_embedding' in state_dict:
121
+ resize_clip_pos_embed(state_dict, model)
122
+ # specified to eva_vit_model
123
+ elif 'visual.pos_embed' in state_dict:
124
+ resize_evaclip_pos_embed(state_dict, model)
125
+
126
+ # resize_clip_pos_embed(state_dict, model)
127
+ incompatible_keys = model.load_state_dict(state_dict, strict=strict)
128
+ logging.info(f"incompatible_keys.missing_keys: {incompatible_keys.missing_keys}")
129
+ return incompatible_keys
130
+
131
+ def load_clip_visual_state_dict(checkpoint_path: str, map_location: str='cpu', is_openai: bool=False, skip_list:list=[]):
132
+ state_dict = load_state_dict(checkpoint_path, map_location=map_location, is_openai=is_openai, skip_list=skip_list)
133
+
134
+ for k in list(state_dict.keys()):
135
+ if not k.startswith('visual.'):
136
+ del state_dict[k]
137
+ for k in list(state_dict.keys()):
138
+ if k.startswith('visual.'):
139
+ new_k = k[7:]
140
+ state_dict[new_k] = state_dict[k]
141
+ del state_dict[k]
142
+ return state_dict
143
+
144
+ def load_clip_text_state_dict(checkpoint_path: str, map_location: str='cpu', is_openai: bool=False, skip_list:list=[]):
145
+ state_dict = load_state_dict(checkpoint_path, map_location=map_location, is_openai=is_openai, skip_list=skip_list)
146
+
147
+ for k in list(state_dict.keys()):
148
+ if k.startswith('visual.'):
149
+ del state_dict[k]
150
+ return state_dict
151
+
152
+ def get_pretrained_tag(pretrained_model):
153
+ pretrained_model = pretrained_model.lower()
154
+ if "laion" in pretrained_model or "open_clip" in pretrained_model:
155
+ return "open_clip"
156
+ elif "openai" in pretrained_model:
157
+ return "clip"
158
+ elif "eva" in pretrained_model and "clip" in pretrained_model:
159
+ return "eva_clip"
160
+ else:
161
+ return "other"
162
+
163
+ def load_pretrained_checkpoint(
164
+ model,
165
+ visual_checkpoint_path,
166
+ text_checkpoint_path,
167
+ strict=True,
168
+ visual_model=None,
169
+ text_model=None,
170
+ model_key="model|module|state_dict",
171
+ skip_list=[]):
172
+ visual_tag = get_pretrained_tag(visual_model)
173
+ text_tag = get_pretrained_tag(text_model)
174
+
175
+ logging.info(f"num of model state_dict keys: {len(model.state_dict().keys())}")
176
+ visual_incompatible_keys, text_incompatible_keys = None, None
177
+ if visual_checkpoint_path:
178
+ if visual_tag == "eva_clip" or visual_tag == "open_clip":
179
+ visual_state_dict = load_clip_visual_state_dict(visual_checkpoint_path, is_openai=False, skip_list=skip_list)
180
+ elif visual_tag == "clip":
181
+ visual_state_dict = load_clip_visual_state_dict(visual_checkpoint_path, is_openai=True, skip_list=skip_list)
182
+ else:
183
+ visual_state_dict = load_state_dict(visual_checkpoint_path, model_key=model_key, is_openai=False, skip_list=skip_list)
184
+
185
+ # resize_clip_pos_embed for CLIP and open CLIP
186
+ if 'positional_embedding' in visual_state_dict:
187
+ resize_visual_pos_embed(visual_state_dict, model)
188
+ # specified to EVA model
189
+ elif 'pos_embed' in visual_state_dict:
190
+ resize_eva_pos_embed(visual_state_dict, model)
191
+
192
+ visual_incompatible_keys = model.visual.load_state_dict(visual_state_dict, strict=strict)
193
+ logging.info(f"num of loaded visual_state_dict keys: {len(visual_state_dict.keys())}")
194
+ logging.info(f"visual_incompatible_keys.missing_keys: {visual_incompatible_keys.missing_keys}")
195
+
196
+ if text_checkpoint_path:
197
+ if text_tag == "eva_clip" or text_tag == "open_clip":
198
+ text_state_dict = load_clip_text_state_dict(text_checkpoint_path, is_openai=False, skip_list=skip_list)
199
+ elif text_tag == "clip":
200
+ text_state_dict = load_clip_text_state_dict(text_checkpoint_path, is_openai=True, skip_list=skip_list)
201
+ else:
202
+ text_state_dict = load_state_dict(visual_checkpoint_path, model_key=model_key, is_openai=False, skip_list=skip_list)
203
+
204
+ text_incompatible_keys = model.text.load_state_dict(text_state_dict, strict=strict)
205
+
206
+ logging.info(f"num of loaded text_state_dict keys: {len(text_state_dict.keys())}")
207
+ logging.info(f"text_incompatible_keys.missing_keys: {text_incompatible_keys.missing_keys}")
208
+
209
+ return visual_incompatible_keys, text_incompatible_keys
210
+
211
+ def create_model(
212
+ model_name: str,
213
+ pretrained: Optional[str] = None,
214
+ precision: str = 'fp32',
215
+ device: Union[str, torch.device] = 'cpu',
216
+ jit: bool = False,
217
+ force_quick_gelu: bool = False,
218
+ force_custom_clip: bool = False,
219
+ force_patch_dropout: Optional[float] = None,
220
+ pretrained_image: str = '',
221
+ pretrained_text: str = '',
222
+ pretrained_hf: bool = True,
223
+ pretrained_visual_model: str = None,
224
+ pretrained_text_model: str = None,
225
+ cache_dir: Optional[str] = None,
226
+ skip_list: list = [],
227
+ ):
228
+ model_name = model_name.replace('/', '-') # for callers using old naming with / in ViT names
229
+ if isinstance(device, str):
230
+ device = torch.device(device)
231
+
232
+ if pretrained and pretrained.lower() == 'openai':
233
+ logging.info(f'Loading pretrained {model_name} from OpenAI.')
234
+ model = load_openai_model(
235
+ model_name,
236
+ precision=precision,
237
+ device=device,
238
+ jit=jit,
239
+ cache_dir=cache_dir,
240
+ )
241
+ else:
242
+ model_cfg = get_model_config(model_name)
243
+ if model_cfg is not None:
244
+ logging.info(f'Loaded {model_name} model config.')
245
+ else:
246
+ logging.error(f'Model config for {model_name} not found; available models {list_models()}.')
247
+ raise RuntimeError(f'Model config for {model_name} not found.')
248
+
249
+ if 'rope' in model_cfg.get('vision_cfg', {}):
250
+ if model_cfg['vision_cfg']['rope']:
251
+ os.environ['RoPE'] = "1"
252
+ else:
253
+ os.environ['RoPE'] = "0"
254
+
255
+ if force_quick_gelu:
256
+ # override for use of QuickGELU on non-OpenAI transformer models
257
+ model_cfg["quick_gelu"] = True
258
+
259
+ if force_patch_dropout is not None:
260
+ # override the default patch dropout value
261
+ model_cfg['vision_cfg']["patch_dropout"] = force_patch_dropout
262
+
263
+ cast_dtype = get_cast_dtype(precision)
264
+ custom_clip = model_cfg.pop('custom_text', False) or force_custom_clip or ('hf_model_name' in model_cfg['text_cfg'])
265
+
266
+
267
+ if custom_clip:
268
+ if 'hf_model_name' in model_cfg.get('text_cfg', {}):
269
+ model_cfg['text_cfg']['hf_model_pretrained'] = pretrained_hf
270
+ model = CustomCLIP(**model_cfg, cast_dtype=cast_dtype)
271
+ else:
272
+ model = CLIP(**model_cfg, cast_dtype=cast_dtype)
273
+
274
+ pretrained_cfg = {}
275
+ if pretrained:
276
+ checkpoint_path = ''
277
+ pretrained_cfg = get_pretrained_cfg(model_name, pretrained)
278
+ if pretrained_cfg:
279
+ checkpoint_path = download_pretrained(pretrained_cfg, cache_dir=cache_dir)
280
+ elif os.path.exists(pretrained):
281
+ checkpoint_path = pretrained
282
+
283
+ if checkpoint_path:
284
+ logging.info(f'Loading pretrained {model_name} weights ({pretrained}).')
285
+ load_checkpoint(model,
286
+ checkpoint_path,
287
+ model_key="model|module|state_dict",
288
+ strict=False
289
+ )
290
+ else:
291
+ error_str = (
292
+ f'Pretrained weights ({pretrained}) not found for model {model_name}.'
293
+ f'Available pretrained tags ({list_pretrained_tags_by_model(model_name)}.')
294
+ logging.warning(error_str)
295
+ raise RuntimeError(error_str)
296
+ else:
297
+ visual_checkpoint_path = ''
298
+ text_checkpoint_path = ''
299
+
300
+ if pretrained_image:
301
+ pretrained_visual_model = pretrained_visual_model.replace('/', '-') # for callers using old naming with / in ViT names
302
+ pretrained_image_cfg = get_pretrained_cfg(pretrained_visual_model, pretrained_image)
303
+ if 'timm_model_name' in model_cfg.get('vision_cfg', {}):
304
+ # pretrained weight loading for timm models set via vision_cfg
305
+ model_cfg['vision_cfg']['timm_model_pretrained'] = True
306
+ elif pretrained_image_cfg:
307
+ visual_checkpoint_path = download_pretrained(pretrained_image_cfg, cache_dir=cache_dir)
308
+ elif os.path.exists(pretrained_image):
309
+ visual_checkpoint_path = pretrained_image
310
+ else:
311
+ logging.warning(f'Pretrained weights ({visual_checkpoint_path}) not found for model {model_name}.visual.')
312
+ raise RuntimeError(f'Pretrained weights ({visual_checkpoint_path}) not found for model {model_name}.visual.')
313
+
314
+ if pretrained_text:
315
+ pretrained_text_model = pretrained_text_model.replace('/', '-') # for callers using old naming with / in ViT names
316
+ pretrained_text_cfg = get_pretrained_cfg(pretrained_text_model, pretrained_text)
317
+ if pretrained_image_cfg:
318
+ text_checkpoint_path = download_pretrained(pretrained_text_cfg, cache_dir=cache_dir)
319
+ elif os.path.exists(pretrained_text):
320
+ text_checkpoint_path = pretrained_text
321
+ else:
322
+ logging.warning(f'Pretrained weights ({text_checkpoint_path}) not found for model {model_name}.text.')
323
+ raise RuntimeError(f'Pretrained weights ({text_checkpoint_path}) not found for model {model_name}.text.')
324
+
325
+ if visual_checkpoint_path:
326
+ logging.info(f'Loading pretrained {model_name}.visual weights ({visual_checkpoint_path}).')
327
+ if text_checkpoint_path:
328
+ logging.info(f'Loading pretrained {model_name}.text weights ({text_checkpoint_path}).')
329
+
330
+ if visual_checkpoint_path or text_checkpoint_path:
331
+ load_pretrained_checkpoint(
332
+ model,
333
+ visual_checkpoint_path,
334
+ text_checkpoint_path,
335
+ strict=False,
336
+ visual_model=pretrained_visual_model,
337
+ text_model=pretrained_text_model,
338
+ model_key="model|module|state_dict",
339
+ skip_list=skip_list
340
+ )
341
+
342
+ if "fp16" in precision or "bf16" in precision:
343
+ logging.info(f'convert precision to {precision}')
344
+ model = model.to(torch.bfloat16) if 'bf16' in precision else model.to(torch.float16)
345
+
346
+ model.to(device=device)
347
+
348
+ # set image / mean metadata from pretrained_cfg if available, or use default
349
+ model.visual.image_mean = pretrained_cfg.get('mean', None) or OPENAI_DATASET_MEAN
350
+ model.visual.image_std = pretrained_cfg.get('std', None) or OPENAI_DATASET_STD
351
+
352
+ if jit:
353
+ model = torch.jit.script(model)
354
+
355
+ return model
356
+
357
+
358
+ def create_model_and_transforms(
359
+ model_name: str,
360
+ pretrained: Optional[str] = None,
361
+ precision: str = 'fp32',
362
+ device: Union[str, torch.device] = 'cpu',
363
+ jit: bool = False,
364
+ force_quick_gelu: bool = False,
365
+ force_custom_clip: bool = False,
366
+ force_patch_dropout: Optional[float] = None,
367
+ pretrained_image: str = '',
368
+ pretrained_text: str = '',
369
+ pretrained_hf: bool = True,
370
+ pretrained_visual_model: str = None,
371
+ pretrained_text_model: str = None,
372
+ image_mean: Optional[Tuple[float, ...]] = None,
373
+ image_std: Optional[Tuple[float, ...]] = None,
374
+ cache_dir: Optional[str] = None,
375
+ skip_list: list = [],
376
+ ):
377
+ model = create_model(
378
+ model_name,
379
+ pretrained,
380
+ precision=precision,
381
+ device=device,
382
+ jit=jit,
383
+ force_quick_gelu=force_quick_gelu,
384
+ force_custom_clip=force_custom_clip,
385
+ force_patch_dropout=force_patch_dropout,
386
+ pretrained_image=pretrained_image,
387
+ pretrained_text=pretrained_text,
388
+ pretrained_hf=pretrained_hf,
389
+ pretrained_visual_model=pretrained_visual_model,
390
+ pretrained_text_model=pretrained_text_model,
391
+ cache_dir=cache_dir,
392
+ skip_list=skip_list,
393
+ )
394
+
395
+ image_mean = image_mean or getattr(model.visual, 'image_mean', None)
396
+ image_std = image_std or getattr(model.visual, 'image_std', None)
397
+ preprocess_train = image_transform(
398
+ model.visual.image_size,
399
+ is_train=True,
400
+ mean=image_mean,
401
+ std=image_std
402
+ )
403
+ preprocess_val = image_transform(
404
+ model.visual.image_size,
405
+ is_train=False,
406
+ mean=image_mean,
407
+ std=image_std
408
+ )
409
+
410
+ return model, preprocess_train, preprocess_val
411
+
412
+
413
+ def create_transforms(
414
+ model_name: str,
415
+ pretrained: Optional[str] = None,
416
+ precision: str = 'fp32',
417
+ device: Union[str, torch.device] = 'cpu',
418
+ jit: bool = False,
419
+ force_quick_gelu: bool = False,
420
+ force_custom_clip: bool = False,
421
+ force_patch_dropout: Optional[float] = None,
422
+ pretrained_image: str = '',
423
+ pretrained_text: str = '',
424
+ pretrained_hf: bool = True,
425
+ pretrained_visual_model: str = None,
426
+ pretrained_text_model: str = None,
427
+ image_mean: Optional[Tuple[float, ...]] = None,
428
+ image_std: Optional[Tuple[float, ...]] = None,
429
+ cache_dir: Optional[str] = None,
430
+ skip_list: list = [],
431
+ ):
432
+ model = create_model(
433
+ model_name,
434
+ pretrained,
435
+ precision=precision,
436
+ device=device,
437
+ jit=jit,
438
+ force_quick_gelu=force_quick_gelu,
439
+ force_custom_clip=force_custom_clip,
440
+ force_patch_dropout=force_patch_dropout,
441
+ pretrained_image=pretrained_image,
442
+ pretrained_text=pretrained_text,
443
+ pretrained_hf=pretrained_hf,
444
+ pretrained_visual_model=pretrained_visual_model,
445
+ pretrained_text_model=pretrained_text_model,
446
+ cache_dir=cache_dir,
447
+ skip_list=skip_list,
448
+ )
449
+
450
+
451
+ image_mean = image_mean or getattr(model.visual, 'image_mean', None)
452
+ image_std = image_std or getattr(model.visual, 'image_std', None)
453
+ preprocess_train = image_transform(
454
+ model.visual.image_size,
455
+ is_train=True,
456
+ mean=image_mean,
457
+ std=image_std
458
+ )
459
+ preprocess_val = image_transform(
460
+ model.visual.image_size,
461
+ is_train=False,
462
+ mean=image_mean,
463
+ std=image_std
464
+ )
465
+ del model
466
+
467
+ return preprocess_train, preprocess_val
468
+
469
+ def create_model_from_pretrained(
470
+ model_name: str,
471
+ pretrained: str,
472
+ precision: str = 'fp32',
473
+ device: Union[str, torch.device] = 'cpu',
474
+ jit: bool = False,
475
+ force_quick_gelu: bool = False,
476
+ force_custom_clip: bool = False,
477
+ force_patch_dropout: Optional[float] = None,
478
+ return_transform: bool = True,
479
+ image_mean: Optional[Tuple[float, ...]] = None,
480
+ image_std: Optional[Tuple[float, ...]] = None,
481
+ cache_dir: Optional[str] = None,
482
+ is_frozen: bool = False,
483
+ ):
484
+ if not is_pretrained_cfg(model_name, pretrained) and not os.path.exists(pretrained):
485
+ raise RuntimeError(
486
+ f'{pretrained} is not a valid pretrained cfg or checkpoint for {model_name}.'
487
+ f' Use open_clip.list_pretrained() to find one.')
488
+
489
+ model = create_model(
490
+ model_name,
491
+ pretrained,
492
+ precision=precision,
493
+ device=device,
494
+ jit=jit,
495
+ force_quick_gelu=force_quick_gelu,
496
+ force_custom_clip=force_custom_clip,
497
+ force_patch_dropout=force_patch_dropout,
498
+ cache_dir=cache_dir,
499
+ )
500
+
501
+ if is_frozen:
502
+ for param in model.parameters():
503
+ param.requires_grad = False
504
+
505
+ if not return_transform:
506
+ return model
507
+
508
+ image_mean = image_mean or getattr(model.visual, 'image_mean', None)
509
+ image_std = image_std or getattr(model.visual, 'image_std', None)
510
+ preprocess = image_transform(
511
+ model.visual.image_size,
512
+ is_train=False,
513
+ mean=image_mean,
514
+ std=image_std
515
+ )
516
+
517
+ return model, preprocess
ComfyUI-PuLID-Flux-Enhanced/eva_clip/hf_configs.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # HF architecture dict:
2
+ arch_dict = {
3
+ # https://huggingface.co/docs/transformers/model_doc/roberta#roberta
4
+ "roberta": {
5
+ "config_names": {
6
+ "context_length": "max_position_embeddings",
7
+ "vocab_size": "vocab_size",
8
+ "width": "hidden_size",
9
+ "heads": "num_attention_heads",
10
+ "layers": "num_hidden_layers",
11
+ "layer_attr": "layer",
12
+ "token_embeddings_attr": "embeddings"
13
+ },
14
+ "pooler": "mean_pooler",
15
+ },
16
+ # https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.XLMRobertaConfig
17
+ "xlm-roberta": {
18
+ "config_names": {
19
+ "context_length": "max_position_embeddings",
20
+ "vocab_size": "vocab_size",
21
+ "width": "hidden_size",
22
+ "heads": "num_attention_heads",
23
+ "layers": "num_hidden_layers",
24
+ "layer_attr": "layer",
25
+ "token_embeddings_attr": "embeddings"
26
+ },
27
+ "pooler": "mean_pooler",
28
+ },
29
+ # https://huggingface.co/docs/transformers/model_doc/mt5#mt5
30
+ "mt5": {
31
+ "config_names": {
32
+ # unlimited seqlen
33
+ # https://github.com/google-research/text-to-text-transfer-transformer/issues/273
34
+ # https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/models/t5/modeling_t5.py#L374
35
+ "context_length": "",
36
+ "vocab_size": "vocab_size",
37
+ "width": "d_model",
38
+ "heads": "num_heads",
39
+ "layers": "num_layers",
40
+ "layer_attr": "block",
41
+ "token_embeddings_attr": "embed_tokens"
42
+ },
43
+ "pooler": "mean_pooler",
44
+ },
45
+ "bert": {
46
+ "config_names": {
47
+ "context_length": "max_position_embeddings",
48
+ "vocab_size": "vocab_size",
49
+ "width": "hidden_size",
50
+ "heads": "num_attention_heads",
51
+ "layers": "num_hidden_layers",
52
+ "layer_attr": "layer",
53
+ "token_embeddings_attr": "embeddings"
54
+ },
55
+ "pooler": "mean_pooler",
56
+ }
57
+ }
ComfyUI-PuLID-Flux-Enhanced/eva_clip/hf_model.py ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ huggingface model adapter
2
+
3
+ Wraps HuggingFace transformers (https://github.com/huggingface/transformers) models for use as a text tower in CLIP model.
4
+ """
5
+
6
+ import re
7
+
8
+ import torch
9
+ import torch.nn as nn
10
+ from torch.nn import functional as F
11
+ from torch import TensorType
12
+ try:
13
+ import transformers
14
+ from transformers import AutoModel, AutoModelForMaskedLM, AutoTokenizer, AutoConfig, PretrainedConfig
15
+ from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, \
16
+ BaseModelOutputWithPoolingAndCrossAttentions
17
+ except ImportError as e:
18
+ transformers = None
19
+
20
+
21
+ class BaseModelOutput:
22
+ pass
23
+
24
+
25
+ class PretrainedConfig:
26
+ pass
27
+
28
+ from .hf_configs import arch_dict
29
+
30
+ # utils
31
+ def _camel2snake(s):
32
+ return re.sub(r'(?<!^)(?=[A-Z])', '_', s).lower()
33
+
34
+ # TODO: ?last - for gpt-like models
35
+ _POOLERS = {}
36
+
37
+ def register_pooler(cls):
38
+ """Decorator registering pooler class"""
39
+ _POOLERS[_camel2snake(cls.__name__)] = cls
40
+ return cls
41
+
42
+
43
+ @register_pooler
44
+ class MeanPooler(nn.Module):
45
+ """Mean pooling"""
46
+ def forward(self, x:BaseModelOutput, attention_mask:TensorType):
47
+ masked_output = x.last_hidden_state * attention_mask.unsqueeze(-1)
48
+ return masked_output.sum(dim=1) / attention_mask.sum(-1, keepdim=True)
49
+
50
+ @register_pooler
51
+ class MaxPooler(nn.Module):
52
+ """Max pooling"""
53
+ def forward(self, x:BaseModelOutput, attention_mask:TensorType):
54
+ masked_output = x.last_hidden_state.masked_fill(attention_mask.unsqueeze(-1), -torch.inf)
55
+ return masked_output.max(1).values
56
+
57
+ @register_pooler
58
+ class ClsPooler(nn.Module):
59
+ """CLS token pooling"""
60
+ def __init__(self, use_pooler_output=True):
61
+ super().__init__()
62
+ self.cls_token_position = 0
63
+ self.use_pooler_output = use_pooler_output
64
+
65
+ def forward(self, x:BaseModelOutput, attention_mask:TensorType):
66
+
67
+ if (self.use_pooler_output and
68
+ isinstance(x, (BaseModelOutputWithPooling, BaseModelOutputWithPoolingAndCrossAttentions)) and
69
+ (x.pooler_output is not None)
70
+ ):
71
+ return x.pooler_output
72
+
73
+ return x.last_hidden_state[:, self.cls_token_position, :]
74
+
75
+ class HFTextEncoder(nn.Module):
76
+ """HuggingFace model adapter"""
77
+ def __init__(
78
+ self,
79
+ model_name_or_path: str,
80
+ output_dim: int,
81
+ tokenizer_name: str = None,
82
+ config: PretrainedConfig = None,
83
+ pooler_type: str = None,
84
+ proj: str = None,
85
+ pretrained: bool = True,
86
+ masked_language_modeling: bool = False):
87
+ super().__init__()
88
+
89
+ self.output_dim = output_dim
90
+
91
+ # TODO: find better way to get this information
92
+ uses_transformer_pooler = (pooler_type == "cls_pooler")
93
+
94
+ if transformers is None:
95
+ raise RuntimeError("Please `pip install transformers` to use pre-trained HuggingFace models")
96
+ if config is None:
97
+ self.config = AutoConfig.from_pretrained(model_name_or_path)
98
+ if masked_language_modeling:
99
+ create_func, model_args = (AutoModelForMaskedLM.from_pretrained, model_name_or_path) if pretrained else (
100
+ AutoModelForMaskedLM.from_config, self.config)
101
+ else:
102
+ create_func, model_args = (AutoModel.from_pretrained, model_name_or_path) if pretrained else (
103
+ AutoModel.from_config, self.config)
104
+ # TODO: do all model configs have this attribute? PretrainedConfig does so yes??
105
+ if hasattr(self.config, "is_encoder_decoder") and self.config.is_encoder_decoder:
106
+ self.transformer = create_func(model_args)
107
+ self.transformer = self.transformer.encoder
108
+ else:
109
+ self.transformer = create_func(model_args, add_pooling_layer=uses_transformer_pooler)
110
+ else:
111
+ self.config = config
112
+ if masked_language_modeling:
113
+ self.transformer = AutoModelForMaskedLM.from_config(config)
114
+ else:
115
+ self.transformer = AutoModel.from_config(config)
116
+
117
+ if pooler_type is None: # get default arch pooler
118
+ self.pooler = _POOLERS[(arch_dict[self.config.model_type]["pooler"])]()
119
+ else:
120
+ self.pooler = _POOLERS[pooler_type]()
121
+
122
+ d_model = getattr(self.config, arch_dict[self.config.model_type]["config_names"]["width"])
123
+ if (d_model == output_dim) and (proj is None): # do we always need a proj?
124
+ self.proj = nn.Identity()
125
+ elif proj == 'linear':
126
+ self.proj = nn.Linear(d_model, output_dim, bias=False)
127
+ elif proj == 'mlp':
128
+ hidden_size = (d_model + output_dim) // 2
129
+ self.proj = nn.Sequential(
130
+ nn.Linear(d_model, hidden_size, bias=False),
131
+ nn.GELU(),
132
+ nn.Linear(hidden_size, output_dim, bias=False),
133
+ )
134
+
135
+ # self.itm_proj = nn.Linear(d_model, 2, bias=False)
136
+ # self.mlm_proj = nn.Linear(d_model, self.config.vocab_size), bias=False)
137
+ self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
138
+
139
+ # def forward_itm(self, x:TensorType, image_embeds:TensorType) -> TensorType:
140
+ # image_atts = torch.ones(image_embeds.size()[:-1],dtype=torch.long).to(x.device)
141
+ # attn_mask = (x != self.config.pad_token_id).long()
142
+ # out = self.transformer(
143
+ # input_ids=x,
144
+ # attention_mask=attn_mask,
145
+ # encoder_hidden_states = image_embeds,
146
+ # encoder_attention_mask = image_atts,
147
+ # )
148
+ # pooled_out = self.pooler(out, attn_mask)
149
+
150
+ # return self.itm_proj(pooled_out)
151
+
152
+ def mask(self, input_ids, vocab_size, device, targets=None, masked_indices=None, probability_matrix=None):
153
+ if masked_indices is None:
154
+ masked_indices = torch.bernoulli(probability_matrix).bool()
155
+
156
+ masked_indices[input_ids == self.tokenizer.pad_token_id] = False
157
+ masked_indices[input_ids == self.tokenizer.cls_token_id] = False
158
+
159
+ if targets is not None:
160
+ targets[~masked_indices] = -100 # We only compute loss on masked tokens
161
+
162
+ # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
163
+ indices_replaced = torch.bernoulli(torch.full(input_ids.shape, 0.8)).bool() & masked_indices
164
+ input_ids[indices_replaced] = self.tokenizer.mask_token_id
165
+
166
+ # 10% of the time, we replace masked input tokens with random word
167
+ indices_random = torch.bernoulli(torch.full(input_ids.shape, 0.5)).bool() & masked_indices & ~indices_replaced
168
+ random_words = torch.randint(vocab_size, input_ids.shape, dtype=torch.long).to(device)
169
+ input_ids[indices_random] = random_words[indices_random]
170
+ # The rest of the time (10% of the time) we keep the masked input tokens unchanged
171
+
172
+ if targets is not None:
173
+ return input_ids, targets
174
+ else:
175
+ return input_ids
176
+
177
+ def forward_mlm(self, input_ids, image_embeds, mlm_probability=0.25):
178
+ labels = input_ids.clone()
179
+ attn_mask = (input_ids != self.config.pad_token_id).long()
180
+ image_atts = torch.ones(image_embeds.size()[:-1],dtype=torch.long).to(input_ids.device)
181
+ vocab_size = getattr(self.config, arch_dict[self.config.model_type]["config_names"]["vocab_size"])
182
+ probability_matrix = torch.full(labels.shape, mlm_probability)
183
+ input_ids, labels = self.mask(input_ids, vocab_size, input_ids.device, targets=labels,
184
+ probability_matrix = probability_matrix)
185
+ mlm_output = self.transformer(input_ids,
186
+ attention_mask = attn_mask,
187
+ encoder_hidden_states = image_embeds,
188
+ encoder_attention_mask = image_atts,
189
+ return_dict = True,
190
+ labels = labels,
191
+ )
192
+ return mlm_output.loss
193
+ # mlm_output = self.transformer(input_ids,
194
+ # attention_mask = attn_mask,
195
+ # encoder_hidden_states = image_embeds,
196
+ # encoder_attention_mask = image_atts,
197
+ # return_dict = True,
198
+ # ).last_hidden_state
199
+ # logits = self.mlm_proj(mlm_output)
200
+
201
+ # # logits = logits[:, :-1, :].contiguous().view(-1, vocab_size)
202
+ # logits = logits[:, 1:, :].contiguous().view(-1, vocab_size)
203
+ # labels = labels[:, 1:].contiguous().view(-1)
204
+
205
+ # mlm_loss = F.cross_entropy(
206
+ # logits,
207
+ # labels,
208
+ # # label_smoothing=0.1,
209
+ # )
210
+ # return mlm_loss
211
+
212
+
213
+ def forward(self, x:TensorType) -> TensorType:
214
+ attn_mask = (x != self.config.pad_token_id).long()
215
+ out = self.transformer(input_ids=x, attention_mask=attn_mask)
216
+ pooled_out = self.pooler(out, attn_mask)
217
+
218
+ return self.proj(pooled_out)
219
+
220
+ def lock(self, unlocked_layers:int=0, freeze_layer_norm:bool=True):
221
+ if not unlocked_layers: # full freezing
222
+ for n, p in self.transformer.named_parameters():
223
+ p.requires_grad = (not freeze_layer_norm) if "LayerNorm" in n.split(".") else False
224
+ return
225
+
226
+ encoder = self.transformer.encoder if hasattr(self.transformer, 'encoder') else self.transformer
227
+ layer_list = getattr(encoder, arch_dict[self.config.model_type]["config_names"]["layer_attr"])
228
+ print(f"Unlocking {unlocked_layers}/{len(layer_list) + 1} layers of hf model")
229
+ embeddings = getattr(
230
+ self.transformer, arch_dict[self.config.model_type]["config_names"]["token_embeddings_attr"])
231
+ modules = [embeddings, *layer_list][:-unlocked_layers]
232
+ # freeze layers
233
+ for module in modules:
234
+ for n, p in module.named_parameters():
235
+ p.requires_grad = (not freeze_layer_norm) if "LayerNorm" in n.split(".") else False
236
+
237
+
238
+ @torch.jit.ignore
239
+ def set_grad_checkpointing(self, enable=True):
240
+ self.transformer.gradient_checkpointing_enable()
241
+
242
+ def get_num_layers(self):
243
+ encoder = self.transformer.encoder if hasattr(self.transformer, 'encoder') else self.transformer
244
+ layer_list = getattr(encoder, arch_dict[self.config.model_type]["config_names"]["layer_attr"])
245
+ return len(layer_list)
246
+
247
+ def init_parameters(self):
248
+ pass
ComfyUI-PuLID-Flux-Enhanced/eva_clip/loss.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ import torch.nn as nn
4
+ from torch.nn import functional as F
5
+
6
+ try:
7
+ import torch.distributed.nn
8
+ from torch import distributed as dist
9
+ has_distributed = True
10
+ except ImportError:
11
+ has_distributed = False
12
+
13
+ try:
14
+ import horovod.torch as hvd
15
+ except ImportError:
16
+ hvd = None
17
+
18
+ from timm.loss import LabelSmoothingCrossEntropy
19
+
20
+
21
+ def gather_features(
22
+ image_features,
23
+ text_features,
24
+ local_loss=False,
25
+ gather_with_grad=False,
26
+ rank=0,
27
+ world_size=1,
28
+ use_horovod=False
29
+ ):
30
+ assert has_distributed, 'torch.distributed did not import correctly, please use a PyTorch version with support.'
31
+ if use_horovod:
32
+ assert hvd is not None, 'Please install horovod'
33
+ if gather_with_grad:
34
+ all_image_features = hvd.allgather(image_features)
35
+ all_text_features = hvd.allgather(text_features)
36
+ else:
37
+ with torch.no_grad():
38
+ all_image_features = hvd.allgather(image_features)
39
+ all_text_features = hvd.allgather(text_features)
40
+ if not local_loss:
41
+ # ensure grads for local rank when all_* features don't have a gradient
42
+ gathered_image_features = list(all_image_features.chunk(world_size, dim=0))
43
+ gathered_text_features = list(all_text_features.chunk(world_size, dim=0))
44
+ gathered_image_features[rank] = image_features
45
+ gathered_text_features[rank] = text_features
46
+ all_image_features = torch.cat(gathered_image_features, dim=0)
47
+ all_text_features = torch.cat(gathered_text_features, dim=0)
48
+ else:
49
+ # We gather tensors from all gpus
50
+ if gather_with_grad:
51
+ all_image_features = torch.cat(torch.distributed.nn.all_gather(image_features), dim=0)
52
+ all_text_features = torch.cat(torch.distributed.nn.all_gather(text_features), dim=0)
53
+ # all_image_features = torch.cat(torch.distributed.nn.all_gather(image_features, async_op=True), dim=0)
54
+ # all_text_features = torch.cat(torch.distributed.nn.all_gather(text_features, async_op=True), dim=0)
55
+ else:
56
+ gathered_image_features = [torch.zeros_like(image_features) for _ in range(world_size)]
57
+ gathered_text_features = [torch.zeros_like(text_features) for _ in range(world_size)]
58
+ dist.all_gather(gathered_image_features, image_features)
59
+ dist.all_gather(gathered_text_features, text_features)
60
+ if not local_loss:
61
+ # ensure grads for local rank when all_* features don't have a gradient
62
+ gathered_image_features[rank] = image_features
63
+ gathered_text_features[rank] = text_features
64
+ all_image_features = torch.cat(gathered_image_features, dim=0)
65
+ all_text_features = torch.cat(gathered_text_features, dim=0)
66
+
67
+ return all_image_features, all_text_features
68
+
69
+
70
+ class ClipLoss(nn.Module):
71
+
72
+ def __init__(
73
+ self,
74
+ local_loss=False,
75
+ gather_with_grad=False,
76
+ cache_labels=False,
77
+ rank=0,
78
+ world_size=1,
79
+ use_horovod=False,
80
+ smoothing=0.,
81
+ ):
82
+ super().__init__()
83
+ self.local_loss = local_loss
84
+ self.gather_with_grad = gather_with_grad
85
+ self.cache_labels = cache_labels
86
+ self.rank = rank
87
+ self.world_size = world_size
88
+ self.use_horovod = use_horovod
89
+ self.label_smoothing_cross_entropy = LabelSmoothingCrossEntropy(smoothing=smoothing) if smoothing > 0 else None
90
+
91
+ # cache state
92
+ self.prev_num_logits = 0
93
+ self.labels = {}
94
+
95
+ def forward(self, image_features, text_features, logit_scale=1.):
96
+ device = image_features.device
97
+ if self.world_size > 1:
98
+ all_image_features, all_text_features = gather_features(
99
+ image_features, text_features,
100
+ self.local_loss, self.gather_with_grad, self.rank, self.world_size, self.use_horovod)
101
+
102
+ if self.local_loss:
103
+ logits_per_image = logit_scale * image_features @ all_text_features.T
104
+ logits_per_text = logit_scale * text_features @ all_image_features.T
105
+ else:
106
+ logits_per_image = logit_scale * all_image_features @ all_text_features.T
107
+ logits_per_text = logits_per_image.T
108
+ else:
109
+ logits_per_image = logit_scale * image_features @ text_features.T
110
+ logits_per_text = logit_scale * text_features @ image_features.T
111
+ # calculated ground-truth and cache if enabled
112
+ num_logits = logits_per_image.shape[0]
113
+ if self.prev_num_logits != num_logits or device not in self.labels:
114
+ labels = torch.arange(num_logits, device=device, dtype=torch.long)
115
+ if self.world_size > 1 and self.local_loss:
116
+ labels = labels + num_logits * self.rank
117
+ if self.cache_labels:
118
+ self.labels[device] = labels
119
+ self.prev_num_logits = num_logits
120
+ else:
121
+ labels = self.labels[device]
122
+
123
+ if self.label_smoothing_cross_entropy:
124
+ total_loss = (
125
+ self.label_smoothing_cross_entropy(logits_per_image, labels) +
126
+ self.label_smoothing_cross_entropy(logits_per_text, labels)
127
+ ) / 2
128
+ else:
129
+ total_loss = (
130
+ F.cross_entropy(logits_per_image, labels) +
131
+ F.cross_entropy(logits_per_text, labels)
132
+ ) / 2
133
+
134
+ acc = None
135
+ i2t_acc = (logits_per_image.argmax(-1) == labels).sum() / len(logits_per_image)
136
+ t2i_acc = (logits_per_text.argmax(-1) == labels).sum() / len(logits_per_text)
137
+ acc = {"i2t": i2t_acc, "t2i": t2i_acc}
138
+ return total_loss, acc
ComfyUI-PuLID-Flux-Enhanced/eva_clip/model.py ADDED
@@ -0,0 +1,439 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ CLIP Model
2
+
3
+ Adapted from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI.
4
+ """
5
+ import os
6
+ from dataclasses import dataclass
7
+ from typing import Optional, Tuple, Union
8
+ from functools import partial
9
+
10
+ import numpy as np
11
+ import torch
12
+ import torch.nn.functional as F
13
+ from torch import nn
14
+
15
+ try:
16
+ from .hf_model import HFTextEncoder
17
+ except:
18
+ HFTextEncoder = None
19
+ from .modified_resnet import ModifiedResNet
20
+ from .timm_model import TimmModel
21
+ from .eva_vit_model import EVAVisionTransformer
22
+ from .transformer import LayerNorm, QuickGELU, Attention, VisionTransformer, TextTransformer
23
+
24
+ try:
25
+ from apex.normalization import FusedLayerNorm
26
+ except:
27
+ FusedLayerNorm = LayerNorm
28
+ print("Nvidia APEX normalization not installed, using PyTorch LayerNorm")
29
+
30
+ try:
31
+ import xformers.ops as xops
32
+ except ImportError:
33
+ xops = None
34
+ #print("Please 'pip install xformers'")
35
+
36
+ @dataclass
37
+ class CLIPVisionCfg:
38
+ layers: Union[Tuple[int, int, int, int], int] = 12
39
+ width: int = 768
40
+ head_width: int = 64
41
+ mlp_ratio: float = 4.0
42
+ patch_size: int = 16
43
+ image_size: Union[Tuple[int, int], int] = 224
44
+ ls_init_value: Optional[float] = None # layer scale initial value
45
+ patch_dropout: float = 0. # what fraction of patches to dropout during training (0 would mean disabled and no patches dropped) - 0.5 to 0.75 recommended in the paper for optimal results
46
+ global_average_pool: bool = False # whether to global average pool the last embedding layer, instead of using CLS token (https://arxiv.org/abs/2205.01580)
47
+ drop_path_rate: Optional[float] = None # drop path rate
48
+ timm_model_name: str = None # a valid model name overrides layers, width, patch_size
49
+ timm_model_pretrained: bool = False # use (imagenet) pretrained weights for named model
50
+ timm_pool: str = 'avg' # feature pooling for timm model ('abs_attn', 'rot_attn', 'avg', '')
51
+ timm_proj: str = 'linear' # linear projection for timm model output ('linear', 'mlp', '')
52
+ timm_proj_bias: bool = False # enable bias final projection
53
+ eva_model_name: str = None # a valid eva model name overrides layers, width, patch_size
54
+ qkv_bias: bool = True
55
+ fusedLN: bool = False
56
+ xattn: bool = False
57
+ postnorm: bool = False
58
+ rope: bool = False
59
+ pt_hw_seq_len: int = 16 # 224/14
60
+ intp_freq: bool = False
61
+ naiveswiglu: bool = False
62
+ subln: bool = False
63
+
64
+
65
+ @dataclass
66
+ class CLIPTextCfg:
67
+ context_length: int = 77
68
+ vocab_size: int = 49408
69
+ width: int = 512
70
+ heads: int = 8
71
+ layers: int = 12
72
+ ls_init_value: Optional[float] = None # layer scale initial value
73
+ hf_model_name: str = None
74
+ hf_tokenizer_name: str = None
75
+ hf_model_pretrained: bool = True
76
+ proj: str = 'mlp'
77
+ pooler_type: str = 'mean_pooler'
78
+ masked_language_modeling: bool = False
79
+ fusedLN: bool = False
80
+ xattn: bool = False
81
+ attn_mask: bool = True
82
+
83
+ def get_cast_dtype(precision: str):
84
+ cast_dtype = None
85
+ if precision == 'bf16':
86
+ cast_dtype = torch.bfloat16
87
+ elif precision == 'fp16':
88
+ cast_dtype = torch.float16
89
+ return cast_dtype
90
+
91
+
92
+ def _build_vision_tower(
93
+ embed_dim: int,
94
+ vision_cfg: CLIPVisionCfg,
95
+ quick_gelu: bool = False,
96
+ cast_dtype: Optional[torch.dtype] = None
97
+ ):
98
+ if isinstance(vision_cfg, dict):
99
+ vision_cfg = CLIPVisionCfg(**vision_cfg)
100
+
101
+ # OpenAI models are pretrained w/ QuickGELU but native nn.GELU is both faster and more
102
+ # memory efficient in recent PyTorch releases (>= 1.10).
103
+ # NOTE: timm models always use native GELU regardless of quick_gelu flag.
104
+ act_layer = QuickGELU if quick_gelu else nn.GELU
105
+
106
+ if vision_cfg.eva_model_name:
107
+ vision_heads = vision_cfg.width // vision_cfg.head_width
108
+ norm_layer = LayerNorm
109
+
110
+ visual = EVAVisionTransformer(
111
+ img_size=vision_cfg.image_size,
112
+ patch_size=vision_cfg.patch_size,
113
+ num_classes=embed_dim,
114
+ use_mean_pooling=vision_cfg.global_average_pool, #False
115
+ init_values=vision_cfg.ls_init_value,
116
+ patch_dropout=vision_cfg.patch_dropout,
117
+ embed_dim=vision_cfg.width,
118
+ depth=vision_cfg.layers,
119
+ num_heads=vision_heads,
120
+ mlp_ratio=vision_cfg.mlp_ratio,
121
+ qkv_bias=vision_cfg.qkv_bias,
122
+ drop_path_rate=vision_cfg.drop_path_rate,
123
+ norm_layer= partial(FusedLayerNorm, eps=1e-6) if vision_cfg.fusedLN else partial(norm_layer, eps=1e-6),
124
+ xattn=vision_cfg.xattn,
125
+ rope=vision_cfg.rope,
126
+ postnorm=vision_cfg.postnorm,
127
+ pt_hw_seq_len= vision_cfg.pt_hw_seq_len, # 224/14
128
+ intp_freq= vision_cfg.intp_freq,
129
+ naiveswiglu= vision_cfg.naiveswiglu,
130
+ subln= vision_cfg.subln
131
+ )
132
+ elif vision_cfg.timm_model_name:
133
+ visual = TimmModel(
134
+ vision_cfg.timm_model_name,
135
+ pretrained=vision_cfg.timm_model_pretrained,
136
+ pool=vision_cfg.timm_pool,
137
+ proj=vision_cfg.timm_proj,
138
+ proj_bias=vision_cfg.timm_proj_bias,
139
+ embed_dim=embed_dim,
140
+ image_size=vision_cfg.image_size
141
+ )
142
+ act_layer = nn.GELU # so that text transformer doesn't use QuickGELU w/ timm models
143
+ elif isinstance(vision_cfg.layers, (tuple, list)):
144
+ vision_heads = vision_cfg.width * 32 // vision_cfg.head_width
145
+ visual = ModifiedResNet(
146
+ layers=vision_cfg.layers,
147
+ output_dim=embed_dim,
148
+ heads=vision_heads,
149
+ image_size=vision_cfg.image_size,
150
+ width=vision_cfg.width
151
+ )
152
+ else:
153
+ vision_heads = vision_cfg.width // vision_cfg.head_width
154
+ norm_layer = LayerNormFp32 if cast_dtype in (torch.float16, torch.bfloat16) else LayerNorm
155
+ visual = VisionTransformer(
156
+ image_size=vision_cfg.image_size,
157
+ patch_size=vision_cfg.patch_size,
158
+ width=vision_cfg.width,
159
+ layers=vision_cfg.layers,
160
+ heads=vision_heads,
161
+ mlp_ratio=vision_cfg.mlp_ratio,
162
+ ls_init_value=vision_cfg.ls_init_value,
163
+ patch_dropout=vision_cfg.patch_dropout,
164
+ global_average_pool=vision_cfg.global_average_pool,
165
+ output_dim=embed_dim,
166
+ act_layer=act_layer,
167
+ norm_layer=norm_layer,
168
+ )
169
+
170
+ return visual
171
+
172
+
173
+ def _build_text_tower(
174
+ embed_dim: int,
175
+ text_cfg: CLIPTextCfg,
176
+ quick_gelu: bool = False,
177
+ cast_dtype: Optional[torch.dtype] = None,
178
+ ):
179
+ if isinstance(text_cfg, dict):
180
+ text_cfg = CLIPTextCfg(**text_cfg)
181
+
182
+ if text_cfg.hf_model_name:
183
+ text = HFTextEncoder(
184
+ text_cfg.hf_model_name,
185
+ output_dim=embed_dim,
186
+ tokenizer_name=text_cfg.hf_tokenizer_name,
187
+ proj=text_cfg.proj,
188
+ pooler_type=text_cfg.pooler_type,
189
+ masked_language_modeling=text_cfg.masked_language_modeling
190
+ )
191
+ else:
192
+ act_layer = QuickGELU if quick_gelu else nn.GELU
193
+ norm_layer = LayerNorm
194
+
195
+ text = TextTransformer(
196
+ context_length=text_cfg.context_length,
197
+ vocab_size=text_cfg.vocab_size,
198
+ width=text_cfg.width,
199
+ heads=text_cfg.heads,
200
+ layers=text_cfg.layers,
201
+ ls_init_value=text_cfg.ls_init_value,
202
+ output_dim=embed_dim,
203
+ act_layer=act_layer,
204
+ norm_layer= FusedLayerNorm if text_cfg.fusedLN else norm_layer,
205
+ xattn=text_cfg.xattn,
206
+ attn_mask=text_cfg.attn_mask,
207
+ )
208
+ return text
209
+
210
+ class CLIP(nn.Module):
211
+ def __init__(
212
+ self,
213
+ embed_dim: int,
214
+ vision_cfg: CLIPVisionCfg,
215
+ text_cfg: CLIPTextCfg,
216
+ quick_gelu: bool = False,
217
+ cast_dtype: Optional[torch.dtype] = None,
218
+ ):
219
+ super().__init__()
220
+ self.visual = _build_vision_tower(embed_dim, vision_cfg, quick_gelu, cast_dtype)
221
+
222
+ text = _build_text_tower(embed_dim, text_cfg, quick_gelu, cast_dtype)
223
+ self.transformer = text.transformer
224
+ self.vocab_size = text.vocab_size
225
+ self.token_embedding = text.token_embedding
226
+ self.positional_embedding = text.positional_embedding
227
+ self.ln_final = text.ln_final
228
+ self.text_projection = text.text_projection
229
+ self.register_buffer('attn_mask', text.attn_mask, persistent=False)
230
+
231
+ self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
232
+
233
+ def lock_image_tower(self, unlocked_groups=0, freeze_bn_stats=False):
234
+ # lock image tower as per LiT - https://arxiv.org/abs/2111.07991
235
+ self.visual.lock(unlocked_groups=unlocked_groups, freeze_bn_stats=freeze_bn_stats)
236
+
237
+ @torch.jit.ignore
238
+ def set_grad_checkpointing(self, enable=True):
239
+ self.visual.set_grad_checkpointing(enable)
240
+ self.transformer.grad_checkpointing = enable
241
+
242
+ @torch.jit.ignore
243
+ def no_weight_decay(self):
244
+ return {'logit_scale'}
245
+
246
+ def encode_image(self, image, normalize: bool = False):
247
+ features = self.visual(image)
248
+ return F.normalize(features, dim=-1) if normalize else features
249
+
250
+ def encode_text(self, text, normalize: bool = False):
251
+ cast_dtype = self.transformer.get_cast_dtype()
252
+
253
+ x = self.token_embedding(text).to(cast_dtype) # [batch_size, n_ctx, d_model]
254
+
255
+ x = x + self.positional_embedding.to(cast_dtype)
256
+ x = x.permute(1, 0, 2) # NLD -> LND
257
+ x = self.transformer(x, attn_mask=self.attn_mask)
258
+ x = x.permute(1, 0, 2) # LND -> NLD
259
+ x = self.ln_final(x) # [batch_size, n_ctx, transformer.width]
260
+ # take features from the eot embedding (eot_token is the highest number in each sequence)
261
+ x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
262
+ return F.normalize(x, dim=-1) if normalize else x
263
+
264
+ def forward(self, image, text):
265
+ image_features = self.encode_image(image, normalize=True)
266
+ text_features = self.encode_text(text, normalize=True)
267
+ return image_features, text_features, self.logit_scale.exp()
268
+
269
+
270
+ class CustomCLIP(nn.Module):
271
+ def __init__(
272
+ self,
273
+ embed_dim: int,
274
+ vision_cfg: CLIPVisionCfg,
275
+ text_cfg: CLIPTextCfg,
276
+ quick_gelu: bool = False,
277
+ cast_dtype: Optional[torch.dtype] = None,
278
+ itm_task: bool = False,
279
+ ):
280
+ super().__init__()
281
+ self.visual = _build_vision_tower(embed_dim, vision_cfg, quick_gelu, cast_dtype)
282
+ self.text = _build_text_tower(embed_dim, text_cfg, quick_gelu, cast_dtype)
283
+ self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
284
+
285
+ def lock_image_tower(self, unlocked_groups=0, freeze_bn_stats=False):
286
+ # lock image tower as per LiT - https://arxiv.org/abs/2111.07991
287
+ self.visual.lock(unlocked_groups=unlocked_groups, freeze_bn_stats=freeze_bn_stats)
288
+
289
+ def lock_text_tower(self, unlocked_layers:int=0, freeze_layer_norm:bool=True):
290
+ self.text.lock(unlocked_layers, freeze_layer_norm)
291
+
292
+ @torch.jit.ignore
293
+ def set_grad_checkpointing(self, enable=True):
294
+ self.visual.set_grad_checkpointing(enable)
295
+ self.text.set_grad_checkpointing(enable)
296
+
297
+ @torch.jit.ignore
298
+ def no_weight_decay(self):
299
+ return {'logit_scale'}
300
+
301
+ def encode_image(self, image, normalize: bool = False):
302
+ features = self.visual(image)
303
+ return F.normalize(features, dim=-1) if normalize else features
304
+
305
+ def encode_text(self, text, normalize: bool = False):
306
+ features = self.text(text)
307
+ return F.normalize(features, dim=-1) if normalize else features
308
+
309
+ def forward(self, image, text):
310
+ image_features = self.encode_image(image, normalize=True)
311
+ text_features = self.encode_text(text, normalize=True)
312
+ return image_features, text_features, self.logit_scale.exp()
313
+
314
+
315
+ def convert_weights_to_lp(model: nn.Module, dtype=torch.float16):
316
+ """Convert applicable model parameters to low-precision (bf16 or fp16)"""
317
+
318
+ def _convert_weights(l):
319
+
320
+ if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
321
+ l.weight.data = l.weight.data.to(dtype)
322
+ if l.bias is not None:
323
+ l.bias.data = l.bias.data.to(dtype)
324
+
325
+ if isinstance(l, (nn.MultiheadAttention, Attention)):
326
+ for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:
327
+ tensor = getattr(l, attr, None)
328
+ if tensor is not None:
329
+ tensor.data = tensor.data.to(dtype)
330
+
331
+ if isinstance(l, nn.Parameter):
332
+ l.data = l.data.to(dtype)
333
+
334
+ for name in ["text_projection", "proj"]:
335
+ if hasattr(l, name) and isinstance(l, nn.Parameter):
336
+ attr = getattr(l, name, None)
337
+ if attr is not None:
338
+ attr.data = attr.data.to(dtype)
339
+
340
+ model.apply(_convert_weights)
341
+
342
+
343
+ convert_weights_to_fp16 = convert_weights_to_lp # backwards compat
344
+
345
+
346
+ # used to maintain checkpoint compatibility
347
+ def convert_to_custom_text_state_dict(state_dict: dict):
348
+ if 'text_projection' in state_dict:
349
+ # old format state_dict, move text tower -> .text
350
+ new_state_dict = {}
351
+ for k, v in state_dict.items():
352
+ if any(k.startswith(p) for p in (
353
+ 'text_projection',
354
+ 'positional_embedding',
355
+ 'token_embedding',
356
+ 'transformer',
357
+ 'ln_final',
358
+ 'logit_scale'
359
+ )):
360
+ k = 'text.' + k
361
+ new_state_dict[k] = v
362
+ return new_state_dict
363
+ return state_dict
364
+
365
+
366
+ def build_model_from_openai_state_dict(
367
+ state_dict: dict,
368
+ quick_gelu=True,
369
+ cast_dtype=torch.float16,
370
+ ):
371
+ vit = "visual.proj" in state_dict
372
+
373
+ if vit:
374
+ vision_width = state_dict["visual.conv1.weight"].shape[0]
375
+ vision_layers = len(
376
+ [k for k in state_dict.keys() if k.startswith("visual.") and k.endswith(".attn.in_proj_weight")])
377
+ vision_patch_size = state_dict["visual.conv1.weight"].shape[-1]
378
+ grid_size = round((state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5)
379
+ image_size = vision_patch_size * grid_size
380
+ else:
381
+ counts: list = [
382
+ len(set(k.split(".")[2] for k in state_dict if k.startswith(f"visual.layer{b}"))) for b in [1, 2, 3, 4]]
383
+ vision_layers = tuple(counts)
384
+ vision_width = state_dict["visual.layer1.0.conv1.weight"].shape[0]
385
+ output_width = round((state_dict["visual.attnpool.positional_embedding"].shape[0] - 1) ** 0.5)
386
+ vision_patch_size = None
387
+ assert output_width ** 2 + 1 == state_dict["visual.attnpool.positional_embedding"].shape[0]
388
+ image_size = output_width * 32
389
+
390
+ embed_dim = state_dict["text_projection"].shape[1]
391
+ context_length = state_dict["positional_embedding"].shape[0]
392
+ vocab_size = state_dict["token_embedding.weight"].shape[0]
393
+ transformer_width = state_dict["ln_final.weight"].shape[0]
394
+ transformer_heads = transformer_width // 64
395
+ transformer_layers = len(set(k.split(".")[2] for k in state_dict if k.startswith(f"transformer.resblocks")))
396
+
397
+ vision_cfg = CLIPVisionCfg(
398
+ layers=vision_layers,
399
+ width=vision_width,
400
+ patch_size=vision_patch_size,
401
+ image_size=image_size,
402
+ )
403
+ text_cfg = CLIPTextCfg(
404
+ context_length=context_length,
405
+ vocab_size=vocab_size,
406
+ width=transformer_width,
407
+ heads=transformer_heads,
408
+ layers=transformer_layers
409
+ )
410
+ model = CLIP(
411
+ embed_dim,
412
+ vision_cfg=vision_cfg,
413
+ text_cfg=text_cfg,
414
+ quick_gelu=quick_gelu, # OpenAI models were trained with QuickGELU
415
+ cast_dtype=cast_dtype,
416
+ )
417
+
418
+ for key in ["input_resolution", "context_length", "vocab_size"]:
419
+ state_dict.pop(key, None)
420
+
421
+ convert_weights_to_fp16(model) # OpenAI state dicts are partially converted to float16
422
+ model.load_state_dict(state_dict)
423
+ return model.eval()
424
+
425
+
426
+ def trace_model(model, batch_size=256, device=torch.device('cpu')):
427
+ model.eval()
428
+ image_size = model.visual.image_size
429
+ example_images = torch.ones((batch_size, 3, image_size, image_size), device=device)
430
+ example_text = torch.zeros((batch_size, model.context_length), dtype=torch.int, device=device)
431
+ model = torch.jit.trace_module(
432
+ model,
433
+ inputs=dict(
434
+ forward=(example_images, example_text),
435
+ encode_text=(example_text,),
436
+ encode_image=(example_images,)
437
+ ))
438
+ model.visual.image_size = image_size
439
+ return model
ComfyUI-PuLID-Flux-Enhanced/eva_clip/model_configs/EVA01-CLIP-B-16.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 512,
3
+ "vision_cfg": {
4
+ "image_size": 224,
5
+ "layers": 12,
6
+ "width": 768,
7
+ "patch_size": 16,
8
+ "eva_model_name": "eva-clip-b-16",
9
+ "ls_init_value": 0.1,
10
+ "drop_path_rate": 0.0
11
+ },
12
+ "text_cfg": {
13
+ "context_length": 77,
14
+ "vocab_size": 49408,
15
+ "width": 512,
16
+ "heads": 8,
17
+ "layers": 12
18
+ }
19
+ }
ComfyUI-PuLID-Flux-Enhanced/eva_clip/model_configs/EVA01-CLIP-g-14-plus.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 1024,
3
+ "vision_cfg": {
4
+ "image_size": 224,
5
+ "layers": 40,
6
+ "width": 1408,
7
+ "head_width": 88,
8
+ "mlp_ratio": 4.3637,
9
+ "patch_size": 14,
10
+ "eva_model_name": "eva-clip-g-14-x",
11
+ "drop_path_rate": 0,
12
+ "xattn": true,
13
+ "fusedLN": true
14
+ },
15
+ "text_cfg": {
16
+ "context_length": 77,
17
+ "vocab_size": 49408,
18
+ "width": 1024,
19
+ "heads": 16,
20
+ "layers": 24,
21
+ "xattn": false,
22
+ "fusedLN": true
23
+ }
24
+ }
ComfyUI-PuLID-Flux-Enhanced/eva_clip/model_configs/EVA01-CLIP-g-14.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 1024,
3
+ "vision_cfg": {
4
+ "image_size": 224,
5
+ "layers": 40,
6
+ "width": 1408,
7
+ "head_width": 88,
8
+ "mlp_ratio": 4.3637,
9
+ "patch_size": 14,
10
+ "eva_model_name": "eva-clip-g-14-x",
11
+ "drop_path_rate": 0.4,
12
+ "xattn": true,
13
+ "fusedLN": true
14
+ },
15
+ "text_cfg": {
16
+ "context_length": 77,
17
+ "vocab_size": 49408,
18
+ "width": 768,
19
+ "heads": 12,
20
+ "layers": 12,
21
+ "xattn": false,
22
+ "fusedLN": true
23
+ }
24
+ }
ComfyUI-PuLID-Flux-Enhanced/eva_clip/model_configs/EVA02-CLIP-B-16.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 512,
3
+ "vision_cfg": {
4
+ "image_size": 224,
5
+ "layers": 12,
6
+ "width": 768,
7
+ "head_width": 64,
8
+ "patch_size": 16,
9
+ "mlp_ratio": 2.6667,
10
+ "eva_model_name": "eva-clip-b-16-X",
11
+ "drop_path_rate": 0.0,
12
+ "xattn": true,
13
+ "fusedLN": true,
14
+ "rope": true,
15
+ "pt_hw_seq_len": 16,
16
+ "intp_freq": true,
17
+ "naiveswiglu": true,
18
+ "subln": true
19
+ },
20
+ "text_cfg": {
21
+ "context_length": 77,
22
+ "vocab_size": 49408,
23
+ "width": 512,
24
+ "heads": 8,
25
+ "layers": 12,
26
+ "xattn": true,
27
+ "fusedLN": true
28
+ }
29
+ }
ComfyUI-PuLID-Flux-Enhanced/eva_clip/model_configs/EVA02-CLIP-L-14-336.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 768,
3
+ "vision_cfg": {
4
+ "image_size": 336,
5
+ "layers": 24,
6
+ "width": 1024,
7
+ "drop_path_rate": 0,
8
+ "head_width": 64,
9
+ "mlp_ratio": 2.6667,
10
+ "patch_size": 14,
11
+ "eva_model_name": "eva-clip-l-14-336",
12
+ "xattn": true,
13
+ "fusedLN": true,
14
+ "rope": true,
15
+ "pt_hw_seq_len": 16,
16
+ "intp_freq": true,
17
+ "naiveswiglu": true,
18
+ "subln": true
19
+ },
20
+ "text_cfg": {
21
+ "context_length": 77,
22
+ "vocab_size": 49408,
23
+ "width": 768,
24
+ "heads": 12,
25
+ "layers": 12,
26
+ "xattn": false,
27
+ "fusedLN": true
28
+ }
29
+ }
ComfyUI-PuLID-Flux-Enhanced/eva_clip/model_configs/EVA02-CLIP-L-14.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 768,
3
+ "vision_cfg": {
4
+ "image_size": 224,
5
+ "layers": 24,
6
+ "width": 1024,
7
+ "drop_path_rate": 0,
8
+ "head_width": 64,
9
+ "mlp_ratio": 2.6667,
10
+ "patch_size": 14,
11
+ "eva_model_name": "eva-clip-l-14",
12
+ "xattn": true,
13
+ "fusedLN": true,
14
+ "rope": true,
15
+ "pt_hw_seq_len": 16,
16
+ "intp_freq": true,
17
+ "naiveswiglu": true,
18
+ "subln": true
19
+ },
20
+ "text_cfg": {
21
+ "context_length": 77,
22
+ "vocab_size": 49408,
23
+ "width": 768,
24
+ "heads": 12,
25
+ "layers": 12,
26
+ "xattn": false,
27
+ "fusedLN": true
28
+ }
29
+ }
ComfyUI-PuLID-Flux-Enhanced/eva_clip/model_configs/EVA02-CLIP-bigE-14-plus.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 1024,
3
+ "vision_cfg": {
4
+ "image_size": 224,
5
+ "layers": 64,
6
+ "width": 1792,
7
+ "head_width": 112,
8
+ "mlp_ratio": 8.571428571428571,
9
+ "patch_size": 14,
10
+ "eva_model_name": "eva-clip-4b-14-x",
11
+ "drop_path_rate": 0,
12
+ "xattn": true,
13
+ "postnorm": true,
14
+ "fusedLN": true
15
+ },
16
+ "text_cfg": {
17
+ "context_length": 77,
18
+ "vocab_size": 49408,
19
+ "width": 1280,
20
+ "heads": 20,
21
+ "layers": 32,
22
+ "xattn": false,
23
+ "fusedLN": true
24
+ }
25
+ }
ComfyUI-PuLID-Flux-Enhanced/eva_clip/model_configs/EVA02-CLIP-bigE-14.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 1024,
3
+ "vision_cfg": {
4
+ "image_size": 224,
5
+ "layers": 64,
6
+ "width": 1792,
7
+ "head_width": 112,
8
+ "mlp_ratio": 8.571428571428571,
9
+ "patch_size": 14,
10
+ "eva_model_name": "eva-clip-4b-14-x",
11
+ "drop_path_rate": 0,
12
+ "xattn": true,
13
+ "postnorm": true,
14
+ "fusedLN": true
15
+ },
16
+ "text_cfg": {
17
+ "context_length": 77,
18
+ "vocab_size": 49408,
19
+ "width": 1024,
20
+ "heads": 16,
21
+ "layers": 24,
22
+ "xattn": false,
23
+ "fusedLN": true
24
+ }
25
+ }
ComfyUI-PuLID-Flux-Enhanced/eva_clip/modified_resnet.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import OrderedDict
2
+
3
+ import torch
4
+ from torch import nn
5
+ from torch.nn import functional as F
6
+
7
+ from .utils import freeze_batch_norm_2d
8
+
9
+
10
+ class Bottleneck(nn.Module):
11
+ expansion = 4
12
+
13
+ def __init__(self, inplanes, planes, stride=1):
14
+ super().__init__()
15
+
16
+ # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
17
+ self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
18
+ self.bn1 = nn.BatchNorm2d(planes)
19
+ self.act1 = nn.ReLU(inplace=True)
20
+
21
+ self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
22
+ self.bn2 = nn.BatchNorm2d(planes)
23
+ self.act2 = nn.ReLU(inplace=True)
24
+
25
+ self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
26
+
27
+ self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
28
+ self.bn3 = nn.BatchNorm2d(planes * self.expansion)
29
+ self.act3 = nn.ReLU(inplace=True)
30
+
31
+ self.downsample = None
32
+ self.stride = stride
33
+
34
+ if stride > 1 or inplanes != planes * Bottleneck.expansion:
35
+ # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
36
+ self.downsample = nn.Sequential(OrderedDict([
37
+ ("-1", nn.AvgPool2d(stride)),
38
+ ("0", nn.Conv2d(inplanes, planes * self.expansion, 1, stride=1, bias=False)),
39
+ ("1", nn.BatchNorm2d(planes * self.expansion))
40
+ ]))
41
+
42
+ def forward(self, x: torch.Tensor):
43
+ identity = x
44
+
45
+ out = self.act1(self.bn1(self.conv1(x)))
46
+ out = self.act2(self.bn2(self.conv2(out)))
47
+ out = self.avgpool(out)
48
+ out = self.bn3(self.conv3(out))
49
+
50
+ if self.downsample is not None:
51
+ identity = self.downsample(x)
52
+
53
+ out += identity
54
+ out = self.act3(out)
55
+ return out
56
+
57
+
58
+ class AttentionPool2d(nn.Module):
59
+ def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None):
60
+ super().__init__()
61
+ self.positional_embedding = nn.Parameter(torch.randn(spacial_dim ** 2 + 1, embed_dim) / embed_dim ** 0.5)
62
+ self.k_proj = nn.Linear(embed_dim, embed_dim)
63
+ self.q_proj = nn.Linear(embed_dim, embed_dim)
64
+ self.v_proj = nn.Linear(embed_dim, embed_dim)
65
+ self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
66
+ self.num_heads = num_heads
67
+
68
+ def forward(self, x):
69
+ x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3]).permute(2, 0, 1) # NCHW -> (HW)NC
70
+ x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0) # (HW+1)NC
71
+ x = x + self.positional_embedding[:, None, :].to(x.dtype) # (HW+1)NC
72
+ x, _ = F.multi_head_attention_forward(
73
+ query=x, key=x, value=x,
74
+ embed_dim_to_check=x.shape[-1],
75
+ num_heads=self.num_heads,
76
+ q_proj_weight=self.q_proj.weight,
77
+ k_proj_weight=self.k_proj.weight,
78
+ v_proj_weight=self.v_proj.weight,
79
+ in_proj_weight=None,
80
+ in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
81
+ bias_k=None,
82
+ bias_v=None,
83
+ add_zero_attn=False,
84
+ dropout_p=0.,
85
+ out_proj_weight=self.c_proj.weight,
86
+ out_proj_bias=self.c_proj.bias,
87
+ use_separate_proj_weight=True,
88
+ training=self.training,
89
+ need_weights=False
90
+ )
91
+
92
+ return x[0]
93
+
94
+
95
+ class ModifiedResNet(nn.Module):
96
+ """
97
+ A ResNet class that is similar to torchvision's but contains the following changes:
98
+ - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
99
+ - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
100
+ - The final pooling layer is a QKV attention instead of an average pool
101
+ """
102
+
103
+ def __init__(self, layers, output_dim, heads, image_size=224, width=64):
104
+ super().__init__()
105
+ self.output_dim = output_dim
106
+ self.image_size = image_size
107
+
108
+ # the 3-layer stem
109
+ self.conv1 = nn.Conv2d(3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
110
+ self.bn1 = nn.BatchNorm2d(width // 2)
111
+ self.act1 = nn.ReLU(inplace=True)
112
+ self.conv2 = nn.Conv2d(width // 2, width // 2, kernel_size=3, padding=1, bias=False)
113
+ self.bn2 = nn.BatchNorm2d(width // 2)
114
+ self.act2 = nn.ReLU(inplace=True)
115
+ self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False)
116
+ self.bn3 = nn.BatchNorm2d(width)
117
+ self.act3 = nn.ReLU(inplace=True)
118
+ self.avgpool = nn.AvgPool2d(2)
119
+
120
+ # residual layers
121
+ self._inplanes = width # this is a *mutable* variable used during construction
122
+ self.layer1 = self._make_layer(width, layers[0])
123
+ self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
124
+ self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
125
+ self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
126
+
127
+ embed_dim = width * 32 # the ResNet feature dimension
128
+ self.attnpool = AttentionPool2d(image_size // 32, embed_dim, heads, output_dim)
129
+
130
+ self.init_parameters()
131
+
132
+ def _make_layer(self, planes, blocks, stride=1):
133
+ layers = [Bottleneck(self._inplanes, planes, stride)]
134
+
135
+ self._inplanes = planes * Bottleneck.expansion
136
+ for _ in range(1, blocks):
137
+ layers.append(Bottleneck(self._inplanes, planes))
138
+
139
+ return nn.Sequential(*layers)
140
+
141
+ def init_parameters(self):
142
+ if self.attnpool is not None:
143
+ std = self.attnpool.c_proj.in_features ** -0.5
144
+ nn.init.normal_(self.attnpool.q_proj.weight, std=std)
145
+ nn.init.normal_(self.attnpool.k_proj.weight, std=std)
146
+ nn.init.normal_(self.attnpool.v_proj.weight, std=std)
147
+ nn.init.normal_(self.attnpool.c_proj.weight, std=std)
148
+
149
+ for resnet_block in [self.layer1, self.layer2, self.layer3, self.layer4]:
150
+ for name, param in resnet_block.named_parameters():
151
+ if name.endswith("bn3.weight"):
152
+ nn.init.zeros_(param)
153
+
154
+ def lock(self, unlocked_groups=0, freeze_bn_stats=False):
155
+ assert unlocked_groups == 0, 'partial locking not currently supported for this model'
156
+ for param in self.parameters():
157
+ param.requires_grad = False
158
+ if freeze_bn_stats:
159
+ freeze_batch_norm_2d(self)
160
+
161
+ @torch.jit.ignore
162
+ def set_grad_checkpointing(self, enable=True):
163
+ # FIXME support for non-transformer
164
+ pass
165
+
166
+ def stem(self, x):
167
+ x = self.act1(self.bn1(self.conv1(x)))
168
+ x = self.act2(self.bn2(self.conv2(x)))
169
+ x = self.act3(self.bn3(self.conv3(x)))
170
+ x = self.avgpool(x)
171
+ return x
172
+
173
+ def forward(self, x):
174
+ x = self.stem(x)
175
+ x = self.layer1(x)
176
+ x = self.layer2(x)
177
+ x = self.layer3(x)
178
+ x = self.layer4(x)
179
+ x = self.attnpool(x)
180
+
181
+ return x
ComfyUI-PuLID-Flux-Enhanced/eva_clip/openai.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ OpenAI pretrained model functions
2
+
3
+ Adapted from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI.
4
+ """
5
+
6
+ import os
7
+ import warnings
8
+ from typing import List, Optional, Union
9
+
10
+ import torch
11
+
12
+ from .model import build_model_from_openai_state_dict, convert_weights_to_lp, get_cast_dtype
13
+ from .pretrained import get_pretrained_url, list_pretrained_models_by_tag, download_pretrained_from_url
14
+
15
+ __all__ = ["list_openai_models", "load_openai_model"]
16
+
17
+
18
+ def list_openai_models() -> List[str]:
19
+ """Returns the names of available CLIP models"""
20
+ return list_pretrained_models_by_tag('openai')
21
+
22
+
23
+ def load_openai_model(
24
+ name: str,
25
+ precision: Optional[str] = None,
26
+ device: Optional[Union[str, torch.device]] = None,
27
+ jit: bool = True,
28
+ cache_dir: Optional[str] = None,
29
+ ):
30
+ """Load a CLIP model
31
+
32
+ Parameters
33
+ ----------
34
+ name : str
35
+ A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict
36
+ precision: str
37
+ Model precision, if None defaults to 'fp32' if device == 'cpu' else 'fp16'.
38
+ device : Union[str, torch.device]
39
+ The device to put the loaded model
40
+ jit : bool
41
+ Whether to load the optimized JIT model (default) or more hackable non-JIT model.
42
+ cache_dir : Optional[str]
43
+ The directory to cache the downloaded model weights
44
+
45
+ Returns
46
+ -------
47
+ model : torch.nn.Module
48
+ The CLIP model
49
+ preprocess : Callable[[PIL.Image], torch.Tensor]
50
+ A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input
51
+ """
52
+ if device is None:
53
+ device = "cuda" if torch.cuda.is_available() else "cpu"
54
+ if precision is None:
55
+ precision = 'fp32' if device == 'cpu' else 'fp16'
56
+
57
+ if get_pretrained_url(name, 'openai'):
58
+ model_path = download_pretrained_from_url(get_pretrained_url(name, 'openai'), cache_dir=cache_dir)
59
+ elif os.path.isfile(name):
60
+ model_path = name
61
+ else:
62
+ raise RuntimeError(f"Model {name} not found; available models = {list_openai_models()}")
63
+
64
+ try:
65
+ # loading JIT archive
66
+ model = torch.jit.load(model_path, map_location=device if jit else "cpu").eval()
67
+ state_dict = None
68
+ except RuntimeError:
69
+ # loading saved state dict
70
+ if jit:
71
+ warnings.warn(f"File {model_path} is not a JIT archive. Loading as a state dict instead")
72
+ jit = False
73
+ state_dict = torch.load(model_path, map_location="cpu")
74
+
75
+ if not jit:
76
+ # Build a non-jit model from the OpenAI jitted model state dict
77
+ cast_dtype = get_cast_dtype(precision)
78
+ try:
79
+ model = build_model_from_openai_state_dict(state_dict or model.state_dict(), cast_dtype=cast_dtype)
80
+ except KeyError:
81
+ sd = {k[7:]: v for k, v in state_dict["state_dict"].items()}
82
+ model = build_model_from_openai_state_dict(sd, cast_dtype=cast_dtype)
83
+
84
+ # model from OpenAI state dict is in manually cast fp16 mode, must be converted for AMP/fp32/bf16 use
85
+ model = model.to(device)
86
+ if precision.startswith('amp') or precision == 'fp32':
87
+ model.float()
88
+ elif precision == 'bf16':
89
+ convert_weights_to_lp(model, dtype=torch.bfloat16)
90
+
91
+ return model
92
+
93
+ # patch the device names
94
+ device_holder = torch.jit.trace(lambda: torch.ones([]).to(torch.device(device)), example_inputs=[])
95
+ device_node = [n for n in device_holder.graph.findAllNodes("prim::Constant") if "Device" in repr(n)][-1]
96
+
97
+ def patch_device(module):
98
+ try:
99
+ graphs = [module.graph] if hasattr(module, "graph") else []
100
+ except RuntimeError:
101
+ graphs = []
102
+
103
+ if hasattr(module, "forward1"):
104
+ graphs.append(module.forward1.graph)
105
+
106
+ for graph in graphs:
107
+ for node in graph.findAllNodes("prim::Constant"):
108
+ if "value" in node.attributeNames() and str(node["value"]).startswith("cuda"):
109
+ node.copyAttributes(device_node)
110
+
111
+ model.apply(patch_device)
112
+ patch_device(model.encode_image)
113
+ patch_device(model.encode_text)
114
+
115
+ # patch dtype to float32 (typically for CPU)
116
+ if precision == 'fp32':
117
+ float_holder = torch.jit.trace(lambda: torch.ones([]).float(), example_inputs=[])
118
+ float_input = list(float_holder.graph.findNode("aten::to").inputs())[1]
119
+ float_node = float_input.node()
120
+
121
+ def patch_float(module):
122
+ try:
123
+ graphs = [module.graph] if hasattr(module, "graph") else []
124
+ except RuntimeError:
125
+ graphs = []
126
+
127
+ if hasattr(module, "forward1"):
128
+ graphs.append(module.forward1.graph)
129
+
130
+ for graph in graphs:
131
+ for node in graph.findAllNodes("aten::to"):
132
+ inputs = list(node.inputs())
133
+ for i in [1, 2]: # dtype can be the second or third argument to aten::to()
134
+ if inputs[i].node()["value"] == 5:
135
+ inputs[i].node().copyAttributes(float_node)
136
+
137
+ model.apply(patch_float)
138
+ patch_float(model.encode_image)
139
+ patch_float(model.encode_text)
140
+ model.float()
141
+
142
+ # ensure image_size attr available at consistent location for both jit and non-jit
143
+ model.visual.image_size = model.input_resolution.item()
144
+ return model
ComfyUI-PuLID-Flux-Enhanced/eva_clip/pretrained.py ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import os
3
+ import urllib
4
+ import warnings
5
+ from functools import partial
6
+ from typing import Dict, Union
7
+
8
+ from tqdm import tqdm
9
+
10
+ try:
11
+ from huggingface_hub import hf_hub_download
12
+ _has_hf_hub = True
13
+ except ImportError:
14
+ hf_hub_download = None
15
+ _has_hf_hub = False
16
+
17
+
18
+ def _pcfg(url='', hf_hub='', filename='', mean=None, std=None):
19
+ return dict(
20
+ url=url,
21
+ hf_hub=hf_hub,
22
+ mean=mean,
23
+ std=std,
24
+ )
25
+
26
+ _VITB32 = dict(
27
+ openai=_pcfg(
28
+ "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt"),
29
+ laion400m_e31=_pcfg(
30
+ "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-quickgelu-laion400m_e31-d867053b.pt"),
31
+ laion400m_e32=_pcfg(
32
+ "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-quickgelu-laion400m_e32-46683a32.pt"),
33
+ laion2b_e16=_pcfg(
34
+ "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-laion2b_e16-af8dbd0c.pth"),
35
+ laion2b_s34b_b79k=_pcfg(hf_hub='laion/CLIP-ViT-B-32-laion2B-s34B-b79K/')
36
+ )
37
+
38
+ _VITB32_quickgelu = dict(
39
+ openai=_pcfg(
40
+ "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt"),
41
+ laion400m_e31=_pcfg(
42
+ "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-quickgelu-laion400m_e31-d867053b.pt"),
43
+ laion400m_e32=_pcfg(
44
+ "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-quickgelu-laion400m_e32-46683a32.pt"),
45
+ )
46
+
47
+ _VITB16 = dict(
48
+ openai=_pcfg(
49
+ "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt"),
50
+ laion400m_e31=_pcfg(
51
+ "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_16-laion400m_e31-00efa78f.pt"),
52
+ laion400m_e32=_pcfg(
53
+ "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_16-laion400m_e32-55e67d44.pt"),
54
+ laion2b_s34b_b88k=_pcfg(hf_hub='laion/CLIP-ViT-B-16-laion2B-s34B-b88K/'),
55
+ )
56
+
57
+ _EVAB16 = dict(
58
+ eva=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_B_psz14to16.pt'),
59
+ eva02=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_B_psz14to16.pt'),
60
+ eva_clip=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_CLIP_B_psz16_s8B.pt'),
61
+ eva02_clip=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_CLIP_B_psz16_s8B.pt'),
62
+ )
63
+
64
+ _VITB16_PLUS_240 = dict(
65
+ laion400m_e31=_pcfg(
66
+ "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_16_plus_240-laion400m_e31-8fb26589.pt"),
67
+ laion400m_e32=_pcfg(
68
+ "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_16_plus_240-laion400m_e32-699c4b84.pt"),
69
+ )
70
+
71
+ _VITL14 = dict(
72
+ openai=_pcfg(
73
+ "https://openaipublic.azureedge.net/clip/models/b8cca3fd41ae0c99ba7e8951adf17d267cdb84cd88be6f7c2e0eca1737a03836/ViT-L-14.pt"),
74
+ laion400m_e31=_pcfg(
75
+ "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_l_14-laion400m_e31-69988bb6.pt"),
76
+ laion400m_e32=_pcfg(
77
+ "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_l_14-laion400m_e32-3d133497.pt"),
78
+ laion2b_s32b_b82k=_pcfg(
79
+ hf_hub='laion/CLIP-ViT-L-14-laion2B-s32B-b82K/',
80
+ mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
81
+ )
82
+
83
+ _EVAL14 = dict(
84
+ eva=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_L_psz14.pt'),
85
+ eva02=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_L_psz14.pt'),
86
+ eva_clip=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_CLIP_L_psz14_s4B.pt'),
87
+ eva02_clip=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_CLIP_L_psz14_s4B.pt'),
88
+ )
89
+
90
+ _VITL14_336 = dict(
91
+ openai=_pcfg(
92
+ "https://openaipublic.azureedge.net/clip/models/3035c92b350959924f9f00213499208652fc7ea050643e8b385c2dac08641f02/ViT-L-14-336px.pt"),
93
+ )
94
+
95
+ _EVAL14_336 = dict(
96
+ eva_clip=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_CLIP_L_336_psz14_s6B.pt'),
97
+ eva02_clip=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_CLIP_L_336_psz14_s6B.pt'),
98
+ eva_clip_224to336=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_CLIP_L_psz14_224to336.pt'),
99
+ eva02_clip_224to336=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_CLIP_L_psz14_224to336.pt'),
100
+ )
101
+
102
+ _VITH14 = dict(
103
+ laion2b_s32b_b79k=_pcfg(hf_hub='laion/CLIP-ViT-H-14-laion2B-s32B-b79K/'),
104
+ )
105
+
106
+ _VITg14 = dict(
107
+ laion2b_s12b_b42k=_pcfg(hf_hub='laion/CLIP-ViT-g-14-laion2B-s12B-b42K/'),
108
+ laion2b_s34b_b88k=_pcfg(hf_hub='laion/CLIP-ViT-g-14-laion2B-s34B-b88K/'),
109
+ )
110
+
111
+ _EVAg14 = dict(
112
+ eva=_pcfg(hf_hub='QuanSun/EVA-CLIP/'),
113
+ eva01=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA01_g_psz14.pt'),
114
+ eva_clip=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA01_CLIP_g_14_psz14_s11B.pt'),
115
+ eva01_clip=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA01_CLIP_g_14_psz14_s11B.pt'),
116
+ )
117
+
118
+ _EVAg14_PLUS = dict(
119
+ eva=_pcfg(hf_hub='QuanSun/EVA-CLIP/'),
120
+ eva01=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA01_g_psz14.pt'),
121
+ eva_clip=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA01_CLIP_g_14_plus_psz14_s11B.pt'),
122
+ eva01_clip=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA01_CLIP_g_14_plus_psz14_s11B.pt'),
123
+ )
124
+
125
+ _VITbigG14 = dict(
126
+ laion2b_s39b_b160k=_pcfg(hf_hub='laion/CLIP-ViT-bigG-14-laion2B-39B-b160k/'),
127
+ )
128
+
129
+ _EVAbigE14 = dict(
130
+ eva=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_E_psz14.pt'),
131
+ eva02=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_E_psz14.pt'),
132
+ eva_clip=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_CLIP_E_psz14_s4B.pt'),
133
+ eva02_clip=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_CLIP_E_psz14_s4B.pt'),
134
+ )
135
+
136
+ _EVAbigE14_PLUS = dict(
137
+ eva=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_E_psz14.pt'),
138
+ eva02=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_E_psz14.pt'),
139
+ eva_clip=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_CLIP_E_psz14_plus_s9B.pt'),
140
+ eva02_clip=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_CLIP_E_psz14_plus_s9B.pt'),
141
+ )
142
+
143
+
144
+ _PRETRAINED = {
145
+ # "ViT-B-32": _VITB32,
146
+ "OpenaiCLIP-B-32": _VITB32,
147
+ "OpenCLIP-B-32": _VITB32,
148
+
149
+ # "ViT-B-32-quickgelu": _VITB32_quickgelu,
150
+ "OpenaiCLIP-B-32-quickgelu": _VITB32_quickgelu,
151
+ "OpenCLIP-B-32-quickgelu": _VITB32_quickgelu,
152
+
153
+ # "ViT-B-16": _VITB16,
154
+ "OpenaiCLIP-B-16": _VITB16,
155
+ "OpenCLIP-B-16": _VITB16,
156
+
157
+ "EVA02-B-16": _EVAB16,
158
+ "EVA02-CLIP-B-16": _EVAB16,
159
+
160
+ # "ViT-B-16-plus-240": _VITB16_PLUS_240,
161
+ "OpenCLIP-B-16-plus-240": _VITB16_PLUS_240,
162
+
163
+ # "ViT-L-14": _VITL14,
164
+ "OpenaiCLIP-L-14": _VITL14,
165
+ "OpenCLIP-L-14": _VITL14,
166
+
167
+ "EVA02-L-14": _EVAL14,
168
+ "EVA02-CLIP-L-14": _EVAL14,
169
+
170
+ # "ViT-L-14-336": _VITL14_336,
171
+ "OpenaiCLIP-L-14-336": _VITL14_336,
172
+
173
+ "EVA02-CLIP-L-14-336": _EVAL14_336,
174
+
175
+ # "ViT-H-14": _VITH14,
176
+ # "ViT-g-14": _VITg14,
177
+ "OpenCLIP-H-14": _VITH14,
178
+ "OpenCLIP-g-14": _VITg14,
179
+
180
+ "EVA01-CLIP-g-14": _EVAg14,
181
+ "EVA01-CLIP-g-14-plus": _EVAg14_PLUS,
182
+
183
+ # "ViT-bigG-14": _VITbigG14,
184
+ "OpenCLIP-bigG-14": _VITbigG14,
185
+
186
+ "EVA02-CLIP-bigE-14": _EVAbigE14,
187
+ "EVA02-CLIP-bigE-14-plus": _EVAbigE14_PLUS,
188
+ }
189
+
190
+
191
+ def _clean_tag(tag: str):
192
+ # normalize pretrained tags
193
+ return tag.lower().replace('-', '_')
194
+
195
+
196
+ def list_pretrained(as_str: bool = False):
197
+ """ returns list of pretrained models
198
+ Returns a tuple (model_name, pretrain_tag) by default or 'name:tag' if as_str == True
199
+ """
200
+ return [':'.join([k, t]) if as_str else (k, t) for k in _PRETRAINED.keys() for t in _PRETRAINED[k].keys()]
201
+
202
+
203
+ def list_pretrained_models_by_tag(tag: str):
204
+ """ return all models having the specified pretrain tag """
205
+ models = []
206
+ tag = _clean_tag(tag)
207
+ for k in _PRETRAINED.keys():
208
+ if tag in _PRETRAINED[k]:
209
+ models.append(k)
210
+ return models
211
+
212
+
213
+ def list_pretrained_tags_by_model(model: str):
214
+ """ return all pretrain tags for the specified model architecture """
215
+ tags = []
216
+ if model in _PRETRAINED:
217
+ tags.extend(_PRETRAINED[model].keys())
218
+ return tags
219
+
220
+
221
+ def is_pretrained_cfg(model: str, tag: str):
222
+ if model not in _PRETRAINED:
223
+ return False
224
+ return _clean_tag(tag) in _PRETRAINED[model]
225
+
226
+
227
+ def get_pretrained_cfg(model: str, tag: str):
228
+ if model not in _PRETRAINED:
229
+ return {}
230
+ model_pretrained = _PRETRAINED[model]
231
+ return model_pretrained.get(_clean_tag(tag), {})
232
+
233
+
234
+ def get_pretrained_url(model: str, tag: str):
235
+ cfg = get_pretrained_cfg(model, _clean_tag(tag))
236
+ return cfg.get('url', '')
237
+
238
+
239
+ def download_pretrained_from_url(
240
+ url: str,
241
+ cache_dir: Union[str, None] = None,
242
+ ):
243
+ if not cache_dir:
244
+ cache_dir = os.path.expanduser("~/.cache/clip")
245
+ os.makedirs(cache_dir, exist_ok=True)
246
+ filename = os.path.basename(url)
247
+
248
+ if 'openaipublic' in url:
249
+ expected_sha256 = url.split("/")[-2]
250
+ elif 'mlfoundations' in url:
251
+ expected_sha256 = os.path.splitext(filename)[0].split("-")[-1]
252
+ else:
253
+ expected_sha256 = ''
254
+
255
+ download_target = os.path.join(cache_dir, filename)
256
+
257
+ if os.path.exists(download_target) and not os.path.isfile(download_target):
258
+ raise RuntimeError(f"{download_target} exists and is not a regular file")
259
+
260
+ if os.path.isfile(download_target):
261
+ if expected_sha256:
262
+ if hashlib.sha256(open(download_target, "rb").read()).hexdigest().startswith(expected_sha256):
263
+ return download_target
264
+ else:
265
+ warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
266
+ else:
267
+ return download_target
268
+
269
+ with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
270
+ with tqdm(total=int(source.headers.get("Content-Length")), ncols=80, unit='iB', unit_scale=True) as loop:
271
+ while True:
272
+ buffer = source.read(8192)
273
+ if not buffer:
274
+ break
275
+
276
+ output.write(buffer)
277
+ loop.update(len(buffer))
278
+
279
+ if expected_sha256 and not hashlib.sha256(open(download_target, "rb").read()).hexdigest().startswith(expected_sha256):
280
+ raise RuntimeError(f"Model has been downloaded but the SHA256 checksum does not not match")
281
+
282
+ return download_target
283
+
284
+
285
+ def has_hf_hub(necessary=False):
286
+ if not _has_hf_hub and necessary:
287
+ # if no HF Hub module installed, and it is necessary to continue, raise error
288
+ raise RuntimeError(
289
+ 'Hugging Face hub model specified but package not installed. Run `pip install huggingface_hub`.')
290
+ return _has_hf_hub
291
+
292
+
293
+ def download_pretrained_from_hf(
294
+ model_id: str,
295
+ filename: str = 'open_clip_pytorch_model.bin',
296
+ revision=None,
297
+ cache_dir: Union[str, None] = None,
298
+ ):
299
+ has_hf_hub(True)
300
+ cached_file = hf_hub_download(model_id, filename, revision=revision, cache_dir=cache_dir)
301
+ return cached_file
302
+
303
+
304
+ def download_pretrained(
305
+ cfg: Dict,
306
+ force_hf_hub: bool = False,
307
+ cache_dir: Union[str, None] = None,
308
+ ):
309
+ target = ''
310
+ if not cfg:
311
+ return target
312
+
313
+ download_url = cfg.get('url', '')
314
+ download_hf_hub = cfg.get('hf_hub', '')
315
+ if download_hf_hub and force_hf_hub:
316
+ # use HF hub even if url exists
317
+ download_url = ''
318
+
319
+ if download_url:
320
+ target = download_pretrained_from_url(download_url, cache_dir=cache_dir)
321
+ elif download_hf_hub:
322
+ has_hf_hub(True)
323
+ # we assume the hf_hub entries in pretrained config combine model_id + filename in
324
+ # 'org/model_name/filename.pt' form. To specify just the model id w/o filename and
325
+ # use 'open_clip_pytorch_model.bin' default, there must be a trailing slash 'org/model_name/'.
326
+ model_id, filename = os.path.split(download_hf_hub)
327
+ if filename:
328
+ target = download_pretrained_from_hf(model_id, filename=filename, cache_dir=cache_dir)
329
+ else:
330
+ target = download_pretrained_from_hf(model_id, cache_dir=cache_dir)
331
+
332
+ return target
ComfyUI-PuLID-Flux-Enhanced/eva_clip/rope.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from math import pi
2
+ import torch
3
+ from torch import nn
4
+ from einops import rearrange, repeat
5
+ import logging
6
+
7
+ def broadcat(tensors, dim = -1):
8
+ num_tensors = len(tensors)
9
+ shape_lens = set(list(map(lambda t: len(t.shape), tensors)))
10
+ assert len(shape_lens) == 1, 'tensors must all have the same number of dimensions'
11
+ shape_len = list(shape_lens)[0]
12
+ dim = (dim + shape_len) if dim < 0 else dim
13
+ dims = list(zip(*map(lambda t: list(t.shape), tensors)))
14
+ expandable_dims = [(i, val) for i, val in enumerate(dims) if i != dim]
15
+ assert all([*map(lambda t: len(set(t[1])) <= 2, expandable_dims)]), 'invalid dimensions for broadcastable concatentation'
16
+ max_dims = list(map(lambda t: (t[0], max(t[1])), expandable_dims))
17
+ expanded_dims = list(map(lambda t: (t[0], (t[1],) * num_tensors), max_dims))
18
+ expanded_dims.insert(dim, (dim, dims[dim]))
19
+ expandable_shapes = list(zip(*map(lambda t: t[1], expanded_dims)))
20
+ tensors = list(map(lambda t: t[0].expand(*t[1]), zip(tensors, expandable_shapes)))
21
+ return torch.cat(tensors, dim = dim)
22
+
23
+ def rotate_half(x):
24
+ x = rearrange(x, '... (d r) -> ... d r', r = 2)
25
+ x1, x2 = x.unbind(dim = -1)
26
+ x = torch.stack((-x2, x1), dim = -1)
27
+ return rearrange(x, '... d r -> ... (d r)')
28
+
29
+
30
+ class VisionRotaryEmbedding(nn.Module):
31
+ def __init__(
32
+ self,
33
+ dim,
34
+ pt_seq_len,
35
+ ft_seq_len=None,
36
+ custom_freqs = None,
37
+ freqs_for = 'lang',
38
+ theta = 10000,
39
+ max_freq = 10,
40
+ num_freqs = 1,
41
+ ):
42
+ super().__init__()
43
+ if custom_freqs:
44
+ freqs = custom_freqs
45
+ elif freqs_for == 'lang':
46
+ freqs = 1. / (theta ** (torch.arange(0, dim, 2)[:(dim // 2)].float() / dim))
47
+ elif freqs_for == 'pixel':
48
+ freqs = torch.linspace(1., max_freq / 2, dim // 2) * pi
49
+ elif freqs_for == 'constant':
50
+ freqs = torch.ones(num_freqs).float()
51
+ else:
52
+ raise ValueError(f'unknown modality {freqs_for}')
53
+
54
+ if ft_seq_len is None: ft_seq_len = pt_seq_len
55
+ t = torch.arange(ft_seq_len) / ft_seq_len * pt_seq_len
56
+
57
+ freqs_h = torch.einsum('..., f -> ... f', t, freqs)
58
+ freqs_h = repeat(freqs_h, '... n -> ... (n r)', r = 2)
59
+
60
+ freqs_w = torch.einsum('..., f -> ... f', t, freqs)
61
+ freqs_w = repeat(freqs_w, '... n -> ... (n r)', r = 2)
62
+
63
+ freqs = broadcat((freqs_h[:, None, :], freqs_w[None, :, :]), dim = -1)
64
+
65
+ self.register_buffer("freqs_cos", freqs.cos())
66
+ self.register_buffer("freqs_sin", freqs.sin())
67
+
68
+ logging.info(f'Shape of rope freq: {self.freqs_cos.shape}')
69
+
70
+ def forward(self, t, start_index = 0):
71
+ rot_dim = self.freqs_cos.shape[-1]
72
+ end_index = start_index + rot_dim
73
+ assert rot_dim <= t.shape[-1], f'feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}'
74
+ t_left, t, t_right = t[..., :start_index], t[..., start_index:end_index], t[..., end_index:]
75
+ t = (t * self.freqs_cos) + (rotate_half(t) * self.freqs_sin)
76
+
77
+ return torch.cat((t_left, t, t_right), dim = -1)
78
+
79
+ class VisionRotaryEmbeddingFast(nn.Module):
80
+ def __init__(
81
+ self,
82
+ dim,
83
+ pt_seq_len,
84
+ ft_seq_len=None,
85
+ custom_freqs = None,
86
+ freqs_for = 'lang',
87
+ theta = 10000,
88
+ max_freq = 10,
89
+ num_freqs = 1,
90
+ patch_dropout = 0.
91
+ ):
92
+ super().__init__()
93
+ if custom_freqs:
94
+ freqs = custom_freqs
95
+ elif freqs_for == 'lang':
96
+ freqs = 1. / (theta ** (torch.arange(0, dim, 2)[:(dim // 2)].float() / dim))
97
+ elif freqs_for == 'pixel':
98
+ freqs = torch.linspace(1., max_freq / 2, dim // 2) * pi
99
+ elif freqs_for == 'constant':
100
+ freqs = torch.ones(num_freqs).float()
101
+ else:
102
+ raise ValueError(f'unknown modality {freqs_for}')
103
+
104
+ if ft_seq_len is None: ft_seq_len = pt_seq_len
105
+ t = torch.arange(ft_seq_len) / ft_seq_len * pt_seq_len
106
+
107
+ freqs = torch.einsum('..., f -> ... f', t, freqs)
108
+ freqs = repeat(freqs, '... n -> ... (n r)', r = 2)
109
+ freqs = broadcat((freqs[:, None, :], freqs[None, :, :]), dim = -1)
110
+
111
+ freqs_cos = freqs.cos().view(-1, freqs.shape[-1])
112
+ freqs_sin = freqs.sin().view(-1, freqs.shape[-1])
113
+
114
+ self.patch_dropout = patch_dropout
115
+
116
+ self.register_buffer("freqs_cos", freqs_cos)
117
+ self.register_buffer("freqs_sin", freqs_sin)
118
+
119
+ logging.info(f'Shape of rope freq: {self.freqs_cos.shape}')
120
+
121
+ def forward(self, t, patch_indices_keep=None):
122
+ if patch_indices_keep is not None:
123
+ batch = t.size()[0]
124
+ batch_indices = torch.arange(batch)
125
+ batch_indices = batch_indices[..., None]
126
+
127
+ freqs_cos = repeat(self.freqs_cos, 'i j -> n i m j', n=t.shape[0], m=t.shape[1])
128
+ freqs_sin = repeat(self.freqs_sin, 'i j -> n i m j', n=t.shape[0], m=t.shape[1])
129
+
130
+ freqs_cos = freqs_cos[batch_indices, patch_indices_keep]
131
+ freqs_cos = rearrange(freqs_cos, 'n i m j -> n m i j')
132
+ freqs_sin = freqs_sin[batch_indices, patch_indices_keep]
133
+ freqs_sin = rearrange(freqs_sin, 'n i m j -> n m i j')
134
+
135
+ return t * freqs_cos + rotate_half(t) * freqs_sin
136
+
137
+ return t * self.freqs_cos + rotate_half(t) * self.freqs_sin
ComfyUI-PuLID-Flux-Enhanced/eva_clip/timm_model.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ timm model adapter
2
+
3
+ Wraps timm (https://github.com/rwightman/pytorch-image-models) models for use as a vision tower in CLIP model.
4
+ """
5
+ import logging
6
+ from collections import OrderedDict
7
+
8
+ import torch
9
+ import torch.nn as nn
10
+
11
+ try:
12
+ import timm
13
+ from timm.models.layers import Mlp, to_2tuple
14
+ try:
15
+ # old timm imports < 0.8.1
16
+ from timm.models.layers.attention_pool2d import RotAttentionPool2d
17
+ from timm.models.layers.attention_pool2d import AttentionPool2d as AbsAttentionPool2d
18
+ except ImportError:
19
+ # new timm imports >= 0.8.1
20
+ from timm.layers import RotAttentionPool2d
21
+ from timm.layers import AttentionPool2d as AbsAttentionPool2d
22
+ except ImportError:
23
+ timm = None
24
+
25
+ from .utils import freeze_batch_norm_2d
26
+
27
+
28
+ class TimmModel(nn.Module):
29
+ """ timm model adapter
30
+ # FIXME this adapter is a work in progress, may change in ways that break weight compat
31
+ """
32
+
33
+ def __init__(
34
+ self,
35
+ model_name,
36
+ embed_dim,
37
+ image_size=224,
38
+ pool='avg',
39
+ proj='linear',
40
+ proj_bias=False,
41
+ drop=0.,
42
+ pretrained=False):
43
+ super().__init__()
44
+ if timm is None:
45
+ raise RuntimeError("Please `pip install timm` to use timm models.")
46
+
47
+ self.image_size = to_2tuple(image_size)
48
+ self.trunk = timm.create_model(model_name, pretrained=pretrained)
49
+ feat_size = self.trunk.default_cfg.get('pool_size', None)
50
+ feature_ndim = 1 if not feat_size else 2
51
+ if pool in ('abs_attn', 'rot_attn'):
52
+ assert feature_ndim == 2
53
+ # if attn pooling used, remove both classifier and default pool
54
+ self.trunk.reset_classifier(0, global_pool='')
55
+ else:
56
+ # reset global pool if pool config set, otherwise leave as network default
57
+ reset_kwargs = dict(global_pool=pool) if pool else {}
58
+ self.trunk.reset_classifier(0, **reset_kwargs)
59
+ prev_chs = self.trunk.num_features
60
+
61
+ head_layers = OrderedDict()
62
+ if pool == 'abs_attn':
63
+ head_layers['pool'] = AbsAttentionPool2d(prev_chs, feat_size=feat_size, out_features=embed_dim)
64
+ prev_chs = embed_dim
65
+ elif pool == 'rot_attn':
66
+ head_layers['pool'] = RotAttentionPool2d(prev_chs, out_features=embed_dim)
67
+ prev_chs = embed_dim
68
+ else:
69
+ assert proj, 'projection layer needed if non-attention pooling is used.'
70
+
71
+ # NOTE attention pool ends with a projection layer, so proj should usually be set to '' if such pooling is used
72
+ if proj == 'linear':
73
+ head_layers['drop'] = nn.Dropout(drop)
74
+ head_layers['proj'] = nn.Linear(prev_chs, embed_dim, bias=proj_bias)
75
+ elif proj == 'mlp':
76
+ head_layers['mlp'] = Mlp(prev_chs, 2 * embed_dim, embed_dim, drop=drop, bias=(True, proj_bias))
77
+
78
+ self.head = nn.Sequential(head_layers)
79
+
80
+ def lock(self, unlocked_groups=0, freeze_bn_stats=False):
81
+ """ lock modules
82
+ Args:
83
+ unlocked_groups (int): leave last n layer groups unlocked (default: 0)
84
+ """
85
+ if not unlocked_groups:
86
+ # lock full model
87
+ for param in self.trunk.parameters():
88
+ param.requires_grad = False
89
+ if freeze_bn_stats:
90
+ freeze_batch_norm_2d(self.trunk)
91
+ else:
92
+ # NOTE: partial freeze requires latest timm (master) branch and is subject to change
93
+ try:
94
+ # FIXME import here until API stable and in an official release
95
+ from timm.models.helpers import group_parameters, group_modules
96
+ except ImportError:
97
+ raise RuntimeError(
98
+ 'Please install latest timm `pip install git+https://github.com/rwightman/pytorch-image-models`')
99
+ matcher = self.trunk.group_matcher()
100
+ gparams = group_parameters(self.trunk, matcher)
101
+ max_layer_id = max(gparams.keys())
102
+ max_layer_id = max_layer_id - unlocked_groups
103
+ for group_idx in range(max_layer_id + 1):
104
+ group = gparams[group_idx]
105
+ for param in group:
106
+ self.trunk.get_parameter(param).requires_grad = False
107
+ if freeze_bn_stats:
108
+ gmodules = group_modules(self.trunk, matcher, reverse=True)
109
+ gmodules = {k for k, v in gmodules.items() if v <= max_layer_id}
110
+ freeze_batch_norm_2d(self.trunk, gmodules)
111
+
112
+ @torch.jit.ignore
113
+ def set_grad_checkpointing(self, enable=True):
114
+ try:
115
+ self.trunk.set_grad_checkpointing(enable)
116
+ except Exception as e:
117
+ logging.warning('grad checkpointing not supported for this timm image tower, continuing without...')
118
+
119
+ def forward(self, x):
120
+ x = self.trunk(x)
121
+ x = self.head(x)
122
+ return x
ComfyUI-PuLID-Flux-Enhanced/eva_clip/tokenizer.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ CLIP tokenizer
2
+
3
+ Copied from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI.
4
+ """
5
+ import gzip
6
+ import html
7
+ import os
8
+ from functools import lru_cache
9
+ from typing import Union, List
10
+
11
+ import ftfy
12
+ import regex as re
13
+ import torch
14
+
15
+ # https://stackoverflow.com/q/62691279
16
+ import os
17
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
18
+
19
+
20
+ @lru_cache()
21
+ def default_bpe():
22
+ return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz")
23
+
24
+
25
+ @lru_cache()
26
+ def bytes_to_unicode():
27
+ """
28
+ Returns list of utf-8 byte and a corresponding list of unicode strings.
29
+ The reversible bpe codes work on unicode strings.
30
+ This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
31
+ When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
32
+ This is a signficant percentage of your normal, say, 32K bpe vocab.
33
+ To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
34
+ And avoids mapping to whitespace/control characters the bpe code barfs on.
35
+ """
36
+ bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
37
+ cs = bs[:]
38
+ n = 0
39
+ for b in range(2**8):
40
+ if b not in bs:
41
+ bs.append(b)
42
+ cs.append(2**8+n)
43
+ n += 1
44
+ cs = [chr(n) for n in cs]
45
+ return dict(zip(bs, cs))
46
+
47
+
48
+ def get_pairs(word):
49
+ """Return set of symbol pairs in a word.
50
+ Word is represented as tuple of symbols (symbols being variable-length strings).
51
+ """
52
+ pairs = set()
53
+ prev_char = word[0]
54
+ for char in word[1:]:
55
+ pairs.add((prev_char, char))
56
+ prev_char = char
57
+ return pairs
58
+
59
+
60
+ def basic_clean(text):
61
+ text = ftfy.fix_text(text)
62
+ text = html.unescape(html.unescape(text))
63
+ return text.strip()
64
+
65
+
66
+ def whitespace_clean(text):
67
+ text = re.sub(r'\s+', ' ', text)
68
+ text = text.strip()
69
+ return text
70
+
71
+
72
+ class SimpleTokenizer(object):
73
+ def __init__(self, bpe_path: str = default_bpe(), special_tokens=None):
74
+ self.byte_encoder = bytes_to_unicode()
75
+ self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
76
+ merges = gzip.open(bpe_path).read().decode("utf-8").split('\n')
77
+ merges = merges[1:49152-256-2+1]
78
+ merges = [tuple(merge.split()) for merge in merges]
79
+ vocab = list(bytes_to_unicode().values())
80
+ vocab = vocab + [v+'</w>' for v in vocab]
81
+ for merge in merges:
82
+ vocab.append(''.join(merge))
83
+ if not special_tokens:
84
+ special_tokens = ['<start_of_text>', '<end_of_text>']
85
+ else:
86
+ special_tokens = ['<start_of_text>', '<end_of_text>'] + special_tokens
87
+ vocab.extend(special_tokens)
88
+ self.encoder = dict(zip(vocab, range(len(vocab))))
89
+ self.decoder = {v: k for k, v in self.encoder.items()}
90
+ self.bpe_ranks = dict(zip(merges, range(len(merges))))
91
+ self.cache = {t:t for t in special_tokens}
92
+ special = "|".join(special_tokens)
93
+ self.pat = re.compile(special + r"""|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)
94
+
95
+ self.vocab_size = len(self.encoder)
96
+ self.all_special_ids = [self.encoder[t] for t in special_tokens]
97
+
98
+ def bpe(self, token):
99
+ if token in self.cache:
100
+ return self.cache[token]
101
+ word = tuple(token[:-1]) + ( token[-1] + '</w>',)
102
+ pairs = get_pairs(word)
103
+
104
+ if not pairs:
105
+ return token+'</w>'
106
+
107
+ while True:
108
+ bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
109
+ if bigram not in self.bpe_ranks:
110
+ break
111
+ first, second = bigram
112
+ new_word = []
113
+ i = 0
114
+ while i < len(word):
115
+ try:
116
+ j = word.index(first, i)
117
+ new_word.extend(word[i:j])
118
+ i = j
119
+ except:
120
+ new_word.extend(word[i:])
121
+ break
122
+
123
+ if word[i] == first and i < len(word)-1 and word[i+1] == second:
124
+ new_word.append(first+second)
125
+ i += 2
126
+ else:
127
+ new_word.append(word[i])
128
+ i += 1
129
+ new_word = tuple(new_word)
130
+ word = new_word
131
+ if len(word) == 1:
132
+ break
133
+ else:
134
+ pairs = get_pairs(word)
135
+ word = ' '.join(word)
136
+ self.cache[token] = word
137
+ return word
138
+
139
+ def encode(self, text):
140
+ bpe_tokens = []
141
+ text = whitespace_clean(basic_clean(text)).lower()
142
+ for token in re.findall(self.pat, text):
143
+ token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
144
+ bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
145
+ return bpe_tokens
146
+
147
+ def decode(self, tokens):
148
+ text = ''.join([self.decoder[token] for token in tokens])
149
+ text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('</w>', ' ')
150
+ return text
151
+
152
+
153
+ _tokenizer = SimpleTokenizer()
154
+
155
+
156
+ def tokenize(texts: Union[str, List[str]], context_length: int = 77) -> torch.LongTensor:
157
+ """
158
+ Returns the tokenized representation of given input string(s)
159
+
160
+ Parameters
161
+ ----------
162
+ texts : Union[str, List[str]]
163
+ An input string or a list of input strings to tokenize
164
+ context_length : int
165
+ The context length to use; all CLIP models use 77 as the context length
166
+
167
+ Returns
168
+ -------
169
+ A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
170
+ """
171
+ if isinstance(texts, str):
172
+ texts = [texts]
173
+
174
+ sot_token = _tokenizer.encoder["<start_of_text>"]
175
+ eot_token = _tokenizer.encoder["<end_of_text>"]
176
+ all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts]
177
+ result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
178
+
179
+ for i, tokens in enumerate(all_tokens):
180
+ if len(tokens) > context_length:
181
+ tokens = tokens[:context_length] # Truncate
182
+ tokens[-1] = eot_token
183
+ result[i, :len(tokens)] = torch.tensor(tokens)
184
+
185
+ return result
186
+
187
+
188
+ class HFTokenizer:
189
+ "HuggingFace tokenizer wrapper"
190
+ def __init__(self, tokenizer_name:str):
191
+ from transformers import AutoTokenizer
192
+ self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
193
+
194
+ def __call__(self, texts:Union[str, List[str]], context_length:int=77) -> torch.Tensor:
195
+ # same cleaning as for default tokenizer, except lowercasing
196
+ # adding lower (for case-sensitive tokenizers) will make it more robust but less sensitive to nuance
197
+ if isinstance(texts, str):
198
+ texts = [texts]
199
+ texts = [whitespace_clean(basic_clean(text)) for text in texts]
200
+ input_ids = self.tokenizer(texts, return_tensors='pt', max_length=context_length, padding='max_length', truncation=True).input_ids
201
+ return input_ids
ComfyUI-PuLID-Flux-Enhanced/eva_clip/transform.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Sequence, Tuple
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ import torchvision.transforms.functional as F
6
+
7
+ from torchvision.transforms import Normalize, Compose, RandomResizedCrop, InterpolationMode, ToTensor, Resize, \
8
+ CenterCrop
9
+
10
+ from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
11
+
12
+
13
+ class ResizeMaxSize(nn.Module):
14
+
15
+ def __init__(self, max_size, interpolation=InterpolationMode.BICUBIC, fn='max', fill=0):
16
+ super().__init__()
17
+ if not isinstance(max_size, int):
18
+ raise TypeError(f"Size should be int. Got {type(max_size)}")
19
+ self.max_size = max_size
20
+ self.interpolation = interpolation
21
+ self.fn = min if fn == 'min' else min
22
+ self.fill = fill
23
+
24
+ def forward(self, img):
25
+ if isinstance(img, torch.Tensor):
26
+ height, width = img.shape[:2]
27
+ else:
28
+ width, height = img.size
29
+ scale = self.max_size / float(max(height, width))
30
+ if scale != 1.0:
31
+ new_size = tuple(round(dim * scale) for dim in (height, width))
32
+ img = F.resize(img, new_size, self.interpolation)
33
+ pad_h = self.max_size - new_size[0]
34
+ pad_w = self.max_size - new_size[1]
35
+ img = F.pad(img, padding=[pad_w//2, pad_h//2, pad_w - pad_w//2, pad_h - pad_h//2], fill=self.fill)
36
+ return img
37
+
38
+
39
+ def _convert_to_rgb(image):
40
+ return image.convert('RGB')
41
+
42
+
43
+ # class CatGen(nn.Module):
44
+ # def __init__(self, num=4):
45
+ # self.num = num
46
+ # def mixgen_batch(image, text):
47
+ # batch_size = image.shape[0]
48
+ # index = np.random.permutation(batch_size)
49
+
50
+ # cat_images = []
51
+ # for i in range(batch_size):
52
+ # # image mixup
53
+ # image[i,:] = lam * image[i,:] + (1 - lam) * image[index[i],:]
54
+ # # text concat
55
+ # text[i] = tokenizer((str(text[i]) + " " + str(text[index[i]])))[0]
56
+ # text = torch.stack(text)
57
+ # return image, text
58
+
59
+
60
+ def image_transform(
61
+ image_size: int,
62
+ is_train: bool,
63
+ mean: Optional[Tuple[float, ...]] = None,
64
+ std: Optional[Tuple[float, ...]] = None,
65
+ resize_longest_max: bool = False,
66
+ fill_color: int = 0,
67
+ ):
68
+ mean = mean or OPENAI_DATASET_MEAN
69
+ if not isinstance(mean, (list, tuple)):
70
+ mean = (mean,) * 3
71
+
72
+ std = std or OPENAI_DATASET_STD
73
+ if not isinstance(std, (list, tuple)):
74
+ std = (std,) * 3
75
+
76
+ if isinstance(image_size, (list, tuple)) and image_size[0] == image_size[1]:
77
+ # for square size, pass size as int so that Resize() uses aspect preserving shortest edge
78
+ image_size = image_size[0]
79
+
80
+ normalize = Normalize(mean=mean, std=std)
81
+ if is_train:
82
+ return Compose([
83
+ RandomResizedCrop(image_size, scale=(0.9, 1.0), interpolation=InterpolationMode.BICUBIC),
84
+ _convert_to_rgb,
85
+ ToTensor(),
86
+ normalize,
87
+ ])
88
+ else:
89
+ if resize_longest_max:
90
+ transforms = [
91
+ ResizeMaxSize(image_size, fill=fill_color)
92
+ ]
93
+ else:
94
+ transforms = [
95
+ Resize(image_size, interpolation=InterpolationMode.BICUBIC),
96
+ CenterCrop(image_size),
97
+ ]
98
+ transforms.extend([
99
+ _convert_to_rgb,
100
+ ToTensor(),
101
+ normalize,
102
+ ])
103
+ return Compose(transforms)