Spaces:

valhalla
/

minDALLE

Runtime error

App Files Files Community

valhalla commited on Dec 16, 2021

Commit

b442155

1 Parent(s): 4e3891e

init

Browse files

Files changed (25) hide show

CITATION.cff +17 -0
LICENSE +2 -0
LICENSE.apache-2.0 +201 -0
LICENSE.cc-by-nc-sa-4.0 +437 -0
README.md +5 -5
app.py +141 -0
configs/dalle-1.3B.yaml +33 -0
configs/transfer-imagenet-clscond-gen.yaml +49 -0
configs/transfer-imagenet-uncond-gen.yaml +48 -0
dalle/__init__.py +0 -0
dalle/models/__init__.py +202 -0
dalle/models/stage1/layers.py +373 -0
dalle/models/stage1/vqgan.py +93 -0
dalle/models/stage2/layers.py +140 -0
dalle/models/stage2/transformer.py +255 -0
dalle/models/tokenizer.py +26 -0
dalle/utils/__init__.py +3 -0
dalle/utils/config.py +123 -0
dalle/utils/sampling.py +152 -0
dalle/utils/utils.py +84 -0
examples/sampling_ex.py +63 -0
examples/sampling_interactive_demo.ipynb +298 -0
examples/transfer_learning_ex.py +172 -0
requirements.txt +10 -0
setup.cfg +3 -0

CITATION.cff ADDED Viewed

	@@ -0,0 +1,17 @@

+cff-version: 1.2.0
+message: "If you find this repository useful in your research, please cite"
+authors:
+  - family-names: Kim
+    given-names: Saehoon
+  - family-names: Cho
+    given-names: Sanghun
+  - family-names: Kim
+    given-names: Chiheon
+  - family-names: Lee
+    given-names: Doyup
+  - family-names: Baek
+    given-names: Woonhyuk
+title: "minDALL-E on Conceptual Captions"
+version: 0.1
+date-released: 2021-12-14
+repository-code: https://github.com/kakaobrain/minDALL-E

LICENSE ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ The `source codes` are licensed under [Apache 2.0](LICENSE.apache-2.0) License.
2	+ The `stage2 pretrained weights` are licensed under [CC-BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/) License.

LICENSE.apache-2.0 ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [2021.11.13] [Kakao Brain]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

LICENSE.cc-by-nc-sa-4.0 ADDED Viewed

	@@ -0,0 +1,437 @@

+Attribution-NonCommercial-ShareAlike 4.0 International
+=======================================================================
+Creative Commons Corporation ("Creative Commons") is not a law firm and
+does not provide legal services or legal advice. Distribution of
+Creative Commons public licenses does not create a lawyer-client or
+other relationship. Creative Commons makes its licenses and related
+information available on an "as-is" basis. Creative Commons gives no
+warranties regarding its licenses, any material licensed under their
+terms and conditions, or any related information. Creative Commons
+disclaims all liability for damages resulting from their use to the
+fullest extent possible.
+Using Creative Commons Public Licenses
+Creative Commons public licenses provide a standard set of terms and
+conditions that creators and other rights holders may use to share
+original works of authorship and other material subject to copyright
+and certain other rights specified in the public license below. The
+following considerations are for informational purposes only, are not
+exhaustive, and do not form part of our licenses.
+     Considerations for licensors: Our public licenses are
+     intended for use by those authorized to give the public
+     permission to use material in ways otherwise restricted by
+     copyright and certain other rights. Our licenses are
+     irrevocable. Licensors should read and understand the terms
+     and conditions of the license they choose before applying it.
+     Licensors should also secure all rights necessary before
+     applying our licenses so that the public can reuse the
+     material as expected. Licensors should clearly mark any
+     material not subject to the license. This includes other CC-
+     licensed material, or material used under an exception or
+     limitation to copyright. More considerations for licensors:
+    wiki.creativecommons.org/Considerations_for_licensors
+     Considerations for the public: By using one of our public
+     licenses, a licensor grants the public permission to use the
+     licensed material under specified terms and conditions. If
+     the licensor's permission is not necessary for any reason--for
+     example, because of any applicable exception or limitation to
+     copyright--then that use is not regulated by the license. Our
+     licenses grant only permissions under copyright and certain
+     other rights that a licensor has authority to grant. Use of
+     the licensed material may still be restricted for other
+     reasons, including because others have copyright or other
+     rights in the material. A licensor may make special requests,
+     such as asking that all changes be marked or described.
+     Although not required by our licenses, you are encouraged to
+     respect those requests where reasonable. More_considerations
+     for the public:
+    wiki.creativecommons.org/Considerations_for_licensees
+=======================================================================
+Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International
+Public License
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution-NonCommercial-ShareAlike 4.0 International Public License
+("Public License"). To the extent this Public License may be
+interpreted as a contract, You are granted the Licensed Rights in
+consideration of Your acceptance of these terms and conditions, and the
+Licensor grants You such rights in consideration of benefits the
+Licensor receives from making the Licensed Material available under
+these terms and conditions.
+Section 1 -- Definitions.
+  a. Adapted Material means material subject to Copyright and Similar
+     Rights that is derived from or based upon the Licensed Material
+     and in which the Licensed Material is translated, altered,
+     arranged, transformed, or otherwise modified in a manner requiring
+     permission under the Copyright and Similar Rights held by the
+     Licensor. For purposes of this Public License, where the Licensed
+     Material is a musical work, performance, or sound recording,
+     Adapted Material is always produced where the Licensed Material is
+     synched in timed relation with a moving image.
+  b. Adapter's License means the license You apply to Your Copyright
+     and Similar Rights in Your contributions to Adapted Material in
+     accordance with the terms and conditions of this Public License.
+  c. BY-NC-SA Compatible License means a license listed at
+     creativecommons.org/compatiblelicenses, approved by Creative
+     Commons as essentially the equivalent of this Public License.
+  d. Copyright and Similar Rights means copyright and/or similar rights
+     closely related to copyright including, without limitation,
+     performance, broadcast, sound recording, and Sui Generis Database
+     Rights, without regard to how the rights are labeled or
+     categorized. For purposes of this Public License, the rights
+     specified in Section 2(b)(1)-(2) are not Copyright and Similar
+     Rights.
+  e. Effective Technological Measures means those measures that, in the
+     absence of proper authority, may not be circumvented under laws
+     fulfilling obligations under Article 11 of the WIPO Copyright
+     Treaty adopted on December 20, 1996, and/or similar international
+     agreements.
+  f. Exceptions and Limitations means fair use, fair dealing, and/or
+     any other exception or limitation to Copyright and Similar Rights
+     that applies to Your use of the Licensed Material.
+  g. License Elements means the license attributes listed in the name
+     of a Creative Commons Public License. The License Elements of this
+     Public License are Attribution, NonCommercial, and ShareAlike.
+  h. Licensed Material means the artistic or literary work, database,
+     or other material to which the Licensor applied this Public
+     License.
+  i. Licensed Rights means the rights granted to You subject to the
+     terms and conditions of this Public License, which are limited to
+     all Copyright and Similar Rights that apply to Your use of the
+     Licensed Material and that the Licensor has authority to license.
+  j. Licensor means the individual(s) or entity(ies) granting rights
+     under this Public License.
+  k. NonCommercial means not primarily intended for or directed towards
+     commercial advantage or monetary compensation. For purposes of
+     this Public License, the exchange of the Licensed Material for
+     other material subject to Copyright and Similar Rights by digital
+     file-sharing or similar means is NonCommercial provided there is
+     no payment of monetary compensation in connection with the
+     exchange.
+  l. Share means to provide material to the public by any means or
+     process that requires permission under the Licensed Rights, such
+     as reproduction, public display, public performance, distribution,
+     dissemination, communication, or importation, and to make material
+     available to the public including in ways that members of the
+     public may access the material from a place and at a time
+     individually chosen by them.
+  m. Sui Generis Database Rights means rights other than copyright
+     resulting from Directive 96/9/EC of the European Parliament and of
+     the Council of 11 March 1996 on the legal protection of databases,
+     as amended and/or succeeded, as well as other essentially
+     equivalent rights anywhere in the world.
+  n. You means the individual or entity exercising the Licensed Rights
+     under this Public License. Your has a corresponding meaning.
+Section 2 -- Scope.
+  a. License grant.
+       1. Subject to the terms and conditions of this Public License,
+          the Licensor hereby grants You a worldwide, royalty-free,
+          non-sublicensable, non-exclusive, irrevocable license to
+          exercise the Licensed Rights in the Licensed Material to:
+            a. reproduce and Share the Licensed Material, in whole or
+               in part, for NonCommercial purposes only; and
+            b. produce, reproduce, and Share Adapted Material for
+               NonCommercial purposes only.
+       2. Exceptions and Limitations. For the avoidance of doubt, where
+          Exceptions and Limitations apply to Your use, this Public
+          License does not apply, and You do not need to comply with
+          its terms and conditions.
+       3. Term. The term of this Public License is specified in Section
+          6(a).
+       4. Media and formats; technical modifications allowed. The
+          Licensor authorizes You to exercise the Licensed Rights in
+          all media and formats whether now known or hereafter created,
+          and to make technical modifications necessary to do so. The
+          Licensor waives and/or agrees not to assert any right or
+          authority to forbid You from making technical modifications
+          necessary to exercise the Licensed Rights, including
+          technical modifications necessary to circumvent Effective
+          Technological Measures. For purposes of this Public License,
+          simply making modifications authorized by this Section 2(a)
+          (4) never produces Adapted Material.
+       5. Downstream recipients.
+            a. Offer from the Licensor -- Licensed Material. Every
+               recipient of the Licensed Material automatically
+               receives an offer from the Licensor to exercise the
+               Licensed Rights under the terms and conditions of this
+               Public License.
+            b. Additional offer from the Licensor -- Adapted Material.
+               Every recipient of Adapted Material from You
+               automatically receives an offer from the Licensor to
+               exercise the Licensed Rights in the Adapted Material
+               under the conditions of the Adapter's License You apply.
+            c. No downstream restrictions. You may not offer or impose
+               any additional or different terms or conditions on, or
+               apply any Effective Technological Measures to, the
+               Licensed Material if doing so restricts exercise of the
+               Licensed Rights by any recipient of the Licensed
+               Material.
+       6. No endorsement. Nothing in this Public License constitutes or
+          may be construed as permission to assert or imply that You
+          are, or that Your use of the Licensed Material is, connected
+          with, or sponsored, endorsed, or granted official status by,
+          the Licensor or others designated to receive attribution as
+          provided in Section 3(a)(1)(A)(i).
+  b. Other rights.
+       1. Moral rights, such as the right of integrity, are not
+          licensed under this Public License, nor are publicity,
+          privacy, and/or other similar personality rights; however, to
+          the extent possible, the Licensor waives and/or agrees not to
+          assert any such rights held by the Licensor to the limited
+          extent necessary to allow You to exercise the Licensed
+          Rights, but not otherwise.
+       2. Patent and trademark rights are not licensed under this
+          Public License.
+       3. To the extent possible, the Licensor waives any right to
+          collect royalties from You for the exercise of the Licensed
+          Rights, whether directly or through a collecting society
+          under any voluntary or waivable statutory or compulsory
+          licensing scheme. In all other cases the Licensor expressly
+          reserves any right to collect such royalties, including when
+          the Licensed Material is used other than for NonCommercial
+          purposes.
+Section 3 -- License Conditions.
+Your exercise of the Licensed Rights is expressly made subject to the
+following conditions.
+  a. Attribution.
+       1. If You Share the Licensed Material (including in modified
+          form), You must:
+            a. retain the following if it is supplied by the Licensor
+               with the Licensed Material:
+                 i. identification of the creator(s) of the Licensed
+                    Material and any others designated to receive
+                    attribution, in any reasonable manner requested by
+                    the Licensor (including by pseudonym if
+                    designated);
+                ii. a copyright notice;
+               iii. a notice that refers to this Public License;
+                iv. a notice that refers to the disclaimer of
+                    warranties;
+                 v. a URI or hyperlink to the Licensed Material to the
+                    extent reasonably practicable;
+            b. indicate if You modified the Licensed Material and
+               retain an indication of any previous modifications; and
+            c. indicate the Licensed Material is licensed under this
+               Public License, and include the text of, or the URI or
+               hyperlink to, this Public License.
+       2. You may satisfy the conditions in Section 3(a)(1) in any
+          reasonable manner based on the medium, means, and context in
+          which You Share the Licensed Material. For example, it may be
+          reasonable to satisfy the conditions by providing a URI or
+          hyperlink to a resource that includes the required
+          information.
+       3. If requested by the Licensor, You must remove any of the
+          information required by Section 3(a)(1)(A) to the extent
+          reasonably practicable.
+  b. ShareAlike.
+     In addition to the conditions in Section 3(a), if You Share
+     Adapted Material You produce, the following conditions also apply.
+       1. The Adapter's License You apply must be a Creative Commons
+          license with the same License Elements, this version or
+          later, or a BY-NC-SA Compatible License.
+       2. You must include the text of, or the URI or hyperlink to, the
+          Adapter's License You apply. You may satisfy this condition
+          in any reasonable manner based on the medium, means, and
+          context in which You Share Adapted Material.
+       3. You may not offer or impose any additional or different terms
+          or conditions on, or apply any Effective Technological
+          Measures to, Adapted Material that restrict exercise of the
+          rights granted under the Adapter's License You apply.
+Section 4 -- Sui Generis Database Rights.
+Where the Licensed Rights include Sui Generis Database Rights that
+apply to Your use of the Licensed Material:
+  a. for the avoidance of doubt, Section 2(a)(1) grants You the right
+     to extract, reuse, reproduce, and Share all or a substantial
+     portion of the contents of the database for NonCommercial purposes
+     only;
+  b. if You include all or a substantial portion of the database
+     contents in a database in which You have Sui Generis Database
+     Rights, then the database in which You have Sui Generis Database
+     Rights (but not its individual contents) is Adapted Material,
+     including for purposes of Section 3(b); and
+  c. You must comply with the conditions in Section 3(a) if You Share
+     all or a substantial portion of the contents of the database.
+For the avoidance of doubt, this Section 4 supplements and does not
+replace Your obligations under this Public License where the Licensed
+Rights include other Copyright and Similar Rights.
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+  a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
+     EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
+     AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
+     ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
+     IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
+     WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
+     PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
+     ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
+     KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
+     ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+  b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
+     TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
+     NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
+     INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
+     COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
+     USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
+     ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
+     DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
+     IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+  c. The disclaimer of warranties and limitation of liability provided
+     above shall be interpreted in a manner that, to the extent
+     possible, most closely approximates an absolute disclaimer and
+     waiver of all liability.
+Section 6 -- Term and Termination.
+  a. This Public License applies for the term of the Copyright and
+     Similar Rights licensed here. However, if You fail to comply with
+     this Public License, then Your rights under this Public License
+     terminate automatically.
+  b. Where Your right to use the Licensed Material has terminated under
+     Section 6(a), it reinstates:
+       1. automatically as of the date the violation is cured, provided
+          it is cured within 30 days of Your discovery of the
+          violation; or
+       2. upon express reinstatement by the Licensor.
+     For the avoidance of doubt, this Section 6(b) does not affect any
+     right the Licensor may have to seek remedies for Your violations
+     of this Public License.
+  c. For the avoidance of doubt, the Licensor may also offer the
+     Licensed Material under separate terms or conditions or stop
+     distributing the Licensed Material at any time; however, doing so
+     will not terminate this Public License.
+  d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+     License.
+Section 7 -- Other Terms and Conditions.
+  a. The Licensor shall not be bound by any additional or different
+     terms or conditions communicated by You unless expressly agreed.
+  b. Any arrangements, understandings, or agreements regarding the
+     Licensed Material not stated herein are separate from and
+     independent of the terms and conditions of this Public License.
+Section 8 -- Interpretation.
+  a. For the avoidance of doubt, this Public License does not, and
+     shall not be interpreted to, reduce, limit, restrict, or impose
+     conditions on any use of the Licensed Material that could lawfully
+     be made without permission under this Public License.
+  b. To the extent possible, if any provision of this Public License is
+     deemed unenforceable, it shall be automatically reformed to the
+     minimum extent necessary to make it enforceable. If the provision
+     cannot be reformed, it shall be severed from this Public License
+     without affecting the enforceability of the remaining terms and
+     conditions.
+  c. No term or condition of this Public License will be waived and no
+     failure to comply consented to unless expressly agreed to by the
+     Licensor.
+  d. Nothing in this Public License constitutes or may be interpreted
+     as a limitation upon, or waiver of, any privileges and immunities
+     that apply to the Licensor or You, including from the legal
+     processes of any jurisdiction or authority.
+=======================================================================
+Creative Commons is not a party to its public
+licenses. Notwithstanding, Creative Commons may elect to apply one of
+its public licenses to material it publishes and in those instances
+will be considered the “Licensor.” The text of the Creative Commons
+public licenses is dedicated to the public domain under the CC0 Public
+Domain Dedication. Except for the limited purpose of indicating that
+material is shared under a Creative Commons public license or as
+otherwise permitted by the Creative Commons policies published at
+creativecommons.org/policies, Creative Commons does not authorize the
+use of the trademark "Creative Commons" or any other trademark or logo
+of Creative Commons without its prior written consent including,
+without limitation, in connection with any unauthorized modifications
+to any of its public licenses or any other arrangements,
+understandings, or agreements concerning use of licensed material. For
+the avoidance of doubt, this paragraph does not form part of the
+public licenses.
+Creative Commons may be contacted at creativecommons.org.

README.md CHANGED Viewed

@@ -1,9 +1,9 @@
 ---
-title: MinDALLE
-emoji: 🏃
-colorFrom: blue
-colorTo: blue
-sdk: streamlit
 app_file: app.py
 pinned: false
 ---

 ---
+title: MinDALL E
+emoji: 🔥
+colorFrom: red
+colorTo: yellow
+sdk: gradio
 app_file: app.py
 pinned: false
 ---

app.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import os
+import sys
+import numpy as np
+import streamlit as st
+from PIL import Image
+# import clip
+# sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+# import gradio as gr
+# from dalle.models import Dalle
+# from dalle.utils.utils import clip_score, set_seed
+device = "cpu"
+# model = Dalle.from_pretrained("minDALL-E/1.3B")  # This will automatically download the pretrained model.
+# model.to(device=device)
+# model_clip, preprocess_clip = clip.load("ViT-B/32", device=device)
+# model_clip.to(device=device)
+# def sample(prompt):
+#     # Sampling
+#     images = (
+#         model.sampling(prompt=prompt, top_k=256, top_p=None, softmax_temperature=1.0, num_candidates=3, device=device)
+#         .cpu()
+#         .numpy()
+#     )
+#     images = np.transpose(images, (0, 2, 3, 1))
+#     # CLIP Re-ranking
+#     rank = clip_score(
+#         prompt=prompt, images=images, model_clip=model_clip, preprocess_clip=preprocess_clip, device=device
+#     )
+#     # Save images
+#     images = images[rank]
+#     # print(rank, images.shape)
+#     pil_images = []
+#     for i in range(len(images)):
+#         im = Image.fromarray((images[i] * 255).astype(np.uint8))
+#         pil_images.append(im)
+#     # im = Image.fromarray((images[0] * 255).astype(np.uint8))
+#     return pil_images
+# title = "Interactive demo: ImageGPT"
+# description = "Demo for OpenAI's ImageGPT: Generative Pretraining from Pixels. To use it, simply upload an image or use the example image below and click 'submit'. Results will show up in a few seconds."
+# article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2109.10282'>ImageGPT: Generative Pretraining from Pixels</a> | <a href='https://openai.com/blog/image-gpt/'>Official blog</a></p>"
+# iface = gr.Interface(
+#     fn=sample,
+#     inputs=[gr.inputs.Textbox(label="What would you like to see?")],
+#     outputs=gr.outputs.Image(type="pil", label="Model input + completions"),
+#     title=title,
+#     description=description,
+#     article=article,
+#     #examples=examples,
+#     enable_queue=True,
+# )
+# iface.launch(debug=True)
+#!/usr/bin/env python
+# coding: utf-8
+st.sidebar.markdown(
+    """
+<style>
+.aligncenter {
+    text-align: center;
+}
+</style>
+<p class="aligncenter">
+    <img src="https://raw.githubusercontent.com/borisdayma/dalle-mini/main/img/logo.png"/>
+</p>
+""",
+    unsafe_allow_html=True,
+)
+st.sidebar.markdown(
+    """
+___
+<p style='text-align: center'>
+DALL·E mini is an AI model that generates images from any prompt you give!
+</p>
+<p style='text-align: center'>
+Created by Boris Dayma et al. 2021
+<br/>
+<a href="https://github.com/borisdayma/dalle-mini" target="_blank">GitHub</a> | <a href="https://wandb.ai/dalle-mini/dalle-mini/reports/DALL-E-mini--Vmlldzo4NjIxODA" target="_blank">Project Report</a>
+</p>
+        """,
+    unsafe_allow_html=True,
+)
+st.header("DALL·E mini")
+st.subheader("Generate images from text")
+prompt = st.text_input("What do you want to see?")
+DEBUG = False
+# if prompt != "":
+#     container = st.empty()
+#     container.markdown(
+#         f"""
+#         <style> p {{ margin:0 }} div {{ margin:0 }} </style>
+#         <div data-stale="false" class="element-container css-1e5imcs e1tzin5v1">
+#         <div class="stAlert">
+#         <div role="alert" data-baseweb="notification" class="st-ae st-af st-ag st-ah st-ai st-aj st-ak st-g3 st-am st-b8 st-ao st-ap st-aq st-ar st-as st-at st-au st-av st-aw st-ax st-ay st-az st-b9 st-b1 st-b2 st-b3 st-b4 st-b5 st-b6">
+#         <div class="st-b7">
+#         <div class="css-whx05o e13vu3m50">
+#         <div data-testid="stMarkdownContainer" class="css-1ekf893 e16nr0p30">
+#                 <img src="https://raw.githubusercontent.com/borisdayma/dalle-mini/main/app/streamlit/img/loading.gif" width="30"/>
+#                 Generating predictions for: <b>{prompt}</b>
+#         </div>
+#         </div>
+#         </div>
+#         </div>
+#         </div>
+#         </div>
+#         <small><i>Predictions may take up to 40s under high load. Please stand by.</i></small>
+#     """,
+#         unsafe_allow_html=True,
+#     )
+#     print(f"Getting selections: {prompt}")
+#     selected = sample(prompt)
+#     margin = 0.1  # for better position of zoom in arrow
+#     n_columns = 3
+#     cols = st.columns([1] + [margin, 1] * (n_columns - 1))
+#     for i, img in enumerate(selected):
+#         cols[(i % n_columns) * 2].image(img)
+#     container.markdown(f"**{prompt}**")
+#     st.button("Again!", key="again_button")

configs/dalle-1.3B.yaml ADDED Viewed

	@@ -0,0 +1,33 @@

+stage1:
+  type: vqgan
+  embed_dim: 256
+  n_embed: 16384
+  hparams:
+    double_z: False
+    z_channels: 256
+    resolution: 256
+    in_channels: 3
+    out_ch: 3
+    ch: 128
+    ch_mult: [1, 1, 2, 2, 4]
+    num_res_blocks: 2
+    attn_resolutions: [16]
+    pdrop: 0.0
+stage2:
+  type: transformer1d
+  vocab_size_txt: 16384
+  vocab_size_img: 16384
+  hparams:
+    embed_dim: 1536
+    n_layers: 42
+    n_heads: 24
+    n_dense_layers: 42
+    ctx_len_img: 256
+    ctx_len_txt: 64
+    embd_pdrop: 0.0
+    resid_pdrop: 0.0
+    attn_pdrop: 0.0
+    mlp_bias: True
+    attn_bias: True
+    gelu_use_approx: False

configs/transfer-imagenet-clscond-gen.yaml ADDED Viewed

	@@ -0,0 +1,49 @@

+dataset:
+  dataset: imagenet
+  image_resolution: 256
+stage1:
+  type: vqgan
+  embed_dim: 256
+  n_embed: 16384
+  hparams:
+    double_z: False
+    z_channels: 256
+    resolution: 256
+    in_channels: 3
+    out_ch: 3
+    ch: 128
+    ch_mult: [1, 1, 2, 2, 4]
+    num_res_blocks: 2
+    attn_resolutions: [16]
+    pdrop: 0.0
+stage2:
+  type: igpt
+  use_cls_cond: True
+  vocab_size_img: 16384
+  hparams:
+    embed_dim: 1536
+    n_layers: 42
+    n_heads: 24
+    n_dense_layers: 42
+    ctx_len_img: 256
+    embd_pdrop: 0.0
+    resid_pdrop: 0.0
+    attn_pdrop: 0.0
+    mlp_bias: True
+    attn_bias: True
+    gelu_use_approx: False
+    n_classes: 1000
+optimizer:
+  opt_type: adamW
+  base_lr: 1e-4
+  weight_decay: 0.0
+  betas: [0.9, 0.95]
+  grad_clip_norm: 4.0
+experiment:
+  local_batch_size: 2
+  total_batch_size: 512
+  epochs: 8

configs/transfer-imagenet-uncond-gen.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+dataset:
+  dataset: imagenet
+  image_resolution: 256
+stage1:
+  type: vqgan
+  embed_dim: 256
+  n_embed: 16384
+  hparams:
+    double_z: False
+    z_channels: 256
+    resolution: 256
+    in_channels: 3
+    out_ch: 3
+    ch: 128
+    ch_mult: [1, 1, 2, 2, 4]
+    num_res_blocks: 2
+    attn_resolutions: [16]
+    pdrop: 0.0
+stage2:
+  type: igpt
+  use_cls_cond: False
+  vocab_size_img: 16384
+  hparams:
+    embed_dim: 1536
+    n_layers: 42
+    n_heads: 24
+    n_dense_layers: 42
+    ctx_len_img: 256
+    embd_pdrop: 0.0
+    resid_pdrop: 0.0
+    attn_pdrop: 0.0
+    mlp_bias: True
+    attn_bias: True
+    gelu_use_approx: False
+optimizer:
+  opt_type: adamW
+  base_lr: 1e-4
+  weight_decay: 0.0
+  betas: [0.9, 0.95]
+  grad_clip_norm: 4.0
+experiment:
+  local_batch_size: 2
+  total_batch_size: 512
+  epochs: 8

dalle/__init__.py ADDED Viewed

File without changes

dalle/models/__init__.py ADDED Viewed

	@@ -0,0 +1,202 @@

+# ------------------------------------------------------------------------------------
+# Minimal DALL-E
+# Copyright (c) 2021 KakaoBrain. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------
+import os
+import torch
+import torch.nn as nn
+import pytorch_lightning as pl
+from typing import Optional, Tuple
+from omegaconf import OmegaConf
+from torch.cuda.amp import autocast
+from torch.optim.lr_scheduler import CosineAnnealingLR
+from torch.nn import functional as F
+from .stage1.vqgan import VQGAN
+from .stage2.transformer import Transformer1d, iGPT
+from .. import utils
+from ..utils.config import get_base_config
+from ..utils.sampling import sampling, sampling_igpt
+from .tokenizer import build_tokenizer
+_MODELS = {
+    'minDALL-E/1.3B': 'https://arena.kakaocdn.net/brainrepo/models/minDALL-E/57b008f02ceaa02b779c8b7463143315/1.3B.tar.gz'
+}
+class Dalle(nn.Module):
+    def __init__(self,
+                 config: OmegaConf) -> None:
+        super().__init__()
+        self.tokenizer = None
+        self.stage1 = VQGAN(n_embed=config.stage1.n_embed,
+                            embed_dim=config.stage1.embed_dim,
+                            hparams=config.stage1.hparams)
+        self.stage2 = Transformer1d(vocab_size_txt=config.stage2.vocab_size_txt,
+                                    vocab_size_img=config.stage2.vocab_size_img,
+                                    hparams=config.stage2.hparams)
+        self.config_stage1 = config.stage1
+        self.config_stage2 = config.stage2
+        self.config_dataset = config.dataset
+    @classmethod
+    def from_pretrained(cls,
+                        path: str) -> nn.Module:
+        path = _MODELS[path] if path in _MODELS else path
+        path = utils.realpath_url_or_path(path, root=os.path.expanduser("~/.cache/minDALL-E"))
+        config_base = get_base_config()
+        config_new = OmegaConf.load(os.path.join(path, 'config.yaml'))
+        config_update = OmegaConf.merge(config_base, config_new)
+        model = cls(config_update)
+        model.tokenizer = build_tokenizer(os.path.join(path, 'tokenizer'),
+                                          context_length=model.config_dataset.context_length,
+                                          lowercase=True,
+                                          dropout=None)
+        model.stage1.from_ckpt(os.path.join(path, 'stage1_last.ckpt'))
+        model.stage2.from_ckpt(os.path.join(path, 'stage2_last.ckpt'))
+        return model
+    @torch.no_grad()
+    def sampling(self,
+                 prompt: str,
+                 top_k: int = 256,
+                 top_p: Optional[float] = None,
+                 softmax_temperature: float = 1.0,
+                 num_candidates: int = 96,
+                 device: str = 'cuda:0',
+                 use_fp16: bool = True) -> torch.FloatTensor:
+        self.stage1.eval()
+        self.stage2.eval()
+        tokens = self.tokenizer.encode(prompt)
+        tokens = torch.LongTensor(tokens.ids)
+        tokens = torch.repeat_interleave(tokens.unsqueeze(0), num_candidates, dim=0)
+        # Check if the encoding works as intended
+        # print(self.tokenizer.decode_batch(tokens.tolist(), skip_special_tokens=True)[0])
+        tokens = tokens.to(device)
+        codes = sampling(self.stage2,
+                         tokens,
+                         top_k=top_k,
+                         top_p=top_p,
+                         softmax_temperature=softmax_temperature,
+                         use_fp16=use_fp16)
+        codes = codes.view(num_candidates, 16, 16)  # [B, 16, 16]
+        pixels = torch.clamp(self.stage1.decode_code(codes) * 0.5 + 0.5, 0, 1)  # [B, 256, 256]
+        return pixels
+class ImageGPT(pl.LightningModule):
+    def __init__(self,
+                 config: OmegaConf) -> None:
+        super().__init__()
+        self.stage1 = VQGAN(n_embed=config.stage1.n_embed,
+                            embed_dim=config.stage1.embed_dim,
+                            hparams=config.stage1.hparams)
+        self.stage2 = iGPT(vocab_size_img=config.stage2.vocab_size_img,
+                           use_cls_cond=config.stage2.use_cls_cond,
+                           hparams=config.stage2.hparams)
+        self.config = config
+        self.use_cls_cond = config.stage2.use_cls_cond
+        # make the parameters in stage 1 not trainable
+        self.stage1.eval()
+        for p in self.stage1.parameters():
+            p.requires_grad = False
+    @classmethod
+    def from_pretrained(cls,
+                        path_upstream: str,
+                        path_downstream: str) -> Tuple[nn.Module, OmegaConf]:
+        config_base = get_base_config(use_default=False)
+        config_down = OmegaConf.load(path_downstream)
+        config_down = OmegaConf.merge(config_base, config_down)
+        model = cls(config_down)
+        model.stage1.from_ckpt(os.path.join(path_upstream, 'stage1_last.ckpt'), strict=True)
+        model.stage2.from_ckpt(os.path.join(path_upstream, 'stage2_last.ckpt'), strict=False)
+        return model, config_down
+    def sample(self,
+               cls_idx: Optional[int] = None,
+               top_k: int = 256,
+               top_p: Optional[float] = None,
+               softmax_temperature: float = 1.0,
+               num_candidates: int = 16,
+               device: str = 'cuda:0',
+               use_fp16: bool = True,
+               is_tqdm: bool = True) -> torch.FloatTensor:
+        self.stage1.eval()
+        self.stage2.eval()
+        if cls_idx is None:
+            sos = self.stage2.sos.repeat(num_candidates, 1, 1)
+        else:
+            sos = torch.LongTensor([cls_idx]).to(device=device)
+            sos = sos.repeat(num_candidates)
+            sos = self.stage2.sos(sos).unsqueeze(1)
+        codes = sampling_igpt(self.stage2,
+                              sos=sos,
+                              top_k=top_k,
+                              top_p=top_p,
+                              softmax_temperature=softmax_temperature,
+                              use_fp16=use_fp16,
+                              is_tqdm=is_tqdm)
+        codes = codes.view(num_candidates, 16, 16)  # [B, 16, 16]
+        pixels = torch.clamp(self.stage1.decode_code(codes) * 0.5 + 0.5, 0, 1)  # [B, 256, 256]
+        return pixels
+    def forward(self,
+                images: torch.FloatTensor,
+                labels: Optional[torch.LongTensor] = None) -> torch.FloatTensor:
+        B, C, H, W = images.shape
+        with torch.no_grad():
+            with autocast(enabled=False):
+                codes = self.stage1.get_codes(images).detach()
+        logits = self.stage2(codes, labels)
+        return logits, codes
+    def training_step(self, batch, batch_idx):
+        images, labels = batch
+        logits, codes = self(images, labels=labels if self.use_cls_cond else None)
+        loss = F.cross_entropy(logits.view(-1, logits.shape[-1]), codes.view(-1))
+        self.log("train/loss", loss, on_step=True, on_epoch=True, prog_bar=False, logger=True)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        images, labels = batch
+        logits, codes = self(images, labels=labels if self.use_cls_cond else None)
+        loss = F.cross_entropy(logits.view(-1, logits.shape[-1]), codes.view(-1))
+        self.log("val/loss", loss, on_step=False, on_epoch=True, prog_bar=False, logger=True)
+        return loss
+    def configure_optimizers(self):
+        assert self.config.optimizer.opt_type == 'adamW'
+        assert self.config.optimizer.sched_type == 'cosine'
+        opt = torch.optim.AdamW(self.parameters(),
+                                lr=self.config.optimizer.base_lr,
+                                betas=self.config.optimizer.betas,
+                                weight_decay=self.config.optimizer.weight_decay)
+        sched = CosineAnnealingLR(opt,
+                                  T_max=self.config.optimizer.max_steps,
+                                  eta_min=self.config.optimizer.min_lr)
+        sched = {
+            'scheduler': sched,
+            'name': 'cosine'
+        }
+        return [opt], [sched]
+    def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure,
+                       on_tpu=False, using_native_amp=False, using_lbfgs=False):
+        optimizer.step(closure=optimizer_closure)
+        self.lr_schedulers().step()
+        self.log("lr", self.lr_schedulers().get_last_lr()[0], on_step=True, on_epoch=False, prog_bar=True, logger=True)
+    def on_epoch_start(self):
+        self.stage1.eval()

dalle/models/stage1/layers.py ADDED Viewed

	@@ -0,0 +1,373 @@

+# ------------------------------------------------------------------------------------
+# Modified from VQGAN (https://github.com/CompVis/taming-transformers)
+# Copyright (c) 2020 Patrick Esser and Robin Rombach and Björn Ommer. All Rights Reserved.
+# ------------------------------------------------------------------------------------
+import torch
+import torch.nn as nn
+from typing import Tuple, Optional
+def nonlinearity(x):
+    # swish
+    return x*torch.sigmoid(x)
+def Normalize(in_channels):
+    return torch.nn.GroupNorm(num_groups=32,
+                              num_channels=in_channels,
+                              eps=1e-6,
+                              affine=True)
+class Upsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, x):
+        x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        if self.with_conv:
+            x = self.conv(x)
+        return x
+class Downsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=0)
+    def forward(self, x):
+        if self.with_conv:
+            pad = (0, 1, 0, 1)
+            x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+            x = self.conv(x)
+        else:
+            x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
+        return x
+class ResnetBlock(nn.Module):
+    def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False,
+                 dropout, temb_channels=512):
+        assert temb_channels == 0
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.norm1 = Normalize(in_channels)
+        self.conv1 = torch.nn.Conv2d(in_channels,
+                                     out_channels,
+                                     kernel_size=3,
+                                     stride=1,
+                                     padding=1)
+        self.norm2 = Normalize(out_channels)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = torch.nn.Conv2d(out_channels,
+                                     out_channels,
+                                     kernel_size=3,
+                                     stride=1,
+                                     padding=1)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = torch.nn.Conv2d(in_channels,
+                                                     out_channels,
+                                                     kernel_size=3,
+                                                     stride=1,
+                                                     padding=1)
+            else:
+                self.nin_shortcut = torch.nn.Conv2d(in_channels,
+                                                    out_channels,
+                                                    kernel_size=1,
+                                                    stride=1,
+                                                    padding=0)
+    def forward(self, x, temb=None):
+        assert temb is None
+        h = x
+        h = self.norm1(h)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        return x+h
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.k = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.v = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.proj_out = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=1,
+                                        stride=1,
+                                        padding=0)
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        b, c, h, w = q.shape
+        q = q.reshape(b, c, h*w)
+        q = q.permute(0, 2, 1)  # b,hw,c
+        k = k.reshape(b, c, h*w)  # b,c,hw
+        w_ = torch.bmm(q, k)  # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+        w_ = w_ * (int(c)**(-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+        # attend to values
+        v = v.reshape(b, c, h*w)
+        w_ = w_.permute(0, 2, 1)  # b,hw,hw (first hw of k, second of q)
+        h_ = torch.bmm(v, w_)  # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+        h_ = h_.reshape(b, c, h, w)
+        h_ = self.proj_out(h_)
+        return x+h_
+class Encoder(nn.Module):
+    def __init__(self,
+                 *,  # forced to use named arguments
+                 ch: int,
+                 out_ch: int,
+                 ch_mult: Tuple[int] = (1, 2, 4, 8),
+                 num_res_blocks: int,
+                 attn_resolutions: Tuple[int],
+                 pdrop: float = 0.0,
+                 resamp_with_conv: bool = True,
+                 in_channels: int,
+                 resolution: int,
+                 z_channels: int,
+                 double_z: Optional[bool] = None) -> None:
+        super().__init__()
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(in_channels,
+                                       self.ch,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+        curr_res = resolution
+        in_ch_mult = (1,)+tuple(ch_mult)
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch*in_ch_mult[i_level]
+            block_out = ch*ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=pdrop))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions-1:
+                down.downsample = Downsample(block_in, resamp_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=pdrop)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=pdrop)
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        2*z_channels if double_z else z_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, x):
+        assert x.shape[2] == x.shape[3] == self.resolution, \
+               "{}, {}".format(x.shape, self.resolution)
+        # downsampling
+        h = self.conv_in(x)
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](h)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+            if i_level != self.num_resolutions-1:
+                h = self.down[i_level].downsample(h)
+        # middle
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class Decoder(nn.Module):
+    def __init__(self,
+                 *,  # forced to use named arguments
+                 ch: int,
+                 out_ch: int,
+                 ch_mult: Tuple[int] = (1, 2, 4, 8),
+                 num_res_blocks: int,
+                 attn_resolutions: Tuple[int],
+                 pdrop: float = 0.0,
+                 resamp_with_conv: bool = True,
+                 in_channels: int,
+                 resolution: int,
+                 z_channels: int,
+                 double_z: bool) -> None:
+        super().__init__()
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        block_in = ch*ch_mult[self.num_resolutions-1]
+        curr_res = resolution // 2**(self.num_resolutions-1)
+        self.z_shape = (1, z_channels, curr_res, curr_res)
+        # z to block_in
+        self.conv_in = torch.nn.Conv2d(z_channels,
+                                       block_in,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=pdrop)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=pdrop)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch*ch_mult[i_level]
+            for i_block in range(self.num_res_blocks+1):
+                block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=pdrop))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        out_ch,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, z):
+        assert z.shape[1:] == self.z_shape[1:]
+        self.last_z_shape = z.shape
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks+1):
+                h = self.up[i_level].block[i_block](h)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h

dalle/models/stage1/vqgan.py ADDED Viewed

	@@ -0,0 +1,93 @@

+# ------------------------------------------------------------------------------------
+# Modified from VQGAN (https://github.com/CompVis/taming-transformers)
+# Copyright (c) 2020 Patrick Esser and Robin Rombach and Björn Ommer. All Rights Reserved.
+# ------------------------------------------------------------------------------------
+import torch
+import torch.nn as nn
+from typing import List, Tuple, Optional
+from einops import rearrange
+from omegaconf import OmegaConf
+from .layers import Encoder, Decoder
+class VectorQuantizer(nn.Module):
+    """
+    Simplified VectorQuantizer in the original VQGAN repository
+    by removing unncessary modules for sampling
+    """
+    def __init__(self, dim: int, n_embed: int, beta: float) -> None:
+        super().__init__()
+        self.n_embed = n_embed
+        self.dim = dim
+        self.beta = beta
+        self.embedding = nn.Embedding(self.n_embed, self.dim)
+        self.embedding.weight.data.uniform_(-1.0 / self.n_embed, 1.0 / self.n_embed)
+    def forward(self,
+                z: torch.FloatTensor) -> Tuple[torch.FloatTensor, torch.LongTensor]:
+        z = rearrange(z, 'b c h w -> b h w c').contiguous()  # [B,C,H,W] -> [B,H,W,C]
+        z_flattened = z.view(-1, self.dim)
+        d = torch.sum(z_flattened ** 2, dim=1, keepdim=True) + \
+            torch.sum(self.embedding.weight**2, dim=1) - 2 * \
+            torch.einsum('bd,dn->bn', z_flattened, rearrange(self.embedding.weight, 'n d -> d n'))
+        min_encoding_indices = torch.argmin(d, dim=1)
+        z_q = self.embedding(min_encoding_indices).view(z.shape)
+        return z_q, min_encoding_indices
+    def get_codebook_entry(self,
+                           indices: torch.LongTensor,
+                           shape: Optional[List[int]] = None) -> torch.FloatTensor:
+        z_q = self.embedding(indices)
+        if shape is not None:
+            z_q = z_q.view(shape)
+            z_q = z_q.permute(0, 3, 1, 2).contiguous()
+        return z_q
+class VQGAN(nn.Module):
+    def __init__(self, n_embed: int, embed_dim: int, hparams: OmegaConf) -> None:
+        super().__init__()
+        self.encoder = Encoder(**hparams)
+        self.decoder = Decoder(**hparams)
+        self.quantize = VectorQuantizer(dim=embed_dim, n_embed=n_embed, beta=0.25)
+        self.quant_conv = torch.nn.Conv2d(hparams.z_channels, embed_dim, 1)
+        self.post_quant_conv = torch.nn.Conv2d(embed_dim, hparams.z_channels, 1)
+        self.latent_dim = hparams.attn_resolutions[0]
+    def forward(self, x: torch.FloatTensor) -> torch.FloatTensor:
+        quant = self.encode(x)
+        dec = self.decode(quant)
+        return dec
+    def encode(self, x: torch.FloatTensor) -> torch.FloatTensor:
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+        quant = self.quantize(h)[0]
+        quant = rearrange(quant, 'b h w c -> b c h w').contiguous()
+        return quant
+    def decode(self, quant: torch.FloatTensor) -> torch.FloatTensor:
+        quant = self.post_quant_conv(quant)
+        dec = self.decoder(quant)
+        return dec
+    def decode_code(self, code: torch.LongTensor) -> torch.FloatTensor:
+        quant = self.quantize.get_codebook_entry(code)
+        quant = quant.permute(0, 3, 1, 2)
+        dec = self.decode(quant)
+        return dec
+    def get_codes(self, x: torch.FloatTensor) -> torch.LongTensor:
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+        codes = self.quantize(h)[1].view(x.shape[0], self.latent_dim ** 2)
+        return codes
+    def from_ckpt(self, path: str, strict: bool = True) -> None:
+        ckpt = torch.load(path, map_location='cpu')['state_dict']
+        self.load_state_dict(ckpt, strict=strict)
+        print(f'{path} successfully restored..')

dalle/models/stage2/layers.py ADDED Viewed

	@@ -0,0 +1,140 @@

+# ------------------------------------------------------------------------------------
+# Minimal DALL-E
+# Copyright (c) 2021 KakaoBrain. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------
+# Modified from minGPT (https://github.com/karpathy/minGPT)
+# Copyright (c) 2020 Andrej Karpathy. All Rights Reserved.
+# ------------------------------------------------------------------------------------
+import math
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+class GELU(nn.Module):
+    def __init__(self, use_approx=False):
+        super().__init__()
+        self.use_approx = use_approx
+    def forward(self, x):
+        if self.use_approx:
+            return x * torch.sigmoid(1.702 * x)
+        else:
+            return F.gelu(x)
+class MultiHeadSelfAttention(nn.Module):
+    def __init__(self,
+                 ctx_len: int,
+                 embed_dim: int,
+                 n_heads: int,
+                 resid_pdrop: float,
+                 attn_pdrop: float,
+                 attn_bias: bool,
+                 use_mask: bool = True):
+        super().__init__()
+        assert embed_dim % n_heads == 0
+        # key, query, value projections for all heads
+        self.key = nn.Linear(embed_dim, embed_dim, bias=attn_bias)
+        self.query = nn.Linear(embed_dim, embed_dim, bias=attn_bias)
+        self.value = nn.Linear(embed_dim, embed_dim, bias=attn_bias)
+        # regularization
+        self.attn_drop = nn.Dropout(attn_pdrop)
+        self.resid_drop = nn.Dropout(resid_pdrop)
+        # output projection
+        self.proj = nn.Linear(embed_dim, embed_dim, attn_bias)
+        self.n_heads = n_heads
+        self.ctx_len = ctx_len
+        self.use_mask = use_mask
+        if self.use_mask:
+            self.register_buffer("mask", torch.ones(ctx_len, ctx_len), persistent=False)
+            self.mask = torch.tril(self.mask).view(1, ctx_len, ctx_len)
+    def forward(self, x, use_cache=False, layer_past=None):
+        B, T, C = x.shape
+        x = x.transpose(0, 1).contiguous()  # (B, T, C) -> (T, B, C)
+        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+        k = self.key(x).view(T, B*self.n_heads, C//self.n_heads).transpose(0, 1)  # (B*nh, T, hs)
+        q = self.query(x).view(T, B*self.n_heads, C//self.n_heads).transpose(0, 1)  # (B*nh, T, hs)
+        v = self.value(x).view(T, B*self.n_heads, C//self.n_heads).transpose(0, 1)  # (B*nh, T, hs)
+        if use_cache:
+            present = torch.stack([k, v])
+        if layer_past is not None:
+            past_key, past_value = layer_past
+            k = torch.cat([past_key, k], dim=-2)
+            v = torch.cat([past_value, v], dim=-2)
+        if use_cache and layer_past is not None:
+            # Tensor shape below: (B * nh, 1, hs) X (B * nh, hs, K) -> (B * nh, 1, K)
+            att = torch.bmm(q, (k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))))
+            att = F.softmax(att, dim=-1)
+            att = self.attn_drop(att)
+            y = torch.bmm(att, v)  # (B*nh, 1, K) X (B*nh, K, hs) -> (B*nh, 1, hs)
+        else:
+            # Tensor shape below: (B * nh, T, hs) X (B * nh, hs, T) -> (B * nh, T, T)
+            att = torch.bmm(q, (k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))))
+            if self.use_mask:
+                mask = self.mask if T == self.ctx_len else self.mask[:, :T, :T]
+                att = att.masked_fill(mask == 0, float('-inf'))
+            att = F.softmax(att, dim=-1)
+            att = self.attn_drop(att)
+            y = torch.bmm(att, v)  # (B*nh, T, T) X (B*nh, T, hs) -> (B*nh, T, hs)
+        y = y.transpose(0, 1).contiguous().view(T, B, C)  # re-assemble all head outputs side by side
+        # output projection
+        y = self.resid_drop(self.proj(y))
+        if use_cache:
+            return y.transpose(0, 1).contiguous(), present  # (T, B, C) -> (B, T, C)
+        else:
+            return y.transpose(0, 1).contiguous()  # (T, B, C) -> (B, T, C)
+class Block(nn.Module):
+    def __init__(self,
+                 ctx_len: int,
+                 embed_dim: int,
+                 n_heads: int,
+                 mlp_bias: bool,
+                 attn_bias: bool,
+                 resid_pdrop: bool,
+                 attn_pdrop: bool,
+                 gelu_use_approx: bool):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(embed_dim)
+        self.ln2 = nn.LayerNorm(embed_dim)
+        self.attn = MultiHeadSelfAttention(ctx_len=ctx_len,
+                                           embed_dim=embed_dim,
+                                           n_heads=n_heads,
+                                           attn_pdrop=attn_pdrop,
+                                           resid_pdrop=resid_pdrop,
+                                           attn_bias=attn_bias,
+                                           use_mask=True)
+        self.mlp = nn.Sequential(
+            nn.Linear(embed_dim, 4 * embed_dim, bias=mlp_bias),
+            GELU(gelu_use_approx),
+            nn.Linear(4 * embed_dim, embed_dim, bias=mlp_bias),
+            nn.Dropout(resid_pdrop),
+        )
+    def forward(self, x):
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+    def sample(self, x, layer_past=None):
+        attn, present = self.attn(self.ln1(x), use_cache=True, layer_past=layer_past)
+        x = x + attn
+        x = x + self.mlp(self.ln2(x))
+        return x, present

dalle/models/stage2/transformer.py ADDED Viewed

	@@ -0,0 +1,255 @@

+# ------------------------------------------------------------------------------------
+# Minimal DALL-E
+# Copyright (c) 2021 KakaoBrain. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------
+# Modified from minGPT (https://github.com/karpathy/minGPT)
+# Copyright (c) 2020 Andrej Karpathy. All Rights Reserved.
+# ------------------------------------------------------------------------------------
+import torch
+import torch.nn as nn
+from typing import Optional, Tuple, List
+from torch.cuda.amp import autocast
+from omegaconf import OmegaConf
+from .layers import Block
+class Transformer1d(nn.Module):
+    def __init__(self,
+                 vocab_size_txt: int,
+                 vocab_size_img: int,
+                 hparams: OmegaConf) -> None:
+        super().__init__()
+        assert hparams.n_layers == hparams.n_dense_layers
+        # input embedding for image and text
+        self.tok_emb_img = nn.Embedding(vocab_size_img, hparams.embed_dim)
+        self.tok_emb_txt = nn.Embedding(vocab_size_txt, hparams.embed_dim)
+        self.pos_emb_img = nn.Embedding(hparams.ctx_len_img, hparams.embed_dim)
+        self.pos_emb_txt = nn.Embedding(hparams.ctx_len_txt, hparams.embed_dim)
+        self.drop = nn.Dropout(hparams.embd_pdrop)
+        # transformer blocks
+        self.blocks = [Block(ctx_len=hparams.ctx_len_img + hparams.ctx_len_txt,
+                             embed_dim=hparams.embed_dim,
+                             n_heads=hparams.n_heads,
+                             mlp_bias=hparams.mlp_bias,
+                             attn_bias=hparams.attn_bias,
+                             resid_pdrop=hparams.resid_pdrop,
+                             attn_pdrop=hparams.attn_pdrop,
+                             gelu_use_approx=hparams.gelu_use_approx) for i in range(1, hparams.n_layers+1)]
+        self.blocks = nn.Sequential(*self.blocks)
+        # heads for image and text
+        self.ln_f = nn.LayerNorm(hparams.embed_dim)
+        self.head_img = nn.Linear(hparams.embed_dim, vocab_size_img, bias=False)
+        self.head_txt = nn.Linear(hparams.embed_dim, vocab_size_txt, bias=False)
+        self.ctx_len_img = hparams.ctx_len_img
+        self.ctx_len_txt = hparams.ctx_len_txt
+        self.n_layers = hparams.n_layers
+        self.apply(self._init_weights)
+    def _init_weights(self, module: nn.Module) -> None:
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def forward(self,
+                images: torch.LongTensor,
+                texts: torch.LongTensor,
+                pos_images: torch.LongTensor,
+                pos_texts: torch.LongTensor) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
+        B, T = images.shape
+        _, N = texts.shape
+        assert T <= self.ctx_len_img, "Already reached the maximum context length (image)."
+        assert N == self.ctx_len_txt, "Already reached the maximum context length (text)."
+        texts = self.tok_emb_txt(texts)
+        images = self.tok_emb_img(images)
+        texts = texts + self.pos_emb_txt(pos_texts)
+        images = images + self.pos_emb_img(pos_images)
+        x = torch.cat([texts, images], axis=1).contiguous()
+        x = self.drop(x)
+        x = self.blocks(x)
+        x = self.ln_f(x)
+        texts = x[:, :N-1].contiguous()
+        images = x[:, N-1:-1].contiguous()
+        logits_txt = self.head_txt(texts)
+        logits_img = self.head_img(images)
+        return logits_img, logits_txt
+    @torch.no_grad()
+    def sampling(self,
+                 images: torch.LongTensor,
+                 texts: torch.LongTensor,
+                 pos_images: torch.LongTensor,
+                 pos_texts: torch.LongTensor,
+                 use_fp16: bool = True,
+                 past: Optional[List[torch.Tensor]] = None) -> Tuple[torch.FloatTensor, List[torch.FloatTensor]]:
+        _, N = texts.shape
+        assert N == self.ctx_len_txt, "Already reached the maximum context length (text)."
+        with autocast(enabled=use_fp16):
+            if images is None:
+                assert past is None
+                texts = self.tok_emb_txt(texts)
+                x = texts + self.pos_emb_txt(pos_texts)
+                x = self.drop(x)
+                presents = []
+                for i, block in enumerate(self.blocks):
+                    x, present = block.sample(x, layer_past=None)
+                    presents.append(present)
+                x = self.ln_f(x)
+                x = x[:, N-1].contiguous()
+                logits = self.head_img(x)
+            else:
+                if past is None:
+                    texts = self.tok_emb_txt(texts)
+                    images = self.tok_emb_img(images)
+                    texts = texts + self.pos_emb_txt(pos_texts)
+                    images = images + self.pos_emb_img(pos_images)
+                    x = torch.cat([texts, images], axis=1).contiguous()
+                else:
+                    images = self.tok_emb_img(images)
+                    x = images + self.pos_emb_img(pos_images)
+                x = self.drop(x)
+                if past is not None:
+                    past = torch.cat(past, dim=-2)
+                presents = []
+                for i, block in enumerate(self.blocks):
+                    x, present = block.sample(x, layer_past=None if past is None else past[i])
+                    presents.append(present)
+                x = self.ln_f(x)
+                x = x[:, -1].contiguous()
+                logits = self.head_img(x)
+            return logits, presents
+    def from_ckpt(self, path: str) -> None:
+        ckpt = torch.load(path, map_location='cpu')['state_dict']
+        self.load_state_dict(ckpt, strict=True)
+        print(f'{path} succesfully restored..')
+class iGPT(nn.Module):
+    def __init__(self,
+                 vocab_size_img: int,
+                 use_cls_cond: bool,
+                 hparams: OmegaConf) -> None:
+        super().__init__()
+        self.use_cls_cond = use_cls_cond
+        # sos token embedding
+        if self.use_cls_cond:
+            self.sos = nn.Embedding(hparams.n_classes, hparams.embed_dim)
+        else:
+            self.sos = nn.Parameter(torch.randn(1, 1, hparams.embed_dim))
+        # input embedding
+        self.tok_emb_img = nn.Embedding(vocab_size_img, hparams.embed_dim)
+        self.pos_emb_img = nn.Embedding(hparams.ctx_len_img, hparams.embed_dim)
+        self.drop = nn.Dropout(hparams.embd_pdrop)
+        # transformer blocks
+        self.blocks = [Block(ctx_len=hparams.ctx_len_img + 1,
+                             embed_dim=hparams.embed_dim,
+                             n_heads=hparams.n_heads,
+                             mlp_bias=hparams.mlp_bias,
+                             attn_bias=hparams.attn_bias,
+                             resid_pdrop=hparams.resid_pdrop,
+                             attn_pdrop=hparams.attn_pdrop,
+                             gelu_use_approx=hparams.gelu_use_approx) for i in range(1, hparams.n_layers+1)]
+        self.blocks = nn.Sequential(*self.blocks)
+        # head
+        self.ln_f = nn.LayerNorm(hparams.embed_dim)
+        self.head = nn.Linear(hparams.embed_dim, vocab_size_img, bias=False)
+        self.ctx_len_img = hparams.ctx_len_img
+        self.n_layers = hparams.n_layers
+        self.apply(self._init_weights)
+    def _init_weights(self, module: nn.Module) -> None:
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    @torch.no_grad()
+    def sampling(self,
+                 sos: torch.FloatTensor,
+                 codes: torch.LongTensor,
+                 pos_codes: torch.LongTensor,
+                 n_samples: int = 16,
+                 use_fp16: bool = True,
+                 past: Optional[torch.Tensor] = None) -> Tuple[torch.FloatTensor, List[torch.FloatTensor]]:
+        with autocast(enabled=use_fp16):
+            if codes is None:
+                assert past is None
+                xs = self.drop(sos)
+                presents = []
+                for i, block in enumerate(self.blocks):
+                    xs, present = block.sample(xs, layer_past=None)
+                    presents.append(present)
+                xs = self.ln_f(xs)
+                logits = self.head(xs)[:, -1]
+            else:
+                if past is None:
+                    xs = self.tok_emb_img(codes) + self.pos_emb_img(pos_codes)
+                    xs = torch.cat([sos, xs], dim=1)
+                else:
+                    xs = self.tok_emb_img(codes) + self.pos_emb_img(pos_codes)
+                xs = self.drop(xs)
+                past = torch.cat(past, dim=-2) if past is not None else past
+                presents = []
+                for i, block in enumerate(self.blocks):
+                    xs, present = block.sample(xs, layer_past=None if past is None else past[i])
+                    presents.append(present)
+                xs = self.ln_f(xs)
+                logits = self.head(xs)[:, -1]
+            return logits, presents
+    def forward(self,
+                codes: torch.LongTensor,
+                labels: Optional[torch.LongTensor] = None) -> torch.FloatTensor:
+        B, T = codes.shape
+        xps = torch.arange(T, device=codes.device).repeat((B, 1))
+        sos = self.sos.repeat((B, 1, 1)) if labels is None else self.sos(labels).unsqueeze(1)
+        h = self.tok_emb_img(codes) + self.pos_emb_img(xps)
+        h = torch.cat([sos, h[:, :-1]], dim=1).contiguous()
+        h = self.drop(h)
+        h = self.blocks(h)
+        h = self.ln_f(h)
+        logits = self.head(h)
+        return logits
+    def from_ckpt(self, path: str, strict: bool = True) -> None:
+        ckpt = torch.load(path, map_location='cpu')['state_dict']
+        self.load_state_dict(ckpt, strict=strict)
+        print(f'{path} successfully restored..')

dalle/models/tokenizer.py ADDED Viewed

	@@ -0,0 +1,26 @@

+# ------------------------------------------------------------------------------------
+# Minimal DALL-E
+# Copyright (c) 2021 KakaoBrain. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------
+import os
+from functools import partial
+from tokenizers import CharBPETokenizer
+def build_tokenizer(path: str,
+                    context_length: int = 64,
+                    *args,
+                    **kwargs):
+    from_file = partial(CharBPETokenizer.from_file,
+                        vocab_filename=os.path.join(path, 'bpe-16k-vocab.json'),
+                        merges_filename=os.path.join(path, 'bpe-16k-merges.txt'),
+                        unk_token='[UNK]')
+    tokenizer = from_file(*args, **kwargs)
+    tokenizer.add_special_tokens(['[PAD]'])
+    tokenizer.enable_padding(length=context_length,
+                             pad_id=tokenizer.token_to_id('[PAD]'))
+    tokenizer.enable_truncation(max_length=context_length)
+    print(f'{path} successfully restored..')
+    return tokenizer

dalle/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .utils import *
+from .config import *
+from .sampling import *

dalle/utils/config.py ADDED Viewed

	@@ -0,0 +1,123 @@

+# ------------------------------------------------------------------------------------
+# Minimal DALL-E
+# Copyright (c) 2021 KakaoBrain. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------
+from typing import Optional, List
+from dataclasses import dataclass, field
+from omegaconf import OmegaConf
+@dataclass
+class DataConfig:
+    dataset: Optional[str] = None
+    tokenizer_type: str = 'CharBPE'
+    context_length: int = 64
+    image_resolution: int = 256
+    transforms: str = 'dalle-vqvae'
+    bpe_pdrop: Optional[float] = None
+@dataclass
+class Stage1Hparams:
+    double_z: bool = False
+    z_channels: int = 256
+    resolution: int = 256
+    in_channels: int = 3
+    out_ch: int = 3
+    ch: int = 128
+    ch_mult: List[int] = field(default_factory=lambda: [1, 1, 2, 2, 4])
+    num_res_blocks: int = 2
+    attn_resolutions: List[int] = field(default_factory=lambda: [16])
+    pdrop: float = 0.0
+@dataclass
+class Stage2Hparams:
+    embed_dim: int = 1536
+    n_layers: int = 42
+    n_heads: int = 24
+    n_dense_layers: int = 42
+    ctx_len_img: int = 256
+    ctx_len_txt: int = 64
+    embd_pdrop: float = 0.0
+    resid_pdrop: float = 0.0
+    attn_pdrop: float = 0.0
+    mlp_bias: bool = True
+    attn_bias: bool = True
+    gelu_use_approx: bool = False
+    use_head_txt: bool = True
+    n_classes: Optional[int] = None
+@dataclass
+class Stage1Config:
+    type: str = 'vqgan'
+    embed_dim: int = 256
+    n_embed: int = 16384
+    hparams: Stage1Hparams = Stage1Hparams()
+@dataclass
+class Stage2Config:
+    type: str = 'transformer1d'
+    vocab_size_txt: int = 16384
+    vocab_size_img: int = 16384
+    use_cls_cond: Optional[bool] = None
+    hparams: Stage2Hparams = Stage2Hparams()
+@dataclass
+class WarmupConfig:
+    epoch: int = 1
+    multiplier: int = 1
+    buffer_epoch: int = 0
+    min_lr: float = 0.0
+    mode: str = 'fix'
+    peak_lr: float = 1e-4
+    start_from_zero: bool = True
+@dataclass
+class OptConfig:
+    opt_type: str = 'adamW'
+    base_lr: float = 1e-4
+    weight_decay: float = 1e-4
+    betas: List[float] = field(default_factory=lambda: [0.9, 0.99])
+    grad_clip_norm: float = 1.0
+    sched_type: str = 'cosine'
+    max_steps: int = 0
+    min_lr: float = 0.0
+@dataclass
+class ExpConfig:
+    local_batch_size: int = 4
+    total_batch_size: int = 512
+    valid_batch_size: int = 32
+    epochs: int = 10
+    save_ckpt_freq: int = 2
+    test_freq: int = 1
+    use_amp: bool = True
+@dataclass
+class DefaultConfig:
+    dataset: DataConfig = DataConfig()
+    stage1: Stage1Config = Stage1Config()
+    stage2: Stage2Config = Stage2Config()
+@dataclass
+class FineTuningConfig:
+    dataset: DataConfig = DataConfig()
+    stage1: Stage1Config = Stage1Config()
+    stage2: Stage2Config = Stage2Config()
+    optimizer: OptConfig = OptConfig()
+    experiment: ExpConfig = ExpConfig()
+def get_base_config(use_default=True):
+    return OmegaConf.structured(DefaultConfig if use_default else FineTuningConfig)

dalle/utils/sampling.py ADDED Viewed

	@@ -0,0 +1,152 @@

+# ------------------------------------------------------------------------------------
+# Minimal DALL-E
+# Copyright (c) 2021 KakaoBrain. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------
+import torch
+from typing import Optional
+from tqdm import tqdm
+from torch.nn import functional as F
+def cutoff_topk_logits(logits: torch.FloatTensor, k: int) -> torch.FloatTensor:
+    if k is None:
+        return logits
+    else:
+        v, ix = torch.topk(logits, k)
+        out = logits.clone()
+        out[out < v[:, [-1]]] = -float('Inf')
+        return out
+def cutoff_topp_probs(probs: torch.FloatTensor, p: float) -> torch.FloatTensor:
+    if p is None:
+        return probs
+    else:
+        sorted_probs, sorted_indices = torch.sort(probs, dim=-1, descending=True)
+        cum_probs = torch.cumsum(sorted_probs, dim=-1)
+        sorted_idx_remove_cond = cum_probs >= p
+        sorted_idx_remove_cond[..., 1:] = sorted_idx_remove_cond[..., :-1].clone()
+        sorted_idx_remove_cond[..., 0] = 0
+        indices_to_remove = sorted_idx_remove_cond.scatter(-1, sorted_indices, sorted_idx_remove_cond)
+        probs = probs.masked_fill(indices_to_remove, 0.0)
+        norm_probs = probs / torch.sum(probs, dim=-1, keepdim=True)
+        return norm_probs
+def get_positional_encoding(inputs: torch.LongTensor, mode: str = '1d') -> torch.LongTensor:
+    device = inputs.device
+    if mode == '1d':
+        B, N = inputs.shape
+        xs_pos = torch.arange(N, device=device).repeat((B, 1))
+    elif mode == '2d':
+        B, H, W = inputs.shape
+        xs_pos_h = torch.arange(H, device=device).repeat(B, W, 1).transpose(1, 2)
+        xs_pos_w = torch.arange(W, device=device).repeat(B, H, 1)
+        xs_pos = (xs_pos_h, xs_pos_w)
+    else:
+        raise ValueError('%s positional encoding invalid' % mode)
+    return xs_pos
+@torch.no_grad()
+def sampling(model: torch.nn.Module,
+             tokens: torch.LongTensor,
+             top_k: Optional[float] = None,
+             top_p: Optional[float] = None,
+             softmax_temperature: float = 1.0,
+             is_tqdm: bool = True,
+             use_fp16: bool = True,
+             max_seq_len: int = 256) -> torch.LongTensor:
+    code = None
+    past = None
+    pbar = tqdm(range(max_seq_len), total=max_seq_len) if is_tqdm else range(max_seq_len)
+    pos_enc_tokens = get_positional_encoding(tokens, mode='1d')
+    for cnt, h in enumerate(pbar):
+        if code is None:
+            code_ = None
+            pos_enc_code_ = None
+        else:
+            code_ = code.clone().detach()
+            pos_enc_code_ = get_positional_encoding(code_, mode='1d')
+            code_ = code_[:, cnt-1].unsqueeze(-1)
+            pos_enc_code_ = pos_enc_code_[:, cnt-1].unsqueeze(-1)
+        logits, present = model.sampling(images=code_,
+                                         texts=tokens,
+                                         pos_images=pos_enc_code_,
+                                         pos_texts=pos_enc_tokens,
+                                         use_fp16=use_fp16,
+                                         past=past)
+        logits = logits.to(dtype=torch.float32)
+        logits = logits / softmax_temperature
+        present = torch.stack(present).clone().detach()
+        if past is None:
+            past = [present]
+        else:
+            past.append(present)
+        logits = cutoff_topk_logits(logits, top_k)
+        probs = F.softmax(logits, dim=-1)
+        probs = cutoff_topp_probs(probs, top_p)
+        idx = torch.multinomial(probs, num_samples=1).clone().detach()
+        code = idx if code is None else torch.cat([code, idx], axis=1)
+    del past
+    return code
+@torch.no_grad()
+def sampling_igpt(model: torch.nn.Module,
+                  sos: torch.FloatTensor,
+                  top_k: Optional[float] = None,
+                  top_p: Optional[float] = None,
+                  softmax_temperature: float = 1.0,
+                  is_tqdm: bool = True,
+                  use_fp16: bool = True,
+                  max_seq_len: int = 256) -> torch.LongTensor:
+    code = None
+    past = None
+    pbar = tqdm(range(max_seq_len), total=max_seq_len) if is_tqdm else range(max_seq_len)
+    for cnt, h in enumerate(pbar):
+        if code is None:
+            code_ = None
+            pos_enc_code_ = None
+        else:
+            code_ = code.clone().detach()
+            pos_enc_code_ = get_positional_encoding(code_, mode='1d')
+            code_ = code_[:, cnt-1].unsqueeze(-1)
+            pos_enc_code_ = pos_enc_code_[:, cnt-1].unsqueeze(-1)
+        logits, present = model.sampling(sos=sos,
+                                         codes=code_,
+                                         pos_codes=pos_enc_code_,
+                                         use_fp16=use_fp16,
+                                         past=past)
+        logits = logits.to(dtype=torch.float32)
+        logits = logits / softmax_temperature
+        present = torch.stack(present).clone().detach()
+        if past is None:
+            past = [present]
+        else:
+            past.append(present)
+        logits = cutoff_topk_logits(logits, top_k)
+        probs = F.softmax(logits, dim=-1)
+        probs = cutoff_topp_probs(probs, top_p)
+        idx = torch.multinomial(probs, num_samples=1).clone().detach()
+        code = idx if code is None else torch.cat([code, idx], axis=1)
+    del past
+    return code

dalle/utils/utils.py ADDED Viewed

	@@ -0,0 +1,84 @@

+# ------------------------------------------------------------------------------------
+# Minimal DALL-E
+# Copyright (c) 2021 KakaoBrain. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------
+import os
+import random
+import urllib
+import hashlib
+import tarfile
+import torch
+import clip
+import numpy as np
+from PIL import Image
+from torch.nn import functional as F
+from tqdm import tqdm
+def set_seed(seed: int):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+@torch.no_grad()
+def clip_score(prompt: str,
+               images: np.ndarray,
+               model_clip: torch.nn.Module,
+               preprocess_clip,
+               device: str) -> np.ndarray:
+    images = [preprocess_clip(Image.fromarray((image*255).astype(np.uint8))) for image in images]
+    images = torch.stack(images, dim=0).to(device=device)
+    texts = clip.tokenize(prompt).to(device=device)
+    texts = torch.repeat_interleave(texts, images.shape[0], dim=0)
+    image_features = model_clip.encode_image(images)
+    text_features = model_clip.encode_text(texts)
+    scores = F.cosine_similarity(image_features, text_features).squeeze()
+    rank = torch.argsort(scores, descending=True).cpu().numpy()
+    return rank
+def download(url: str, root: str) -> str:
+    os.makedirs(root, exist_ok=True)
+    filename = os.path.basename(url)
+    pathname = filename[:-len('.tar.gz')]
+    expected_md5 = url.split("/")[-2]
+    download_target = os.path.join(root, filename)
+    result_path = os.path.join(root, pathname)
+    if os.path.isfile(download_target) and (os.path.exists(result_path) and not os.path.isfile(result_path)):
+        return result_path
+    with urllib.request.urlopen(url) as source, open(download_target, 'wb') as output:
+        with tqdm(total=int(source.info().get('Content-Length')), ncols=80, unit='iB', unit_scale=True,
+                  unit_divisor=1024) as loop:
+            while True:
+                buffer = source.read(8192)
+                if not buffer:
+                    break
+                output.write(buffer)
+                loop.update(len(buffer))
+    if hashlib.md5(open(download_target, 'rb').read()).hexdigest() != expected_md5:
+        raise RuntimeError(f'Model has been downloaded but the md5 checksum does not not match')
+    with tarfile.open(download_target, 'r:gz') as f:
+        pbar = tqdm(f.getmembers(), total=len(f.getmembers()))
+        for member in pbar:
+            pbar.set_description(f'extracting: {member.name} (size:{member.size // (1024 * 1024)}MB)')
+            f.extract(member=member, path=root)
+    return result_path
+def realpath_url_or_path(url_or_path: str, root: str = None) -> str:
+    if urllib.parse.urlparse(url_or_path).scheme in ('http', 'https'):
+        return download(url_or_path, root)
+    return url_or_path

examples/sampling_ex.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# ------------------------------------------------------------------------------------
+# Minimal DALL-E
+# Copyright (c) 2021 KakaoBrain. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------
+import os
+import sys
+import argparse
+import clip
+import numpy as np
+from PIL import Image
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from dalle.models import Dalle
+from dalle.utils.utils import set_seed, clip_score
+parser = argparse.ArgumentParser()
+parser.add_argument('-n', '--num_candidates', type=int, default=96)
+parser.add_argument('--prompt', type=str, default='A painting of a tree on the ocean')
+parser.add_argument('--softmax-temperature', type=float, default=1.0)
+parser.add_argument('--top-k', type=int, default=256)
+parser.add_argument('--top-p', type=float, default=None, help='0.0 <= top-p <= 1.0')
+parser.add_argument('--seed', type=int, default=0)
+args = parser.parse_args()
+# Setup
+assert args.top_k <= 256, "It is recommended that top_k is set lower than 256."
+set_seed(args.seed)
+device = 'cuda:0'
+model = Dalle.from_pretrained('minDALL-E/1.3B')  # This will automatically download the pretrained model.
+model.to(device=device)
+# Sampling
+images = model.sampling(prompt=args.prompt,
+                        top_k=args.top_k,
+                        top_p=args.top_p,
+                        softmax_temperature=args.softmax_temperature,
+                        num_candidates=args.num_candidates,
+                        device=device).cpu().numpy()
+images = np.transpose(images, (0, 2, 3, 1))
+# CLIP Re-ranking
+model_clip, preprocess_clip = clip.load("ViT-B/32", device=device)
+model_clip.to(device=device)
+rank = clip_score(prompt=args.prompt,
+                  images=images,
+                  model_clip=model_clip,
+                  preprocess_clip=preprocess_clip,
+                  device=device)
+# Save images
+images = images[rank]
+print(rank, images.shape)
+if not os.path.exists('./figures'):
+    os.makedirs('./figures')
+for i in range(min(16, args.num_candidates)):
+    im = Image.fromarray((images[i]*255).astype(np.uint8))
+    im.save(f'./figures/{args.prompt}_{i}.png')

examples/sampling_interactive_demo.ipynb ADDED Viewed

	@@ -0,0 +1,298 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "cdf36725-ec00-4027-95d6-374340c2264e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|█████████████████████████████████████| 4.72G/4.72G [02:04<00:00, 40.7MiB/s]\n",
+      "extracting: ./1.3B/tokenizer/bpe-16k-vocab.json (size:0MB): 100%|██████████| 7/7 [00:59<00:00,  8.51s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/root/.cache/minDALL-E/1.3B/tokenizer successfully restored..\n",
+      "/root/.cache/minDALL-E/1.3B/stage1_last.ckpt successfully restored..\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|                                               | 0.00/338M [00:00<?, ?iB/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/root/.cache/minDALL-E/1.3B/stage2_last.ckpt succesfully restored..\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|███████████████████████████████████████| 338M/338M [00:09<00:00, 38.5MiB/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "import math\n",
+    "import argparse\n",
+    "import clip\n",
+    "import numpy as np\n",
+    "%matplotlib inline\n",
+    "import matplotlib.pyplot as plt\n",
+    "from PIL import Image\n",
+    "\n",
+    "sys.path.append(os.path.dirname(os.getcwd()))\n",
+    "\n",
+    "from dalle.models import Dalle\n",
+    "from dalle.utils.utils import set_seed, clip_score\n",
+    "\n",
+    "device = 'cuda:0'\n",
+    "model = Dalle.from_pretrained(\"minDALL-E/1.3B\")\n",
+    "model_clip, preprocess_clip = clip.load(\"ViT-B/32\", device=device)\n",
+    "\n",
+    "model_clip.to(device=device)\n",
+    "model.to(device=device)\n",
+    "\n",
+    "def sampling(prompt, top_k, softmax_temperature, seed, num_candidates=96, num_samples_for_display=36):\n",
+    "    # Setup\n",
+    "    n_row = int(math.sqrt(num_samples_for_display))\n",
+    "    n_col = int(math.sqrt(num_samples_for_display))\n",
+    "    set_seed(seed)\n",
+    "    \n",
+    "    # Sampling\n",
+    "    images = model.sampling(prompt=prompt,\n",
+    "                            top_k=top_k,\n",
+    "                            top_p=None,\n",
+    "                            softmax_temperature=softmax_temperature,\n",
+    "                            num_candidates=num_candidates,\n",
+    "                            device=device).cpu().numpy()\n",
+    "    images = np.transpose(images, (0, 2, 3, 1))\n",
+    "\n",
+    "    # CLIP Re-ranking\n",
+    "    rank = clip_score(prompt=prompt, images=images, model_clip=model_clip, preprocess_clip=preprocess_clip, device=device)\n",
+    "    images = images[rank]\n",
+    "    \n",
+    "    images = images[:num_samples_for_display]\n",
+    "    fig = plt.figure(figsize=(8*n_row, 8*n_col))\n",
+    "\n",
+    "    for i in range(num_samples_for_display):\n",
+    "        ax = fig.add_subplot(n_row, n_col, i+1)\n",
+    "        ax.imshow(images[i])\n",
+    "        ax.set_axis_off()\n",
+    "\n",
+    "    plt.tight_layout()\n",
+    "    plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "619add15-073e-40f4-9a97-06b89d647c81",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ee477531ea0e4b86b20d997f8cb83767",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "IntSlider(value=0, description='RND SEED: ', max=1024)"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d63edc4725ef4f4e8a6f03f7693a481d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "FloatSlider(value=1.0, description='SOFTMAX TEMPERATURE:', max=5.0, step=0.2)"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5bb9170e9e8b4686a661799d8aff3901",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "IntSlider(value=256, description='TOP-K:', max=512, step=16)"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6b97b49debfc4f7ab002748e9fd89864",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Text(value='A painting of a monkey with sunglasses in the frame', description='String:', placeholder='Text pro…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a520b10d8c0b4dd0bb6db56dc37b4422",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Button(description='Generate!', style=ButtonStyle())"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5a98437abf964636a467677dc4f816bb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Output()"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "90d05006d50e4d88b8fb7c36095b12e7",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Output()"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "import ipywidgets as widgets\n",
+    "from IPython.display import display\n",
+    "from IPython.display import clear_output\n",
+    "\n",
+    "output = widgets.Output()\n",
+    "plot_output = widgets.Output()\n",
+    "\n",
+    "def btn_eventhandler(obj):\n",
+    "    output.clear_output()\n",
+    "    plot_output.clear_output()\n",
+    "    \n",
+    "    with output:\n",
+    "        print(f'SEED: {slider_seed.value}')\n",
+    "        print(f'Softmax Temperature: {slider_temp.value}')\n",
+    "        print(f'Top-K: {slider_topk.value}')\n",
+    "        print(f'Text prompt: {wd_text.value}')\n",
+    "        \n",
+    "    with plot_output:\n",
+    "        sampling(prompt=wd_text.value, top_k=slider_topk.value, softmax_temperature=slider_temp.value, seed=slider_seed.value)\n",
+    "    \n",
+    "slider_seed = widgets.IntSlider(\n",
+    "    min=0,\n",
+    "    max=1024,\n",
+    "    step=1,\n",
+    "    description='RND SEED: ',\n",
+    "    value=0\n",
+    ")\n",
+    "slider_topk = widgets.IntSlider(\n",
+    "    min=0,\n",
+    "    max=512,\n",
+    "    step=16,\n",
+    "    description='TOP-K:',\n",
+    "    value=256\n",
+    ")\n",
+    "slider_temp = widgets.FloatSlider(\n",
+    "    min=0.0,\n",
+    "    max=5.0,\n",
+    "    step=0.2,\n",
+    "    description='SOFTMAX TEMPERATURE:',\n",
+    "    value=1.0\n",
+    ")\n",
+    "wd_text = widgets.Text(\n",
+    "    value='A painting of a monkey with sunglasses in the frame',\n",
+    "    placeholder='Text prompt',\n",
+    "    description='String:',\n",
+    "    disabled=False\n",
+    ")\n",
+    "\n",
+    "display(slider_seed)\n",
+    "display(slider_temp)\n",
+    "display(slider_topk)\n",
+    "display(wd_text)\n",
+    "\n",
+    "btn = widgets.Button(description='Generate!')\n",
+    "display(btn)\n",
+    "btn.on_click(btn_eventhandler)\n",
+    "\n",
+    "display(output)\n",
+    "display(plot_output)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "20571236-3b9a-426e-ab29-96b643c8cbe1",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

examples/transfer_learning_ex.py ADDED Viewed

	@@ -0,0 +1,172 @@

+# ------------------------------------------------------------------------------------
+# Minimal DALL-E
+# Copyright (c) 2021 KakaoBrain. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------
+import os
+import sys
+import argparse
+from typing import Optional
+from datetime import datetime
+import torch
+from torch.utils.data import DataLoader
+import torchvision
+import torchvision.transforms as transforms
+import pytorch_lightning as pl
+from pytorch_lightning.callbacks import ModelCheckpoint, Callback
+from pytorch_lightning.loggers import TensorBoardLogger
+from pytorch_lightning.utilities.distributed import rank_zero_only
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from dalle.models import ImageGPT
+parser = argparse.ArgumentParser()
+parser.add_argument('-d', '--config-downstream', type=str, default=None, required=True)
+parser.add_argument('-u', '--path-upstream', type=str, default=None, required=True)
+parser.add_argument('-r', '--result-path', type=str, default=None, required=True)
+parser.add_argument('--imagenet-path', type=str, default=None, required=True)
+parser.add_argument('--n-gpus', type=int, default=1)
+parser.add_argument('--seed', type=int, default=0)
+args = parser.parse_args()
+class ImageLogger(Callback):
+    def __init__(self):
+        super().__init__()
+    @rank_zero_only
+    def log_img(self, pl_module, batch, current_epoch, split="train"):
+        with torch.no_grad():
+            images, labels = batch
+            recons = pl_module.stage1(images)
+            images = images.cpu()
+            recons = recons.cpu()
+            grid_org = (torchvision.utils.make_grid(images, nrow=8) + 1.0) / 2.0
+            grid_rec = (torchvision.utils.make_grid(recons, nrow=8) + 1.0) / 2.0
+            grid_rec = torch.clip(grid_rec, min=0, max=1)
+            pl_module.logger.experiment.add_image(f"images_org/{split}", grid_org, global_step=current_epoch)
+            pl_module.logger.experiment.add_image(f"images_rec/{split}", grid_rec, global_step=current_epoch)
+    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx):
+        if batch_idx == 0 and trainer.current_epoch < 5:
+            self.log_img(pl_module, batch, current_epoch=trainer.current_epoch, split="train")
+    def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx):
+        if batch_idx == 0 and trainer.current_epoch < 5:
+            self.log_img(pl_module, batch, current_epoch=trainer.current_epoch, split="test")
+class ImageNetDataModule(pl.LightningDataModule):
+    def __init__(self,
+                 data_dir: Optional[str] = None,
+                 image_resolution: int = 256,
+                 train_batch_size: int = 2,
+                 valid_batch_size: int = 32,
+                 num_workers: int = 8):
+        super().__init__()
+        self.data_dir = data_dir
+        self.image_resolution = image_resolution
+        self.train_batch_size = train_batch_size
+        self.valid_batch_size = valid_batch_size
+        self.num_workers = num_workers
+        self.train_transform = transforms.Compose(
+            [transforms.Resize(image_resolution),
+             transforms.RandomCrop(image_resolution),
+             transforms.ToTensor(),
+             transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])]
+        )
+        self.valid_transform = transforms.Compose(
+            [transforms.Resize(image_resolution),
+             transforms.CenterCrop(image_resolution),
+             transforms.ToTensor(),
+             transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])]
+        )
+    def setup(self, stage=None):
+        self.trainset = torchvision.datasets.ImageNet(root=self.data_dir, split='train', transform=self.train_transform)
+        self.validset = torchvision.datasets.ImageNet(root=self.data_dir, split='val', transform=self.valid_transform)
+    def train_dataloader(self):
+        return DataLoader(self.trainset,
+                          batch_size=self.train_batch_size,
+                          num_workers=self.num_workers,
+                          pin_memory=True)
+    def valid_dataloader(self):
+        return DataLoader(self.validset,
+                          batch_size=self.valid_batch_size,
+                          num_workers=self.num_workers,
+                          pin_memory=True)
+def setup_callbacks(config):
+    # Setup callbacks
+    now = datetime.now().strftime('%d%m%Y_%H%M%S')
+    result_path = os.path.join(args.result_path,
+                               os.path.basename(args.config_downstream).split('.')[0],
+                               now)
+    ckpt_path = os.path.join(result_path, 'ckpt')
+    log_path = os.path.join(result_path, 'log')
+    checkpoint_callback = ModelCheckpoint(
+        dirpath=ckpt_path,
+        filename="imagenet-clscond-gen-{epoch:02d}" if config.stage2.use_cls_cond else
+                 "imagenet-uncond-gen-{epoch:02d}",
+        every_n_epochs=config.experiment.save_ckpt_freq,
+        save_weights_only=True,
+        save_last=True
+    )
+    logger = TensorBoardLogger(log_path, name="iGPT")
+    logger_img = ImageLogger()
+    return checkpoint_callback, logger, logger_img
+if __name__ == '__main__':
+    pl.seed_everything(args.seed)
+    # Build iGPT
+    model, config = ImageGPT.from_pretrained(args.path_upstream, args.config_downstream)
+    # Setup callbacks
+    ckpt_callback, logger, logger_img = setup_callbacks(config)
+    # Build data modules
+    dataset = ImageNetDataModule(data_dir=args.imagenet_path,
+                                 image_resolution=config.dataset.image_resolution,
+                                 train_batch_size=config.experiment.local_batch_size,
+                                 valid_batch_size=config.experiment.valid_batch_size,
+                                 num_workers=16)
+    dataset.setup()
+    train_dataloader = dataset.train_dataloader()
+    valid_dataloader = dataset.valid_dataloader()
+    print(f"len(train_dataset) = {len(dataset.trainset)}")
+    print(f"len(valid_dataset) = {len(dataset.validset)}")
+    # Calculate how many batches are accumulated
+    assert config.experiment.total_batch_size % (config.experiment.local_batch_size * args.n_gpus) == 0
+    grad_accm_steps = config.experiment.total_batch_size // (config.experiment.local_batch_size * args.n_gpus)
+    config.optimizer.max_steps = len(dataset.trainset) // config.experiment.total_batch_size * config.experiment.epochs
+    # Build trainer
+    trainer = pl.Trainer(max_epochs=config.experiment.epochs,
+                         accumulate_grad_batches=grad_accm_steps,
+                         gradient_clip_val=config.optimizer.grad_clip_norm,
+                         precision=16 if config.experiment.use_amp else 32,
+                         callbacks=[ckpt_callback, logger_img],
+                         accelerator="gpu",
+                         devices=args.n_gpus,
+                         strategy="ddp",
+                         logger=logger)
+    trainer.fit(model, train_dataloader, valid_dataloader)

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+torch==1.8.0
+torchvision>=0.8.2
+tokenizers>=0.10.2
+pyflakes>=2.2.0
+tqdm>=4.46.0
+pytorch-lightning>=1.5
+einops
+omegaconf
+git+https://github.com/openai/CLIP.git
+matplotlib

setup.cfg ADDED Viewed

	@@ -0,0 +1,3 @@

+[flake8]
+max-line-length = 120
+ignore = E226, E402, W504