UncleFish commited on May 7, 2024

Commit

92b6d07

1 Parent(s): bc78887

init release

Browse files

Files changed (29) hide show

.gitignore +3 -0
LICENSE.txt +437 -0
README.md +150 -0
added_tokens.json +14 -0
config.json +26 -0
configuration_blip_3.py +161 -0
demo.ipynb +0 -0
generation_config.json +7 -0
image_processing_blip_3.py +409 -0
model-00001-of-00004.safetensors +3 -0
model-00002-of-00004.safetensors +3 -0
model-00003-of-00004.safetensors +3 -0
model-00004-of-00004.safetensors +3 -0
model.safetensors.index.json +673 -0
modeling_blip_3.py +110 -0
preprocessor_config.json +23 -0
special_tokens_map.json +30 -0
test_samples/images/1074.jpg +0 -0
test_samples/images/1148.jpg +0 -0
test_samples/images/152.jpg +0 -0
test_samples/images/1614.jpg +0 -0
test_samples/images/26302.jpg +0 -0
test_samples/images/45711.jpg +0 -0
test_samples/test.json +36 -0
tokenizer.json +0 -0
tokenizer.model +3 -0
tokenizer_config.json +137 -0
utils.py +383 -0
vlm.py +1531 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+__pycache__/**
+debug.py
+sanity_check.ipynb

LICENSE.txt ADDED Viewed

	@@ -0,0 +1,437 @@

+Attribution-NonCommercial-ShareAlike 4.0 International
+=======================================================================
+Creative Commons Corporation ("Creative Commons") is not a law firm and
+does not provide legal services or legal advice. Distribution of
+Creative Commons public licenses does not create a lawyer-client or
+other relationship. Creative Commons makes its licenses and related
+information available on an "as-is" basis. Creative Commons gives no
+warranties regarding its licenses, any material licensed under their
+terms and conditions, or any related information. Creative Commons
+disclaims all liability for damages resulting from their use to the
+fullest extent possible.
+Using Creative Commons Public Licenses
+Creative Commons public licenses provide a standard set of terms and
+conditions that creators and other rights holders may use to share
+original works of authorship and other material subject to copyright
+and certain other rights specified in the public license below. The
+following considerations are for informational purposes only, are not
+exhaustive, and do not form part of our licenses.
+     Considerations for licensors: Our public licenses are
+     intended for use by those authorized to give the public
+     permission to use material in ways otherwise restricted by
+     copyright and certain other rights. Our licenses are
+     irrevocable. Licensors should read and understand the terms
+     and conditions of the license they choose before applying it.
+     Licensors should also secure all rights necessary before
+     applying our licenses so that the public can reuse the
+     material as expected. Licensors should clearly mark any
+     material not subject to the license. This includes other CC-
+     licensed material, or material used under an exception or
+     limitation to copyright. More considerations for licensors:
+    wiki.creativecommons.org/Considerations_for_licensors
+     Considerations for the public: By using one of our public
+     licenses, a licensor grants the public permission to use the
+     licensed material under specified terms and conditions. If
+     the licensor's permission is not necessary for any reason--for
+     example, because of any applicable exception or limitation to
+     copyright--then that use is not regulated by the license. Our
+     licenses grant only permissions under copyright and certain
+     other rights that a licensor has authority to grant. Use of
+     the licensed material may still be restricted for other
+     reasons, including because others have copyright or other
+     rights in the material. A licensor may make special requests,
+     such as asking that all changes be marked or described.
+     Although not required by our licenses, you are encouraged to
+     respect those requests where reasonable. More considerations
+     for the public:
+    wiki.creativecommons.org/Considerations_for_licensees
+=======================================================================
+Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International
+Public License
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution-NonCommercial-ShareAlike 4.0 International Public License
+("Public License"). To the extent this Public License may be
+interpreted as a contract, You are granted the Licensed Rights in
+consideration of Your acceptance of these terms and conditions, and the
+Licensor grants You such rights in consideration of benefits the
+Licensor receives from making the Licensed Material available under
+these terms and conditions.
+Section 1 -- Definitions.
+  a. Adapted Material means material subject to Copyright and Similar
+     Rights that is derived from or based upon the Licensed Material
+     and in which the Licensed Material is translated, altered,
+     arranged, transformed, or otherwise modified in a manner requiring
+     permission under the Copyright and Similar Rights held by the
+     Licensor. For purposes of this Public License, where the Licensed
+     Material is a musical work, performance, or sound recording,
+     Adapted Material is always produced where the Licensed Material is
+     synched in timed relation with a moving image.
+  b. Adapter's License means the license You apply to Your Copyright
+     and Similar Rights in Your contributions to Adapted Material in
+     accordance with the terms and conditions of this Public License.
+  c. BY-NC-SA Compatible License means a license listed at
+     creativecommons.org/compatiblelicenses, approved by Creative
+     Commons as essentially the equivalent of this Public License.
+  d. Copyright and Similar Rights means copyright and/or similar rights
+     closely related to copyright including, without limitation,
+     performance, broadcast, sound recording, and Sui Generis Database
+     Rights, without regard to how the rights are labeled or
+     categorized. For purposes of this Public License, the rights
+     specified in Section 2(b)(1)-(2) are not Copyright and Similar
+     Rights.
+  e. Effective Technological Measures means those measures that, in the
+     absence of proper authority, may not be circumvented under laws
+     fulfilling obligations under Article 11 of the WIPO Copyright
+     Treaty adopted on December 20, 1996, and/or similar international
+     agreements.
+  f. Exceptions and Limitations means fair use, fair dealing, and/or
+     any other exception or limitation to Copyright and Similar Rights
+     that applies to Your use of the Licensed Material.
+  g. License Elements means the license attributes listed in the name
+     of a Creative Commons Public License. The License Elements of this
+     Public License are Attribution, NonCommercial, and ShareAlike.
+  h. Licensed Material means the artistic or literary work, database,
+     or other material to which the Licensor applied this Public
+     License.
+  i. Licensed Rights means the rights granted to You subject to the
+     terms and conditions of this Public License, which are limited to
+     all Copyright and Similar Rights that apply to Your use of the
+     Licensed Material and that the Licensor has authority to license.
+  j. Licensor means the individual(s) or entity(ies) granting rights
+     under this Public License.
+  k. NonCommercial means not primarily intended for or directed towards
+     commercial advantage or monetary compensation. For purposes of
+     this Public License, the exchange of the Licensed Material for
+     other material subject to Copyright and Similar Rights by digital
+     file-sharing or similar means is NonCommercial provided there is
+     no payment of monetary compensation in connection with the
+     exchange.
+  l. Share means to provide material to the public by any means or
+     process that requires permission under the Licensed Rights, such
+     as reproduction, public display, public performance, distribution,
+     dissemination, communication, or importation, and to make material
+     available to the public including in ways that members of the
+     public may access the material from a place and at a time
+     individually chosen by them.
+  m. Sui Generis Database Rights means rights other than copyright
+     resulting from Directive 96/9/EC of the European Parliament and of
+     the Council of 11 March 1996 on the legal protection of databases,
+     as amended and/or succeeded, as well as other essentially
+     equivalent rights anywhere in the world.
+  n. You means the individual or entity exercising the Licensed Rights
+     under this Public License. Your has a corresponding meaning.
+Section 2 -- Scope.
+  a. License grant.
+       1. Subject to the terms and conditions of this Public License,
+          the Licensor hereby grants You a worldwide, royalty-free,
+          non-sublicensable, non-exclusive, irrevocable license to
+          exercise the Licensed Rights in the Licensed Material to:
+            a. reproduce and Share the Licensed Material, in whole or
+               in part, for NonCommercial purposes only; and
+            b. produce, reproduce, and Share Adapted Material for
+               NonCommercial purposes only.
+       2. Exceptions and Limitations. For the avoidance of doubt, where
+          Exceptions and Limitations apply to Your use, this Public
+          License does not apply, and You do not need to comply with
+          its terms and conditions.
+       3. Term. The term of this Public License is specified in Section
+          6(a).
+       4. Media and formats; technical modifications allowed. The
+          Licensor authorizes You to exercise the Licensed Rights in
+          all media and formats whether now known or hereafter created,
+          and to make technical modifications necessary to do so. The
+          Licensor waives and/or agrees not to assert any right or
+          authority to forbid You from making technical modifications
+          necessary to exercise the Licensed Rights, including
+          technical modifications necessary to circumvent Effective
+          Technological Measures. For purposes of this Public License,
+          simply making modifications authorized by this Section 2(a)
+          (4) never produces Adapted Material.
+       5. Downstream recipients.
+            a. Offer from the Licensor -- Licensed Material. Every
+               recipient of the Licensed Material automatically
+               receives an offer from the Licensor to exercise the
+               Licensed Rights under the terms and conditions of this
+               Public License.
+            b. Additional offer from the Licensor -- Adapted Material.
+               Every recipient of Adapted Material from You
+               automatically receives an offer from the Licensor to
+               exercise the Licensed Rights in the Adapted Material
+               under the conditions of the Adapter's License You apply.
+            c. No downstream restrictions. You may not offer or impose
+               any additional or different terms or conditions on, or
+               apply any Effective Technological Measures to, the
+               Licensed Material if doing so restricts exercise of the
+               Licensed Rights by any recipient of the Licensed
+               Material.
+       6. No endorsement. Nothing in this Public License constitutes or
+          may be construed as permission to assert or imply that You
+          are, or that Your use of the Licensed Material is, connected
+          with, or sponsored, endorsed, or granted official status by,
+          the Licensor or others designated to receive attribution as
+          provided in Section 3(a)(1)(A)(i).
+  b. Other rights.
+       1. Moral rights, such as the right of integrity, are not
+          licensed under this Public License, nor are publicity,
+          privacy, and/or other similar personality rights; however, to
+          the extent possible, the Licensor waives and/or agrees not to
+          assert any such rights held by the Licensor to the limited
+          extent necessary to allow You to exercise the Licensed
+          Rights, but not otherwise.
+       2. Patent and trademark rights are not licensed under this
+          Public License.
+       3. To the extent possible, the Licensor waives any right to
+          collect royalties from You for the exercise of the Licensed
+          Rights, whether directly or through a collecting society
+          under any voluntary or waivable statutory or compulsory
+          licensing scheme. In all other cases the Licensor expressly
+          reserves any right to collect such royalties, including when
+          the Licensed Material is used other than for NonCommercial
+          purposes.
+Section 3 -- License Conditions.
+Your exercise of the Licensed Rights is expressly made subject to the
+following conditions.
+  a. Attribution.
+       1. If You Share the Licensed Material (including in modified
+          form), You must:
+            a. retain the following if it is supplied by the Licensor
+               with the Licensed Material:
+                 i. identification of the creator(s) of the Licensed
+                    Material and any others designated to receive
+                    attribution, in any reasonable manner requested by
+                    the Licensor (including by pseudonym if
+                    designated);
+                ii. a copyright notice;
+               iii. a notice that refers to this Public License;
+                iv. a notice that refers to the disclaimer of
+                    warranties;
+                 v. a URI or hyperlink to the Licensed Material to the
+                    extent reasonably practicable;
+            b. indicate if You modified the Licensed Material and
+               retain an indication of any previous modifications; and
+            c. indicate the Licensed Material is licensed under this
+               Public License, and include the text of, or the URI or
+               hyperlink to, this Public License.
+       2. You may satisfy the conditions in Section 3(a)(1) in any
+          reasonable manner based on the medium, means, and context in
+          which You Share the Licensed Material. For example, it may be
+          reasonable to satisfy the conditions by providing a URI or
+          hyperlink to a resource that includes the required
+          information.
+       3. If requested by the Licensor, You must remove any of the
+          information required by Section 3(a)(1)(A) to the extent
+          reasonably practicable.
+  b. ShareAlike.
+     In addition to the conditions in Section 3(a), if You Share
+     Adapted Material You produce, the following conditions also apply.
+       1. The Adapter's License You apply must be a Creative Commons
+          license with the same License Elements, this version or
+          later, or a BY-NC-SA Compatible License.
+       2. You must include the text of, or the URI or hyperlink to, the
+          Adapter's License You apply. You may satisfy this condition
+          in any reasonable manner based on the medium, means, and
+          context in which You Share Adapted Material.
+       3. You may not offer or impose any additional or different terms
+          or conditions on, or apply any Effective Technological
+          Measures to, Adapted Material that restrict exercise of the
+          rights granted under the Adapter's License You apply.
+Section 4 -- Sui Generis Database Rights.
+Where the Licensed Rights include Sui Generis Database Rights that
+apply to Your use of the Licensed Material:
+  a. for the avoidance of doubt, Section 2(a)(1) grants You the right
+     to extract, reuse, reproduce, and Share all or a substantial
+     portion of the contents of the database for NonCommercial purposes
+     only;
+  b. if You include all or a substantial portion of the database
+     contents in a database in which You have Sui Generis Database
+     Rights, then the database in which You have Sui Generis Database
+     Rights (but not its individual contents) is Adapted Material,
+     including for purposes of Section 3(b); and
+  c. You must comply with the conditions in Section 3(a) if You Share
+     all or a substantial portion of the contents of the database.
+For the avoidance of doubt, this Section 4 supplements and does not
+replace Your obligations under this Public License where the Licensed
+Rights include other Copyright and Similar Rights.
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+  a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
+     EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
+     AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
+     ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
+     IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
+     WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
+     PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
+     ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
+     KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
+     ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+  b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
+     TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
+     NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
+     INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
+     COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
+     USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
+     ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
+     DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
+     IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+  c. The disclaimer of warranties and limitation of liability provided
+     above shall be interpreted in a manner that, to the extent
+     possible, most closely approximates an absolute disclaimer and
+     waiver of all liability.
+Section 6 -- Term and Termination.
+  a. This Public License applies for the term of the Copyright and
+     Similar Rights licensed here. However, if You fail to comply with
+     this Public License, then Your rights under this Public License
+     terminate automatically.
+  b. Where Your right to use the Licensed Material has terminated under
+     Section 6(a), it reinstates:
+       1. automatically as of the date the violation is cured, provided
+          it is cured within 30 days of Your discovery of the
+          violation; or
+       2. upon express reinstatement by the Licensor.
+     For the avoidance of doubt, this Section 6(b) does not affect any
+     right the Licensor may have to seek remedies for Your violations
+     of this Public License.
+  c. For the avoidance of doubt, the Licensor may also offer the
+     Licensed Material under separate terms or conditions or stop
+     distributing the Licensed Material at any time; however, doing so
+     will not terminate this Public License.
+  d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+     License.
+Section 7 -- Other Terms and Conditions.
+  a. The Licensor shall not be bound by any additional or different
+     terms or conditions communicated by You unless expressly agreed.
+  b. Any arrangements, understandings, or agreements regarding the
+     Licensed Material not stated herein are separate from and
+     independent of the terms and conditions of this Public License.
+Section 8 -- Interpretation.
+  a. For the avoidance of doubt, this Public License does not, and
+     shall not be interpreted to, reduce, limit, restrict, or impose
+     conditions on any use of the Licensed Material that could lawfully
+     be made without permission under this Public License.
+  b. To the extent possible, if any provision of this Public License is
+     deemed unenforceable, it shall be automatically reformed to the
+     minimum extent necessary to make it enforceable. If the provision
+     cannot be reformed, it shall be severed from this Public License
+     without affecting the enforceability of the remaining terms and
+     conditions.
+  c. No term or condition of this Public License will be waived and no
+     failure to comply consented to unless expressly agreed to by the
+     Licensor.
+  d. Nothing in this Public License constitutes or may be interpreted
+     as a limitation upon, or waiver of, any privileges and immunities
+     that apply to the Licensor or You, including from the legal
+     processes of any jurisdiction or authority.
+=======================================================================
+Creative Commons is not a party to its public
+licenses. Notwithstanding, Creative Commons may elect to apply one of
+its public licenses to material it publishes and in those instances
+will be considered the “Licensor.” The text of the Creative Commons
+public licenses is dedicated to the public domain under the CC0 Public
+Domain Dedication. Except for the limited purpose of indicating that
+material is shared under a Creative Commons public license or as
+otherwise permitted by the Creative Commons policies published at
+creativecommons.org/policies, Creative Commons does not authorize the
+use of the trademark "Creative Commons" or any other trademark or logo
+of Creative Commons without its prior written consent including,
+without limitation, in connection with any unauthorized modifications
+to any of its public licenses or any other arrangements,
+understandings, or agreements concerning use of licensed material. For
+the avoidance of doubt, this paragraph does not form part of the
+public licenses.
+Creative Commons may be contacted at creativecommons.org.

README.md ADDED Viewed

	@@ -0,0 +1,150 @@

+---
+license: cc-by-nc-4.0
+language:
+- en
+pipeline_tag: image-text-to-text
+---
+# Model description
+`BLIP3` is a series of foundational vision-language models (VLMs) developed by Salesforce AI Research. \
+These models have been trained at scale on high-quality image caption datasets and interleaved image-text data. BLIP3 highlights a few features below,
+* The pretrained foundation model, `blip3-phi3-mini-base-r-v1`, achieves state-of-the-art performance under 5b parameters and demonstrates strong in-context learning capabilities.
+* The instruct fine-tuned model, `blip3-phi3-mini-instruct-r-v1`, achieves state-of-the-art performance among open-source and closed-source VLMs under 5b parameters.
+* `blip3-phi3-mini-instruct-r-v1` supports flexible high-resolution image encoding with efficient visual token sampling.
+More technical details will come with a technical report soon.
+# Datasets
+| Dataset Type| Dataset(s) Used                          |
+|--------|------------------------------------------|
+| Pretrain | caption data: (datacomp, cc12m, cc3m, SBU, vg) && interleaved data: obelics |
+| Instruction Tuning    | LLaVA-Instruct-150K, ShareGPT4V captions, a mixture of academic VQA data including OCR/Document/Chart-focused tasks, publicly available text-only instruction data |
+# Results
+### Pretrain
+| Model       | Shot | COCO (val) | NoCaps (val) | TextCaps (val) | OKVQA  (val) | TextVQA (val) | VizWiz (testdev) | VQAv2 (testdev) |
+|-------------|------|------------|--------------|----------------|--------------|---------------|------------------|-----------------|
+| Flamingo-3B |    4 |       85.0 | -            | -              |         43.3 |          32.7 |               34 |            53.2 |
+|             |    8 |       90.6 | -            | -              |         44.6 |          32.4 |             38.4 |            55.4 |
+| MM1-3B      |    0 |       73.5 |         55.6 |           63.3 |         26.1 |          29.4 |             15.6 |            46.2 |
+|             |    4 |      112.3 |         99.7 |           84.1 |         48.6 |          45.3 |             38.0 |            57.9 |
+|             |    8 |      114.6 |        104.7 |           88.8 |         48.4 |          44.6 |             46.4 |            63.6 |
+| **blip3-phi3-mini-base-r-v1 (Ours)**|    0 |       **81.7** |         **80.2** |           60.7 |         **26.5** |          **36.0** |             **21.2** |            **48.1** |
+|             |    4 |      110.5 |        **101.7** |           **84.6** |         **49.2** |          **46.1** |             **38.4** |            **63.9** |
+|             |    8 |      112.1 |        104.4 |           87.7 |         **49.1** |          **46.4** |             44.3 |            **63.8** |
+### Instruct
+| Model                      | SEED-IMG | MMBench(dev) | MME-total | MME-P    | MME-C   | MMStar   | MMMU (val) | MMVet    | MathVista (mini) | ScienceQA (test) | POPE      | AI2D     |   |
+|----------------------------|----------|--------------|-----------|----------|---------|----------|------------|----------|------------------|------------------|----------|----------|---|
+| MM1-3B-Chat                | 68.8     | 75.9         | 1761      | **1482**     | 279     | -        | 33.9       | 43.7     | -                | -                | **87.4**            | -        |   |
+| openbmb/MiniCPM-V-2        | 67.1     | 69.6         | 1808      | -        | -       | -        | 38.2       | -        | 38.7             | -                | -         | -        |   |
+| VILA1.5-3B                 | 67.9     | 63.4         | -         | 1442     | -       | -        | 33.3       | 35.4     | -                | 69.0             | 85.9       | -        |   |
+| xtuner/llava-phi-3-mini-hf | 70.0     | 69.2         | 1790      | 1477     | 313     | 43.7     | **41.4**       | -        | -                | 73.7             | 87.3       | 69.3     |   |
+| **blip3-phi3-mini-instruct-r-v1 (Ours)** | **72.1**     | **74.1**         | **1827**      | 1467     | **360**     | **44.6**     | 39.8       | **45.1**     | **39.3**             | **74.2**             | 87.2       | **75.8**     |   |
+# Bias, Risks, Limitations, and Ethical Considerations
+We removed Laion from our training data due to known CSAM concerns.
+The other main data sources are from the internet, including webpages,
+image stock sites, and curated datasets released by the research community.
+The model may be subject to bias from the original data source, as well as bias from LLMs and commercial APIs.
+We strongly recommend users conduct an assessment of safety and fairness before applying to downstream applications.
+# How to use
+> We require the use of the development version (`"4.41.0.dev0"`) of the `transformers` library. To get it, as of 05/07/2024, one can use `pip uninstall -y transformers && pip install git+https://github.com/huggingface/transformers.`
+```python
+from transformers import AutoModelForVision2Seq, AutoTokenizer, AutoImageProcessor, StoppingCriteria
+import torch
+import requests
+from PIL import Image
+# define the prompt template
+def apply_prompt_template(prompt):
+    s = (
+            '<|system|>\nA chat between a curious user and an artificial intelligence assistant. '
+            "The assistant gives helpful, detailed, and polite answers to the user's questions.<|end|>\n"
+            f'<|user|>\n<image>\n{prompt}<|end|>\n<|assistant|>\n'
+        )
+    return s
+class EosListStoppingCriteria(StoppingCriteria):
+    def __init__(self, eos_sequence = [32007]):
+        self.eos_sequence = eos_sequence
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        last_ids = input_ids[:,-len(self.eos_sequence):].tolist()
+        return self.eos_sequence in last_ids
+# load models
+model_name_or_path = "Salesforce/blip3-phi3-mini-instruct-r-v1"
+model = AutoModelForVision2Seq.from_pretrained(model_name_or_path, trust_remote_code=True)
+tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True, use_fast=False, legacy=False)
+image_processor = AutoImageProcessor.from_pretrained(model_name_or_path, trust_remote_code=True)
+tokenizer = model.update_special_tokens(tokenizer)
+# craft a test sample
+img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
+raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
+query = "how many dogs are in the picture?"
+model = model.cuda()
+inputs = image_processor([raw_image], return_tensors="pt", image_aspect_ratio='anyres')
+prompt = apply_prompt_template(query)
+language_inputs = tokenizer([prompt], return_tensors="pt")
+inputs.update(language_inputs)
+inputs = {name: tensor.cuda() for name, tensor in inputs.items()}
+generated_text = model.generate(**inputs, image_size=[raw_image.size],
+                                pad_token_id=tokenizer.pad_token_id,
+                                do_sample=False, max_new_tokens=768, top_p=None, num_beams=1,
+                                stopping_criteria = [EosListStoppingCriteria()],
+                                )
+prediction = tokenizer.decode(generated_text[0], skip_special_tokens=True).split("<|end|>")[0]
+print("==> prediction: ", prediction)
+# output: ==> prediction: There is one dog in the picture.
+```
+More comprehensive examples can be found in the [notebook](demo.ipynb).
+# Reproducibility:
+Our SFT evaluation is based on the VLMEvalKit, in which we fixed some inconsistencies with the official benchmarks (e.g., LLM judge API). During our development, we noticed that the raw resolution of the input image would noticeably affect the model output in some cases.
+# License
+Our code and weights are released under the Creative Commons Attribution Non Commercial 4.0 [LICENSE](LICENSE.txt). Please fill out a form at [here](https://forms.gle/ffPc9oZC2ZGeJ1N68) to consult the commercial use of model weights.
+# Code acknowledgement
+[LAVIS](https://github.com/salesforce/LAVIS) \
+[openflamingo](https://github.com/mlfoundations/open_flamingo) \
+[VLMEvalKit](https://github.com/open-compass/VLMEvalKit/tree/main)
+# Citation
+```
+@misc{blip3_phi3_mini,
+    title={BLIP3-phi3-mini-instruct Model Card},
+    url={https://huggingface.co/Salesforce/blip3-phi3-mini-instruct-r-v1},
+    author={Salesforce AI Research},
+    month={May},
+    year={2024}
+}
+```
+# Troubleshoot
+1. If you missed any packages, please consider the following
+```
+pip install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cu121
+pip install open_clip_torch==2.24.0
+pip install einops
+pip install einops-exts
+```

added_tokens.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "<pad>": 32011,
+  "<|assistant|>": 32001,
+  "<|endoftext|>": 32000,
+  "<|end|>": 32007,
+  "<|placeholder1|>": 32002,
+  "<|placeholder2|>": 32003,
+  "<|placeholder3|>": 32004,
+  "<|placeholder4|>": 32005,
+  "<|placeholder5|>": 32008,
+  "<|placeholder6|>": 32009,
+  "<|system|>": 32006,
+  "<|user|>": 32010
+}

config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "architectures": [
+    "Blip3ModelForConditionalGeneration"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_blip_3.Blip3Config",
+    "AutoModelForVision2Seq": "modeling_blip_3.Blip3ModelForConditionalGeneration"
+  },
+  "model_type": "blip_3",
+  "text_config": {
+    "initial_tokenizer_len": 32012,
+    "model_type": "phi3",
+    "sliding_window": 2047,
+    "torch_dtype": "bfloat16"
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.41.0.dev0",
+  "vision_encoder_config": {
+    "anyres_patch_sampling": true,
+    "image_aspect_ratio": "anyres",
+    "model_type": "blip_3_vision_encoder"
+  },
+  "vision_tokenizer_config": {
+    "model_type": "blip_3_vision_tokenizer"
+  }
+}

configuration_blip_3.py ADDED Viewed

	@@ -0,0 +1,161 @@

+from transformers import PretrainedConfig
+from transformers import logging
+from transformers import CONFIG_MAPPING
+logger = logging.get_logger(__name__)
+class Blip3VisionEncoderConfig(PretrainedConfig):
+    model_type = "blip_3_vision_encoder"
+    def __init__(self,
+                 model_name: str = 'ViT-H-14-378-quickgelu',
+                 force_image_size: int = 378,
+                 **kwargs):
+        self.model_name = model_name
+        self.force_image_size = force_image_size
+        super().__init__(**kwargs)
+class Blip3VisionTokenizerConfig(PretrainedConfig):
+    model_type = "blip_3_vision_tokenizer"
+    def __init__(self,
+                 vis_feature_dim: int = 1280,
+                 lang_embedding_dim: int = 3072,
+                 num_vis_tokens: int = 128,
+                 image_aspect_ratio: str = 'anyres',
+                 repeat_latents: bool = False,
+                **kwargs):
+        self.vis_feature_dim = vis_feature_dim
+        self.lang_embedding_dim = lang_embedding_dim
+        self.num_vis_tokens = num_vis_tokens
+        self.image_aspect_ratio = image_aspect_ratio
+        self.repeat_latents = repeat_latents
+        super().__init__(**kwargs)
+class Blip3Config(PretrainedConfig):
+    model_type = "blip_3"
+    def __init__(self,
+                 vision_encoder_config: dict = None,
+                 vision_tokenizer_config: dict = None,
+                 text_config: dict = None,
+                 **kwargs):
+        if vision_encoder_config is None:
+            vision_encoder_config = {'image_aspect_ratio': 'anyres', 'anyres_patch_sampling': True}
+            logger.info("vision_encoder_config is None. initializing the Blip3VisionEncoderConfig with default values.")
+        if vision_tokenizer_config is None:
+            vision_tokenizer_config = {}
+            logger.info("vision_tokenizer_config is None. Initializing the Blip3VisionTokenizerConfig with default values.")
+        if text_config is None:
+            text_config = {
+                'initial_tokenizer_len':32012,
+                'pad_token_id':32011,
+                'bos_token_id':1,
+                'eos_token_id':32000,
+                'vocab_size': 32064,
+                'hidden_size': 3072,
+                'intermediate_size': 8192,
+                'num_hidden_layers': 32,
+                'num_attention_heads': 32,
+                'num_key_value_heads': 32,
+                'resid_pdrop': 0.0,
+                'embd_pdrop': 0.0,
+                'attention_dropout': 0.0,
+                'hidden_act': 'silu',
+                'max_position_embeddings': 4096,
+                'original_max_position_embeddings': 4096,
+                'initializer_range': 0.02,
+                'rms_norm_eps': 1e-05,
+                'use_cache': True,
+                'rope_theta': 10000.0,
+                'rope_scaling': None,
+                'sliding_window': 2047,
+                'return_dict': True,
+                'output_hidden_states': False,
+                'output_attentions': False,
+                'torchscript': False,
+                'torch_dtype': 'bfloat16',
+                'use_bfloat16': False,
+                'tf_legacy_loss': False,
+                'pruned_heads': {},
+                'tie_word_embeddings': False,
+                'chunk_size_feed_forward': 0,
+                'is_encoder_decoder': False,
+                'is_decoder': False,
+                'cross_attention_hidden_size': None,
+                'add_cross_attention': False,
+                'tie_encoder_decoder': False,
+                'max_length': 20,
+                'min_length': 0,
+                'do_sample': False,
+                'early_stopping': False,
+                'num_beams': 1,
+                'num_beam_groups': 1,
+                'diversity_penalty': 0.0,
+                'temperature': 1.0,
+                'top_k': 50,
+                'top_p': 1.0,
+                'typical_p': 1.0,
+                'repetition_penalty': 1.0,
+                'length_penalty': 1.0,
+                'no_repeat_ngram_size': 0,
+                'encoder_no_repeat_ngram_size': 0,
+                'bad_words_ids': None,
+                'num_return_sequences': 1,
+                'output_scores': False,
+                'return_dict_in_generate': False,
+                'forced_bos_token_id': None,
+                'forced_eos_token_id': None,
+                'remove_invalid_values': False,
+                'exponential_decay_length_penalty': None,
+                'suppress_tokens': None,
+                'begin_suppress_tokens': None,
+                'finetuning_task': None,
+                'id2label': {0: 'LABEL_0', 1: 'LABEL_1'},
+                'label2id': {'LABEL_0': 0, 'LABEL_1': 1},
+                'tokenizer_class': None,
+                'prefix': None,
+                'bos_token_id': 1,
+                'pad_token_id': 32000,
+                'eos_token_id': 32000,
+                'sep_token_id': None,
+                'decoder_start_token_id': None,
+                'task_specific_params': None,
+                'problem_type': None,
+                'model_type': 'phi3'
+                }
+            logger.info("text_config is None. Initializing the text config with default values (`Phi3Config`).")
+        self.vision_encoder_config = Blip3VisionEncoderConfig(**vision_encoder_config)
+        self.vision_tokenizer_config = Blip3VisionTokenizerConfig(**vision_tokenizer_config)
+        text_model_type = text_config["model_type"] if "model_type" in text_config else "phi3"
+        self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
+        for key in ['initial_tokenizer_len', 'pad_token_id']:
+            if key not in self.text_config.to_dict():
+                raise ValueError(f"The key `{key}` is missing in the text_config.")
+        super().__init__(**kwargs)
+    @classmethod
+    def from_vision_encoder_vision_tokenizer_text_configs(
+        cls,
+        vision_encoder_config: Blip3VisionEncoderConfig,
+        vision_tokenizer_config: Blip3VisionTokenizerConfig,
+        text_config: PretrainedConfig,
+        **kwargs):
+        return cls(
+            vision_encoder_config=vision_encoder_config.to_dict(),
+            vision_tokenizer_config=vision_tokenizer_config.to_dict(),
+            text_config=text_config.to_dict(),
+            **kwargs,
+        )

demo.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 32000,
+  "pad_token_id": 32000,
+  "transformers_version": "4.41.0.dev0"
+}

image_processing_blip_3.py ADDED Viewed

	@@ -0,0 +1,409 @@

+import random
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
+import torchvision.transforms.functional as F
+from torchvision.transforms import Normalize, Compose, RandomResizedCrop, InterpolationMode, ToTensor, Resize, \
+    CenterCrop, ColorJitter, Grayscale
+import numbers
+import torch
+import ast
+import math
+from PIL import Image
+from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.utils import TensorType
+class Blip3ImageProcessor(BaseImageProcessor):
+    def __init__(
+        self,
+        do_resize: bool = True,
+        resize_mode: str = "squash",
+        interpolation_mode: str = "bicubic",
+        size: Union[Tuple[int, int], List[int]] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.do_resize = do_resize
+        self.resize_mode = resize_mode
+        self.interpolation_mode = interpolation_mode
+        self.size = size if size is not None else (378, 378)
+        self.image_mean = image_mean if image_mean is not None else [0.48145466, 0.4578275, 0.40821073]
+        self.image_std = image_std if image_std is not None else [0.26862954, 0.26130258, 0.27577711]
+    @classmethod
+    def resize(cls, image_size, resize_mode, interpolation='bicubic', fill_color=0):
+        interpolation_mode = InterpolationMode.BILINEAR if interpolation == 'bilinear' else InterpolationMode.BICUBIC
+        if resize_mode == 'longest':
+            transforms = [
+                ResizeKeepRatio(image_size, interpolation=interpolation_mode, longest=1),
+                CenterCropOrPad(image_size, fill=fill_color)
+            ]
+        elif resize_mode == 'squash':
+            if isinstance(image_size, int):
+                image_size = (image_size, image_size)
+            transforms = [
+                Resize(image_size, interpolation=interpolation_mode),
+            ]
+        else:
+            assert resize_mode == 'shortest'
+            if not isinstance(image_size, (tuple, list)):
+                image_size = (image_size, image_size)
+            if image_size[0] == image_size[1]:
+                # simple case, use torchvision built-in Resize w/ shortest edge mode (scalar size arg)
+                transforms = [
+                    Resize(image_size[0], interpolation=interpolation_mode)
+                ]
+            else:
+                # resize shortest edge to matching target dim for non-square target
+                transforms = [ResizeKeepRatio(image_size)]
+            transforms += [CenterCrop(image_size)]
+        return transforms
+    @classmethod
+    def convert_rgb(cls, image):
+        return image.convert("RGB")
+    def _preprocess(self,
+                   images: ImageInput
+                   ) -> torch.Tensor:
+        transforms = self.resize(self.size,  self.resize_mode, self.interpolation_mode)
+        transforms.extend([
+            self.convert_rgb,
+            ToTensor(),
+            Normalize(mean=self.image_mean, std=self.image_std)
+        ])
+        composed_transforms = Compose(transforms)
+        images_tensor = composed_transforms(images)
+        return images_tensor
+    def preprocess(self,
+                   images: ImageInput,
+                   return_tensors: Optional[Union[str, TensorType]] = None,
+                   **kwargs) -> BatchFeature:
+        if 'image_aspect_ratio' in kwargs:
+            image_aspect_ratio = kwargs['image_aspect_ratio']
+        else:
+            image_aspect_ratio = 'pad'
+        new_images = []
+        if image_aspect_ratio == 'pad':
+            for image in images:
+                image = self._preprocess(image)
+                new_images.append(image)
+        else:
+            if isinstance(self.size, (tuple, list)):
+                base_img_size = self.size[0]
+            else:
+                raise ValueError("size should be list or tuple")
+            for image in images:
+                image = process_anyres_image(image, self._preprocess, self.size,
+                                             [
+                                                [base_img_size,base_img_size*2],
+                                                [base_img_size*2,base_img_size],
+                                                [base_img_size*2,base_img_size*2],
+                                                [base_img_size*3,base_img_size],
+                                                [base_img_size,base_img_size*3]
+                                            ])
+                new_images.append(image)
+        if all(x.shape == new_images[0].shape for x in new_images):
+            new_images = torch.stack(new_images, dim=0)
+        if image_aspect_ratio == 'pad':
+            new_images = BatchFeature(data={"pixel_values": new_images.unsqueeze(0).unsqueeze(0)}, tensor_type=return_tensors)
+        else:
+            new_images = BatchFeature(data={"pixel_values": new_images.unsqueeze(0)}, tensor_type=return_tensors)
+        return new_images
+    # def preprocess(self,
+    #                images: ImageInput,
+    #                return_tensors: Optional[Union[str, TensorType]] = None,
+    #                **kwargs) -> BatchFeature:
+    #     transforms = self.resize(self.size,  self.resize_mode, self.interpolation_mode)
+    #     transforms.extend([
+    #         self.convert_rgb,
+    #         ToTensor(),
+    #         Normalize(mean=self.image_mean, std=self.image_std)
+    #     ])
+    #     composed_transforms = Compose(transforms)
+    #     images_tensor = composed_transforms(images).unsqueeze(0).unsqueeze(1).unsqueeze(0)
+    #     encoded_outputs = BatchFeature(data={"pixel_values": images_tensor}, tensor_type=return_tensors)
+    #     return encoded_outputs
+class ResizeKeepRatio:
+    """ Resize and Keep Ratio
+    Copy & paste from `timm`
+    """
+    def __init__(
+            self,
+            size,
+            longest=0.,
+            interpolation=InterpolationMode.BICUBIC,
+            random_scale_prob=0.,
+            random_scale_range=(0.85, 1.05),
+            random_aspect_prob=0.,
+            random_aspect_range=(0.9, 1.11)
+    ):
+        if isinstance(size, (list, tuple)):
+            self.size = tuple(size)
+        else:
+            self.size = (size, size)
+        self.interpolation = interpolation
+        self.longest = float(longest)  # [0, 1] where 0 == shortest edge, 1 == longest
+        self.random_scale_prob = random_scale_prob
+        self.random_scale_range = random_scale_range
+        self.random_aspect_prob = random_aspect_prob
+        self.random_aspect_range = random_aspect_range
+    @staticmethod
+    def get_params(
+            img,
+            target_size,
+            longest,
+            random_scale_prob=0.,
+            random_scale_range=(0.85, 1.05),
+            random_aspect_prob=0.,
+            random_aspect_range=(0.9, 1.11)
+    ):
+        """Get parameters
+        """
+        source_size = img.size[::-1]  # h, w
+        h, w = source_size
+        target_h, target_w = target_size
+        ratio_h = h / target_h
+        ratio_w = w / target_w
+        ratio = max(ratio_h, ratio_w) * longest + min(ratio_h, ratio_w) * (1. - longest)
+        if random_scale_prob > 0 and random.random() < random_scale_prob:
+            ratio_factor = random.uniform(random_scale_range[0], random_scale_range[1])
+            ratio_factor = (ratio_factor, ratio_factor)
+        else:
+            ratio_factor = (1., 1.)
+        if random_aspect_prob > 0 and random.random() < random_aspect_prob:
+            aspect_factor = random.uniform(random_aspect_range[0], random_aspect_range[1])
+            ratio_factor = (ratio_factor[0] / aspect_factor, ratio_factor[1] * aspect_factor)
+        size = [round(x * f / ratio) for x, f in zip(source_size, ratio_factor)]
+        return size
+    def __call__(self, img):
+        """
+        Args:
+            img (PIL Image): Image to be cropped and resized.
+        Returns:
+            PIL Image: Resized, padded to at least target size, possibly cropped to exactly target size
+        """
+        size = self.get_params(
+            img, self.size, self.longest,
+            self.random_scale_prob, self.random_scale_range,
+            self.random_aspect_prob, self.random_aspect_range
+        )
+        img = F.resize(img, size, self.interpolation)
+        return img
+    def __repr__(self):
+        format_string = self.__class__.__name__ + '(size={0}'.format(self.size)
+        format_string += f', interpolation={self.interpolation})'
+        format_string += f', longest={self.longest:.3f})'
+        return format_string
+def _setup_size(size, error_msg):
+    if isinstance(size, numbers.Number):
+        return int(size), int(size)
+    if isinstance(size, Sequence) and len(size) == 1:
+        return size[0], size[0]
+    if len(size) != 2:
+        raise ValueError(error_msg)
+    return size
+def center_crop_or_pad(img: torch.Tensor, output_size: List[int], fill=0) -> torch.Tensor:
+    """Center crops and/or pads the given image.
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If image size is smaller than output size along any edge, image is padded with 0 and then center cropped.
+    Args:
+        img (PIL Image or Tensor): Image to be cropped.
+        output_size (sequence or int): (height, width) of the crop box. If int or sequence with single int,
+            it is used for both directions.
+        fill (int, Tuple[int]): Padding color
+    Returns:
+        PIL Image or Tensor: Cropped image.
+    """
+    if isinstance(output_size, numbers.Number):
+        output_size = (int(output_size), int(output_size))
+    elif isinstance(output_size, (tuple, list)) and len(output_size) == 1:
+        output_size = (output_size[0], output_size[0])
+    _, image_height, image_width = F.get_dimensions(img)
+    crop_height, crop_width = output_size
+    if crop_width > image_width or crop_height > image_height:
+        padding_ltrb = [
+            (crop_width - image_width) // 2 if crop_width > image_width else 0,
+            (crop_height - image_height) // 2 if crop_height > image_height else 0,
+            (crop_width - image_width + 1) // 2 if crop_width > image_width else 0,
+            (crop_height - image_height + 1) // 2 if crop_height > image_height else 0,
+        ]
+        img = F.pad(img, padding_ltrb, fill=fill)
+        _, image_height, image_width = F.get_dimensions(img)
+        if crop_width == image_width and crop_height == image_height:
+            return img
+    crop_top = int(round((image_height - crop_height) / 2.0))
+    crop_left = int(round((image_width - crop_width) / 2.0))
+    return F.crop(img, crop_top, crop_left, crop_height, crop_width)
+class CenterCropOrPad(torch.nn.Module):
+    """Crops the given image at the center.
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If image size is smaller than output size along any edge, image is padded with 0 and then center cropped.
+    Args:
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+    """
+    def __init__(self, size, fill=0):
+        super().__init__()
+        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
+        self.fill = fill
+    def forward(self, img):
+        """
+        Args:
+            img (PIL Image or Tensor): Image to be cropped.
+        Returns:
+            PIL Image or Tensor: Cropped image.
+        """
+        return center_crop_or_pad(img, self.size, fill=self.fill)
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(size={self.size})"
+def process_anyres_image(image, processor, processor_size, grid_pinpoints):
+    """
+    Process an image with variable resolutions.
+    Args:
+        image (PIL.Image.Image): The input image to be processed.
+        processor: The image processor object.
+        processor_size (tuple, list): The size of the image processor.
+        grid_pinpoints (str): A string representation of a list of possible resolutions.
+    Returns:
+        torch.Tensor: A tensor containing the processed image patches.
+    """
+    # FIXME: determine grid_pinpoints from image sizes.
+    if type(grid_pinpoints) is list:
+        possible_resolutions = grid_pinpoints
+    else:
+        possible_resolutions = ast.literal_eval(grid_pinpoints)
+    best_resolution = select_best_resolution(image.size, possible_resolutions)
+    image_padded = resize_and_pad_image(image, best_resolution)
+    # processor_size = processor.transforms[0].size
+    patches = divide_to_patches(image_padded, processor_size[0])
+    image_original_resize = image.resize((processor_size[0], processor_size[0]))
+    image_patches = [image_original_resize] + patches
+    image_patches = [processor(image_patch)
+                     for image_patch in image_patches]
+    return torch.stack(image_patches, dim=0)
+def select_best_resolution(original_size, possible_resolutions):
+    """
+    Selects the best resolution from a list of possible resolutions based on the original size.
+    Args:
+        original_size (tuple): The original size of the image in the format (width, height).
+        possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
+    Returns:
+        tuple: The best fit resolution in the format (width, height).
+    """
+    original_width, original_height = original_size
+    best_fit = None
+    max_effective_resolution = 0
+    min_wasted_resolution = float('inf')
+    for width, height in possible_resolutions:
+        scale = min(width / original_width, height / original_height)
+        downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
+        effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
+        wasted_resolution = (width * height) - effective_resolution
+        if effective_resolution > max_effective_resolution or (effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution):
+            max_effective_resolution = effective_resolution
+            min_wasted_resolution = wasted_resolution
+            best_fit = (width, height)
+    return best_fit
+def resize_and_pad_image(image, target_resolution):
+    """
+    Resize and pad an image to a target resolution while maintaining aspect ratio.
+    Args:
+        image (PIL.Image.Image): The input image.
+        target_resolution (tuple): The target resolution (width, height) of the image.
+    Returns:
+        PIL.Image.Image: The resized and padded image.
+    """
+    original_width, original_height = image.size
+    target_width, target_height = target_resolution
+    scale_w = target_width / original_width
+    scale_h = target_height / original_height
+    if scale_w < scale_h:
+        new_width = target_width
+        new_height = min(math.ceil(original_height * scale_w), target_height)
+    else:
+        new_height = target_height
+        new_width = min(math.ceil(original_width * scale_h), target_width)
+    # Resize the image
+    resized_image = image.resize((new_width, new_height))
+    new_image = Image.new('RGB', (target_width, target_height), (0, 0, 0))
+    paste_x = (target_width - new_width) // 2
+    paste_y = (target_height - new_height) // 2
+    new_image.paste(resized_image, (paste_x, paste_y))
+    return new_image
+def divide_to_patches(image, patch_size):
+    """
+    Divides an image into patches of a specified size.
+    Args:
+        image (PIL.Image.Image): The input image.
+        patch_size (int): The size of each patch.
+    Returns:
+        list: A list of PIL.Image.Image objects representing the patches.
+    """
+    patches = []
+    width, height = image.size
+    for i in range(0, height, patch_size):
+        for j in range(0, width, patch_size):
+            box = (j, i, j + patch_size, i + patch_size)
+            patch = image.crop(box)
+            patches.append(patch)
+    return patches

model-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2e6acb1fba540ae862c8ec5a3f0898a25f97df07029235aa164be537e13e664b
+size 4977054880

model-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:edf7152efe2adbf112f09395c1a443d53b439cba0342285dae7ab6225281dcaa
+size 4983112128

model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e1afa3c02b194f2a1cd36f296ef7690c42f356b9b48e98644011b59336b0699a
+size 4983112168

model-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b69d6172f961180def49586fe73b71c2bd2e4ba968564f276486e86030a1da36
+size 3414256548

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,673 @@

+{
+  "metadata": {
+    "total_size": 18357448972
+  },
+  "weight_map": {
+    "vlm.lang_model.lm_head.additional_fc.bias": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.lm_head.additional_fc.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.lm_head.bias": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.lm_head.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.embed_tokens.additional_embedding.weight": "model-00001-of-00004.safetensors",
+    "vlm.lang_model.model.embed_tokens.weight": "model-00001-of-00004.safetensors",
+    "vlm.lang_model.model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "vlm.lang_model.model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.lang_model.model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.lang_model.model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "vlm.lang_model.model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.lang_model.model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.lang_model.model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "vlm.lang_model.model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.lang_model.model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.lang_model.model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "vlm.lang_model.model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.lang_model.model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.lang_model.model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.10.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.10.self_attn.qkv_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.11.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.11.self_attn.qkv_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.12.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.12.self_attn.qkv_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.13.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.13.self_attn.qkv_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.14.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.14.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.14.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.14.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.14.self_attn.qkv_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.15.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.15.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.15.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.15.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.15.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.15.self_attn.qkv_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.16.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.16.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.16.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.16.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.16.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.16.self_attn.qkv_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.17.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.17.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.17.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.17.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.17.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.17.self_attn.qkv_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.18.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.18.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.18.self_attn.qkv_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.19.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.19.self_attn.qkv_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "vlm.lang_model.model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.lang_model.model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.lang_model.model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "vlm.lang_model.model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.lang_model.model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.lang_model.model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.20.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.20.self_attn.qkv_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.21.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.21.self_attn.qkv_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.22.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.22.self_attn.qkv_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.23.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.23.self_attn.qkv_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.24.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.24.self_attn.qkv_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.25.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.25.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.25.mlp.gate_up_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.25.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.25.self_attn.qkv_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.26.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.26.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.26.mlp.gate_up_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.26.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.26.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.26.self_attn.qkv_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.27.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.27.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.27.mlp.gate_up_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.27.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.27.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.27.self_attn.qkv_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.28.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.28.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.28.mlp.gate_up_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.28.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.28.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.28.self_attn.qkv_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.29.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.29.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.29.mlp.gate_up_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.29.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.29.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.29.self_attn.qkv_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.3.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.3.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.3.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.3.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.lang_model.model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.lang_model.model.layers.30.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.30.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.30.mlp.gate_up_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.30.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.30.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.30.self_attn.qkv_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.31.mlp.gate_up_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.31.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.31.self_attn.qkv_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.4.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.4.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.4.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.4.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.4.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.4.self_attn.qkv_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.5.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.5.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.5.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.5.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.5.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.5.self_attn.qkv_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.6.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.6.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.6.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.6.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.6.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.6.self_attn.qkv_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.7.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.7.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.7.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.7.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.7.self_attn.qkv_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.8.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.8.self_attn.qkv_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.9.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.9.self_attn.qkv_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.norm.weight": "model-00004-of-00004.safetensors",
+    "vlm.vision_encoder.class_embedding": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.conv1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.ln_post.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.ln_post.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.ln_pre.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.ln_pre.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.positional_embedding": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.proj": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.0.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.0.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.0.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.0.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.0.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.0.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.0.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.0.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.0.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.0.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.0.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.0.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.1.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.1.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.1.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.1.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.1.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.1.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.1.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.1.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.1.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.1.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.1.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.1.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.10.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.10.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.10.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.10.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.10.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.10.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.10.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.10.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.10.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.10.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.10.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.10.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.11.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.11.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.11.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.11.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.11.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.11.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.11.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.11.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.11.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.11.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.11.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.11.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.12.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.12.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.12.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.12.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.12.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.12.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.12.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.12.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.12.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.12.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.12.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.12.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.13.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.13.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.13.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.13.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.13.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.13.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.13.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.13.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.13.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.13.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.13.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.13.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.14.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.14.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.14.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.14.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.14.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.14.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.14.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.14.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.14.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.14.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.14.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.14.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.15.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.15.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.15.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.15.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.15.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.15.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.15.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.15.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.15.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.15.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.15.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.15.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.16.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.16.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.16.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.16.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.16.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.16.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.16.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.16.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.16.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.16.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.16.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.16.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.17.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.17.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.17.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.17.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.17.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.17.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.17.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.17.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.17.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.17.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.17.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.17.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.18.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.18.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.18.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.18.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.18.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.18.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.18.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.18.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.18.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.18.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.18.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.18.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.19.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.19.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.19.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.19.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.19.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.19.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.19.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.19.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.19.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.19.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.19.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.19.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.2.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.2.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.2.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.2.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.2.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.2.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.2.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.2.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.2.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.2.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.2.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.2.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.20.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.20.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.20.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.20.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.20.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.20.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.20.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.20.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.20.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.20.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.20.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.20.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.21.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.21.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.21.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.21.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.21.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.21.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.21.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.21.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.21.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.21.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.21.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.21.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.22.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.22.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.22.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.22.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.22.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.22.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.22.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.22.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.22.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.22.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.22.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.22.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.23.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.23.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.23.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.23.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.23.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.23.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.23.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.23.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.23.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.23.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.23.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.23.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.24.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.24.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.24.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.24.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.24.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.24.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.24.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.24.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.24.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.24.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.24.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.24.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.25.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.25.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.25.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.25.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.25.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.25.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.25.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.25.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.25.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.25.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.25.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.25.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.26.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.26.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.26.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.26.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.26.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.26.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.26.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.26.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.26.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.26.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.26.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.26.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.27.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.27.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.27.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.27.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.27.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.27.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.27.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.27.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.27.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.27.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.27.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.27.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.28.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.28.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.28.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.28.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.28.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.28.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.28.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.28.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.28.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.28.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.28.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.28.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.29.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.29.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.29.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.29.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.29.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.29.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.29.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.29.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.29.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.29.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.29.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.29.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.3.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.3.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.3.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.3.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.3.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.3.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.3.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.3.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.3.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.3.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.3.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.3.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.30.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.30.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.30.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.30.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.30.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.30.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.30.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.30.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.30.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.30.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.30.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.30.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.31.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.31.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.31.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.31.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.31.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.31.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.31.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.31.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.31.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.31.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.31.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.31.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.4.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.4.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.4.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.4.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.4.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.4.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.4.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.4.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.4.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.4.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.4.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.4.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.5.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.5.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.5.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.5.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.5.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.5.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.5.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.5.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.5.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.5.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.5.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.5.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.6.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.6.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.6.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.6.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.6.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.6.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.6.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.6.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.6.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.6.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.6.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.6.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.7.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.7.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.7.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.7.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.7.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.7.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.7.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.7.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.7.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.7.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.7.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.7.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.8.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.8.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.8.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.8.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.8.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.8.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.8.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.8.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.8.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.8.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.8.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.8.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.9.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.9.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.9.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.9.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.9.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.9.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.9.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.9.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.9.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.9.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.9.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.9.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.latents": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.0.0.norm_latents.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.0.0.norm_latents.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.0.0.norm_media.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.0.0.norm_media.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.0.0.to_kv.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.0.0.to_out.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.0.0.to_q.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.0.1.0.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.0.1.0.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.0.1.1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.0.1.3.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.1.0.norm_latents.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.1.0.norm_latents.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.1.0.norm_media.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.1.0.norm_media.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.1.0.to_kv.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.1.0.to_out.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.1.0.to_q.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.1.1.0.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.1.1.0.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.1.1.1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.1.1.3.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.2.0.norm_latents.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.2.0.norm_latents.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.2.0.norm_media.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.2.0.norm_media.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.2.0.to_kv.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.2.0.to_out.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.2.0.to_q.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.2.1.0.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.2.1.0.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.2.1.1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.2.1.3.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.3.0.norm_latents.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.3.0.norm_latents.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.3.0.norm_media.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.3.0.norm_media.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.3.0.to_kv.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.3.0.to_out.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.3.0.to_q.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.3.1.0.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.3.1.0.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.3.1.1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.3.1.3.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.4.0.norm_latents.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.4.0.norm_latents.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.4.0.norm_media.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.4.0.norm_media.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.4.0.to_kv.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.4.0.to_out.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.4.0.to_q.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.4.1.0.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.4.1.0.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.4.1.1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.4.1.3.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.5.0.norm_latents.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.5.0.norm_latents.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.5.0.norm_media.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.5.0.norm_media.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.5.0.to_kv.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.5.0.to_out.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.5.0.to_q.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.5.1.0.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.5.1.0.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.5.1.1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.5.1.3.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.norm.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.norm.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.projection.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.projection.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.text_projection.0.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.text_projection.0.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.text_projection.2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.text_projection.2.weight": "model-00001-of-00004.safetensors"
+  }
+}

modeling_blip_3.py ADDED Viewed

	@@ -0,0 +1,110 @@

+from transformers import PreTrainedModel, AutoModelForCausalLM
+import torch
+import open_clip
+from typing import List, Optional, Tuple, Union
+from .utils import check_embedding_fns
+from .vlm import InstructPerceiverResampler, KosmosInstruct
+from .configuration_blip_3 import Blip3VisionEncoderConfig, Blip3VisionTokenizerConfig, Blip3Config
+class Blip3VisionEncoder(PreTrainedModel):
+    main_input_name = "pixel_values"
+    config_class = Blip3VisionEncoderConfig
+    def __init__(self, config: Blip3VisionEncoderConfig):
+        super().__init__(config)
+        if config.model_name != 'ViT-H-14-378-quickgelu':
+            raise ValueError(f"Unsupported model {config.model_name}. New vision models will be added soon.")
+        self.model, _, _ = open_clip.create_model_and_transforms(
+            model_name = config.model_name,
+            force_image_size=config.force_image_size
+            )
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        # assert pixel_values.ndim == 4, f"Expected 4D tensor (bs, c, h, w), got {pixel_values.ndim}"
+        return self.model.encode_image(pixel_values)
+# vision tokenizer
+class Blip3VisionTokenizer(PreTrainedModel):
+    config_class = Blip3VisionTokenizerConfig
+    def __init__(self, config: Blip3VisionTokenizerConfig):
+        super().__init__(config)
+        self.model = InstructPerceiverResampler(
+            dim_llm=config.lang_embedding_dim,
+            dim=config.vis_feature_dim,
+            dim_inner=config.lang_embedding_dim,
+            num_latents=config.num_vis_tokens,
+            repeat_latents=config.repeat_latents
+        )
+    def forward(self,
+                vision_features: torch.Tensor,
+                vision_attn_masks: torch.Tensor):
+        return self.model(vision_features, vision_attn_masks)
+# Blip3 model
+class Blip3ModelForConditionalGeneration(PreTrainedModel):
+    config_class = Blip3Config
+    def __init__(self, config: Blip3Config):
+        super().__init__(config)
+        # vision encoder initialization
+        vision_encoder = Blip3VisionEncoder(config.vision_encoder_config).model
+        vision_encoder.visual.output_tokens = True
+        vision_encoder = vision_encoder.visual
+        # language model initialization
+        language_model = AutoModelForCausalLM.from_config(config.text_config)
+        check_embedding_fns(language_model)
+        # Update _tied_weights_keys using the base model used.
+        if language_model._tied_weights_keys is not None:
+            self._tied_weights_keys = [f"language_model.{k}" for k in language_model._tied_weights_keys]
+        # vision tokenizer initialization
+        if config.vision_tokenizer_config.lang_embedding_dim != language_model.get_input_embeddings().weight.shape[1]:
+            overwrite = language_model.get_input_embeddings().weight.shape[1]
+            config.vision_tokenizer_config.lang_embedding_dim = overwrite
+            print(f"Warning: The language embedding dimension in the vision tokenizer config is different from the language model's embedding dimension. Overwriting the language embedding dimension in the vision tokenizer config to {overwrite}.")
+        vision_tokenizer = Blip3VisionTokenizer(config.vision_tokenizer_config).model
+        self.vlm = KosmosInstruct(
+            vision_encoder=vision_encoder,
+            vision_tokenizer=vision_tokenizer,
+            lang_model=language_model,
+            initial_tokenizer_len = config.text_config.initial_tokenizer_len,
+            pad_token_id = config.text_config.pad_token_id,
+            image_aspect_ratio = config.vision_encoder_config.image_aspect_ratio,
+            anyres_patch_sampling = config.vision_encoder_config.anyres_patch_sampling
+        )
+        # Initialize weights and apply final processing
+        self.post_init()
+    @torch.no_grad()
+    def generate(
+        self,
+        pixel_values: torch.FloatTensor,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        **generate_kwargs,
+        ) -> torch.LongTensor:
+        self.vlm = self.vlm.eval()
+        return self.vlm.generate(
+            vision_x = pixel_values,
+            lang_x = input_ids,
+            attention_mask = attention_mask,
+            **generate_kwargs)
+    def update_special_tokens(self, tokenizer):
+        tokenizer.add_special_tokens(
+            {"additional_special_tokens": list(self.vlm.special_tokens.values())}
+        )
+        self.vlm.lang_model.config.vocab_size = len(tokenizer)
+        self.vlm.set_special_token_ids(
+            {
+                v: tokenizer.convert_tokens_to_ids(v) for v in self.vlm.special_tokens.values()
+            }
+        )
+        return tokenizer

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "auto_map": {
+    "AutoImageProcessor": "image_processing_blip_3.Blip3ImageProcessor"
+  },
+  "do_resize": true,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "Blip3ImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "interpolation_mode": "bicubic",
+  "resize_mode": "squash",
+  "size": [
+    378,
+    378
+  ]
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

test_samples/images/1074.jpg ADDED Viewed

test_samples/images/1148.jpg ADDED Viewed

test_samples/images/152.jpg ADDED Viewed

test_samples/images/1614.jpg ADDED Viewed

test_samples/images/26302.jpg ADDED Viewed

test_samples/images/45711.jpg ADDED Viewed

test_samples/test.json ADDED Viewed

	@@ -0,0 +1,36 @@

+[
+    {
+        "image_path": "./test_samples/images/152.jpg",
+        "question": ["Can you explain this meme?"]
+    },
+    {
+        "image_path": "./test_samples/images/26302.jpg",
+        "question": [
+            "In the food web, what are the predators?",
+            "What is the role of Mouse?",
+            "What does Grasshopper and Rabbit have in common?"
+        ]
+    },
+    {
+        "image_path": "./test_samples/images/1614.jpg",
+        "question": ["How is this image taken?"]
+    },
+    {
+        "image_path": "./test_samples/images/1074.jpg",
+        "question": ["Can you identify the season in which the picture was taken?"]
+    },
+    {
+        "image_path": "./test_samples/images/1148.jpg",
+        "question": ["What can be the relationship between the two persons in this image?"]
+    },
+    {
+        "image_path": "./test_samples/images/45711.jpg",
+        "question": [
+            "What is this meeting about?",
+            "How many things are discussed in the meeting?",
+            "What is the 2nd agenda?",
+            "When is the next meeting held?"
+        ]
+    }
+]

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,137 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "32000": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32001": {
+      "content": "<|assistant|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32002": {
+      "content": "<|placeholder1|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32003": {
+      "content": "<|placeholder2|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32004": {
+      "content": "<|placeholder3|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32005": {
+      "content": "<|placeholder4|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32006": {
+      "content": "<|system|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "<|end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "<|placeholder5|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "<|placeholder6|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "<|user|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32011": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "model_max_length": 4096,
+  "pad_token": "<pad>",
+  "padding_side": "left",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

utils.py ADDED Viewed

	@@ -0,0 +1,383 @@

+import torch
+import ast
+import math
+from PIL import Image
+def has_fn(model, fn_name):
+    """Check if model has a function fn_name"""
+    return callable(getattr(model, fn_name, None))
+def exists(val):
+    return val is not None
+def num_params(module, filter_to_trainable=False):
+    """Returns the number of parameters in the module, or optionally only the trainable parameters"""
+    if filter_to_trainable:
+        return sum(p.numel() for p in module.parameters() if p.requires_grad)
+    else:
+        return sum(p.numel() for p in module.parameters())
+def hasattr_recursive(obj, att):
+    """
+    Check if obj has nested attribute
+    Example: hasattr_recursive(obj, 'a.b.c') is equivalent to hasattr(obj, 'a') and hasattr(obj.a, 'b') and hasattr(obj.a.b, 'c')
+    """
+    if att == "":
+        return True
+    i = att.find(".")
+    if i < 0:
+        return hasattr(obj, att)
+    else:
+        try:
+            return hasattr_recursive(getattr(obj, att[:i]), att[i + 1 :])
+        except:
+            return False
+def getattr_recursive(obj, att):
+    """
+    Return nested attribute of obj
+    Example: getattr_recursive(obj, 'a.b.c') is equivalent to obj.a.b.c
+    """
+    if att == "":
+        return obj
+    i = att.find(".")
+    if i < 0:
+        return getattr(obj, att)
+    else:
+        return getattr_recursive(getattr(obj, att[:i]), att[i + 1 :])
+def setattr_recursive(obj, att, val):
+    """
+    Set nested attribute of obj
+    Example: setattr_recursive(obj, 'a.b.c', val) is equivalent to obj.a.b.c = val
+    """
+    if "." in att:
+        obj = getattr_recursive(obj, ".".join(att.split(".")[:-1]))
+    setattr(obj, att.split(".")[-1], val)
+def stack_with_padding(list_of_tensors, padding_value=0, padding_side="right"):
+    """
+    Stack a list of tensors with padding on one side
+    Args:
+        list_of_tensors (list[torch.Tensor]): List of tensors to stack
+        padding_value (int, optional): Value to pad with. Defaults to 0.
+        padding_side (str, optional): Side to pad on. Defaults to "right".
+    Returns:
+        torch.Tensor: Stacked tensors
+    """
+    max_tokens = max(tensor.size(0) for tensor in list_of_tensors)
+    padded_tensors = []
+    for tensor in list_of_tensors:
+        num_tokens = tensor.size(0)
+        if len(tensor.size()) == 1:
+            padding = torch.full(
+                (max_tokens - num_tokens,),
+                padding_value,
+                dtype=tensor.dtype,
+                device=tensor.device,
+            )
+        else:
+            padding = torch.full(
+                (max_tokens - num_tokens, tensor.size(1)),
+                padding_value,
+                dtype=tensor.dtype,
+                device=tensor.device,
+            )
+        padded_tensor = (
+            torch.cat((tensor, padding), dim=0)
+            if padding_side == "right"
+            else torch.cat((padding, tensor), dim=0)
+        )
+        padded_tensors.append(padded_tensor)
+    return torch.stack(padded_tensors)
+def check_embedding_fns(lang_model):
+    """Checks for and attempts to set {get/set}_{input/output}_embeddings functions to the model"""
+    if not has_fn(lang_model, "get_input_embeddings"):
+        if hasattr_recursive(lang_model, "transformer.wte"):  # MPT
+            lang_model.get_input_embeddings = lambda: lang_model.transformer.wte
+        elif hasattr_recursive(lang_model, "model.decoder.embed_tokens"):  # OPT
+            lang_model.get_input_embeddings = lambda: lang_model.decoder.embed_tokens
+        else:
+            raise ValueError(
+                "We require the language encoder to have a get_input_embeddings method but we couldn't determine the name of the input embeddings attribute. Please supply this manually in factory.py."
+            )
+    if not has_fn(lang_model, "set_input_embeddings"):
+        if hasattr_recursive(lang_model, "transformer.wte"):  # MPT
+            lang_model.set_input_embeddings = lambda x: setattr_recursive(
+                lang_model, "transformer.wte", x
+            )
+        elif hasattr_recursive(lang_model, "model.decoder.embed_tokens"):  # OPT
+            lang_model.set_input_embeddings = lambda x: setattr_recursive(
+                lang_model, "model.decoder.embed_tokens", x
+            )
+        else:
+            raise ValueError(
+                "We require the language encoder to have a set_input_embeddings method but we couldn't determine the name of the input embeddings attribute. Please supply this manually in factory.py."
+            )
+    if not has_fn(lang_model, "get_output_embeddings"):
+        if hasattr_recursive(lang_model, "lm_head"):
+            lang_model.get_output_embeddings = lambda: lang_model.lm_head
+        else:
+            raise ValueError(
+                "We require the language encoder to have a get_output_embeddings method but we couldn't determine the name of the output embeddings attribute. Please supply this manually in factory.py."
+            )
+    if not has_fn(lang_model, "set_output_embeddings"):
+        if hasattr_recursive(lang_model, "lm_head"):
+            lang_model.set_output_embeddings = lambda x: setattr_recursive(
+                lang_model, "lm_head", x
+            )
+        else:
+            raise ValueError(
+                "We require the language encoder to have a set_output_embeddings method but we couldn't determine the name of the output embeddings attribute. Please supply this manually in factory.py."
+            )
+def has_fn(model, fn_name):
+    """Check if model has a function fn_name"""
+    return callable(getattr(model, fn_name, None))
+# Adopted from https://github.com/haotian-liu/LLaVA. Below is the original copyright:
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+def unpad_image(tensor, original_size, keep_original_shape=False):
+    """
+    Unpads a PyTorch tensor of a padded and resized image.
+    Args:
+    tensor (torch.Tensor): The image tensor, assumed to be in CxHxW format.
+    original_size (tuple): The original size of the image (height, width).
+    Returns:
+    torch.Tensor: The unpadded image tensor.
+    """
+    original_width, original_height = original_size
+    current_height, current_width = tensor.shape[1:]
+    original_aspect_ratio = original_width / original_height
+    current_aspect_ratio = current_width / current_height
+    if original_aspect_ratio > current_aspect_ratio:
+        scale_factor = current_width / original_width
+        new_height = int(original_height * scale_factor)
+        padding = (current_height - new_height) // 2
+        if keep_original_shape:
+            attention_mask = torch.ones((current_height, current_width), device=tensor.device)
+            attention_mask[:padding, :] = 0
+            attention_mask[current_height - padding:, :] = 0
+            return tensor, attention_mask
+        else:
+            unpadded_tensor = tensor[:, padding:current_height - padding, :]
+            return unpadded_tensor, None
+    else:
+        scale_factor = current_height / original_height
+        new_width = int(original_width * scale_factor)
+        padding = (current_width - new_width) // 2
+        if keep_original_shape:
+            attention_mask = torch.ones((current_height, current_width), device=tensor.device)
+            attention_mask[:, :padding] = 0
+            attention_mask[:, current_width - padding:] = 0
+            return tensor, attention_mask
+        else:
+            unpadded_tensor = tensor[:, :, padding:current_width - padding]
+            return unpadded_tensor, None
+def select_best_resolution(original_size, possible_resolutions):
+    """
+    Selects the best resolution from a list of possible resolutions based on the original size.
+    Args:
+        original_size (tuple): The original size of the image in the format (width, height).
+        possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
+    Returns:
+        tuple: The best fit resolution in the format (width, height).
+    """
+    original_width, original_height = original_size
+    best_fit = None
+    max_effective_resolution = 0
+    min_wasted_resolution = float('inf')
+    for width, height in possible_resolutions:
+        scale = min(width / original_width, height / original_height)
+        downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
+        effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
+        wasted_resolution = (width * height) - effective_resolution
+        if effective_resolution > max_effective_resolution or (effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution):
+            max_effective_resolution = effective_resolution
+            min_wasted_resolution = wasted_resolution
+            best_fit = (width, height)
+    return best_fit
+def resize_and_pad_image(image, target_resolution):
+    """
+    Resize and pad an image to a target resolution while maintaining aspect ratio.
+    Args:
+        image (PIL.Image.Image): The input image.
+        target_resolution (tuple): The target resolution (width, height) of the image.
+    Returns:
+        PIL.Image.Image: The resized and padded image.
+    """
+    original_width, original_height = image.size
+    target_width, target_height = target_resolution
+    scale_w = target_width / original_width
+    scale_h = target_height / original_height
+    if scale_w < scale_h:
+        new_width = target_width
+        new_height = min(math.ceil(original_height * scale_w), target_height)
+    else:
+        new_height = target_height
+        new_width = min(math.ceil(original_width * scale_h), target_width)
+    # Resize the image
+    resized_image = image.resize((new_width, new_height))
+    new_image = Image.new('RGB', (target_width, target_height), (0, 0, 0))
+    paste_x = (target_width - new_width) // 2
+    paste_y = (target_height - new_height) // 2
+    new_image.paste(resized_image, (paste_x, paste_y))
+    return new_image
+def divide_to_patches(image, patch_size):
+    """
+    Divides an image into patches of a specified size.
+    Args:
+        image (PIL.Image.Image): The input image.
+        patch_size (int): The size of each patch.
+    Returns:
+        list: A list of PIL.Image.Image objects representing the patches.
+    """
+    patches = []
+    width, height = image.size
+    for i in range(0, height, patch_size):
+        for j in range(0, width, patch_size):
+            box = (j, i, j + patch_size, i + patch_size)
+            patch = image.crop(box)
+            patches.append(patch)
+    return patches
+def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
+    """
+    Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
+    Args:
+        image_size (tuple): The size of the input image in the format (width, height).
+        grid_pinpoints (str): A string representation of a list of possible resolutions.
+        patch_size (int): The size of each image patch.
+    Returns:
+        tuple: The shape of the image patch grid in the format (width, height).
+    """
+    if type(grid_pinpoints) is list:
+        possible_resolutions = grid_pinpoints
+    else:
+        possible_resolutions = ast.literal_eval(grid_pinpoints)
+    width, height = select_best_resolution(image_size, possible_resolutions)
+    return width // patch_size, height // patch_size
+def process_anyres_image(image, processor, grid_pinpoints):
+    """
+    Process an image with variable resolutions.
+    Args:
+        image (PIL.Image.Image): The input image to be processed.
+        processor: The image processor object.
+        grid_pinpoints (str): A string representation of a list of possible resolutions.
+    Returns:
+        torch.Tensor: A tensor containing the processed image patches.
+    """
+    # FIXME: determine grid_pinpoints from image sizes.
+    if type(grid_pinpoints) is list:
+        possible_resolutions = grid_pinpoints
+    else:
+        possible_resolutions = ast.literal_eval(grid_pinpoints)
+    best_resolution = select_best_resolution(image.size, possible_resolutions)
+    image_padded = resize_and_pad_image(image, best_resolution)
+    processor_size = processor.transforms[0].size
+    patches = divide_to_patches(image_padded, processor_size[0])
+    image_original_resize = image.resize((processor_size[0], processor_size[0]))
+    image_patches = [image_original_resize] + patches
+    image_patches = [processor(image_patch)
+                     for image_patch in image_patches]
+    return torch.stack(image_patches, dim=0)
+def expand2square(pil_img, background_color):
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+def process_images(images, image_processor, model_cfg):
+    image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
+    new_images = []
+    if image_aspect_ratio == 'pad':
+        for image in images:
+            image = expand2square(image, tuple(int(x*255) for x in image_processor.transforms[-1].mean))
+            image = image_processor(image)
+            new_images.append(image)
+    elif image_aspect_ratio in ["anyres", "anyres-legacy"]:
+        base_img_size = image_processor.transforms[0].size[0]
+        for image in images:
+            image = process_anyres_image(image, image_processor, [[base_img_size,base_img_size*2],
+                                                                  [base_img_size*2,base_img_size],
+                                                                  [base_img_size*2,base_img_size*2],
+                                                                  [base_img_size*3,base_img_size],
+                                                                  [base_img_size,base_img_size*3]])
+            # Debug any res inference by only using 672x672.
+            # image = process_anyres_image(image, image_processor, [[base_img_size*2,base_img_size*2]])
+            new_images.append(image)
+    else:
+        return image_processor(images)
+    if all(x.shape == new_images[0].shape for x in new_images):
+        new_images = torch.stack(new_images, dim=0)
+    return new_images

vlm.py ADDED Viewed

	@@ -0,0 +1,1531 @@

+import torch
+from torch import einsum, nn
+from einops import rearrange, repeat
+from einops_exts import rearrange_many
+from einops import rearrange
+from typing import List, Optional, Tuple, Union
+import torch.nn.functional as F
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from dataclasses import dataclass
+from transformers import CLIPVisionModel
+import transformers
+from .utils import num_params, getattr_recursive, stack_with_padding, get_anyres_image_grid_shape, unpad_image
+class VisionTokenizer(nn.Module):
+    def __init__(self, dim_media, num_tokens_per_media):
+        super().__init__()
+        self.dim_media = dim_media
+        self.num_tokens_per_media = num_tokens_per_media
+class PerceiverAttention(nn.Module):
+    def __init__(self, *, dim, dim_head=64, heads=8):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        inner_dim = dim_head * heads
+        self.norm_media = nn.LayerNorm(dim)
+        self.norm_latents = nn.LayerNorm(dim)
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, dim, bias=False)
+    def forward(self, x, latents, vision_attn_masks=None):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (b, T, n1, D)
+            latent (torch.Tensor): latent features
+                shape (b, T, n2, D)
+        """
+        x = self.norm_media(x)
+        latents = self.norm_latents(latents)
+        h = self.heads
+        q = self.to_q(latents)
+        kv_input = torch.cat((x, latents), dim=-2) # TODO: Change the shape of vision attention mask according to this.
+        if vision_attn_masks is not None:
+            vision_attn_masks = torch.cat((vision_attn_masks,
+                                            torch.ones((latents.shape[0], latents.shape[-2]), dtype=latents.dtype, device=latents.device)),
+                                            dim=-1)
+        k, v = self.to_kv(kv_input).chunk(2, dim=-1)
+        q, k, v = rearrange_many((q, k, v), "b t n (h d) -> b h t n d", h=h)
+        q = q * self.scale
+        # attention
+        sim = einsum("... i d, ... j d  -> ... i j", q, k)
+        # Apply vision attention mask here.
+        # Reference: https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html#torch.nn.functional.scaled_dot_product_attention
+        if vision_attn_masks is not None:
+            attn_bias = torch.zeros((q.size(0), 1, 1, q.size(-2), k.size(-2)), dtype=q.dtype, device=q.device)
+            vision_attn_masks = repeat(vision_attn_masks, 'b n -> b 1 1 l n', l=q.size(-2))
+            attn_bias.masked_fill_(vision_attn_masks.logical_not(), float("-inf"))
+            sim += attn_bias
+        sim = sim - sim.amax(dim=-1, keepdim=True).detach()
+        attn = sim.softmax(dim=-1)
+        out = einsum("... i j, ... j d -> ... i d", attn, v)
+        out = rearrange(out, "b h t n d -> b t n (h d)", h=h)
+        return self.to_out(out)
+def FeedForward(dim, mult=4):
+    inner_dim = int(dim * mult)
+    return nn.Sequential(
+        nn.LayerNorm(dim),
+        nn.Linear(dim, inner_dim, bias=False),
+        nn.GELU(),
+        nn.Linear(inner_dim, dim, bias=False),
+    )
+class InstructPerceiverResampler(VisionTokenizer):
+    def __init__(
+        self,
+        *,
+        dim_llm,
+        dim,
+        dim_inner=None,
+        depth=6,
+        dim_head=96,
+        heads=16,
+        num_latents=64,
+        repeat_latents=False,
+        max_num_media=None,
+        max_num_frames=None,
+        ff_mult=4,
+    ):
+        """
+        Perceiver module which takes in image features and outputs image tokens.
+        Args:
+            dim (int): dimension of the incoming image features
+            dim_inner (int, optional): final dimension to project the incoming image features to;
+                also the final dimension of the outputted features. If None, no projection is used, and dim_inner = dim.
+            depth (int, optional): number of layers. Defaults to 6.
+            dim_head (int, optional): dimension of each head. Defaults to 64.
+            heads (int, optional): number of heads. Defaults to 8.
+            num_latents (int, optional): number of latent tokens to use in the Perceiver;
+                also corresponds to number of tokens per sequence to output. Defaults to 64.
+            max_num_media (int, optional): maximum number of media per sequence to input into the Perceiver
+                and keep positional embeddings for. If None, no positional embeddings are used.
+            max_num_frames (int, optional): maximum number of frames to input into the Perceiver
+                and keep positional embeddings for. If None, no positional embeddings are used.
+            ff_mult (int, optional): dimension multiplier for the feedforward network. Defaults to 4.
+        """
+        if dim_inner is not None:
+            projection = nn.Linear(dim, dim_inner)
+        else:
+            projection = None
+            dim_inner = dim
+        super().__init__(dim_media=dim, num_tokens_per_media=num_latents)
+        self.projection = projection
+        # Text embedding projection.
+        # self.text_projection = nn.Linear(dim_llm, dim)
+        modules = [nn.Linear(dim_llm, dim)]
+        for _ in range(1, 2):
+            modules.append(nn.GELU())
+            modules.append(nn.Linear(dim, dim))
+        self.text_projection = nn.Sequential(*modules)
+        self.latents = nn.Parameter(torch.randn(num_latents, dim))
+        self.repeat_latents = repeat_latents
+        # positional embeddings
+        self.frame_embs = (
+            nn.Parameter(torch.randn(max_num_frames, dim))
+            if exists(max_num_frames)
+            else None
+        )
+        self.media_time_embs = (
+            nn.Parameter(torch.randn(max_num_media, 1, dim))
+            if exists(max_num_media)
+            else None
+        )
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        PerceiverAttention(
+                            dim=dim, dim_head=dim_head, heads=heads
+                        ),
+                        FeedForward(dim=dim, mult=ff_mult),
+                    ]
+                )
+            )
+        self.norm = nn.LayerNorm(dim)
+    # TODO: write a new forward function that takes in text input and append them to the query tokens.
+    def forward(self, x, text_embeds=None):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (b, T, F, v, D)
+        Returns:
+            shape (b, T, n, D) where n is self.num_latents
+        """
+        b, T, F, v = x.shape[:4]
+        # frame and media time embeddings
+        if exists(self.frame_embs):
+            frame_embs = repeat(self.frame_embs[:F], "F d -> b T F v d", b=b, T=T, v=v)
+            x = x + frame_embs
+        x = rearrange(
+            x, "b T F v d -> b T (F v) d"
+        )  # flatten the frame and spatial dimensions
+        if exists(self.media_time_embs):
+            x = x + self.media_time_embs[:T]
+        # blocks
+        # FIXME: extending query tokens proportional to the vision sequence length. Hard-coded as dfn5b token_len=729.
+        if self.repeat_latents:
+            r = v // 729 # Repeat the query tokens for r times.
+            latents = repeat(self.latents, "n d -> (n repeat) d", repeat=r)
+        else:
+            latents = self.latents
+        latents = repeat(latents, "n d -> b T n d", b=b, T=T)
+        # latents = repeat(self.latents, "n d -> b T n d", b=b, T=T)
+         # Append text embedding.
+        if exists(text_embeds):
+            text_embeds = self.text_projection(text_embeds)
+            text_embeds = text_embeds[:, None, :, :]
+            latents = torch.cat((latents, text_embeds), dim=2) # FIXME: check latents shape.
+        for attn, ff in self.layers:
+            latents = attn(x, latents) + latents
+            latents = ff(latents) + latents
+        # Truncate latents to only keep query tokens.
+        if exists(text_embeds):
+            latents = latents[:, :, :self.latents.shape[0], :]
+        if exists(self.projection):
+            return self.projection(self.norm(latents))
+        else:
+            return self.norm(latents)
+class DecoupledEmbedding(nn.Embedding):
+    # Derived from https://pytorch.org/docs/stable/_modules/torch/nn/modules/sparse.html#Embedding
+    """
+    Implements a decoupling of parameters to allow freezing (or not) a subset of the embeddings. In practise, the
+    regular `weight` can be trained or frozen (i.e. `partially_freeze=True`), and if `num_additional_embeddings` > 0,
+    then it will create `num_additional_embeddings` additional parameters that are always trained. If
+    `num_additional_embeddings=0`, then the module defaults back to the regular behavior of `nn.Embedding`.
+    """
+    def __init__(
+        self,
+        max_original_id: int,
+        num_additional_embeddings: int = 0,
+        _weight: torch.Tensor = None,
+        num_original_embeddings: int = None,
+        embedding_dim: int = None,
+        partially_freeze=True,
+        device=None,
+        dtype=None,
+        pad_token_id=None,
+    ) -> None:
+        """
+        Args:
+            max_original_id (`int`):
+                The largest token id that should be embedded using the regular embedding (regular `weight`).
+                This is usually len(tokenizer) - 1 before additional tokens are added.
+                Note that this may not equal self.weight.shape[0]
+            num_additional_embeddings (`int`):
+                Number of additional tokens to initialize an Embedding matrix for (`additional_weight`).
+            _weight (`torch.Tensor`, *optional*, defaults to `None`): The regular weight tensor.
+                If provided, this sets the `num_original_embeddings` and `embedding_dim` parameters.
+            num_original_embeddings (`int`):
+                self.weight.shape[0]
+            embedding_dim (`int`):
+                The size of each embedding vector
+            partially_freeze: (`bool`, *optional*, defaults to `True`):
+                If `True`, the regular `weight` will be frozen. `additional_weight` is never frozen.
+            padding_idx (`int`, *optional*):
+                The padding index (needs to be less than num_embeddings)
+        Note: there are a lot of other parameters to initialize a standard `nn.Embedding` such as `padding_idx`,
+        `max_norm` or `norm_type`. We are not supporting these.
+        """
+        # validate args
+        if pad_token_id is not None and pad_token_id > max_original_id:
+            raise ValueError(
+                f"pad_token_id must be <= max_original_id. Got {pad_token_id} and {max_original_id}."
+                + "If the original tokenizer does not have a pad_token_id, use pad_token_id=None."
+            )
+        if _weight is not None:
+            assert (num_original_embeddings is None) or (
+                _weight.shape[0] == num_original_embeddings
+            ), f"num_original_embeddings={num_original_embeddings} but _weight.shape[0]={_weight.shape[0]}"
+            assert (embedding_dim is None) or (
+                _weight.shape[1] == embedding_dim
+            ), f"embedding_dim={embedding_dim} but _weight.shape[1]={_weight.shape[1]}"
+            num_original_embeddings = _weight.shape[0]
+            embedding_dim = _weight.shape[1]
+        else:
+            assert (
+                num_original_embeddings is not None
+            ), "num_original_embeddings must be provided if _weight is not provided"
+            assert (
+                embedding_dim is not None
+            ), "embedding_dim must be provided if _weight is not provided"
+        super().__init__(
+            num_embeddings=num_original_embeddings,
+            embedding_dim=embedding_dim,
+            device=device,
+            dtype=dtype,
+            padding_idx=pad_token_id,
+            _weight=_weight,
+        )
+        self.max_original_id = max_original_id
+        self.padding_idx = pad_token_id
+        self.num_additional_embeddings = num_additional_embeddings
+        if self.num_additional_embeddings > 0:
+            self.additional_embedding = nn.Embedding(
+                num_embeddings=self.num_additional_embeddings,
+                embedding_dim=embedding_dim,
+                device=device,
+                dtype=dtype,
+            )
+        self.set_requires_grad(
+            require_regular_grad=not partially_freeze, require_additional_grad=True
+        )
+    def set_requires_grad(self, require_regular_grad, require_additional_grad):
+        """
+        Helper function to separately set the requires_grad flag for the regular weight and the additional weight.
+        """
+        self.weight.requires_grad_(require_regular_grad)
+        self.additional_embedding.requires_grad_(require_additional_grad)
+    def forward(self, input_ids):
+        """
+        we have 2 embeddings, with different indices - one pretrained self.weight and another
+        self.additional_embedding.weight that is being trained.
+        in order to make a lookup of the input ids, we:
+        1. find out the indices of the entries belonging to the 2nd embedding
+        2. extract those values while subtracting the size of the first embedding (num_embeddings), since the 2nd
+        embedding starts from 0 and not num_embeddings
+        3. perform the 2nd embedding lookup
+        4. now we handle the 1st embedding, we overwrite indices belonging to the 2nd embedding with a padding index
+        5. perform the 1st embedding lookup
+        6. now we overwrite the values in the 1st embedding lookup with the values of the 2nd embedding lookup
+        note: for the 1st embedding lookup we could have looked up only the low indices and not do the padding, but
+        then we have to create a new tensor and populate it with 2 tensors that are spread out across various indices -
+        i.e. not a simple concat - I haven't benchmarked the complex case if it's any faster, given that seqlens are
+        usually relatively short it's probably not faster or if faster not by much - but might be a good idea to
+        measure.
+        """
+        if self.num_additional_embeddings == 0:
+            return F.embedding(input_ids, self.weight)
+        # Clone so that we don't modify the original input_ids later on
+        input_ids = input_ids.clone()
+        additional_vocab_indices = torch.where(input_ids > self.max_original_id)
+        input_ids_additional_vocab = input_ids[additional_vocab_indices]
+        additional_embeddings = self.additional_embedding(
+            input_ids_additional_vocab - self.max_original_id - 1
+        )
+        # for successful lookup replace input_ids with 0, the results of these will be discarded anyway
+        input_ids[additional_vocab_indices] = 0
+        full_vector = F.embedding(input_ids, self.weight)
+        # overwrite the records with high indices
+        full_vector[additional_vocab_indices] = additional_embeddings
+        return full_vector
+    def extra_repr(self) -> str:
+        return "num_original_embeddings={}, num_additional_embeddings={}, embedding_dim={}, partially_freeze={}".format(
+            self.max_original_id + 1,
+            self.num_additional_embeddings,
+            self.embedding_dim,
+            (not self.weight.requires_grad),
+        )
+class DecoupledLinear(nn.Linear):
+    # Derived from https://pytorch.org/docs/stable/_modules/torch/nn/modules/linear.html#Linear
+    """
+    Implements a decoupling of parameters to allow freezing (or not) a subset of the parameters. In practise, the
+    regular `weight` can be trained or frozen (i.e. `partially_freeze=True`), and if `additional_out_features` > 0,
+    then it will create `additional_out_features * in_features` additional parameters that are always trained. If
+    `additional_out_features=0`, then the module defaults back to the regular behavior of `nn.Linear`.
+    """
+    def __init__(
+        self,
+        max_original_id: int,
+        additional_out_features: int = 0,
+        _weight: torch.Tensor = None,
+        _bias: torch.Tensor = None,
+        in_features: int = None,
+        original_out_features: int = None,
+        bias: bool = True,
+        partially_freeze: bool = True,
+        device=None,
+        dtype=None,
+    ) -> None:
+        """
+        Args:
+            max_original_id (`int`): The largest token id that should be extracted from the regular weight.
+                This is usually len(tokenizer) - 1 before additional tokens are added.
+                Note that this may not equal original_out_features - 1
+            _weight: torch.Tensor, *optional*, defaults to `None`. The regular weight tensor.
+                If provided, this sets the `in_features` and `original_out_features` parameters.
+            _bias: torch.Tensor, *optional*, defaults to `None`. The regular bias tensor.
+            in_features: int. Input hidden size.
+            original_out_features: int. Original out_features of the language model's get_output_embeddings() function.
+            additional_out_features: int. Number of additional trainable dimensions.
+            bias: bool. Whether to include a bias term.
+            partially_freeze: bool, *optional*, defaults to `True`): If `True`, the regular `weight` will be frozen.
+        """
+        # argument validation
+        if _weight is not None:
+            assert (_weight.shape[0] == original_out_features) or (
+                original_out_features is None
+            ), f"original_out_features={original_out_features} but _weight.shape[0]={_weight.shape[0]}"
+            assert (_weight.shape[1] == in_features) or (
+                in_features is None
+            ), f"in_features={in_features} but _weight.shape[1]={_weight.shape[1]}"
+            in_features = _weight.shape[1]
+            original_out_features = _weight.shape[0]
+        else:
+            assert (
+                in_features is not None
+            ), "in_features must be provided if _weight is not provided"
+            assert (
+                original_out_features is not None
+            ), "original_out_features must be provided if _weight is not provided"
+        if _bias is not None:
+            assert bias is True, "bias must be True if _bias is provided"
+        # initialize original linear
+        super().__init__(
+            in_features,
+            original_out_features,
+            bias,
+            device,
+            dtype)
+        # set weight and bias manually
+        if _weight is not None:
+            self.weight = nn.Parameter(_weight)
+        if _bias is not None:
+            self.bias = nn.Parameter(_bias)
+        self.in_features = in_features
+        self.original_out_features = original_out_features
+        self.max_original_id = max_original_id
+        # initialize additional linear
+        self.additional_out_features = additional_out_features
+        self.has_bias = bias
+        if additional_out_features > 0:
+            self.additional_fc = nn.Linear(
+                in_features=in_features,
+                out_features=additional_out_features,
+                bias=self.has_bias,
+                device=device,
+                dtype=dtype,
+            )
+        self.set_requires_grad(
+            require_regular_grad=not partially_freeze, require_additional_grad=True
+        )
+    def set_requires_grad(self, require_regular_grad, require_additional_grad):
+        """
+        Helper function to separately set the requires_grad flag for the regular weight and the additional weight.
+        """
+        self.weight.requires_grad_(require_regular_grad)
+        if self.has_bias:
+            self.bias.requires_grad_(require_regular_grad)
+        self.additional_fc.requires_grad_(require_additional_grad)
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        output = F.linear(input, self.weight, self.bias)
+        output = output[..., : self.max_original_id + 1]
+        if self.additional_out_features > 0:
+            additional_features = F.linear(
+                input, self.additional_fc.weight, self.additional_fc.bias
+            )
+            output = torch.cat((output, additional_features), -1)
+        return output
+    def extra_repr(self) -> str:
+        """Overwriting `nn.Linear.extra_repr` to include new parameters."""
+        return "in_features={}, out_features={}, additional_out_features={}, bias={}, partially_freeze={}".format(
+            self.in_features,
+            self.max_original_id + 1,
+            self.additional_out_features,
+            self.bias is not None,
+            (not self.weight.requires_grad or not self.bias.requires_grad),
+        )
+class VLM(nn.Module):
+    """
+    Generic vision-language model (VLM) class.
+    A VLM consists of four components:
+        1. A vision encoder that extracts features from pixels, e.g. CLIP
+            input: (B, T_img, F, C, H, W)
+            output: (B, T_img, F, v, d)
+        2. A vision tokenizer that converts these features to visual token-like embeddings, e.g. Perceiver, or a linear projection head
+            input: (B, T_img, F, v, d)
+            output: (B, T_img, n, d)
+        3. A fusion method that allows the language model to attend to these tokens, e.g. cross-attention, or placing the tokens directly in the language model's input sequence
+        4. A language model
+    """
+    def __init__(
+        self,
+        vision_encoder: nn.Module,
+        vision_tokenizer: nn.Module,
+        lang_model: nn.Module,
+        initial_tokenizer_len: int,
+        pad_token_id: int,
+        gradient_checkpointing: bool = False,
+    ):
+        """
+        Args:
+            vision_encoder (nn.Module): e.g. CLIP
+            vision_tokenizer (nn.Module): e.g. PerceiverResampler
+            lang_model (nn.Module): e.g. MPT
+            initial_tokenizer_len (int): size of the original tokenizer vocab
+            pad_token_id (int): id of the pad token
+            gradient_checkpointing (bool, optional): Whether to use gradient checkpointing. Defaults to False.
+        """
+        super().__init__()
+        # save dimension information
+        self.lang_embedding_dim = lang_model.get_input_embeddings().weight.shape[1]
+        if hasattr(lang_model.config, "d_model"):
+            self.lang_hidden_dim = lang_model.config.d_model  # mpt uses d_model
+        else:
+            self.lang_hidden_dim = lang_model.config.hidden_size
+        self.vis_embedding_dim = vision_tokenizer.dim_media
+        self.num_tokens_per_vis = vision_tokenizer.num_tokens_per_media
+        # core components
+        self.vision_encoder = vision_encoder
+        self.vision_tokenizer = vision_tokenizer
+        self.lang_model = lang_model
+        # lm embeddings
+        self.pad_token_id = pad_token_id
+        self.initial_tokenizer_len = initial_tokenizer_len
+        input_embeds = DecoupledEmbedding(
+            max_original_id=initial_tokenizer_len - 1,
+            num_additional_embeddings=len(self.special_tokens),
+            _weight=self.lang_model.get_input_embeddings().weight,
+            pad_token_id=self.pad_token_id,
+        )
+        if hasattr(input_embeds, "additional_embedding"):
+            input_embeds.additional_embedding.weight.data.normal_(
+                mean=0.0,
+                std=self.lang_model.config.initializer_range
+                if hasattr(self.lang_model.config, "initializer_range")
+                else 0.02,
+            )
+        self.lang_model.set_input_embeddings(input_embeds)
+        out_embeds = DecoupledLinear(
+            max_original_id=initial_tokenizer_len - 1,
+            additional_out_features=len(self.special_tokens),
+            _weight=self.lang_model.get_output_embeddings().weight,
+            _bias=self.lang_model.get_output_embeddings().bias if hasattr(self.lang_model.get_output_embeddings(), "bias") else None,
+        )
+        if hasattr(out_embeds, "additional_fc"):
+            out_embeds.additional_fc.weight.data.normal_(
+                mean=0.0,
+                std=self.lang_model.config.initializer_range
+                if hasattr(self.lang_model.config, "initializer_range")
+                else 0.02,
+            )
+        self.lang_model.set_output_embeddings(out_embeds)
+        # gradient checkpointing
+        self.vision_tokenizer._use_gradient_checkpointing = gradient_checkpointing
+    def forward(
+        self,
+        vision_x: Optional[torch.Tensor],
+        lang_x: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        past_key_values: Optional[
+            List[Union[torch.Tensor, Tuple[torch.Tensor]]]
+        ] = None,
+        past_media_locations: Optional[torch.Tensor] = None,
+        past_vision_tokens: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = False,
+        **kwargs,
+    ):
+        """
+        Args:
+            vision_x: Vision input
+                shape (B, T_img, F, C, H, W) with F=1
+                only F = 1 is supported (single-frame videos)
+                if T_img > the number of media tokens in the corresponding input_ids (lang_x),
+                only the first number of media tokens in lang_x are used
+            lang_x: Language input ids, with media tokens denoting where
+                visual media should be inserted.
+                shape (B, T_txt)
+            attention_mask: Attention mask. Defaults to None.
+            labels: Labels. Defaults to None.
+                shape (B, T_txt)
+            past_key_values (Tuple[torch.Tensor]], optional): Past key value pairs for each of the T_txt previous tokens in the language model. Defaults to None.
+                list of length = number of decoder layers in the LM
+                exact implementation depends on LM, see Hugging Face docs
+            past_media_locations (torch.Tensor, optional): boolean mask denoting which of the previous T_txt tokens were media tokens. Defaults to None.
+                shape (B, T_txt)
+            past_vision_tokens (torch.Tensor, optional): Previous vision tokens. Defaults to None.
+            use_cache (Optional[bool], optional): Whether to use cache. Defaults to False.
+                If True, includes key_values, media_locations, and vision_tokens in the output.
+        """
+        assert not (past_vision_tokens is None) ^ (
+            past_media_locations is None
+        ), "past_vision_tokens and past_media_locations must both be None or both be not None"
+        # convert pixels to vision tokens
+        if vision_x is not None:
+            vision_features = self._encode_vision_x(vision_x=vision_x)
+            vision_tokens = self.vision_tokenizer(vision_features)
+        else:
+            vision_tokens = None
+        # fuse the vision and language tokens
+        new_inputs = self._prepare_inputs_for_forward(
+            vision_tokens=vision_tokens,
+            lang_x=lang_x,
+            attention_mask=attention_mask,
+            labels=labels,
+            past_key_values=past_key_values,
+            past_media_locations=past_media_locations,
+            padding_side="right",
+            past_vision_tokens=past_vision_tokens,
+        )
+        output = self.lang_model(
+            **new_inputs,
+            use_cache=use_cache,
+            past_key_values=past_key_values,
+            **kwargs,
+        )
+        # postprocessing may be needed, e.g. to remove extra tokens from logits that were inserted into the language stream
+        # or to add the past_vision_tokens and past_media_locations to the output
+        output = self._postprocess_outputs_from_forward(
+            output=output,
+            lang_x=lang_x,
+            vision_tokens=vision_tokens,
+            use_cache=use_cache,
+            past_vision_tokens=past_vision_tokens,
+            past_media_locations=past_media_locations,
+        )
+        # postforward hooks
+        self._post_forward_hook()
+        return output
+    def _encode_vision_x_anyres(self, samples, device):
+        image_raw = samples["image"] # list of patch list in of shape [1, N_patch, C, H, W]
+        image_sizes = samples["image_size"]
+        # concate list of patches into one big patch for any res encoding.
+        images = [x.squeeze(0) for x in image_raw] # [N_patch, C, H, W]
+        image = torch.cat(images, dim=0) # [\sum{B}{N_patch_i}, C, H, W]
+        image = image.to(device)
+        with torch.no_grad():
+            if self.vision_encoder.__class__.__name__ == "TimmModel":
+                image_embeds = self.vision_encoder.trunk.forward_features(image)
+            elif self.vision_encoder.__class__.__name__ == 'CLIPVisionModel':
+                image_embeds = self.vision_encoder(image).last_hidden_state
+            else:
+                image_embeds = self.vision_encoder(image)[1]  # OpenCLIP returns tuples
+        if isinstance(self.vision_encoder, CLIPVisionModel):
+            base_img_size = self.vision_encoder.config.image_size
+        else:
+            base_img_size = self.vision_encoder.image_size[0]
+        if self.vision_encoder.__class__.__name__ == "TimmModel":
+            grid_size = self.vision_encoder.trunk.patch_embed.grid_size
+        elif self.vision_encoder.__class__.__name__ == 'CLIPVisionModel':
+            grid_size_base = self.vision_encoder.config.image_size // self.vision_encoder.config.patch_size
+            grid_size = (grid_size_base, grid_size_base)
+        else:
+            grid_size = self.vision_encoder.grid_size
+        height, width = grid_size
+        if not image_embeds.shape[1] == height * width:
+            assert image_embeds.shape[1] == height * width + 1 # For vision encoders that has [CLS] token.
+            image_embeds = image_embeds[:, 1:, :] # Drop the cls token for each patch.
+        n_vis_token_per_patch = image_embeds.shape[1]
+        # Split encoded patches and merge patch features
+        # 1. Get the raw sizes from samples, and split the image embeds [\sum_{B}(N_patch_i), N_tok(16*16), C]
+        split_sizes = [image.shape[0] for image in images]
+        image_embeds = torch.split(image_embeds, split_sizes, dim=0)
+        # 2. For each image (consist of a list of patches), merge the patches spatially (of shape [C, n_patch_height, n_patch_width])
+        new_image_embeds = []
+        patch_attn_masks = []
+        max_n_img_token = -1
+        for idx, patch_embeds in enumerate(image_embeds):
+            if patch_embeds.shape[0] > 1:
+                # 3. Flatten the patch features and get [C, n_patch_height * (n_patch_width+1)]
+                base_patch_embeds = patch_embeds[0] # TODO: prepend the CLS token for th base patch embeds (of the resized entire image).
+                patch_embeds = patch_embeds[1:]
+                assert height * width == base_patch_embeds.shape[0]
+                num_patch_width, num_patch_height = get_anyres_image_grid_shape(image_sizes[idx],
+                                                                                [[base_img_size,base_img_size*2],
+                                                                                 [base_img_size*2,base_img_size],
+                                                                                 [base_img_size*2,base_img_size*2],
+                                                                                 [base_img_size*3,base_img_size],
+                                                                                 [base_img_size,base_img_size*3]],
+                                                                                base_img_size) # Hardcoded grid_pinpoints.
+                patch_embeds = patch_embeds.view(num_patch_height, num_patch_width, height, width, -1)
+                patch_embeds = patch_embeds.permute(4, 0, 2, 1, 3).contiguous()
+                patch_embeds = patch_embeds.flatten(1, 2).flatten(2, 3)
+                # TODO: add an option that return masked patch_embeds instead of trimmed.
+                patch_embeds, patch_attn_mask = unpad_image(patch_embeds, image_sizes[idx], self.anyres_patch_sampling)
+                if hasattr(self, 'image_newline'):
+                    patch_embeds = torch.cat((
+                            patch_embeds,
+                            self.image_newline[:, None, None].expand(*patch_embeds.shape[:-1], 1)
+                        ), dim=-1)
+                if self.anyres_patch_sampling:
+                    patch_embeds = patch_embeds.view(-1, num_patch_height, num_patch_width, height*width)
+                    patch_embeds = patch_embeds.flatten(1, 2).permute(1, 2, 0)
+                    assert patch_attn_mask is not None
+                    patch_attn_mask = patch_attn_mask.view(num_patch_height, num_patch_width, height*width)
+                    patch_attn_mask = patch_attn_mask.flatten(0, 1)
+                    patch_embeds = torch.cat((base_patch_embeds.unsqueeze(0), patch_embeds), dim=0)
+                    patch_attn_mask = torch.cat((torch.ones(n_vis_token_per_patch, device=patch_embeds.device).unsqueeze(0), patch_attn_mask), dim=0)
+                else:
+                    patch_embeds = patch_embeds.flatten(1, 2).transpose(0, 1)
+                    patch_embeds = torch.cat((base_patch_embeds, patch_embeds), dim=0)
+            else:
+                patch_embeds = patch_embeds[0].unsqueeze(0) if self.anyres_patch_sampling else patch_embeds[0]
+                patch_attn_mask = torch.ones(n_vis_token_per_patch, device=patch_embeds.device).unsqueeze(0) if self.anyres_patch_sampling else None
+                if hasattr(self, 'image_newline'):
+                    patch_embeds = torch.cat((
+                            patch_embeds,
+                            self.image_newline[None]
+                        ), dim=0)
+            if not self.anyres_patch_sampling:
+                max_n_img_token = max(patch_embeds.shape[0], max_n_img_token)
+            new_image_embeds.append(patch_embeds)
+            patch_attn_masks.append(patch_attn_mask)
+        if self.anyres_patch_sampling:
+            # Return individual patches for independent token downsampling.
+            return new_image_embeds, patch_attn_masks
+        # 4. Pad and concat the list of image_embeds [N_tok_i, C] together into a batch. Also modify the query attention mask.
+        image_embeds = []
+        image_atts = []
+        for image_embed in new_image_embeds:
+            n_img_token = image_embed.shape[0]
+            img_attn = torch.ones((max_n_img_token), dtype=torch.long, device=image_embed.device)
+            if n_img_token < max_n_img_token:
+                padded_embed = torch.zeros((max_n_img_token, image_embed.shape[-1]), dtype=image_embed.dtype, device=image_embed.device)
+                padded_embed[:n_img_token, :] = image_embed
+                img_attn[n_img_token:] = 0 # Mask out the padded entries.
+            else:
+                padded_embed = image_embed
+            image_embeds.append(padded_embed)
+            image_atts.append(img_attn)
+        image_embeds = torch.stack(image_embeds, dim=0) # Shape [B, N_tok_longest, C_dim]
+        image_atts = torch.stack(image_atts, dim=0) # Shape [B, N_tok_longest, C_dim]
+        # TODO: reshape image_embeds and image_atts to "b T F v d"
+        image_embeds = image_embeds[:, None, None, :, :]
+        # image_atts = image_atts[:, None, None, :, :]
+        return image_embeds, image_atts
+    def _encode_vision_x(self, vision_x: torch.Tensor):
+        """
+        Compute media tokens from vision input by passing it through vision encoder and conditioning language model.
+        Args:
+            vision_x: Vision input
+                shape (B, T_img, F, C, H, W)
+                Images in the same chunk are collated along T_img, and frames are collated along F
+                Currently only F=1 is supported (single-frame videos)
+        rearrange code based on https://github.com/dhansmair/flamingo-mini
+        """
+        assert vision_x.ndim == 6, "vision_x should be of shape (b, T_img, F, C, H, W)"
+        b, T, F = vision_x.shape[:3]
+        vision_x = rearrange(vision_x, "b T F c h w -> (b T F) c h w")
+        with torch.no_grad():
+            if self.vision_encoder.__class__.__name__ == "TimmModel":
+                vision_x = self.vision_encoder.trunk.forward_features(vision_x)
+            elif self.vision_encoder.__class__.__name__ == 'CLIPVisionModel':
+                vision_x = self.vision_encoder(vision_x).last_hidden_state
+            else:
+                vision_x = self.vision_encoder(vision_x)[1]  # OpenCLIP returns tuples
+        vision_x = rearrange(vision_x, "(b T F) v d -> b T F v d", b=b, T=T, F=F)
+        return vision_x
+    def _concat_vision_cache(
+        self, lang_x, vision_tokens, past_vision_tokens, past_media_locations, use_cache
+    ):
+        """
+        Helper function to include the past vision tokens and past media locations in the output.
+        """
+        if use_cache:
+            if past_media_locations is not None and past_vision_tokens is not None:
+                if vision_tokens is not None:
+                    updated_vision_tokens = torch.cat(
+                        [
+                            past_vision_tokens,
+                            vision_tokens,
+                        ],
+                        dim=1,
+                    )
+                else:
+                    updated_vision_tokens = past_vision_tokens
+                updated_media_locations = torch.cat(
+                    [
+                        past_media_locations,
+                        lang_x == self.media_token_id,
+                    ],
+                    dim=1,
+                )
+            else:
+                updated_vision_tokens = vision_tokens
+                updated_media_locations = lang_x == self.media_token_id
+        else:
+            updated_vision_tokens = None
+            updated_media_locations = None
+        return updated_vision_tokens, updated_media_locations
+    def generate(
+        self,
+        vision_x: torch.Tensor,
+        lang_x: torch.Tensor,
+        attention_mask: torch.Tensor = None,
+        past_key_values: Optional[
+            List[Union[torch.Tensor, Tuple[torch.Tensor]]]
+        ] = None,
+        past_media_locations: Optional[torch.Tensor] = None,
+        past_vision_tokens: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+        """
+        Generate text conditioned on vision and language inputs.
+        Args:
+            vision_x (torch.Tensor): Vision input
+                shape (B, T_img, F, C, H, W)
+                see documentation for forward
+            lang_x (torch.Tensor): Language input
+                shape (B, T_txt)
+            attention_mask (torch.Tensor, optional): Attention mask. Defaults to None.
+            **kwargs: see generate documentation in Hugging Face CausalLM models.
+        Returns:
+            torch.Tensor: lang_x with generated tokens appended to it
+        """
+        num_beams = kwargs.pop("num_beams", 1)
+        # convert pixels to vision tokens
+        if vision_x is not None:
+            vision_features = self._encode_vision_x(vision_x=vision_x)
+            vision_tokens = self.vision_tokenizer(vision_features)
+        else:
+            vision_tokens = None
+        # fuse the vision and language tokens
+        # for xattn, vision_x and media_location are repeat_interleaved s.t.
+        # the total batch size is B * num_beams
+        new_inputs = self._prepare_inputs_for_forward(
+            vision_tokens=vision_tokens,
+            lang_x=lang_x,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            past_media_locations=past_media_locations,
+            past_vision_tokens=past_vision_tokens,
+            padding_side="left",
+            num_beams=num_beams,
+        )
+        output = self.lang_model.generate(
+            **new_inputs,
+            past_key_values=past_key_values,
+            num_beams=num_beams,
+            use_cache=True,
+            **kwargs,
+        )
+        self._post_forward_hook()
+        return output
+    @property
+    def num_trainable_params(self):
+        """Print the number of trainable parameters"""
+        return num_params(self, filter_to_trainable=True)
+    def set_trainable(self):
+        """
+        Freeze appropriate parameters in the model.
+        """
+        raise NotImplementedError
+    def group_params_by_weight_decay(self):
+        """
+        Return a tuple of (params to optimize w/ weight decay, params to optimize w/o weight decay)
+        """
+        params_with_wd, params_without_wd = [], []
+        for n, p in self.named_parameters():
+            if p.requires_grad:
+                if self._should_apply_weight_decay(n):
+                    params_with_wd.append(p)
+                else:
+                    params_without_wd.append(p)
+        return params_with_wd, params_without_wd
+    def _should_apply_weight_decay(self, parameter_name):
+        """
+        Return whether weight decay should be applied to a parameter.
+        """
+        raise NotImplementedError
+    @property
+    def special_tokens(self):
+        """
+        Returns a dict mapping from the attribute name of a special token to its string format,
+         e.g. "media_token": "<image>"
+        """
+        assert (
+            "media_token" in self._special_tokens
+        ), "VLMs need to request that the tokenizer add a media_token and call set_special_token_ids to set self.media_token_id"
+        return self._special_tokens
+    @property
+    def special_token_ids(self):
+        """
+        Returns a list of the special token ids
+        """
+        return [getattr(self, f"{att_name}_id") for att_name in self.special_tokens]
+    def set_special_token_ids(self, string_to_ids):
+        """
+        Args:
+            string_to_ids (dict): mapping from token string to id
+        """
+        assert set(self.special_tokens.values()).issubset(set(string_to_ids.keys()))
+        for att_name, token_str in self.special_tokens.items():
+            token_id = string_to_ids[token_str]
+            setattr(self, f"{att_name}_id", token_id)
+            setattr(self.lang_model, f"{att_name}_id", token_id)
+    def init_gradient_checkpointing(self):
+        from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+            checkpoint_wrapper,
+            CheckpointWrapper,
+            CheckpointImpl,
+            apply_activation_checkpointing,
+        )
+        from functools import partial
+        non_reentrant_wrapper = partial(
+            checkpoint_wrapper,
+            checkpoint_impl=CheckpointImpl.NO_REENTRANT,
+        )
+        apply_activation_checkpointing(
+            self,
+            checkpoint_wrapper_fn=non_reentrant_wrapper,
+            check_fn=lambda m: getattr(m, "_use_gradient_checkpointing", False)
+            and not isinstance(m, CheckpointWrapper),
+        )
+@dataclass
+class VLMOutputWithPast(CausalLMOutputWithPast):
+    """
+    VLMOutputWithPast is a wrapper around CausalLMOutputWithPast that adds the following attributes:
+        past_media_locations: Optional[torch.Tensor] = None,
+        past_vision_tokens: Optional[torch.Tensor] = None,
+    """
+    past_media_locations: Optional[torch.Tensor] = None
+    past_vision_tokens: Optional[torch.Tensor] = None
+def exists(val):
+    return val is not None
+def FeedForward(dim, mult=4):
+    inner_dim = int(dim * mult)
+    return nn.Sequential(
+        nn.LayerNorm(dim),
+        nn.Linear(dim, inner_dim, bias=False),
+        nn.GELU(),
+        nn.Linear(inner_dim, dim, bias=False),
+    )
+class VLMWithLanguageStream(VLM):
+    """
+    VLM that fuses modalities by inserting vision tokens directly into the language stream.
+    """
+    def __init__(
+        self,
+        vision_encoder: nn.Module,
+        vision_tokenizer: nn.Module,
+        lang_model: nn.Module,
+        initial_tokenizer_len: int,
+        pad_token_id: int,
+        decoder_layers_attr_name: str = None,
+        gradient_checkpointing: bool = False,
+    ):
+        super().__init__(
+            vision_encoder=vision_encoder,
+            vision_tokenizer=vision_tokenizer,
+            lang_model=lang_model,
+            initial_tokenizer_len=initial_tokenizer_len,
+            pad_token_id=pad_token_id,
+            gradient_checkpointing=gradient_checkpointing,
+        )
+        self.decoder_layers_attr_name = decoder_layers_attr_name
+        if decoder_layers_attr_name is not None:
+            for block in getattr_recursive(self.lang_model, self.decoder_layers_attr_name):
+                block._use_gradient_checkpointing = gradient_checkpointing
+    def _prepare_inputs_for_forward(
+        self,
+        vision_tokens: torch.Tensor,
+        lang_x: torch.Tensor,
+        attention_mask: torch.Tensor,
+        labels: torch.Tensor = None,
+        past_key_values=None,
+        vision_attention_mask: Optional[torch.Tensor] = None,
+        past_media_locations: torch.Tensor = None,
+        past_vision_tokens: torch.Tensor = None,
+        padding_side: str = "left",
+        num_beams: int = 1,
+    ):
+        """
+        Insert the vision tokens directly into the language stream/
+        This requires us to modify the input_ids, attention_mask, and labels.
+        """
+        if past_key_values is not None:
+            past_len = past_key_values[0][0].shape[2]
+            assert attention_mask.shape[1] == past_len + lang_x.shape[1], (
+                "Attention_mask must be as long as the entire past len (including image tokens) and current input IDs. "
+                + "Check that you've expanded the attention mask to account for past image tokens."
+            )
+        if vision_tokens is None:
+            return {
+                "input_ids": lang_x,
+                "attention_mask": attention_mask,
+                "labels": labels,
+            }
+        # get the language embeddings
+        lang_embeds = self.lang_model.get_input_embeddings()(lang_x)
+        # build up the multimodal embeddings
+        B = lang_x.shape[0]
+        has_labels = labels is not None
+        multimodal_embeds = []
+        multimodal_attention_mask = []
+        multimodal_labels = [] if has_labels else None
+        for i in range(B):
+            # get index of <image> tokens in lang_x[i]
+            image_token_idxs = torch.where(lang_x[i] == self.media_token_id)[0]
+            if len(image_token_idxs) == 0:
+                multimodal_embeds.append(lang_embeds[i].clone())
+                multimodal_attention_mask.append(attention_mask[i].clone())
+                if has_labels:
+                    multimodal_labels.append(labels[i].clone())
+                continue
+            # since an image is represented by self.num_tokens_per_vis tokens, we need to offset the image_token_idxs
+            for j, img_idx in enumerate(image_token_idxs):
+                image_token_idxs[j] += (self.num_tokens_per_vis - 1) * j # FIXME: different offset for any resolution encoding when has multiple images.
+            # loop through the image_token_idxs and insert the vision tokens
+            new_embed = lang_embeds[i].clone()
+            new_attention_mask = (
+                attention_mask[i].clone() if attention_mask is not None else None
+            )
+            if has_labels:
+                new_label = labels[i].clone()
+            for img_num, img_idx in enumerate(image_token_idxs):
+                if img_num > 0:
+                    # FIXME: hardcoded as such to avoid assertion error, but this only works for single image samples.
+                    break
+                # Get vision token attention mask for padded llava-style any resolution image tokens.
+                if self.image_aspect_ratio =='anyres':
+                    num_vis_tokens = vision_tokens[i][img_num].shape[0]
+                    if vision_attention_mask is not None:
+                        vis_attention_mask = vision_attention_mask[i]
+                    else:
+                        vis_attention_mask = torch.ones(
+                            num_vis_tokens, dtype=torch.long
+                        ).to(attention_mask.device)
+                else:
+                    assert (
+                        vision_tokens[i][img_num].shape[0] == self.num_tokens_per_vis
+                    ), f"vision token number mismatch: image embedding ({vision_tokens[i][img_num].shape[0]}) \
+                            vs. model.num_tokens_per_vis ({self.num_tokens_per_vis})"
+                    # By default, vision tokens are not padded.
+                    num_vis_tokens = self.num_tokens_per_vis
+                    vis_attention_mask = torch.ones(
+                        num_vis_tokens, dtype=torch.long
+                    ).to(attention_mask.device)
+                new_embed = torch.cat(
+                    (
+                        new_embed[:img_idx],
+                        vision_tokens[i][img_num],
+                        new_embed[img_idx + 1 :],
+                    ),
+                    dim=0,
+                )
+                new_attention_mask = torch.cat(
+                    (
+                        new_attention_mask[:img_idx],
+                        vis_attention_mask,
+                        new_attention_mask[img_idx + 1 :],
+                    ),
+                    dim=0,
+                )
+                if has_labels:
+                    new_label = torch.cat(
+                        (
+                            new_label[:img_idx],
+                            torch.ones(num_vis_tokens, dtype=torch.long).to(
+                                labels.device
+                            )
+                            * -100,
+                            new_label[img_idx + 1 :],
+                        ),
+                        dim=0,
+                    )
+            multimodal_embeds.append(new_embed)
+            multimodal_attention_mask.append(new_attention_mask)
+            if has_labels:
+                multimodal_labels.append(new_label)
+        # stack
+        multimodal_embeds = stack_with_padding(
+            multimodal_embeds,
+            padding_value=self.pad_token_id,
+            padding_side=padding_side,
+        )
+        multimodal_attention_mask = stack_with_padding(
+            multimodal_attention_mask,
+            padding_value=0,
+            padding_side=padding_side,
+        )
+        if has_labels:
+            multimodal_labels = stack_with_padding(
+                multimodal_labels,
+                padding_value=-100,
+                padding_side=padding_side,
+            )
+        return {
+            "inputs_embeds": multimodal_embeds,
+            "attention_mask": multimodal_attention_mask,
+            "labels": multimodal_labels,
+        }
+    def _postprocess_outputs_from_forward(
+        self,
+        output: CausalLMOutputWithPast,
+        lang_x: torch.Tensor,
+        vision_tokens: torch.Tensor,
+        past_vision_tokens: torch.Tensor,
+        past_media_locations: torch.Tensor,
+        use_cache: bool = False,
+    ):
+        # Include the past vision tokens and past media locations in the output
+        updated_vision_tokens, updated_media_locations = self._concat_vision_cache(
+            lang_x=lang_x,
+            vision_tokens=vision_tokens,
+            past_vision_tokens=past_vision_tokens,
+            past_media_locations=past_media_locations,
+            use_cache=use_cache,
+        )
+        # return logits that are the same shape as the original input_ids
+        logits = output.logits
+        batch_logits = []
+        B, T_txt = lang_x.shape
+        for i in range(B):
+            sequence_logits = []
+            logits_j = 0
+            for j in range(T_txt):
+                if lang_x[i, j] != self.media_token_id:
+                    sequence_logits.append(logits[i, logits_j])
+                    logits_j += 1
+                else:
+                    # append the logit for the first image token, then skip over the rest
+                    # note: the model actually learns to predict <im_patch>, not <image>
+                    sequence_logits.append(logits[i, logits_j])
+                    logits_j += self.num_tokens_per_vis
+            sequence_logits = torch.stack(sequence_logits, dim=0)  # (B, vocab_size)
+            batch_logits.append(sequence_logits)
+        batch_logits = torch.stack(batch_logits, dim=0)  # (B, T_txt, vocab_size)
+        # The final logits shape should be the same as the original input_ids shape
+        assert batch_logits.shape[:2] == (B, T_txt)
+        # assemble the output
+        output = VLMOutputWithPast(
+            loss=output.loss,
+            logits=batch_logits,
+            past_key_values=output.past_key_values,
+            hidden_states=output.hidden_states,
+            attentions=output.attentions,
+            past_media_locations=updated_media_locations,
+            past_vision_tokens=updated_vision_tokens,
+        )
+        return output
+    def _post_forward_hook(self):
+        pass
+    @property
+    def num_params_per_module(self):
+        """Print the number of parameters per module in the model"""
+        return "\n".join(
+            [
+                f"Vision encoder: {num_params(self.vision_encoder):,} parameters",
+                f"Vision tokenizer: {num_params(self.vision_tokenizer):,} parameters",
+                f"Language model: {num_params(self.lang_model):,} parameters",
+            ]
+        )
+    @property
+    def num_trainable_params_per_module(self):
+        """Print the number of trainable parameters per module in the model"""
+        return "\n".join(
+            [
+                f"Vision encoder: {num_params(self.vision_encoder, filter_to_trainable=True):,} trainable parameters",
+                f"Vision tokenizer: {num_params(self.vision_tokenizer, filter_to_trainable=True):,} trainable parameters",
+                f"Language model: {num_params(self.lang_model, filter_to_trainable=True):,} trainable parameters",
+            ]
+        )
+class KosmosInstruct(VLMWithLanguageStream):
+    def __init__(
+        self,
+        vision_encoder: nn.Module,
+        vision_tokenizer: nn.Module,
+        lang_model: nn.Module,
+        initial_tokenizer_len: int,
+        pad_token_id: int,
+        decoder_layers_attr_name: str = None,
+        gradient_checkpointing: bool = False,
+        image_aspect_ratio: str = 'pad',
+        anyres_patch_sampling: bool = False
+    ):
+        """
+        Args:
+            vision_encoder (nn.Module): HF CLIPModel
+            lang_encoder (nn.Module): HF causal language model
+            vis_feature_dim (int): final dimension of the visual features outputted by the vision_encoder
+            initial_tokenizer_len (int): size of the tokenizer vocab
+            padding_token_id (int): id of the padding token. None if no padding token; then a padding token
+                will be inserted into self.special_tokens, which factory.py fills after creating new tokens
+            decoder_layers_attr_name (str, optional): name of the decoder layers attribute. Defaults to None.
+            gradient_checkpointing (bool, optional): whether to use gradient checkpointing. Defaults to False.
+        """
+        self._special_tokens = {
+            "media_token": "<image>",
+            "image_placeholder_token": "<image placeholder>",
+            "end_of_trunk_token": "<|endofchunk|>",
+        }
+        lang_embedding_dim = lang_model.get_input_embeddings().weight.shape[1]
+        super().__init__(
+            vision_encoder=vision_encoder,
+            vision_tokenizer=vision_tokenizer,
+            lang_model=lang_model,
+            initial_tokenizer_len=initial_tokenizer_len,
+            gradient_checkpointing=gradient_checkpointing,
+            decoder_layers_attr_name=decoder_layers_attr_name,
+            pad_token_id=pad_token_id,
+        )
+        self.image_aspect_ratio = image_aspect_ratio
+        self.anyres_patch_sampling = anyres_patch_sampling
+    def set_trainable(self):
+        """
+        Unfreeze everything except the vision_encoder
+        """
+        self.requires_grad_(True)
+        self.vision_encoder.requires_grad_(False)
+    def _should_apply_weight_decay(self, parameter_name):
+        """
+        Kosmos applies 0.01 weight deacy to everything
+        """
+        return True
+    def forward(
+        self,
+        vision_x: Optional[torch.Tensor],
+        lang_x: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        image_size: Optional[Tuple] = None,
+        past_key_values: Optional[
+            List[Union[torch.Tensor, Tuple[torch.Tensor]]]
+        ] = None,
+        past_media_locations: Optional[torch.Tensor] = None,
+        past_vision_tokens: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = False,
+        **kwargs,
+    ):
+        """
+        Args:
+            vision_x: Vision input
+                shape (B, T_img, F, C, H, W) with F=1
+                only F = 1 is supported (single-frame videos)
+                if T_img > the number of media tokens in the corresponding input_ids (lang_x),
+                only the first number of media tokens in lang_x are used
+            lang_x: Language input ids, with media tokens denoting where
+                visual media should be inserted.
+                shape (B, T_txt)
+            attention_mask: Attention mask. Defaults to None.
+            labels: Labels. Defaults to None.
+                shape (B, T_txt)
+            past_key_values (Tuple[torch.Tensor]], optional): Past key value pairs for each of the T_txt previous tokens in the language model. Defaults to None.
+                list of length = number of decoder layers in the LM
+                exact implementation depends on LM, see Hugging Face docs
+            past_media_locations (torch.Tensor, optional): boolean mask denoting which of the previous T_txt tokens were media tokens. Defaults to None.
+                shape (B, T_txt)
+            past_vision_tokens (torch.Tensor, optional): Previous vision tokens. Defaults to None.
+            use_cache (Optional[bool], optional): Whether to use cache. Defaults to False.
+                If True, includes key_values, media_locations, and vision_tokens in the output.
+        """
+        assert not (past_vision_tokens is None) ^ (
+            past_media_locations is None
+        ), "past_vision_tokens and past_media_locations must both be None or both be not None"
+        # convert pixels to vision tokens
+        vision_attention_mask = None
+        if vision_x is not None:
+            if self.image_aspect_ratio == 'anyres':
+                input_dict = dict(image=vision_x, image_size=image_size)
+                vision_features, vision_attn_masks = self._encode_vision_x_anyres(input_dict, lang_x.device)
+            else:
+                vision_features = self._encode_vision_x(vision_x=vision_x)
+                vision_attn_masks = None
+            if self.anyres_patch_sampling:
+                split_sizes = [feature.shape[0] for feature in vision_features]
+                vision_features = torch.cat(vision_features, dim=0)
+                vision_features = vision_features[:, None, None, :, :] # Expand dimensions.
+                vision_attn_masks = torch.cat(vision_attn_masks, dim=0)
+            # Prepare text embeds for instruction-aware image query sampling.
+            # FIXME: for debugging purposed, truncating text input to vision tokenizer to be 256 at max.
+            lang_x_truncated = lang_x[:, :256]
+            text_embeds = self.lang_model.get_input_embeddings()(lang_x_truncated)
+            # TODO: repeat text_embeds to match the number of patches for each image patch group.
+            if self.anyres_patch_sampling:
+                repeated_text_embeds = []
+                for i, np in enumerate(split_sizes):
+                    repeated_text_embeds.append(text_embeds[i].repeat(np, 1, 1))
+                text_embeds = torch.cat(repeated_text_embeds, dim=0)
+            vision_tokens = self.vision_tokenizer(vision_features, text_embeds)
+            # Post-processing: Split the batches into groups of patches and concatenate them together.
+            if self.anyres_patch_sampling:
+                vision_token_groups = torch.split(vision_tokens, split_sizes, dim=0)
+                max_n_vis_token = max([vis.shape[0]*vis.shape[-2] for vis in vision_token_groups])
+                # Padding.
+                padded_vision_tokens = []
+                padded_attn_masks = []
+                for patch_vis_tokens in vision_token_groups:
+                    patch_vis_tokens = patch_vis_tokens.flatten(0, 2) # [Np, 1, v, d] -> [Np*v, d]
+                    n_vis_token = patch_vis_tokens.shape[0]
+                    patch_attn = torch.ones((max_n_vis_token), dtype=torch.long, device=patch_vis_tokens.device)
+                    if n_vis_token < max_n_vis_token:
+                        padded_vis_token = torch.zeros((max_n_vis_token, patch_vis_tokens.shape[-1]),
+                                                       dtype=patch_vis_tokens.dtype, device=patch_vis_tokens.device)
+                        padded_vis_token[:n_vis_token, :] = patch_vis_tokens
+                        patch_attn[n_vis_token:] = 0
+                    else:
+                        padded_vis_token = patch_vis_tokens
+                    padded_vision_tokens.append(padded_vis_token)
+                    padded_attn_masks.append(patch_attn)
+                vision_tokens = torch.stack(padded_vision_tokens, dim=0)
+                vision_attention_mask = torch.stack(padded_attn_masks, dim=0)
+                vision_tokens = vision_tokens[:, None, :, :]
+        else:
+            vision_tokens = None
+        # fuse the vision and language tokens
+        new_inputs = self._prepare_inputs_for_forward(
+            vision_tokens=vision_tokens,
+            lang_x=lang_x,
+            attention_mask=attention_mask,
+            vision_attention_mask=vision_attention_mask,
+            labels=labels,
+            past_key_values=past_key_values,
+            past_media_locations=past_media_locations,
+            padding_side="right",
+            past_vision_tokens=past_vision_tokens,
+        )
+        output = self.lang_model(
+            **new_inputs,
+            use_cache=use_cache,
+            past_key_values=past_key_values,
+            **kwargs,
+        )
+        # postprocessing may be needed, e.g. to remove extra tokens from logits that were inserted into the language stream
+        # or to add the past_vision_tokens and past_media_locations to the output
+        output = self._postprocess_outputs_from_forward(
+            output=output,
+            lang_x=lang_x,
+            vision_tokens=vision_tokens,
+            use_cache=use_cache,
+            past_vision_tokens=past_vision_tokens,
+            past_media_locations=past_media_locations,
+        )
+        # postforward hooks
+        self._post_forward_hook()
+        return output
+    def generate(
+        self,
+        vision_x: torch.Tensor,
+        lang_x: torch.Tensor,
+        image_size: Optional[Tuple] = None,
+        attention_mask: torch.Tensor = None,
+        past_key_values: Optional[
+            List[Union[torch.Tensor, Tuple[torch.Tensor]]]
+        ] = None,
+        past_media_locations: Optional[torch.Tensor] = None,
+        past_vision_tokens: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+        """
+        Generate text conditioned on vision and language inputs.
+        Args:
+            vision_x (torch.Tensor): Vision input
+                shape (B, T_img, F, C, H, W)
+                see documentation for forward
+            lang_x (torch.Tensor): Language input
+                shape (B, T_txt)
+            attention_mask (torch.Tensor, optional): Attention mask. Defaults to None.
+            **kwargs: see generate documentation in Hugging Face CausalLM models.
+        Returns:
+            torch.Tensor: lang_x with generated tokens appended to it
+        """
+        num_beams = kwargs.pop("num_beams", 1)
+        # convert pixels to vision tokens
+        vision_attention_mask = None
+        if vision_x is not None:
+            if self.image_aspect_ratio == 'anyres':
+                input_dict = dict(image=vision_x, image_size=image_size)
+                vision_features, vision_attn_masks = self._encode_vision_x_anyres(input_dict, lang_x.device)
+            else:
+                vision_features = self._encode_vision_x(vision_x=vision_x)
+                vision_attn_masks = None
+            if self.anyres_patch_sampling:
+                split_sizes = [feature.shape[0] for feature in vision_features]
+                vision_features = torch.cat(vision_features, dim=0)
+                vision_features = vision_features[:, None, None, :, :] # Expand dimensions.
+                vision_attn_masks = torch.cat(vision_attn_masks, dim=0)
+            # Prepare text embeds for instruction-aware image query sampling.
+            lang_x_truncated = lang_x[:, :256]
+            text_embeds = self.lang_model.get_input_embeddings()(lang_x_truncated) # FIXME: check function calling.
+            # Repeat text_embeds to match the number of patches for each image patch group.
+            if self.anyres_patch_sampling:
+                repeated_text_embeds = []
+                for i, np in enumerate(split_sizes):
+                    repeated_text_embeds.append(text_embeds[i].repeat(np, 1, 1))
+                text_embeds = torch.cat(repeated_text_embeds, dim=0)
+            vision_tokens = self.vision_tokenizer(vision_features, text_embeds)
+            # Post-processing: Split the batches into groups of patches and concatenate them together.
+            if self.anyres_patch_sampling:
+                vision_token_groups = torch.split(vision_tokens, split_sizes, dim=0)
+                max_n_vis_token = max([vis.shape[0]*vis.shape[-2] for vis in vision_token_groups])
+                # Padding.
+                padded_vision_tokens = []
+                padded_attn_masks = []
+                for patch_vis_tokens in vision_token_groups:
+                    patch_vis_tokens = patch_vis_tokens.flatten(0, 2) # [Np, 1, v, d] -> [Np*v, d]
+                    n_vis_token = patch_vis_tokens.shape[0]
+                    patch_attn = torch.ones((max_n_vis_token), dtype=torch.long, device=patch_vis_tokens.device)
+                    if n_vis_token < max_n_vis_token:
+                        padded_vis_token = torch.zeros((max_n_vis_token, patch_vis_tokens.shape[-1]),
+                                                       dtype=patch_vis_tokens.dtype, device=patch_vis_tokens.device)
+                        padded_vis_token[:n_vis_token, :] = patch_vis_tokens
+                        patch_attn[n_vis_token:] = 0
+                    else:
+                        padded_vis_token = patch_vis_tokens
+                    padded_vision_tokens.append(padded_vis_token)
+                    padded_attn_masks.append(patch_attn)
+                vision_tokens = torch.stack(padded_vision_tokens, dim=0)
+                vision_attention_mask = torch.stack(padded_attn_masks, dim=0)
+                vision_tokens = vision_tokens[:, None, :, :]
+        else:
+            vision_tokens = None
+        # fuse the vision and language tokens
+        # for xattn, vision_x and media_location are repeat_interleaved s.t.
+        # the total batch size is B * num_beams
+        new_inputs = self._prepare_inputs_for_forward(
+            vision_tokens=vision_tokens,
+            lang_x=lang_x,
+            attention_mask=attention_mask,
+            vision_attention_mask=vision_attention_mask,
+            past_key_values=past_key_values,
+            past_media_locations=past_media_locations,
+            past_vision_tokens=past_vision_tokens,
+            padding_side="left",
+            num_beams=num_beams,
+        )
+        if transformers.__version__ == '4.41.0.dev0':
+            output = self.lang_model.generate(
+                **new_inputs,
+                num_beams=num_beams,
+                use_cache=True,
+                **kwargs,
+            )
+        else:
+            output = self.lang_model.generate(
+                **new_inputs,
+                past_key_values=past_key_values,
+                num_beams=num_beams,
+                use_cache=True,
+                **kwargs,
+            )
+        self._post_forward_hook()
+        return output