diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..55648570cc429c74f906ad65462196a8d4a439e3 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,29 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/Snake_cn_Mushroom_en.gif filter=lfs diff=lfs merge=lfs -text
+assets/demo_video.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/gif_cases/1-4.gif filter=lfs diff=lfs merge=lfs -text
+assets/gif_cases/Mushroom_en_Snake_cn.gif filter=lfs diff=lfs merge=lfs -text
+assets/gif_cases/Snake_en.gif filter=lfs diff=lfs merge=lfs -text
+assets/gif_cases/english_menu.gif filter=lfs diff=lfs merge=lfs -text
+assets/gif_cases/hong_kong_street.gif filter=lfs diff=lfs merge=lfs -text
+assets/gif_cases/london_car.gif filter=lfs diff=lfs merge=lfs -text
+assets/gif_cases/meal_plan.gif filter=lfs diff=lfs merge=lfs -text
+assets/gif_cases/station.gif filter=lfs diff=lfs merge=lfs -text
+assets/gif_cases/ticket.gif filter=lfs diff=lfs merge=lfs -text
+assets/gif_cases/蘑菇_cn.gif filter=lfs diff=lfs merge=lfs -text
+assets/gif_cases/蛇_cn.gif filter=lfs diff=lfs merge=lfs -text
+assets/minicpmv-llama3-v2.5/case_OCR_en.png filter=lfs diff=lfs merge=lfs -text
+assets/minicpmv-llama3-v2.5/case_complex_reasoning.png filter=lfs diff=lfs merge=lfs -text
+assets/minicpmv-llama3-v2.5/case_information_extraction.png filter=lfs diff=lfs merge=lfs -text
+assets/minicpmv-llama3-v2.5/case_long_img.png filter=lfs diff=lfs merge=lfs -text
+assets/minicpmv-llama3-v2.5/case_markdown.png filter=lfs diff=lfs merge=lfs -text
+assets/minicpmv-llama3-v2.5/cases_all.png filter=lfs diff=lfs merge=lfs -text
+assets/minicpmv2-cases.png filter=lfs diff=lfs merge=lfs -text
+assets/minicpmv2-cases_1.png filter=lfs diff=lfs merge=lfs -text
+assets/minicpmv2-cases_2.png filter=lfs diff=lfs merge=lfs -text
+assets/omnilmm-12b-examples.png filter=lfs diff=lfs merge=lfs -text
+assets/omnilmm-12b-examples_2.png filter=lfs diff=lfs merge=lfs -text
+assets/omnilmm-12b-examples_2_00.jpg filter=lfs diff=lfs merge=lfs -text
+assets/omnilmm-12b-examples_3.png filter=lfs diff=lfs merge=lfs -text
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yaml b/.github/ISSUE_TEMPLATE/bug_report.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f547b1324030e2509df565ec2c0e0e779b696d3b
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.yaml
@@ -0,0 +1,89 @@
+name: 🐞 Bug
+description: 提交错误报告 | File a bug/issue
+title: "[BUG] <title>"
+labels: []
+
+body:
+  - type: checkboxes
+    attributes:
+      label: 是否已有关于该错误的issue或讨论？ | Is there an existing issue / discussion for this?
+      description: |
+        请先搜索您遇到的错误是否在已有的issues或讨论中提到过。
+        Please search to see if an issue / discussion already exists for the bug you encountered.
+        [Issues](https://github.com/OpenBMB/MiniCPM-V/issues)
+        [Discussions](https://github.com/OpenBMB/MiniCPM-V/discussions)
+      options:
+        - label: 我已经搜索过已有的issues和讨论 | I have searched the existing issues / discussions
+          required: true
+  - type: checkboxes
+    attributes:
+      label: 该问题是否在FAQ中有解答？ | Is there an existing answer for this in FAQ?
+      description: |
+        请先搜索您遇到的错误是否已在FAQ中有相关解答。
+        Please search to see if an answer already exists in FAQ for the bug you encountered.
+        [FAQ-en](https://github.com/OpenBMB/MiniCPM-V/blob/main/FAQ.md)
+        [FAQ-zh](https://github.com/OpenBMB/MiniCPM-V/blob/main/FAQ_zh.md)
+      options:
+        - label: 我已经搜索过FAQ | I have searched FAQ
+          required: true
+  - type: textarea
+    attributes:
+      label: 当前行为 | Current Behavior
+      description: |
+        准确描述遇到的行为。
+        A concise description of what you're experiencing.
+    validations:
+      required: false
+  - type: textarea
+    attributes:
+      label: 期望行为 | Expected Behavior
+      description: |
+        准确描述预期的行为。
+        A concise description of what you expected to happen.
+    validations:
+      required: false
+  - type: textarea
+    attributes:
+      label: 复现方法 | Steps To Reproduce
+      description: |
+        复现当前行为的详细步骤。
+        Steps to reproduce the behavior.
+      placeholder: |
+        1. In this environment...
+        2. With this config...
+        3. Run '...'
+        4. See error...
+    validations:
+      required: false
+  - type: textarea
+    attributes:
+      label: 运行环境 | Environment
+      description: |
+        examples:
+          - **OS**: Ubuntu 20.04
+          - **Python**: 3.8
+          - **Transformers**: 4.31.0
+          - **PyTorch**: 2.0.1
+          - **CUDA**: 11.4
+      value: |
+        - OS:
+        - Python:
+        - Transformers:
+        - PyTorch:
+        - CUDA (`python -c 'import torch; print(torch.version.cuda)'`):
+      render: Markdown
+    validations:
+      required: false
+  - type: textarea
+    attributes:
+      label: 备注 | Anything else?
+      description: |
+        您可以在这里补充其他关于该问题背景信息的描述、链接或引用等。
+        
+        您可以通过点击高亮此区域然后拖动文件的方式上传图片或日志文件。
+        
+        Links? References? Anything that will give us more context about the issue you are encountering!
+        
+        Tip: You can attach images or log files by clicking this area to highlight it and then dragging files in.
+    validations:
+      required: false
diff --git a/.github/ISSUE_TEMPLATE/config.yaml b/.github/ISSUE_TEMPLATE/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0086358db1eb971c0cfa8739c27518bbc18a5ff4
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/config.yaml
@@ -0,0 +1 @@
+blank_issues_enabled: true
diff --git a/.github/ISSUE_TEMPLATE/feature_request.yaml b/.github/ISSUE_TEMPLATE/feature_request.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e677af83ac00162afab9318e10e5fc1b9c229fd6
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.yaml
@@ -0,0 +1,78 @@
+name: "💡 Feature Request"
+description: 创建新功能请求 | Create a new ticket for a new feature request
+title: "💡 [REQUEST] - <title>"
+labels: [
+  "question"
+]
+body:
+  - type: input
+    id: start_date
+    attributes:
+      label: "起始日期 | Start Date"
+      description: |
+        起始开发日期
+        Start of development
+      placeholder: "month/day/year"
+    validations:
+      required: false
+  - type: textarea
+    id: implementation_pr
+    attributes:
+      label: "实现PR | Implementation PR"
+      description: |
+        实现该功能的Pull request
+        Pull request used
+      placeholder: "#Pull Request ID"
+    validations:
+      required: false
+  - type: textarea
+    id: reference_issues
+    attributes:
+      label: "相关Issues | Reference Issues"
+      description: |
+        与该功能相关的issues
+        Common issues
+      placeholder: "#Issues IDs"
+    validations:
+      required: false
+  - type: textarea
+    id: summary
+    attributes:
+      label: "摘要 | Summary"
+      description: |
+        简要描述新功能的特点
+        Provide a brief explanation of the feature
+      placeholder: |
+        Describe in a few lines your feature request
+    validations:
+      required: true
+  - type: textarea
+    id: basic_example
+    attributes:
+      label: "基本示例 | Basic Example"
+      description: Indicate here some basic examples of your feature.
+      placeholder: A few specific words about your feature request.
+    validations:
+      required: true
+  - type: textarea
+    id: drawbacks
+    attributes:
+      label: "缺陷 | Drawbacks"
+      description: |
+        该新功能有哪些缺陷/可能造成哪些影响？
+        What are the drawbacks/impacts of your feature request ?
+      placeholder: |
+        Identify the drawbacks and impacts while being neutral on your feature request
+    validations:
+      required: true
+  - type: textarea
+    id: unresolved_question
+    attributes:
+      label: "未解决问题 | Unresolved questions"
+      description: |
+        有哪些尚未解决的问题？
+        What questions still remain unresolved ?
+      placeholder: |
+        Identify any unresolved issues.
+    validations:
+      required: false
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..988f840d44e6a358f3fe196ec8006bcef49217dc
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+*.bk
+__pycache__
+.DS_Store
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..2756a35237431c301015ac7c326cc0120dd4965d
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2024 OpenBMB
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/README.md b/README.md
index 9dbbce39f84cc1d7da668e72efb246b42da3688d..3a20b78f64bf2a2716abbf16c2a86356b2ad0fff 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,718 @@
----
-title: Github Repo Test
-emoji: 📚
-colorFrom: yellow
-colorTo: purple
-sdk: gradio
-sdk_version: 4.32.2
-app_file: app.py
-pinned: false
----
-
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+<div align="center">
+
+<img src="./assets/minicpmv.png" width="300em" ></img> 
+
+**A GPT-4V Level Multimodal LLM on Your Phone**
+
+  <strong>[中文](./README_zh.md) |
+  English</strong>
+
+<p align="center">
+  MiniCPM-Llama3-V  2.5  <a href="https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5/">🤗</a> <a href="https://huggingface.co/spaces/openbmb/MiniCPM-Llama3-V-2_5">🤖</a> |
+  MiniCPM-V 2.0  <a href="https://huggingface.co/openbmb/MiniCPM-V-2/">🤗</a> <a href="https://huggingface.co/spaces/openbmb/MiniCPM-V-2">🤖</a> |
+  <a href="https://openbmb.vercel.app/minicpm-v-2-en"> Technical Blog </a>
+</p>
+
+</div>
+
+
+**MiniCPM-V** is a series of end-side multimodal LLMs (MLLMs) designed for vision-language understanding. The models take image and text as inputs and provide high-quality text outputs. Since February 2024, we have released 4 versions of the model, aiming to achieve **strong performance and efficient deployment**. The most notable models in this series currently include:
+
+- **MiniCPM-Llama3-V 2.5**: 🔥🔥🔥 The latest and most capable model in the MiniCPM-V series. With a total of 8B parameters, the model **surpasses proprietary models such as GPT-4V-1106, Gemini Pro, Qwen-VL-Max and Claude 3** in overall performance. Equipped with the enhanced OCR and instruction-following capability, the model can also support multimodal conversation for **over 30 languages** including English, Chinese, French, Spanish, German etc. With help of quantization, compilation optimizations, and several efficient inference techniques on CPUs and NPUs, MiniCPM-Llama3-V 2.5 can be **efficiently deployed on end-side devices**.
+
+- **MiniCPM-V 2.0**: The lightest model in the MiniCPM-V series. With 2B parameters, it surpasses larger models such as Yi-VL 34B, CogVLM-Chat 17B, and Qwen-VL-Chat 10B in overall performance. It can accept image inputs of any aspect ratio and up to 1.8 million pixels (e.g., 1344x1344), achieving comparable performance with Gemini Pro in understanding scene-text and matches GPT-4V in low hallucination rates.
+
+
+## News <!-- omit in toc -->
+
+#### 📌 Pinned
+
+* [2024.05.28] 🚀🚀🚀 MiniCPM-Llama3-V 2.5 now fully supports its feature in [llama.cpp](https://github.com/OpenBMB/llama.cpp/blob/minicpm-v2.5/examples/minicpmv/README.md) and [ollama](https://github.com/OpenBMB/ollama/tree/minicpm-v2.5)! Please pull the latest code for llama.cpp & ollama. We also release GGUF in various sizes [here](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf/tree/main). FAQ list for ollama usage is comming within a day. Please stay tuned!
+* [2024.05.28] 💫 We now support LoRA fine-tuning for MiniCPM-Llama3-V 2.5, using only 2 V100 GPUs! See more statistics [here](https://github.com/OpenBMB/MiniCPM-V/tree/main/finetune#model-fine-tuning-memory-usage-statistics).
+* [2024.05.23] 🔍 We've released a comprehensive comparison between Phi-3-vision-128k-instruct and MiniCPM-Llama3-V 2.5, including benchmarks evaluations, multilingual capabilities, and inference efficiency 🌟📊🌍🚀. Click [here](./docs/compare_with_phi-3_vision.md) to view more details.
+* [2024.05.23] 🔥🔥🔥 MiniCPM-V tops GitHub Trending and Hugging Face Trending! Our demo, recommended by Hugging Face Gradio’s official account, is available [here](https://huggingface.co/spaces/openbmb/MiniCPM-Llama3-V-2_5). Come and try it out!
+
+<br>
+
+
+* [2024.05.25] MiniCPM-Llama3-V 2.5 now supports streaming outputs and customized system prompts. Try it [here](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5#usage)!
+* [2024.05.24] We release the MiniCPM-Llama3-V 2.5 [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf), which supports [llama.cpp](#inference-with-llamacpp) inference and provides a 6~8 token/s smooth decoding on mobile phones. Try it now!
+* [2024.05.20] We open-soure MiniCPM-Llama3-V 2.5, it has improved OCR capability and supports 30+ languages, representing the first end-side MLLM achieving GPT-4V level performance! We provide [efficient inference](#deployment-on-mobile-phone) and [simple fine-tuning](./finetune/readme.md). Try it now!
+* [2024.04.23] MiniCPM-V-2.0 supports vLLM now! Click [here](#vllm) to view more details.
+* [2024.04.18] We create a HuggingFace Space to host the demo of MiniCPM-V 2.0 at [here](https://huggingface.co/spaces/openbmb/MiniCPM-V-2)!
+* [2024.04.17] MiniCPM-V-2.0 supports deploying [WebUI Demo](#webui-demo) now!
+* [2024.04.15] MiniCPM-V-2.0 now also supports [fine-tuning](https://github.com/modelscope/swift/blob/main/docs/source/Multi-Modal/minicpm-v-2最佳实践.md) with the SWIFT framework!
+* [2024.04.12] We open-source MiniCPM-V 2.0, which achieves comparable performance with Gemini Pro in understanding scene text and outperforms strong Qwen-VL-Chat 9.6B and Yi-VL 34B on <a href="https://rank.opencompass.org.cn/leaderboard-multimodal">OpenCompass</a>, a comprehensive evaluation over 11 popular benchmarks. Click <a href="https://openbmb.vercel.app/minicpm-v-2">here</a> to view the MiniCPM-V 2.0 technical blog.
+* [2024.03.14] MiniCPM-V now supports [fine-tuning](https://github.com/modelscope/swift/blob/main/docs/source/Multi-Modal/minicpm-v最佳实践.md) with the SWIFT framework. Thanks to [Jintao](https://github.com/Jintao-Huang) for the contribution！
+* [2024.03.01] MiniCPM-V now can be deployed on Mac!
+* [2024.02.01] We open-source MiniCPM-V and OmniLMM-12B, which support efficient end-side deployment and powerful multimodal capabilities correspondingly.
+
+
+## Contents <!-- omit in toc -->
+
+
+- [MiniCPM-Llama3-V 2.5](#minicpm-llama3-v-25)
+- [MiniCPM-V 2.0](#minicpm-v-20)
+- [Online Demo](#online-demo)
+- [Install](#install)
+- [Inference](#inference)
+  - [Model Zoo](#model-zoo)
+  - [Multi-turn Conversation](#multi-turn-conversation)
+  - [Inference on Mac](#inference-on-mac)
+  - [Deployment on Mobile Phone](#deployment-on-mobile-phone)
+  - [WebUI Demo](#webui-demo)
+  - [Inference with llama.cpp](#inference-with-llamacpp)
+  - [Inference with vLLM](#inference-with-vllm)
+- [Fine-tuning](#fine-tuning)
+- [TODO](#todo)
+- [🌟 Star History](#-star-history)
+- [Citation](#citation)
+
+## MiniCPM-Llama3-V 2.5
+
+**MiniCPM-Llama3-V 2.5** is the latest model in the MiniCPM-V series. The model is built on SigLip-400M and Llama3-8B-Instruct with a total of 8B parameters. It exhibits a significant performance improvement over MiniCPM-V 2.0. Notable features of MiniCPM-Llama3-V 2.5 include:
+
+- 🔥 **Leading Performance.**
+  MiniCPM-Llama3-V 2.5 has achieved an average score of 65.1 on OpenCompass, a comprehensive evaluation over 11 popular benchmarks. **With only 8B parameters, it surpasses widely used proprietary models like GPT-4V-1106, Gemini Pro, Claude 3 and Qwen-VL-Max** and greatly outperforms other Llama 3-based MLLMs.
+
+- 💪 **Strong OCR Capabilities.**
+  MiniCPM-Llama3-V 2.5 can process images with any aspect ratio and up to 1.8 million pixels (e.g., 1344x1344), achieving a **700+ score on OCRBench, surpassing proprietary models such as GPT-4o, GPT-4V-0409, Qwen-VL-Max and Gemini Pro**. Based on recent user feedback, MiniCPM-Llama3-V 2.5 has now enhanced full-text OCR extraction, table-to-markdown conversion, and other high-utility capabilities, and has further strengthened its instruction-following and complex reasoning abilities, enhancing multimodal interaction experiences.
+
+- 🏆 **Trustworthy Behavior.**
+  Leveraging the latest [RLAIF-V](https://github.com/RLHF-V/RLAIF-V/) method (the newest technique in the [RLHF-V](https://github.com/RLHF-V) [CVPR'24] series), MiniCPM-Llama3-V 2.5 exhibits more trustworthy behavior. It achieves a **10.3%** hallucination rate on Object HalBench, lower than GPT-4V-1106 (13.6%), achieving the best-level performance within the open-source community. [Data released](https://huggingface.co/datasets/openbmb/RLAIF-V-Dataset).
+
+- 🌏 **Multilingual Support.**
+  Thanks to the strong multilingual capabilities of Llama 3 and the cross-lingual generalization technique from [VisCPM](https://github.com/OpenBMB/VisCPM), MiniCPM-Llama3-V 2.5 extends its bilingual (Chinese-English) multimodal capabilities to **over 30 languages including German, French, Spanish, Italian, Korean etc.** [All Supported Languages](./assets/minicpm-llama-v-2-5_languages.md).
+
+- 🚀 **Efficient Deployment.**
+  MiniCPM-Llama3-V 2.5 systematically employs **model quantization, CPU optimizations, NPU optimizations and compilation optimizations**, achieving high-efficiency deployment on end-side devices. For mobile phones with Qualcomm chips, we have integrated the NPU acceleration framework QNN into llama.cpp for the first time. After systematic optimization, MiniCPM-Llama3-V 2.5 has realized a **150x acceleration in end-side MLLM image encoding** and a **3x speedup in language decoding**.
+
+-  💫  **Easy Usage.**
+MiniCPM-Llama3-V 2.5 can be easily used in various ways: (1) [llama.cpp](https://github.com/OpenBMB/llama.cpp/blob/minicpm-v2.5/examples/minicpmv/README.md) and [ollama](https://github.com/OpenBMB/ollama/tree/minicpm-v2.5/examples/minicpm-v2.5) support for efficient CPU inference on local devices, (2) [GGUF](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) format quantized models in 16 sizes, (3) efficient [LoRA](https://github.com/OpenBMB/MiniCPM-V/tree/main/finetune#lora-finetuning) fine-tuning with only 2 V100 GPUs, (4) [streaming output](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5#usage), (5) quick local WebUI demo setup with [Gradio](https://github.com/OpenBMB/MiniCPM-V/blob/main/web_demo_2.5.py) and [Streamlit](https://github.com/OpenBMB/MiniCPM-V/blob/main/web_demo_streamlit-2_5.py), and (6) interactive demos on [HuggingFace Spaces](https://huggingface.co/spaces/openbmb/MiniCPM-Llama3-V-2_5).
+
+### Evaluation  <!-- omit in toc -->
+
+<div align="center">
+    <img src=assets/MiniCPM-Llama3-V-2.5-peformance.png width=66% />
+</div>
+<details>
+<summary>Click to view results on TextVQA, DocVQA, OCRBench, OpenCompass, MME, MMBench, MMMU, MathVista, LLaVA Bench, RealWorld QA, Object HalBench. </summary>
+<div align="center">
+
+<table style="margin: 0px auto;">
+    <thead>
+        <tr>
+            <th align="left">Model</th>
+            <th>Size</th>
+            <th>OCRBench</th>
+            <th>TextVQA val</th>
+            <th>DocVQA test</th>
+            <th>Open-Compass</th>
+            <th>MME</th>
+            <th>MMB test (en)</th>
+            <th>MMB test (cn)</th>
+            <th>MMMU val</th>
+            <th>Math-Vista</th>
+            <th>LLaVA Bench</th>
+            <th>RealWorld QA</th>
+            <th>Object HalBench</th>
+        </tr>
+    </thead>
+    <tbody align="center">
+        <tr>
+            <td colspan="14" align="left"><strong>Proprietary</strong></td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">Gemini Pro</td>
+            <td>-</td>
+            <td>680</td>
+            <td>74.6</td>
+            <td>88.1</td>
+            <td>62.9</td>
+            <td>2148.9</td>
+            <td>73.6</td>
+            <td>74.3</td>
+            <td>48.9</td>
+            <td>45.8</td>
+            <td>79.9</td>
+            <td>60.4</td>
+            <td>-</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">GPT-4V (2023.11.06)</td>
+            <td>-</td>
+            <td>645</td>
+            <td>78.0</td>
+            <td>88.4</td>
+            <td>63.5</td>
+            <td>1771.5</td>
+            <td>77.0</td>
+            <td>74.4</td>
+            <td>53.8</td>
+            <td>47.8</td>
+            <td>93.1</td>
+            <td>63.0</td>
+            <td>86.4</td>
+        </tr>
+        <tr>
+            <td colspan="14" align="left"><strong>Open-source</strong></td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">Mini-Gemini</td>
+            <td>2.2B</td>
+            <td>-</td>
+            <td>56.2</td>
+            <td>34.2*</td>
+            <td>-</td>
+            <td>1653.0</td>
+            <td>-</td>
+            <td>-</td>
+            <td>31.7</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">Qwen-VL-Chat</td>
+            <td>9.6B</td>
+            <td>488</td>
+            <td>61.5</td>
+            <td>62.6</td>
+            <td>51.6</td>
+            <td>1860.0</td>
+            <td>61.8</td>
+            <td>56.3</td>
+            <td>37.0</td>
+            <td>33.8</td>
+            <td>67.7</td>
+            <td>49.3</td>
+            <td>56.2</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">DeepSeek-VL-7B</td>
+            <td>7.3B</td>
+            <td>435</td>
+            <td>64.7*</td>
+            <td>47.0*</td>
+            <td>54.6</td>
+            <td>1765.4</td>
+            <td>73.8</td>
+            <td>71.4</td>
+            <td>38.3</td>
+            <td>36.8</td>
+            <td>77.8</td>
+            <td>54.2</td>
+            <td>-</td>
+        </tr>        
+        <tr>
+            <td nowrap="nowrap" align="left">Yi-VL-34B</td>
+            <td>34B</td>
+            <td>290</td>
+            <td>43.4*</td>
+            <td>16.9*</td>
+            <td>52.2</td>
+            <td><strong>2050.2</strong></td>
+            <td>72.4</td>
+            <td>70.7</td>
+            <td>45.1</td>
+            <td>30.7</td>
+            <td>62.3</td>
+            <td>54.8</td>
+            <td>79.3</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">CogVLM-Chat</td>
+            <td>17.4B</td>
+            <td>590</td>
+            <td>70.4</td>
+            <td>33.3*</td>
+            <td>54.2</td>
+            <td>1736.6</td>
+            <td>65.8</td>
+            <td>55.9</td>
+            <td>37.3</td>
+            <td>34.7</td>
+            <td>73.9</td>
+            <td>60.3</td>
+            <td>73.6</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">TextMonkey</td>
+            <td>9.7B</td>
+            <td>558</td>
+            <td>64.3</td>
+            <td>66.7</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+        </tr>
+        <tr>
+          <td nowrap="nowrap" align="left">Idefics2</td>
+          <td>8.0B</td>
+          <td>-</td>
+          <td>73.0</td>
+          <td>74.0</td>
+          <td>57.2</td>
+          <td>1847.6</td>
+          <td>75.7</td>
+          <td>68.6</td>
+          <td>45.2</td>
+          <td>52.2</td>
+          <td>49.1</td>
+          <td>60.7</td>
+          <td>-</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">Bunny-LLama-3-8B</td>
+            <td>8.4B</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+            <td>54.3</td>
+            <td>1920.3</td>
+            <td>77.0</td>
+            <td>73.9</td>
+            <td>41.3</td>
+            <td>31.5</td>
+            <td>61.2</td>
+            <td>58.8</td>
+            <td>-</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">LLaVA-NeXT Llama-3-8B</td>
+            <td>8.4B</td>
+            <td>-</td>
+            <td>-</td>
+            <td>78.2</td>
+            <td>-</td>
+            <td>1971.5</td>
+            <td>-</td>
+            <td>-</td>
+            <td>41.7</td>
+            <td>37.5</td>
+            <td>80.1</td>
+            <td>60.0</td>
+            <td>-</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">Phi-3-vision-128k-instruct</td>
+            <td>4.2B</td>
+            <td>639*</td>
+            <td>70.9</td>
+            <td>-</td>
+            <td>-</td>
+            <td>1537.5*</td>
+            <td>-</td>
+            <td>-</td>
+            <td>40.4</td>
+            <td>44.5</td>
+            <td>64.2*</td>
+            <td>58.8*</td>
+            <td>-</td>
+        </tr>
+        <tr style="background-color: #e6f2ff;">
+            <td nowrap="nowrap" align="left">MiniCPM-V 1.0</td>
+            <td>2.8B</td>
+            <td>366</td>
+            <td>60.6</td>
+            <td>38.2</td>
+            <td>47.5</td>
+            <td>1650.2</td>
+            <td>64.1</td>
+            <td>62.6</td>
+            <td>38.3</td>
+            <td>28.9</td>
+            <td>51.3</td>
+            <td>51.2</td>
+            <td>78.4</td>
+        </tr>
+        <tr style="background-color: #e6f2ff;">
+            <td nowrap="nowrap" align="left">MiniCPM-V 2.0</td>
+            <td>2.8B</td>
+            <td>605</td>
+            <td>74.1</td>
+            <td>71.9</td>
+            <td>54.5</td>
+            <td>1808.6</td>
+            <td>69.1</td>
+            <td>66.5</td>
+            <td>38.2</td>
+            <td>38.7</td>
+            <td>69.2</td>
+            <td>55.8</td>
+            <td>85.5</td>
+        </tr>
+        <tr style="background-color: #e6f2ff;">
+            <td nowrap="nowrap" align="left">MiniCPM-Llama3-V 2.5</td>
+            <td>8.5B</td>
+            <td><strong>725</strong></td>
+            <td><strong>76.6</strong></td>
+            <td><strong>84.8</strong></td>
+            <td><strong>65.1</strong></td>
+            <td>2024.6</td>
+            <td><strong>77.2</strong></td>
+            <td><strong>74.2</strong></td>
+            <td><strong>45.8</strong></td>
+            <td><strong>54.3</strong></td>
+            <td><strong>86.7</strong></td>
+            <td><strong>63.5</strong></td>
+            <td><strong>89.7</strong></td>
+        </tr>
+    </tbody>
+</table>
+
+
+</div>
+* We evaluate the officially released checkpoint by ourselves.
+
+</details>
+
+<div align="center">
+    <img src="assets/llavabench_compare_3.png" width="100%" />
+    <br>
+    Evaluation results of multilingual LLaVA Bench
+</div>
+
+### Examples <!-- omit in toc -->
+
+<table align="center" >
+  <p align="center" > 
+  <img src="assets/minicpmv-llama3-v2.5/cases_all.png" />
+  </p>
+</table>
+
+We deploy MiniCPM-Llama3-V 2.5 on end devices. The demo video is the raw screen recording on a Xiaomi 14 Pro without edition.
+
+<table align="center">
+    <p align="center">
+      <img src="assets/gif_cases/ticket.gif" width=32%/>
+      <img src="assets/gif_cases/meal_plan.gif" width=32%/>
+    </p>
+</table>
+
+<table align="center">
+    <p align="center">
+      <img src="assets/gif_cases/1-4.gif" width=64%/>
+    </p>
+</table>
+
+## MiniCPM-V 2.0
+
+<details>
+<summary>Click to view more details of MiniCPM-V 2.0</summary>
+
+
+**MiniCPM-V 2.0** is an efficient version with promising performance for deployment. The model is built based on SigLip-400M and [MiniCPM-2.4B](https://github.com/OpenBMB/MiniCPM/), connected by a perceiver resampler. Our latest version, MiniCPM-V 2.0 has several notable features. 
+
+- 🔥 **State-of-the-art Performance.** 
+
+  MiniCPM-V 2.0 achieves **state-of-the-art performance** on multiple benchmarks (including OCRBench, TextVQA, MME, MMB, MathVista, etc) among models under 7B parameters. It even **outperforms strong Qwen-VL-Chat 9.6B, CogVLM-Chat 17.4B, and Yi-VL 34B on OpenCompass, a comprehensive evaluation over 11 popular benchmarks**. Notably, MiniCPM-V 2.0 shows **strong OCR capability**, achieving **comparable performance to Gemini Pro in scene-text understanding**, and **state-of-the-art performance on OCRBench** among open-source models.
+
+- 🏆 **Trustworthy Behavior.** 
+
+  LMMs are known for suffering from hallucination, often generating text not factually grounded in images. MiniCPM-V 2.0 is **the first end-side LMM aligned via multimodal RLHF for trustworthy behavior** (using the recent [RLHF-V](https://rlhf-v.github.io/) [CVPR'24] series technique). This allows the model to **match GPT-4V in preventing hallucinations** on Object HalBench.
+
+- 🌟 **High-Resolution Images at Any Aspect Raito.**
+
+  MiniCPM-V 2.0 can accept **1.8 million pixels (e.g., 1344x1344) images at any aspect ratio**. This enables better perception of fine-grained visual information such as small objects and optical characters, which is achieved via a recent technique from [LLaVA-UHD](https://arxiv.org/pdf/2403.11703.pdf).
+
+- ⚡️ **High Efficiency.** 
+
+  MiniCPM-V 2.0 can be **efficiently deployed on most GPU cards and personal computers**, and **even on end devices such as mobile phones**. For visual encoding, we compress the image representations into much fewer tokens via a perceiver resampler. This allows MiniCPM-V 2.0 to operate with **favorable memory cost and speed during inference even when dealing with high-resolution images**.
+
+- 🙌 **Bilingual Support.** 
+
+  MiniCPM-V 2.0 **supports strong bilingual multimodal capabilities in both English and Chinese**. This is enabled by generalizing multimodal capabilities across languages, a technique from [VisCPM](https://arxiv.org/abs/2308.12038) [ICLR'24].
+
+### Examples <!-- omit in toc -->
+
+<table align="center">
+    <p align="center">
+      <img src="assets/minicpmv2-cases_2.png" width=95%/>
+    </p>
+</table>
+
+We deploy MiniCPM-V 2.0 on end devices. The demo video is the raw screen recording on a Xiaomi 14 Pro without edition.
+
+<table align="center">
+    <p align="center">
+      <img src="assets/gif_cases/station.gif" width=36%/>
+      <img src="assets/gif_cases/london_car.gif" width=36%/>
+    </p>
+</table>
+
+</details>
+
+## Legacy Models <!-- omit in toc --> 
+
+| Model                | Introduction and Guidance       |
+|:----------------------|:-------------------:|
+| MiniCPM-V 1.0  | [Document](./minicpm_v1.md)   | 
+| OmniLMM-12B  | [Document](./omnilmm_en.md)   |  
+
+
+
+## Online Demo
+Click here to try out the Demo of [MiniCPM-Llama3-V 2.5](https://huggingface.co/spaces/openbmb/MiniCPM-Llama3-V-2_5) ｜ [MiniCPM-V 2.0](https://huggingface.co/spaces/openbmb/MiniCPM-V-2).
+
+## Install
+
+1. Clone this repository and navigate to the source folder
+
+```bash
+git clone https://github.com/OpenBMB/MiniCPM-V.git
+cd MiniCPM-V
+```
+
+2. Create conda environment
+
+```Shell
+conda create -n MiniCPM-V python=3.10 -y
+conda activate MiniCPM-V
+```
+
+3. Install dependencies
+
+```shell
+pip install -r requirements.txt
+```
+
+## Inference
+
+
+### Model Zoo
+
+| Model           | Device | Memory    | &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp; Description       | Download |
+|:-----------|:--:|:-----------:|:-------------------|:---------------:|
+| MiniCPM-Llama3-V 2.5 | GPU | 19 GB | The lastest version, achieving state-of-the end-side multimodal performance.   |  [🤗](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5/) &nbsp;&nbsp; [<img src="./assets/modelscope_logo.png" width="20px"></img>](https://modelscope.cn/models/OpenBMB/MiniCPM-Llama3-V-2_5) |
+| MiniCPM-Llama3-V 2.5 gguf | CPU  | 5 GB | The gguf version, lower GPU memory and faster inference.   |  [🤗](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) &nbsp;&nbsp;[<img src="./assets/modelscope_logo.png" width="20px"></img>](https://modelscope.cn/models/OpenBMB/MiniCPM-Llama3-V-2_5-gguf) |
+| MiniCPM-Llama3-V 2.5 int4 | GPU | 8 GB | The int4 quantized version，lower GPU memory usage. |  [🤗](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-int4/) &nbsp;&nbsp; [<img src="./assets/modelscope_logo.png" width="20px"></img>](https://modelscope.cn/models/OpenBMB/MiniCPM-Llama3-V-2_5-int4) |
+| MiniCPM-V 2.0 | GPU | 8 GB | Light version, balance the performance the computation cost.   |  [🤗](https://huggingface.co/openbmb/MiniCPM-V-2) &nbsp;&nbsp; [<img src="./assets/modelscope_logo.png" width="20px"></img>](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2) |
+| MiniCPM-V 1.0 | GPU | 7 GB | Lightest version, achieving the fastest inference. |   [🤗](https://huggingface.co/openbmb/MiniCPM-V) &nbsp;&nbsp; [<img src="./assets/modelscope_logo.png" width="20px"></img>](https://modelscope.cn/models/OpenBMB/MiniCPM-V) |
+
+### Multi-turn Conversation
+
+Please refer to the following codes to run.
+
+<div align="center">
+<img src="assets/airplane.jpeg" width="500px">
+</div>
+
+
+```python
+from chat import MiniCPMVChat, img2base64
+import torch
+import json
+
+torch.manual_seed(0)
+
+chat_model = MiniCPMVChat('openbmb/MiniCPM-Llama3-V-2_5')
+
+im_64 = img2base64('./assets/airplane.jpeg')
+
+# First round chat 
+msgs = [{"role": "user", "content": "Tell me the model of this aircraft."}]
+
+inputs = {"image": im_64, "question": json.dumps(msgs)}
+answer = chat_model.chat(inputs)
+print(answer)
+
+# Second round chat 
+# pass history context of multi-turn conversation
+msgs.append({"role": "assistant", "content": answer})
+msgs.append({"role": "user", "content": "Introduce something about Airbus A380."})
+
+inputs = {"image": im_64, "question": json.dumps(msgs)}
+answer = chat_model.chat(inputs)
+print(answer)
+```
+
+You will get the following output:
+
+```
+"The aircraft in the image is an Airbus A380, which can be identified by its large size, double-deck structure, and the distinctive shape of its wings and engines. The A380 is a wide-body aircraft known for being the world's largest passenger airliner, designed for long-haul flights. It has four engines, which are characteristic of large commercial aircraft. The registration number on the aircraft can also provide specific information about the model if looked up in an aviation database."
+
+"The Airbus A380 is a double-deck, wide-body, four-engine jet airliner made by Airbus. It is the world's largest passenger airliner and is known for its long-haul capabilities. The aircraft was developed to improve efficiency and comfort for passengers traveling over long distances. It has two full-length passenger decks, which can accommodate more passengers than a typical single-aisle airplane. The A380 has been operated by airlines such as Lufthansa, Singapore Airlines, and Emirates, among others. It is widely recognized for its unique design and significant impact on the aviation industry."
+```
+
+
+
+### Inference on Mac
+<details>
+<summary>Click to view an example, to run MiniCPM-Llama3-V 2.5 on 💻 Mac with MPS (Apple silicon or AMD GPUs). </summary>
+
+```python
+# test.py  Need more than 16GB memory.
+import torch
+from PIL import Image
+from transformers import AutoModel, AutoTokenizer
+
+model = AutoModel.from_pretrained('openbmb/MiniCPM-Llama3-V-2_5', trust_remote_code=True, low_cpu_mem_usage=True)
+model = model.to(device='mps')
+
+tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-Llama3-V-2_5', trust_remote_code=True)
+model.eval()
+
+image = Image.open('./assets/hk_OCR.jpg').convert('RGB')
+question = 'Where is this photo taken?'
+msgs = [{'role': 'user', 'content': question}]
+
+answer, context, _ = model.chat(
+    image=image,
+    msgs=msgs,
+    context=None,
+    tokenizer=tokenizer,
+    sampling=True
+)
+print(answer)
+```
+Run with command:
+```shell
+PYTORCH_ENABLE_MPS_FALLBACK=1 python test.py
+```
+</details>
+
+### Deployment on Mobile Phone
+MiniCPM-V 2.0 can be deployed on mobile phones with Android operating systems. 🚀 Click [here](https://github.com/OpenBMB/mlc-MiniCPM) to install apk. MiniCPM-Llama3-V 2.5 coming soon.
+
+### WebUI Demo
+
+<details>
+<summary>Click to see how to deploy WebUI demo on different devices </summary>
+  
+```shell
+pip install -r requirements.txt
+```
+  
+```shell
+# For NVIDIA GPUs, run:
+python web_demo_2.5.py --device cuda
+
+# For Mac with MPS (Apple silicon or AMD GPUs), run:
+PYTORCH_ENABLE_MPS_FALLBACK=1 python web_demo_2.5.py --device mps
+```
+</details>
+
+### Inference with llama.cpp<a id="inference-with-llamacpp"></a>
+MiniCPM-Llama3-V 2.5 can run with llama.cpp now! See our fork of [llama.cpp](https://github.com/OpenBMB/llama.cpp/tree/minicpm-v2.5/examples/minicpmv) for more detail. This implementation supports smooth inference of 6~8 token/s on mobile phones (test environment：Xiaomi 14 pro + Snapdragon 8 Gen 3).
+
+### Inference with vLLM<a id="vllm"></a>
+
+<details>
+<summary>Click to see how to inference with vLLM </summary>
+Because our pull request to vLLM is still waiting for reviewing, we fork this repository to build and test our vLLM demo. Here are the steps:
+
+1. Clone our version of vLLM:
+```shell
+git clone https://github.com/OpenBMB/vllm.git
+```
+2. Install vLLM:
+```shell
+cd vllm
+pip install -e .
+```
+3. Install timm: 
+```shell
+pip install timm=0.9.10
+```
+4. Run our demo:
+```shell
+python examples/minicpmv_example.py 
+```
+</details>
+
+## Fine-tuning
+
+### Simple Fine-tuning <!-- omit in toc -->
+
+We support simple fine-tuning with Hugging Face for MiniCPM-V 2.0 and MiniCPM-Llama3-V 2.5.
+
+[Reference Document](./finetune/readme.md)
+
+### With the SWIFT Framework <!-- omit in toc -->
+
+We now support MiniCPM-V series fine-tuning with the SWIFT framework. SWIFT supports training, inference, evaluation and deployment of nearly 200 LLMs and MLLMs . It supports the lightweight training solutions provided by PEFT and a complete Adapters Library including techniques such as NEFTune, LoRA+ and LLaMA-PRO.
+
+Best Practices：[MiniCPM-V 1.0](https://github.com/modelscope/swift/blob/main/docs/source/Multi-Modal/minicpm-v最佳实践.md), [MiniCPM-V 2.0](https://github.com/modelscope/swift/blob/main/docs/source/Multi-Modal/minicpm-v-2最佳实践.md)
+
+
+
+## TODO
+
+- [x] MiniCPM-V fine-tuning support
+- [ ] Code release for real-time interactive assistant
+
+## Model License <!-- omit in toc -->
+
+The code in this repo is released according to [Apache-2.0](https://github.com/OpenBMB/MiniCPM/blob/main/LICENSE)
+
+The usage of MiniCPM-V's and OmniLMM's parameters is subject to "[General Model License Agreement - Source Notes - Publicity Restrictions - Commercial License](https://github.com/OpenBMB/General-Model-License/blob/main/通用模型许可协议-来源说明-宣传限制-商业授权.md)"
+
+The parameters are fully open to academic research
+
+Please contact cpm@modelbest.cn to obtain written authorization for commercial uses. Free commercial use is also allowed after registration.
+
+## Statement <!-- omit in toc -->
+
+As LMMs, MiniCPM-V models (including OmniLMM) generate contents by learning a large amount of multimodal corpora, but they cannot comprehend, express personal opinions or make value judgement. Anything generated by MiniCPM-V models does not represent the views and positions of the model developers
+
+We will not be liable for any problems arising from the use of MiniCPMV-V models, including but not limited to data security issues, risk of public opinion, or any risks and problems arising from the misdirection, misuse, dissemination or misuse of the model.
+
+
+## Institutions  <!-- omit in toc -->
+
+This project is developed by the following institutions:
+
+- <img src="assets/thunlp.png" width="28px"> [THUNLP](https://nlp.csai.tsinghua.edu.cn/)
+- <img src="assets/modelbest.png" width="28px"> [ModelBest](https://modelbest.cn/)
+- <img src="assets/zhihu.webp" width="28px"> [Zhihu](https://www.zhihu.com/ )
+
+## Other Multimodal Projects from Our Team <!-- omit in toc -->
+
+👏 Welcome to explore other multimodal projects of our team:
+
+[VisCPM](https://github.com/OpenBMB/VisCPM/tree/main) | [RLHF-V](https://github.com/RLHF-V/RLHF-V) | [LLaVA-UHD](https://github.com/thunlp/LLaVA-UHD) | [RLAIF-V](https://github.com/RLHF-V/RLAIF-V)
+
+## 🌟 Star History
+
+<div>
+<img src="./assets/Star-History.png" width="500em" ></img> 
+</div>
+
+## Citation
+
+If you find our model/code/paper helpful, please consider cite our papers 📝 and star us ⭐️！
+
+```bib
+@article{yu2023rlhf,
+  title={Rlhf-v: Towards trustworthy mllms via behavior alignment from fine-grained correctional human feedback},
+  author={Yu, Tianyu and Yao, Yuan and Zhang, Haoye and He, Taiwen and Han, Yifeng and Cui, Ganqu and Hu, Jinyi and Liu, Zhiyuan and Zheng, Hai-Tao and Sun, Maosong and others},
+  journal={arXiv preprint arXiv:2312.00849},
+  year={2023}
+}
+@article{viscpm,
+    title={Large Multilingual Models Pivot Zero-Shot Multimodal Learning across Languages}, 
+    author={Jinyi Hu and Yuan Yao and Chongyi Wang and Shan Wang and Yinxu Pan and Qianyu Chen and Tianyu Yu and Hanghao Wu and Yue Zhao and Haoye Zhang and Xu Han and Yankai Lin and Jiao Xue and Dahai Li and Zhiyuan Liu and Maosong Sun},
+    journal={arXiv preprint arXiv:2308.12038},
+    year={2023}
+}
+@article{xu2024llava-uhd,
+  title={{LLaVA-UHD}: an LMM Perceiving Any Aspect Ratio and High-Resolution Images},
+  author={Xu, Ruyi and Yao, Yuan and Guo, Zonghao and Cui, Junbo and Ni, Zanlin and Ge, Chunjiang and Chua, Tat-Seng and Liu, Zhiyuan and Huang, Gao},
+  journal={arXiv preprint arXiv:2403.11703},
+  year={2024}
+}
+@article{yu2024rlaifv,
+  title={RLAIF-V: Aligning MLLMs through Open-Source AI Feedback for Super GPT-4V Trustworthiness}, 
+  author={Yu, Tianyu and Zhang, Haoye and Yao, Yuan and Dang, Yunkai and Chen, Da and Lu, Xiaoman and Cui, Ganqu and He, Taiwen and Liu, Zhiyuan and Chua, Tat-Seng and Sun, Maosong},
+  journal={arXiv preprint arXiv:2405.17220},
+  year={2024}
+}
+```
diff --git a/README_en.md b/README_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..3a20b78f64bf2a2716abbf16c2a86356b2ad0fff
--- /dev/null
+++ b/README_en.md
@@ -0,0 +1,718 @@
+<div align="center">
+
+<img src="./assets/minicpmv.png" width="300em" ></img> 
+
+**A GPT-4V Level Multimodal LLM on Your Phone**
+
+  <strong>[中文](./README_zh.md) |
+  English</strong>
+
+<p align="center">
+  MiniCPM-Llama3-V  2.5  <a href="https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5/">🤗</a> <a href="https://huggingface.co/spaces/openbmb/MiniCPM-Llama3-V-2_5">🤖</a> |
+  MiniCPM-V 2.0  <a href="https://huggingface.co/openbmb/MiniCPM-V-2/">🤗</a> <a href="https://huggingface.co/spaces/openbmb/MiniCPM-V-2">🤖</a> |
+  <a href="https://openbmb.vercel.app/minicpm-v-2-en"> Technical Blog </a>
+</p>
+
+</div>
+
+
+**MiniCPM-V** is a series of end-side multimodal LLMs (MLLMs) designed for vision-language understanding. The models take image and text as inputs and provide high-quality text outputs. Since February 2024, we have released 4 versions of the model, aiming to achieve **strong performance and efficient deployment**. The most notable models in this series currently include:
+
+- **MiniCPM-Llama3-V 2.5**: 🔥🔥🔥 The latest and most capable model in the MiniCPM-V series. With a total of 8B parameters, the model **surpasses proprietary models such as GPT-4V-1106, Gemini Pro, Qwen-VL-Max and Claude 3** in overall performance. Equipped with the enhanced OCR and instruction-following capability, the model can also support multimodal conversation for **over 30 languages** including English, Chinese, French, Spanish, German etc. With help of quantization, compilation optimizations, and several efficient inference techniques on CPUs and NPUs, MiniCPM-Llama3-V 2.5 can be **efficiently deployed on end-side devices**.
+
+- **MiniCPM-V 2.0**: The lightest model in the MiniCPM-V series. With 2B parameters, it surpasses larger models such as Yi-VL 34B, CogVLM-Chat 17B, and Qwen-VL-Chat 10B in overall performance. It can accept image inputs of any aspect ratio and up to 1.8 million pixels (e.g., 1344x1344), achieving comparable performance with Gemini Pro in understanding scene-text and matches GPT-4V in low hallucination rates.
+
+
+## News <!-- omit in toc -->
+
+#### 📌 Pinned
+
+* [2024.05.28] 🚀🚀🚀 MiniCPM-Llama3-V 2.5 now fully supports its feature in [llama.cpp](https://github.com/OpenBMB/llama.cpp/blob/minicpm-v2.5/examples/minicpmv/README.md) and [ollama](https://github.com/OpenBMB/ollama/tree/minicpm-v2.5)! Please pull the latest code for llama.cpp & ollama. We also release GGUF in various sizes [here](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf/tree/main). FAQ list for ollama usage is comming within a day. Please stay tuned!
+* [2024.05.28] 💫 We now support LoRA fine-tuning for MiniCPM-Llama3-V 2.5, using only 2 V100 GPUs! See more statistics [here](https://github.com/OpenBMB/MiniCPM-V/tree/main/finetune#model-fine-tuning-memory-usage-statistics).
+* [2024.05.23] 🔍 We've released a comprehensive comparison between Phi-3-vision-128k-instruct and MiniCPM-Llama3-V 2.5, including benchmarks evaluations, multilingual capabilities, and inference efficiency 🌟📊🌍🚀. Click [here](./docs/compare_with_phi-3_vision.md) to view more details.
+* [2024.05.23] 🔥🔥🔥 MiniCPM-V tops GitHub Trending and Hugging Face Trending! Our demo, recommended by Hugging Face Gradio’s official account, is available [here](https://huggingface.co/spaces/openbmb/MiniCPM-Llama3-V-2_5). Come and try it out!
+
+<br>
+
+
+* [2024.05.25] MiniCPM-Llama3-V 2.5 now supports streaming outputs and customized system prompts. Try it [here](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5#usage)!
+* [2024.05.24] We release the MiniCPM-Llama3-V 2.5 [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf), which supports [llama.cpp](#inference-with-llamacpp) inference and provides a 6~8 token/s smooth decoding on mobile phones. Try it now!
+* [2024.05.20] We open-soure MiniCPM-Llama3-V 2.5, it has improved OCR capability and supports 30+ languages, representing the first end-side MLLM achieving GPT-4V level performance! We provide [efficient inference](#deployment-on-mobile-phone) and [simple fine-tuning](./finetune/readme.md). Try it now!
+* [2024.04.23] MiniCPM-V-2.0 supports vLLM now! Click [here](#vllm) to view more details.
+* [2024.04.18] We create a HuggingFace Space to host the demo of MiniCPM-V 2.0 at [here](https://huggingface.co/spaces/openbmb/MiniCPM-V-2)!
+* [2024.04.17] MiniCPM-V-2.0 supports deploying [WebUI Demo](#webui-demo) now!
+* [2024.04.15] MiniCPM-V-2.0 now also supports [fine-tuning](https://github.com/modelscope/swift/blob/main/docs/source/Multi-Modal/minicpm-v-2最佳实践.md) with the SWIFT framework!
+* [2024.04.12] We open-source MiniCPM-V 2.0, which achieves comparable performance with Gemini Pro in understanding scene text and outperforms strong Qwen-VL-Chat 9.6B and Yi-VL 34B on <a href="https://rank.opencompass.org.cn/leaderboard-multimodal">OpenCompass</a>, a comprehensive evaluation over 11 popular benchmarks. Click <a href="https://openbmb.vercel.app/minicpm-v-2">here</a> to view the MiniCPM-V 2.0 technical blog.
+* [2024.03.14] MiniCPM-V now supports [fine-tuning](https://github.com/modelscope/swift/blob/main/docs/source/Multi-Modal/minicpm-v最佳实践.md) with the SWIFT framework. Thanks to [Jintao](https://github.com/Jintao-Huang) for the contribution！
+* [2024.03.01] MiniCPM-V now can be deployed on Mac!
+* [2024.02.01] We open-source MiniCPM-V and OmniLMM-12B, which support efficient end-side deployment and powerful multimodal capabilities correspondingly.
+
+
+## Contents <!-- omit in toc -->
+
+
+- [MiniCPM-Llama3-V 2.5](#minicpm-llama3-v-25)
+- [MiniCPM-V 2.0](#minicpm-v-20)
+- [Online Demo](#online-demo)
+- [Install](#install)
+- [Inference](#inference)
+  - [Model Zoo](#model-zoo)
+  - [Multi-turn Conversation](#multi-turn-conversation)
+  - [Inference on Mac](#inference-on-mac)
+  - [Deployment on Mobile Phone](#deployment-on-mobile-phone)
+  - [WebUI Demo](#webui-demo)
+  - [Inference with llama.cpp](#inference-with-llamacpp)
+  - [Inference with vLLM](#inference-with-vllm)
+- [Fine-tuning](#fine-tuning)
+- [TODO](#todo)
+- [🌟 Star History](#-star-history)
+- [Citation](#citation)
+
+## MiniCPM-Llama3-V 2.5
+
+**MiniCPM-Llama3-V 2.5** is the latest model in the MiniCPM-V series. The model is built on SigLip-400M and Llama3-8B-Instruct with a total of 8B parameters. It exhibits a significant performance improvement over MiniCPM-V 2.0. Notable features of MiniCPM-Llama3-V 2.5 include:
+
+- 🔥 **Leading Performance.**
+  MiniCPM-Llama3-V 2.5 has achieved an average score of 65.1 on OpenCompass, a comprehensive evaluation over 11 popular benchmarks. **With only 8B parameters, it surpasses widely used proprietary models like GPT-4V-1106, Gemini Pro, Claude 3 and Qwen-VL-Max** and greatly outperforms other Llama 3-based MLLMs.
+
+- 💪 **Strong OCR Capabilities.**
+  MiniCPM-Llama3-V 2.5 can process images with any aspect ratio and up to 1.8 million pixels (e.g., 1344x1344), achieving a **700+ score on OCRBench, surpassing proprietary models such as GPT-4o, GPT-4V-0409, Qwen-VL-Max and Gemini Pro**. Based on recent user feedback, MiniCPM-Llama3-V 2.5 has now enhanced full-text OCR extraction, table-to-markdown conversion, and other high-utility capabilities, and has further strengthened its instruction-following and complex reasoning abilities, enhancing multimodal interaction experiences.
+
+- 🏆 **Trustworthy Behavior.**
+  Leveraging the latest [RLAIF-V](https://github.com/RLHF-V/RLAIF-V/) method (the newest technique in the [RLHF-V](https://github.com/RLHF-V) [CVPR'24] series), MiniCPM-Llama3-V 2.5 exhibits more trustworthy behavior. It achieves a **10.3%** hallucination rate on Object HalBench, lower than GPT-4V-1106 (13.6%), achieving the best-level performance within the open-source community. [Data released](https://huggingface.co/datasets/openbmb/RLAIF-V-Dataset).
+
+- 🌏 **Multilingual Support.**
+  Thanks to the strong multilingual capabilities of Llama 3 and the cross-lingual generalization technique from [VisCPM](https://github.com/OpenBMB/VisCPM), MiniCPM-Llama3-V 2.5 extends its bilingual (Chinese-English) multimodal capabilities to **over 30 languages including German, French, Spanish, Italian, Korean etc.** [All Supported Languages](./assets/minicpm-llama-v-2-5_languages.md).
+
+- 🚀 **Efficient Deployment.**
+  MiniCPM-Llama3-V 2.5 systematically employs **model quantization, CPU optimizations, NPU optimizations and compilation optimizations**, achieving high-efficiency deployment on end-side devices. For mobile phones with Qualcomm chips, we have integrated the NPU acceleration framework QNN into llama.cpp for the first time. After systematic optimization, MiniCPM-Llama3-V 2.5 has realized a **150x acceleration in end-side MLLM image encoding** and a **3x speedup in language decoding**.
+
+-  💫  **Easy Usage.**
+MiniCPM-Llama3-V 2.5 can be easily used in various ways: (1) [llama.cpp](https://github.com/OpenBMB/llama.cpp/blob/minicpm-v2.5/examples/minicpmv/README.md) and [ollama](https://github.com/OpenBMB/ollama/tree/minicpm-v2.5/examples/minicpm-v2.5) support for efficient CPU inference on local devices, (2) [GGUF](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) format quantized models in 16 sizes, (3) efficient [LoRA](https://github.com/OpenBMB/MiniCPM-V/tree/main/finetune#lora-finetuning) fine-tuning with only 2 V100 GPUs, (4) [streaming output](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5#usage), (5) quick local WebUI demo setup with [Gradio](https://github.com/OpenBMB/MiniCPM-V/blob/main/web_demo_2.5.py) and [Streamlit](https://github.com/OpenBMB/MiniCPM-V/blob/main/web_demo_streamlit-2_5.py), and (6) interactive demos on [HuggingFace Spaces](https://huggingface.co/spaces/openbmb/MiniCPM-Llama3-V-2_5).
+
+### Evaluation  <!-- omit in toc -->
+
+<div align="center">
+    <img src=assets/MiniCPM-Llama3-V-2.5-peformance.png width=66% />
+</div>
+<details>
+<summary>Click to view results on TextVQA, DocVQA, OCRBench, OpenCompass, MME, MMBench, MMMU, MathVista, LLaVA Bench, RealWorld QA, Object HalBench. </summary>
+<div align="center">
+
+<table style="margin: 0px auto;">
+    <thead>
+        <tr>
+            <th align="left">Model</th>
+            <th>Size</th>
+            <th>OCRBench</th>
+            <th>TextVQA val</th>
+            <th>DocVQA test</th>
+            <th>Open-Compass</th>
+            <th>MME</th>
+            <th>MMB test (en)</th>
+            <th>MMB test (cn)</th>
+            <th>MMMU val</th>
+            <th>Math-Vista</th>
+            <th>LLaVA Bench</th>
+            <th>RealWorld QA</th>
+            <th>Object HalBench</th>
+        </tr>
+    </thead>
+    <tbody align="center">
+        <tr>
+            <td colspan="14" align="left"><strong>Proprietary</strong></td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">Gemini Pro</td>
+            <td>-</td>
+            <td>680</td>
+            <td>74.6</td>
+            <td>88.1</td>
+            <td>62.9</td>
+            <td>2148.9</td>
+            <td>73.6</td>
+            <td>74.3</td>
+            <td>48.9</td>
+            <td>45.8</td>
+            <td>79.9</td>
+            <td>60.4</td>
+            <td>-</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">GPT-4V (2023.11.06)</td>
+            <td>-</td>
+            <td>645</td>
+            <td>78.0</td>
+            <td>88.4</td>
+            <td>63.5</td>
+            <td>1771.5</td>
+            <td>77.0</td>
+            <td>74.4</td>
+            <td>53.8</td>
+            <td>47.8</td>
+            <td>93.1</td>
+            <td>63.0</td>
+            <td>86.4</td>
+        </tr>
+        <tr>
+            <td colspan="14" align="left"><strong>Open-source</strong></td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">Mini-Gemini</td>
+            <td>2.2B</td>
+            <td>-</td>
+            <td>56.2</td>
+            <td>34.2*</td>
+            <td>-</td>
+            <td>1653.0</td>
+            <td>-</td>
+            <td>-</td>
+            <td>31.7</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">Qwen-VL-Chat</td>
+            <td>9.6B</td>
+            <td>488</td>
+            <td>61.5</td>
+            <td>62.6</td>
+            <td>51.6</td>
+            <td>1860.0</td>
+            <td>61.8</td>
+            <td>56.3</td>
+            <td>37.0</td>
+            <td>33.8</td>
+            <td>67.7</td>
+            <td>49.3</td>
+            <td>56.2</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">DeepSeek-VL-7B</td>
+            <td>7.3B</td>
+            <td>435</td>
+            <td>64.7*</td>
+            <td>47.0*</td>
+            <td>54.6</td>
+            <td>1765.4</td>
+            <td>73.8</td>
+            <td>71.4</td>
+            <td>38.3</td>
+            <td>36.8</td>
+            <td>77.8</td>
+            <td>54.2</td>
+            <td>-</td>
+        </tr>        
+        <tr>
+            <td nowrap="nowrap" align="left">Yi-VL-34B</td>
+            <td>34B</td>
+            <td>290</td>
+            <td>43.4*</td>
+            <td>16.9*</td>
+            <td>52.2</td>
+            <td><strong>2050.2</strong></td>
+            <td>72.4</td>
+            <td>70.7</td>
+            <td>45.1</td>
+            <td>30.7</td>
+            <td>62.3</td>
+            <td>54.8</td>
+            <td>79.3</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">CogVLM-Chat</td>
+            <td>17.4B</td>
+            <td>590</td>
+            <td>70.4</td>
+            <td>33.3*</td>
+            <td>54.2</td>
+            <td>1736.6</td>
+            <td>65.8</td>
+            <td>55.9</td>
+            <td>37.3</td>
+            <td>34.7</td>
+            <td>73.9</td>
+            <td>60.3</td>
+            <td>73.6</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">TextMonkey</td>
+            <td>9.7B</td>
+            <td>558</td>
+            <td>64.3</td>
+            <td>66.7</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+        </tr>
+        <tr>
+          <td nowrap="nowrap" align="left">Idefics2</td>
+          <td>8.0B</td>
+          <td>-</td>
+          <td>73.0</td>
+          <td>74.0</td>
+          <td>57.2</td>
+          <td>1847.6</td>
+          <td>75.7</td>
+          <td>68.6</td>
+          <td>45.2</td>
+          <td>52.2</td>
+          <td>49.1</td>
+          <td>60.7</td>
+          <td>-</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">Bunny-LLama-3-8B</td>
+            <td>8.4B</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+            <td>54.3</td>
+            <td>1920.3</td>
+            <td>77.0</td>
+            <td>73.9</td>
+            <td>41.3</td>
+            <td>31.5</td>
+            <td>61.2</td>
+            <td>58.8</td>
+            <td>-</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">LLaVA-NeXT Llama-3-8B</td>
+            <td>8.4B</td>
+            <td>-</td>
+            <td>-</td>
+            <td>78.2</td>
+            <td>-</td>
+            <td>1971.5</td>
+            <td>-</td>
+            <td>-</td>
+            <td>41.7</td>
+            <td>37.5</td>
+            <td>80.1</td>
+            <td>60.0</td>
+            <td>-</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">Phi-3-vision-128k-instruct</td>
+            <td>4.2B</td>
+            <td>639*</td>
+            <td>70.9</td>
+            <td>-</td>
+            <td>-</td>
+            <td>1537.5*</td>
+            <td>-</td>
+            <td>-</td>
+            <td>40.4</td>
+            <td>44.5</td>
+            <td>64.2*</td>
+            <td>58.8*</td>
+            <td>-</td>
+        </tr>
+        <tr style="background-color: #e6f2ff;">
+            <td nowrap="nowrap" align="left">MiniCPM-V 1.0</td>
+            <td>2.8B</td>
+            <td>366</td>
+            <td>60.6</td>
+            <td>38.2</td>
+            <td>47.5</td>
+            <td>1650.2</td>
+            <td>64.1</td>
+            <td>62.6</td>
+            <td>38.3</td>
+            <td>28.9</td>
+            <td>51.3</td>
+            <td>51.2</td>
+            <td>78.4</td>
+        </tr>
+        <tr style="background-color: #e6f2ff;">
+            <td nowrap="nowrap" align="left">MiniCPM-V 2.0</td>
+            <td>2.8B</td>
+            <td>605</td>
+            <td>74.1</td>
+            <td>71.9</td>
+            <td>54.5</td>
+            <td>1808.6</td>
+            <td>69.1</td>
+            <td>66.5</td>
+            <td>38.2</td>
+            <td>38.7</td>
+            <td>69.2</td>
+            <td>55.8</td>
+            <td>85.5</td>
+        </tr>
+        <tr style="background-color: #e6f2ff;">
+            <td nowrap="nowrap" align="left">MiniCPM-Llama3-V 2.5</td>
+            <td>8.5B</td>
+            <td><strong>725</strong></td>
+            <td><strong>76.6</strong></td>
+            <td><strong>84.8</strong></td>
+            <td><strong>65.1</strong></td>
+            <td>2024.6</td>
+            <td><strong>77.2</strong></td>
+            <td><strong>74.2</strong></td>
+            <td><strong>45.8</strong></td>
+            <td><strong>54.3</strong></td>
+            <td><strong>86.7</strong></td>
+            <td><strong>63.5</strong></td>
+            <td><strong>89.7</strong></td>
+        </tr>
+    </tbody>
+</table>
+
+
+</div>
+* We evaluate the officially released checkpoint by ourselves.
+
+</details>
+
+<div align="center">
+    <img src="assets/llavabench_compare_3.png" width="100%" />
+    <br>
+    Evaluation results of multilingual LLaVA Bench
+</div>
+
+### Examples <!-- omit in toc -->
+
+<table align="center" >
+  <p align="center" > 
+  <img src="assets/minicpmv-llama3-v2.5/cases_all.png" />
+  </p>
+</table>
+
+We deploy MiniCPM-Llama3-V 2.5 on end devices. The demo video is the raw screen recording on a Xiaomi 14 Pro without edition.
+
+<table align="center">
+    <p align="center">
+      <img src="assets/gif_cases/ticket.gif" width=32%/>
+      <img src="assets/gif_cases/meal_plan.gif" width=32%/>
+    </p>
+</table>
+
+<table align="center">
+    <p align="center">
+      <img src="assets/gif_cases/1-4.gif" width=64%/>
+    </p>
+</table>
+
+## MiniCPM-V 2.0
+
+<details>
+<summary>Click to view more details of MiniCPM-V 2.0</summary>
+
+
+**MiniCPM-V 2.0** is an efficient version with promising performance for deployment. The model is built based on SigLip-400M and [MiniCPM-2.4B](https://github.com/OpenBMB/MiniCPM/), connected by a perceiver resampler. Our latest version, MiniCPM-V 2.0 has several notable features. 
+
+- 🔥 **State-of-the-art Performance.** 
+
+  MiniCPM-V 2.0 achieves **state-of-the-art performance** on multiple benchmarks (including OCRBench, TextVQA, MME, MMB, MathVista, etc) among models under 7B parameters. It even **outperforms strong Qwen-VL-Chat 9.6B, CogVLM-Chat 17.4B, and Yi-VL 34B on OpenCompass, a comprehensive evaluation over 11 popular benchmarks**. Notably, MiniCPM-V 2.0 shows **strong OCR capability**, achieving **comparable performance to Gemini Pro in scene-text understanding**, and **state-of-the-art performance on OCRBench** among open-source models.
+
+- 🏆 **Trustworthy Behavior.** 
+
+  LMMs are known for suffering from hallucination, often generating text not factually grounded in images. MiniCPM-V 2.0 is **the first end-side LMM aligned via multimodal RLHF for trustworthy behavior** (using the recent [RLHF-V](https://rlhf-v.github.io/) [CVPR'24] series technique). This allows the model to **match GPT-4V in preventing hallucinations** on Object HalBench.
+
+- 🌟 **High-Resolution Images at Any Aspect Raito.**
+
+  MiniCPM-V 2.0 can accept **1.8 million pixels (e.g., 1344x1344) images at any aspect ratio**. This enables better perception of fine-grained visual information such as small objects and optical characters, which is achieved via a recent technique from [LLaVA-UHD](https://arxiv.org/pdf/2403.11703.pdf).
+
+- ⚡️ **High Efficiency.** 
+
+  MiniCPM-V 2.0 can be **efficiently deployed on most GPU cards and personal computers**, and **even on end devices such as mobile phones**. For visual encoding, we compress the image representations into much fewer tokens via a perceiver resampler. This allows MiniCPM-V 2.0 to operate with **favorable memory cost and speed during inference even when dealing with high-resolution images**.
+
+- 🙌 **Bilingual Support.** 
+
+  MiniCPM-V 2.0 **supports strong bilingual multimodal capabilities in both English and Chinese**. This is enabled by generalizing multimodal capabilities across languages, a technique from [VisCPM](https://arxiv.org/abs/2308.12038) [ICLR'24].
+
+### Examples <!-- omit in toc -->
+
+<table align="center">
+    <p align="center">
+      <img src="assets/minicpmv2-cases_2.png" width=95%/>
+    </p>
+</table>
+
+We deploy MiniCPM-V 2.0 on end devices. The demo video is the raw screen recording on a Xiaomi 14 Pro without edition.
+
+<table align="center">
+    <p align="center">
+      <img src="assets/gif_cases/station.gif" width=36%/>
+      <img src="assets/gif_cases/london_car.gif" width=36%/>
+    </p>
+</table>
+
+</details>
+
+## Legacy Models <!-- omit in toc --> 
+
+| Model                | Introduction and Guidance       |
+|:----------------------|:-------------------:|
+| MiniCPM-V 1.0  | [Document](./minicpm_v1.md)   | 
+| OmniLMM-12B  | [Document](./omnilmm_en.md)   |  
+
+
+
+## Online Demo
+Click here to try out the Demo of [MiniCPM-Llama3-V 2.5](https://huggingface.co/spaces/openbmb/MiniCPM-Llama3-V-2_5) ｜ [MiniCPM-V 2.0](https://huggingface.co/spaces/openbmb/MiniCPM-V-2).
+
+## Install
+
+1. Clone this repository and navigate to the source folder
+
+```bash
+git clone https://github.com/OpenBMB/MiniCPM-V.git
+cd MiniCPM-V
+```
+
+2. Create conda environment
+
+```Shell
+conda create -n MiniCPM-V python=3.10 -y
+conda activate MiniCPM-V
+```
+
+3. Install dependencies
+
+```shell
+pip install -r requirements.txt
+```
+
+## Inference
+
+
+### Model Zoo
+
+| Model           | Device | Memory    | &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp; Description       | Download |
+|:-----------|:--:|:-----------:|:-------------------|:---------------:|
+| MiniCPM-Llama3-V 2.5 | GPU | 19 GB | The lastest version, achieving state-of-the end-side multimodal performance.   |  [🤗](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5/) &nbsp;&nbsp; [<img src="./assets/modelscope_logo.png" width="20px"></img>](https://modelscope.cn/models/OpenBMB/MiniCPM-Llama3-V-2_5) |
+| MiniCPM-Llama3-V 2.5 gguf | CPU  | 5 GB | The gguf version, lower GPU memory and faster inference.   |  [🤗](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) &nbsp;&nbsp;[<img src="./assets/modelscope_logo.png" width="20px"></img>](https://modelscope.cn/models/OpenBMB/MiniCPM-Llama3-V-2_5-gguf) |
+| MiniCPM-Llama3-V 2.5 int4 | GPU | 8 GB | The int4 quantized version，lower GPU memory usage. |  [🤗](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-int4/) &nbsp;&nbsp; [<img src="./assets/modelscope_logo.png" width="20px"></img>](https://modelscope.cn/models/OpenBMB/MiniCPM-Llama3-V-2_5-int4) |
+| MiniCPM-V 2.0 | GPU | 8 GB | Light version, balance the performance the computation cost.   |  [🤗](https://huggingface.co/openbmb/MiniCPM-V-2) &nbsp;&nbsp; [<img src="./assets/modelscope_logo.png" width="20px"></img>](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2) |
+| MiniCPM-V 1.0 | GPU | 7 GB | Lightest version, achieving the fastest inference. |   [🤗](https://huggingface.co/openbmb/MiniCPM-V) &nbsp;&nbsp; [<img src="./assets/modelscope_logo.png" width="20px"></img>](https://modelscope.cn/models/OpenBMB/MiniCPM-V) |
+
+### Multi-turn Conversation
+
+Please refer to the following codes to run.
+
+<div align="center">
+<img src="assets/airplane.jpeg" width="500px">
+</div>
+
+
+```python
+from chat import MiniCPMVChat, img2base64
+import torch
+import json
+
+torch.manual_seed(0)
+
+chat_model = MiniCPMVChat('openbmb/MiniCPM-Llama3-V-2_5')
+
+im_64 = img2base64('./assets/airplane.jpeg')
+
+# First round chat 
+msgs = [{"role": "user", "content": "Tell me the model of this aircraft."}]
+
+inputs = {"image": im_64, "question": json.dumps(msgs)}
+answer = chat_model.chat(inputs)
+print(answer)
+
+# Second round chat 
+# pass history context of multi-turn conversation
+msgs.append({"role": "assistant", "content": answer})
+msgs.append({"role": "user", "content": "Introduce something about Airbus A380."})
+
+inputs = {"image": im_64, "question": json.dumps(msgs)}
+answer = chat_model.chat(inputs)
+print(answer)
+```
+
+You will get the following output:
+
+```
+"The aircraft in the image is an Airbus A380, which can be identified by its large size, double-deck structure, and the distinctive shape of its wings and engines. The A380 is a wide-body aircraft known for being the world's largest passenger airliner, designed for long-haul flights. It has four engines, which are characteristic of large commercial aircraft. The registration number on the aircraft can also provide specific information about the model if looked up in an aviation database."
+
+"The Airbus A380 is a double-deck, wide-body, four-engine jet airliner made by Airbus. It is the world's largest passenger airliner and is known for its long-haul capabilities. The aircraft was developed to improve efficiency and comfort for passengers traveling over long distances. It has two full-length passenger decks, which can accommodate more passengers than a typical single-aisle airplane. The A380 has been operated by airlines such as Lufthansa, Singapore Airlines, and Emirates, among others. It is widely recognized for its unique design and significant impact on the aviation industry."
+```
+
+
+
+### Inference on Mac
+<details>
+<summary>Click to view an example, to run MiniCPM-Llama3-V 2.5 on 💻 Mac with MPS (Apple silicon or AMD GPUs). </summary>
+
+```python
+# test.py  Need more than 16GB memory.
+import torch
+from PIL import Image
+from transformers import AutoModel, AutoTokenizer
+
+model = AutoModel.from_pretrained('openbmb/MiniCPM-Llama3-V-2_5', trust_remote_code=True, low_cpu_mem_usage=True)
+model = model.to(device='mps')
+
+tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-Llama3-V-2_5', trust_remote_code=True)
+model.eval()
+
+image = Image.open('./assets/hk_OCR.jpg').convert('RGB')
+question = 'Where is this photo taken?'
+msgs = [{'role': 'user', 'content': question}]
+
+answer, context, _ = model.chat(
+    image=image,
+    msgs=msgs,
+    context=None,
+    tokenizer=tokenizer,
+    sampling=True
+)
+print(answer)
+```
+Run with command:
+```shell
+PYTORCH_ENABLE_MPS_FALLBACK=1 python test.py
+```
+</details>
+
+### Deployment on Mobile Phone
+MiniCPM-V 2.0 can be deployed on mobile phones with Android operating systems. 🚀 Click [here](https://github.com/OpenBMB/mlc-MiniCPM) to install apk. MiniCPM-Llama3-V 2.5 coming soon.
+
+### WebUI Demo
+
+<details>
+<summary>Click to see how to deploy WebUI demo on different devices </summary>
+  
+```shell
+pip install -r requirements.txt
+```
+  
+```shell
+# For NVIDIA GPUs, run:
+python web_demo_2.5.py --device cuda
+
+# For Mac with MPS (Apple silicon or AMD GPUs), run:
+PYTORCH_ENABLE_MPS_FALLBACK=1 python web_demo_2.5.py --device mps
+```
+</details>
+
+### Inference with llama.cpp<a id="inference-with-llamacpp"></a>
+MiniCPM-Llama3-V 2.5 can run with llama.cpp now! See our fork of [llama.cpp](https://github.com/OpenBMB/llama.cpp/tree/minicpm-v2.5/examples/minicpmv) for more detail. This implementation supports smooth inference of 6~8 token/s on mobile phones (test environment：Xiaomi 14 pro + Snapdragon 8 Gen 3).
+
+### Inference with vLLM<a id="vllm"></a>
+
+<details>
+<summary>Click to see how to inference with vLLM </summary>
+Because our pull request to vLLM is still waiting for reviewing, we fork this repository to build and test our vLLM demo. Here are the steps:
+
+1. Clone our version of vLLM:
+```shell
+git clone https://github.com/OpenBMB/vllm.git
+```
+2. Install vLLM:
+```shell
+cd vllm
+pip install -e .
+```
+3. Install timm: 
+```shell
+pip install timm=0.9.10
+```
+4. Run our demo:
+```shell
+python examples/minicpmv_example.py 
+```
+</details>
+
+## Fine-tuning
+
+### Simple Fine-tuning <!-- omit in toc -->
+
+We support simple fine-tuning with Hugging Face for MiniCPM-V 2.0 and MiniCPM-Llama3-V 2.5.
+
+[Reference Document](./finetune/readme.md)
+
+### With the SWIFT Framework <!-- omit in toc -->
+
+We now support MiniCPM-V series fine-tuning with the SWIFT framework. SWIFT supports training, inference, evaluation and deployment of nearly 200 LLMs and MLLMs . It supports the lightweight training solutions provided by PEFT and a complete Adapters Library including techniques such as NEFTune, LoRA+ and LLaMA-PRO.
+
+Best Practices：[MiniCPM-V 1.0](https://github.com/modelscope/swift/blob/main/docs/source/Multi-Modal/minicpm-v最佳实践.md), [MiniCPM-V 2.0](https://github.com/modelscope/swift/blob/main/docs/source/Multi-Modal/minicpm-v-2最佳实践.md)
+
+
+
+## TODO
+
+- [x] MiniCPM-V fine-tuning support
+- [ ] Code release for real-time interactive assistant
+
+## Model License <!-- omit in toc -->
+
+The code in this repo is released according to [Apache-2.0](https://github.com/OpenBMB/MiniCPM/blob/main/LICENSE)
+
+The usage of MiniCPM-V's and OmniLMM's parameters is subject to "[General Model License Agreement - Source Notes - Publicity Restrictions - Commercial License](https://github.com/OpenBMB/General-Model-License/blob/main/通用模型许可协议-来源说明-宣传限制-商业授权.md)"
+
+The parameters are fully open to academic research
+
+Please contact cpm@modelbest.cn to obtain written authorization for commercial uses. Free commercial use is also allowed after registration.
+
+## Statement <!-- omit in toc -->
+
+As LMMs, MiniCPM-V models (including OmniLMM) generate contents by learning a large amount of multimodal corpora, but they cannot comprehend, express personal opinions or make value judgement. Anything generated by MiniCPM-V models does not represent the views and positions of the model developers
+
+We will not be liable for any problems arising from the use of MiniCPMV-V models, including but not limited to data security issues, risk of public opinion, or any risks and problems arising from the misdirection, misuse, dissemination or misuse of the model.
+
+
+## Institutions  <!-- omit in toc -->
+
+This project is developed by the following institutions:
+
+- <img src="assets/thunlp.png" width="28px"> [THUNLP](https://nlp.csai.tsinghua.edu.cn/)
+- <img src="assets/modelbest.png" width="28px"> [ModelBest](https://modelbest.cn/)
+- <img src="assets/zhihu.webp" width="28px"> [Zhihu](https://www.zhihu.com/ )
+
+## Other Multimodal Projects from Our Team <!-- omit in toc -->
+
+👏 Welcome to explore other multimodal projects of our team:
+
+[VisCPM](https://github.com/OpenBMB/VisCPM/tree/main) | [RLHF-V](https://github.com/RLHF-V/RLHF-V) | [LLaVA-UHD](https://github.com/thunlp/LLaVA-UHD) | [RLAIF-V](https://github.com/RLHF-V/RLAIF-V)
+
+## 🌟 Star History
+
+<div>
+<img src="./assets/Star-History.png" width="500em" ></img> 
+</div>
+
+## Citation
+
+If you find our model/code/paper helpful, please consider cite our papers 📝 and star us ⭐️！
+
+```bib
+@article{yu2023rlhf,
+  title={Rlhf-v: Towards trustworthy mllms via behavior alignment from fine-grained correctional human feedback},
+  author={Yu, Tianyu and Yao, Yuan and Zhang, Haoye and He, Taiwen and Han, Yifeng and Cui, Ganqu and Hu, Jinyi and Liu, Zhiyuan and Zheng, Hai-Tao and Sun, Maosong and others},
+  journal={arXiv preprint arXiv:2312.00849},
+  year={2023}
+}
+@article{viscpm,
+    title={Large Multilingual Models Pivot Zero-Shot Multimodal Learning across Languages}, 
+    author={Jinyi Hu and Yuan Yao and Chongyi Wang and Shan Wang and Yinxu Pan and Qianyu Chen and Tianyu Yu and Hanghao Wu and Yue Zhao and Haoye Zhang and Xu Han and Yankai Lin and Jiao Xue and Dahai Li and Zhiyuan Liu and Maosong Sun},
+    journal={arXiv preprint arXiv:2308.12038},
+    year={2023}
+}
+@article{xu2024llava-uhd,
+  title={{LLaVA-UHD}: an LMM Perceiving Any Aspect Ratio and High-Resolution Images},
+  author={Xu, Ruyi and Yao, Yuan and Guo, Zonghao and Cui, Junbo and Ni, Zanlin and Ge, Chunjiang and Chua, Tat-Seng and Liu, Zhiyuan and Huang, Gao},
+  journal={arXiv preprint arXiv:2403.11703},
+  year={2024}
+}
+@article{yu2024rlaifv,
+  title={RLAIF-V: Aligning MLLMs through Open-Source AI Feedback for Super GPT-4V Trustworthiness}, 
+  author={Yu, Tianyu and Zhang, Haoye and Yao, Yuan and Dang, Yunkai and Chen, Da and Lu, Xiaoman and Cui, Ganqu and He, Taiwen and Liu, Zhiyuan and Chua, Tat-Seng and Sun, Maosong},
+  journal={arXiv preprint arXiv:2405.17220},
+  year={2024}
+}
+```
diff --git a/README_zh.md b/README_zh.md
new file mode 100644
index 0000000000000000000000000000000000000000..3b59758dd584deaebe0620be9741aa081aa3fb93
--- /dev/null
+++ b/README_zh.md
@@ -0,0 +1,731 @@
+<div align="center">
+
+<!-- <!-- <h1 style="color: #33A6B8; font-family: Helvetica"> OmniLMM </h1> -->
+
+<img src="./assets/minicpmv.png" width="300em" ></img> 
+
+**端侧可用的 GPT-4V 级多模态大模型**
+
+  <strong>中文 |
+  [English](./README_en.md)</strong>
+
+<p align="center">
+  MiniCPM-Llama3-V  2.5  <a href="https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5/">🤗</a> <a href="https://huggingface.co/spaces/openbmb/MiniCPM-Llama3-V-2_5">🤖</a> |
+  MiniCPM-V 2.0  <a href="https://huggingface.co/openbmb/MiniCPM-V-2/">🤗</a> <a href="https://huggingface.co/spaces/openbmb/MiniCPM-V-2">🤖</a> |
+  <a href="https://openbmb.vercel.app/minicpm-v-2">MiniCPM-V 2.0 技术博客</a>
+</p>
+
+</div>
+
+
+**MiniCPM-V**是面向图文理解的端侧多模态大模型系列。该系列模型接受图像和文本输入，并提供高质量的文本输出。自2024年2月以来，我们共发布了4个版本模型，旨在实现**领先的性能和高效的部署**，目前该系列最值得关注的模型包括：
+
+- **MiniCPM-Llama3-V 2.5**：🔥🔥🔥 MiniCPM-V系列的最新、性能最佳模型。总参数量8B，多模态综合性能**超越 GPT-4V-1106、Gemini Pro、Claude 3、Qwen-VL-Max 等商用闭源模型**，OCR 能力及指令跟随能力进一步提升，并**支持超过30种语言**的多模态交互。通过系统使用模型量化、CPU、NPU、编译优化等高效推理技术，MiniCPM-Llama3-V 2.5 可以实现**高效的终端设备部署**。
+
+- **MiniCPM-V 2.0**：MiniCPM-V系列的最轻量级模型。总参数量2B，多模态综合性能超越 Yi-VL 34B、CogVLM-Chat 17B、Qwen-VL-Chat 10B 等更大参数规模的模型，可接受 180 万像素的任意长宽比图像输入，实现了和 Gemini Pro 相近的场景文字识别能力以及和 GPT-4V 相匹的低幻觉率。
+
+
+
+## 更新日志 <!-- omit in toc -->
+
+#### 📌 置顶
+
+* [2024.05.28] 💥 MiniCPM-Llama3-V 2.5 现在在 [llama.cpp](https://github.com/OpenBMB/llama.cpp/blob/minicpm-v2.5/examples/minicpmv/README.md) 和 [ollama](https://github.com/OpenBMB/ollama/tree/minicpm-v2.5/examples/minicpm-v2.5) 中完全支持其功能！请拉取最新的 llama.cpp 和 ollama 代码。我们还发布了各种大小的 GGUF 版本，请点击[这里](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf/tree/main)查看。ollama 使用的 FAQ 将在一天内发布，敬请关注！
+* [2024.05.28] 💫 我们现在支持 MiniCPM-Llama3-V 2.5 的 LoRA 微调，更多内存使用统计信息可以在[这里](https://github.com/OpenBMB/MiniCPM-V/tree/main/finetune#model-fine-tuning-memory-usage-statistics)找到。
+* [2024.05.23] 🔍 我们添加了Phi-3-vision-128k-instruct 与 MiniCPM-Llama3-V 2.5的全面对比，包括基准测试评估、多语言能力和推理效率 🌟📊🌍🚀。点击[这里](./docs/compare_with_phi-3_vision.md)查看详细信息。
+* [2024.05.23] 🔥🔥🔥 MiniCPM-V 在 GitHub Trending 和 Hugging Face Trending 上登顶！MiniCPM-Llama3-V 2.5 Demo 被 Hugging Face 的 Gradio 官方账户推荐，欢迎点击[这里](https://huggingface.co/spaces/openbmb/MiniCPM-Llama3-V-2_5)体验！
+
+<br>
+
+* [2024.05.25] MiniCPM-Llama3-V 2.5 [支持流式输出和自定义系统提示词](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5#usage)了，欢迎试用!
+* [2024.05.24] 我们开源了 MiniCPM-Llama3-V 2.5 [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf)，支持 [llama.cpp](#llamacpp-部署) 推理！实现端侧 6-8 tokens/s 的流畅解码，欢迎试用！
+* [2024.05.20] 我们开源了 MiniCPM-Llama3-V 2.5，增强了 OCR 能力，支持 30 多种语言，并首次在端侧实现了 GPT-4V 级的多模态能力！我们提供了[高效推理](#手机端部署)和[简易微调](./finetune/readme.md)的支持，欢迎试用！
+* [2024.04.23] 我们增加了对 [vLLM](#vllm) 的支持，欢迎体验！
+* [2024.04.18] 我们在 HuggingFace Space 新增了 MiniCPM-V 2.0 的 [demo](https://huggingface.co/spaces/openbmb/MiniCPM-V-2)，欢迎体验！
+* [2024.04.17] MiniCPM-V 2.0 现在支持用户部署本地 [WebUI Demo](#本地webui-demo部署) 了，欢迎试用!
+* [2024.04.15] MiniCPM-V 2.0 现在可以通过 SWIFT 框架 [微调](https://github.com/modelscope/swift/blob/main/docs/source/Multi-Modal/minicpm-v-2最佳实践.md) 了，支持流式输出!
+* [2024.04.12] 我们开源了 MiniCPM-V 2.0，该模型刷新了 OCRBench 开源模型最佳成绩，在场景文字识别能力上比肩 Gemini Pro，同时还在综合了 11 个主流多模态大模型评测基准的 <a href="https://rank.opencompass.org.cn/leaderboard-multimodal">OpenCompass</a> 榜单上超过了 Qwen-VL-Chat 10B、CogVLM-Chat 17B 和 Yi-VL 34B 等更大参数规模的模型！点击<a href="https://openbmb.vercel.app/minicpm-v-2">这里</a>查看 MiniCPM-V 2.0 技术博客。
+* [2024.03.14] MiniCPM-V 现在支持 SWIFT 框架下的[微调](https://github.com/modelscope/swift/blob/main/docs/source/Multi-Modal/minicpm-v最佳实践.md)了，感谢 [Jintao](https://github.com/Jintao-Huang) 的贡献！
+* [2024.03.01] MiniCPM-V 现在支持在 Mac 电脑上进行部署！
+* [2024.02.01] 我们开源了 MiniCPM-V 和 OmniLMM-12B，分别可以支持高效的端侧部署和同规模领先的多模态能力！
+
+
+## 目录 <!-- omit in toc -->
+
+- [MiniCPM-Llama3-V 2.5](#minicpm-llama3-v-25)
+- [MiniCPM-V 2.0](#minicpm-v-20)
+- [Online Demo](#online-demo)
+- [安装](#安装)
+- [推理](#推理)
+  - [模型库](#模型库)
+  - [多轮对话](#多轮对话)
+  - [Mac 推理](#mac-推理)
+  - [手机端部署](#手机端部署)
+  - [本地WebUI Demo部署](#本地webui-demo部署)
+  - [llama.cpp 部署](#llamacpp-部署)
+  - [vLLM 部署 ](#vllm-部署-)
+- [微调](#微调)
+- [未来计划](#未来计划)
+- [🌟 Star History](#-star-history)
+- [引用](#引用)
+
+
+## MiniCPM-Llama3-V 2.5
+**MiniCPM-Llama3-V 2.5** 是 MiniCPM-V 系列的最新版本模型，基于 SigLip-400M 和 Llama3-8B-Instruct 构建，共 8B 参数量，相较于 MiniCPM-V 2.0 性能取得较大幅度提升。MiniCPM-Llama3-V 2.5 值得关注的特点包括：
+
+- 🔥 **领先的性能。**
+  MiniCPM-Llama3-V 2.5 在综合了 11 个主流多模态大模型评测基准的 OpenCompass 榜单上平均得分 65.1，**以 8B 量级的大小超过了 GPT-4V-1106、Gemini Pro、Claude 3、Qwen-VL-Max 等主流商用闭源多模态大模型**，大幅超越基于Llama 3构建的其他多模态大模型。
+
+- 💪 **优秀的 OCR 能力。**
+  MiniCPM-Llama3-V 2.5 可接受 180 万像素的任意宽高比图像输入，**OCRBench 得分达到 725，超越 GPT-4o、GPT-4V、Gemini Pro、Qwen-VL-Max 等商用闭源模型**，达到最佳水平。基于近期用户反馈建议，MiniCPM-Llama3-V 2.5 增强了全文 OCR 信息提取、表格图像转 markdown 等高频实用能力，并且进一步加强了指令跟随、复杂推理能力，带来更好的多模态交互体感。
+
+- 🏆 **可信行为。** 
+  借助最新的 [RLAIF-V](https://github.com/RLHF-V/RLAIF-V/) 对齐技术（[RLHF-V](https://github.com/RLHF-V/) [CVPR'24]系列的最新技术），MiniCPM-Llama3-V 2.5 具有更加可信的多模态行为，在 Object HalBench 的幻觉率降低到了 **10.3%**，显著低于 GPT-4V-1106 (13.6%)，达到开源社区最佳水平。[数据集已发布](https://huggingface.co/datasets/openbmb/RLAIF-V-Dataset)。
+
+- 🌏 **多语言支持。**
+  得益于 Llama 3 强大的多语言能力和 VisCPM 的跨语言泛化技术，MiniCPM-Llama3-V 2.5 在中英双语多模态能力的基础上，仅通过少量翻译的多模态数据的指令微调，高效泛化支持了**德语、法语、西班牙语、意大利语、韩语等 30+ 种语言**的多模态能力，并表现出了良好的多语言多模态对话性能。[查看所有支持语言](./assets/minicpm-llama-v-2-5_languages.md)
+
+- 🚀 **高效部署。**
+  MiniCPM-Llama3-V 2.5 较为系统地通过**模型量化、CPU、NPU、编译优化**等高效加速技术，实现高效的终端设备部署。对于高通芯片的移动手机，我们首次将 NPU 加速框架 QNN 整合进了 llama.cpp。经过系统优化后，MiniCPM-Llama3-V 2.5 实现了多模态大模型端侧**语言解码速度 3 倍加速**、**图像编码 150 倍加速**的巨大提升。
+
+- 💫 **易于使用。**
+  MiniCPM-Llama3-V 2.5 可以通过多种方式轻松使用：（1）[llama.cpp](https://github.com/OpenBMB/llama.cpp/blob/minicpm-v2.5/examples/minicpmv/README.md) 和 [ollama](https://github.com/OpenBMB/ollama/tree/minicpm-v2.5/examples/minicpm-v2.5) 支持在本地设备上进行高效的 CPU 推理；（2）提供 16 种尺寸的 [GGUF](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) 格式量化模型；（3）仅需 2 张 V100 GPU 即可进行高效的 [LoRA](https://github.com/OpenBMB/MiniCPM-V/tree/main/finetune#lora-finetuning) 微调；（	4）支持[流式输出](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5#usage)；（5）快速搭建 [Gradio](https://github.com/OpenBMB/MiniCPM-V/blob/main/web_demo_2.5.py) 和 [Streamlit](https://github.com/OpenBMB/MiniCPM-V/blob/main/web_demo_streamlit-2_5.py) 本地 WebUI demo；（	6.）[HuggingFace Spaces](https://huggingface.co/spaces/openbmb/MiniCPM-Llama3-V-2_5) 交互式 demo。
+
+### 性能评估 <!-- omit in toc -->
+
+<div align="center">
+    <img src="assets/MiniCPM-Llama3-V-2.5-peformance.png" width="66%" />
+</div>
+<details>
+<summary>TextVQA, DocVQA, OCRBench, OpenCompass MultiModal Avg Score, MME, MMBench, MMMU, MathVista, LLaVA Bench, RealWorld QA, Object HalBench上的详细评测结果。 </summary>
+<div align="center">
+
+<table style="margin: 0px auto;">
+    <thead>
+        <tr>
+            <th align="left">Model</th>
+            <th>Size</th>
+            <th>OCRBench</th>
+            <th>TextVQA val</th>
+            <th>DocVQA test</th>
+            <th>Open-Compass</th>
+            <th>MME</th>
+            <th>MMB test (en)</th>
+            <th>MMB test (cn)</th>
+            <th>MMMU val</th>
+            <th>Math-Vista</th>
+            <th>LLaVA Bench</th>
+            <th>RealWorld QA</th>
+            <th>Object HalBench</th>
+        </tr>
+    </thead>
+            <tbody align="center">
+        <tr>
+            <td colspan="14" align="left"><strong>Proprietary</strong></td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">Gemini Pro</td>
+            <td>-</td>
+            <td>680</td>
+            <td>74.6</td>
+            <td>88.1</td>
+            <td>62.9</td>
+            <td>2148.9</td>
+            <td>73.6</td>
+            <td>74.3</td>
+            <td>48.9</td>
+            <td>45.8</td>
+            <td>79.9</td>
+            <td>60.4</td>
+            <td>-</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">GPT-4V (2023.11.06)</td>
+            <td>-</td>
+            <td>645</td>
+            <td>78.0</td>
+            <td>88.4</td>
+            <td>63.5</td>
+            <td>1771.5</td>
+            <td>77.0</td>
+            <td>74.4</td>
+            <td>53.8</td>
+            <td>47.8</td>
+            <td>93.1</td>
+            <td>63.0</td>
+            <td>86.4</td>
+        </tr>
+        <tr>
+            <td colspan="14" align="left"><strong>Open-source</strong></td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">Mini-Gemini</td>
+            <td>2.2B</td>
+            <td>-</td>
+            <td>56.2</td>
+            <td>34.2*</td>
+            <td>-</td>
+            <td>1653.0</td>
+            <td>-</td>
+            <td>-</td>
+            <td>31.7</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">Qwen-VL-Chat</td>
+            <td>9.6B</td>
+            <td>488</td>
+            <td>61.5</td>
+            <td>62.6</td>
+            <td>51.6</td>
+            <td>1860.0</td>
+            <td>61.8</td>
+            <td>56.3</td>
+            <td>37.0</td>
+            <td>33.8</td>
+            <td>67.7</td>
+            <td>49.3</td>
+            <td>56.2</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">DeepSeek-VL-7B</td>
+            <td>7.3B</td>
+            <td>435</td>
+            <td>64.7*</td>
+            <td>47.0*</td>
+            <td>54.6</td>
+            <td>1765.4</td>
+            <td>73.8</td>
+            <td>71.4</td>
+            <td>38.3</td>
+            <td>36.8</td>
+            <td>77.8</td>
+            <td>54.2</td>
+            <td>-</td>
+        </tr>        
+        <tr>
+            <td nowrap="nowrap" align="left">Yi-VL-34B</td>
+            <td>34B</td>
+            <td>290</td>
+            <td>43.4*</td>
+            <td>16.9*</td>
+            <td>52.2</td>
+            <td><strong>2050.2</strong></td>
+            <td>72.4</td>
+            <td>70.7</td>
+            <td>45.1</td>
+            <td>30.7</td>
+            <td>62.3</td>
+            <td>54.8</td>
+            <td>79.3</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">CogVLM-Chat</td>
+            <td>17.4B</td>
+            <td>590</td>
+            <td>70.4</td>
+            <td>33.3*</td>
+            <td>54.2</td>
+            <td>1736.6</td>
+            <td>65.8</td>
+            <td>55.9</td>
+            <td>37.3</td>
+            <td>34.7</td>
+            <td>73.9</td>
+            <td>60.3</td>
+            <td>73.6</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">TextMonkey</td>
+            <td>9.7B</td>
+            <td>558</td>
+            <td>64.3</td>
+            <td>66.7</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+        </tr>
+        <tr>
+          <td nowrap="nowrap" align="left">Idefics2</td>
+          <td>8.0B</td>
+          <td>-</td>
+          <td>73.0</td>
+          <td>74.0</td>
+          <td>57.2</td>
+          <td>1847.6</td>
+          <td>75.7</td>
+          <td>68.6</td>
+          <td>45.2</td>
+          <td>52.2</td>
+          <td>49.1</td>
+          <td>60.7</td>
+          <td>-</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">Bunny-LLama-3-8B</td>
+            <td>8.4B</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+            <td>54.3</td>
+            <td>1920.3</td>
+            <td>77.0</td>
+            <td>73.9</td>
+            <td>41.3</td>
+            <td>31.5</td>
+            <td>61.2</td>
+            <td>58.8</td>
+            <td>-</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">LLaVA-NeXT Llama-3-8B</td>
+            <td>8.4B</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+            <td>1971.5</td>
+            <td>-</td>
+            <td>-</td>
+            <td>41.7</td>
+            <td>-</td>
+            <td>80.1</td>
+            <td>60.0</td>
+            <td>-</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">Phi-3-vision-128k-instruct</td>
+            <td>4.2B</td>
+            <td>639*</td>
+            <td>70.9</td>
+            <td>-</td>
+            <td>-</td>
+            <td>1537.5*</td>
+            <td>-</td>
+            <td>-</td>
+            <td>40.4</td>
+            <td>44.5</td>
+            <td>64.2*</td>
+            <td>58.8*</td>
+            <td>-</td>
+        </tr>
+        <tr style="background-color: #e6f2ff;">
+            <td nowrap="nowrap" align="left">MiniCPM-V 1.0</td>
+            <td>2.8B</td>
+            <td>366</td>
+            <td>60.6</td>
+            <td>38.2</td>
+            <td>47.5</td>
+            <td>1650.2</td>
+            <td>64.1</td>
+            <td>62.6</td>
+            <td>38.3</td>
+            <td>28.9</td>
+            <td>51.3</td>
+            <td>51.2</td>
+            <td>78.4</td>
+        </tr>
+        <tr style="background-color: #e6f2ff;">
+            <td nowrap="nowrap" align="left">MiniCPM-V 2.0</td>
+            <td>2.8B</td>
+            <td>605</td>
+            <td>74.1</td>
+            <td>71.9</td>
+            <td>54.5</td>
+            <td>1808.6</td>
+            <td>69.1</td>
+            <td>66.5</td>
+            <td>38.2</td>
+            <td>38.7</td>
+            <td>69.2</td>
+            <td>55.8</td>
+            <td>85.5</td>
+        </tr>
+        <tr style="background-color: #e6f2ff;">
+            <td nowrap="nowrap" align="left">MiniCPM-Llama3-V 2.5</td>
+            <td>8.5B</td>
+            <td><strong>725</strong></td>
+            <td><strong>76.6</strong></td>
+            <td><strong>84.8</strong></td>
+            <td><strong>65.1</strong></td>
+            <td>2024.6</td>
+            <td><strong>77.2</strong></td>
+            <td><strong>74.2</strong></td>
+            <td><strong>45.8</strong></td>
+            <td><strong>54.3</strong></td>
+            <td><strong>86.7</strong></td>
+            <td><strong>63.5</strong></td>
+            <td><strong>89.7</strong></td>
+        </tr>
+    </tbody>
+</table>
+
+</div>
+* 正式开源模型权重的评测结果。
+</details>
+
+<div align="center">
+    <img src="assets/llavabench_compare_3.png" width="80%" />
+    <br>
+    多语言LLaVA Bench评测结果
+</div>
+
+
+### 典型示例 <!-- omit in toc -->
+<table align="center">
+    <p align="center">
+      <img src="assets/minicpmv-llama3-v2.5/cases_all.png" width=95%/>
+    </p>
+</table>
+
+我们将 MiniCPM-Llama3-V 2.5 部署在小米 14 Pro 上，并录制了以下演示视频。
+
+<table align="center">
+    <p align="center">
+      <img src="assets/gif_cases/ticket.gif" width=32%/>
+      <img src="assets/gif_cases/meal_plan.gif" width=32%/>
+    </p>
+</table>
+
+<table align="center">
+    <p align="center" width=80%>
+      <img src="assets/gif_cases/1-4.gif" width=72%/>
+    </p>
+</table>
+
+## MiniCPM-V 2.0
+
+<details>
+<summary>查看 MiniCPM-V 2.0 的详细信息</summary>
+
+**MiniCPM-V 2.0**可以高效部署到终端设备。该模型基于 SigLip-400M 和 [MiniCPM-2.4B](https://github.com/OpenBMB/MiniCPM/)构建，通过perceiver resampler连接。其特点包括：
+
+- 🔥 **优秀的性能。**
+
+  MiniCPM-V 2.0 在多个测试基准（如 OCRBench, TextVQA, MME, MMB, MathVista 等）中实现了 7B 以下模型的**最佳性能**。**在综合了 11 个主流多模态大模型评测基准的 OpenCompass 榜单上超过了 Qwen-VL-Chat 9.6B、CogVLM-Chat 17.4B 和 Yi-VL 34B 等更大参数规模的模型**。MiniCPM-V 2.0 还展现出**领先的 OCR 能力**，在场景文字识别能力上**接近 Gemini Pro**，OCRBench 得分达到**开源模型第一**。
+  
+
+- 🏆 **可信行为。** 
+
+  多模态大模型深受幻觉问题困扰，模型经常生成和图像中的事实不符的文本。MiniCPM-V 2.0 是 **第一个通过多模态 RLHF 对齐的端侧多模态大模型**（借助 [RLHF-V](https://rlhf-v.github.io/) [CVPR'24] 系列技术）。该模型在 [Object HalBench](https://arxiv.org/abs/2312.00849) 达到**和 GPT-4V 相仿**的性能。
+
+
+- 🌟 **高清图像高效编码。**
+
+  MiniCPM-V 2.0 可以接受 **180 万像素的任意长宽比图像输入**（基于最新的[LLaVA-UHD](https://arxiv.org/pdf/2403.11703.pdf) 技术），这使得模型可以感知到小物体、密集文字等更加细粒度的视觉信息。 
+
+
+- ⚡️ **高效部署。**
+
+  MiniCPM-V 2.0 可以**高效部署在大多数消费级显卡和个人电脑上**，包括**移动手机等终端设备**。在视觉编码方面，我们通过perceiver resampler将图像表示压缩为更少的 token。这使得 MiniCPM-V 2.0 即便是**面对高分辨率图像，也能占用较低的存储并展现优秀的推理速度**。
+
+- 🙌 **双语支持。**
+
+  MiniCPM-V 2.0 **提供领先的中英双语多模态能力支持**。
+  该能力通过 [VisCPM](https://arxiv.org/abs/2308.12038) [ICLR'24] 论文中提出的多模态能力的跨语言泛化技术实现。
+
+### 典型示例 <!-- omit in toc -->
+
+
+<table align="center">
+    <p align="center">
+      <img src="assets/minicpmv2-cases_2.png" width=95%/>
+    </p>
+</table>
+
+我们将 MiniCPM-V 2.0 部署在小米 14 Pro 上，并录制了以下演示视频，未经任何视频剪辑。
+
+<table align="center">
+    <p align="center">
+      <img src="assets/gif_cases/station.gif" width=36%/>
+      <img src="assets/gif_cases/london_car.gif" width=36%/>
+    </p>
+</table>
+
+</details>
+
+
+<a id='legacy-models'></a>
+
+## 历史版本模型  <!-- omit in toc -->
+
+
+| 模型                | 介绍信息和使用教程       |
+|:----------------------|:-------------------:|
+| MiniCPM-V 1.0  | [文档](./minicpm_v1.md)   | 
+| OmniLMM-12B  | [文档](./omnilmm.md)   |  
+
+
+## Online Demo
+
+欢迎通过以下链接使用我们的网页端推理服务： [MiniCPM-Llama3-V 2.5](https://huggingface.co/spaces/openbmb/MiniCPM-Llama3-V-2_5) ｜ [MiniCPM-V 2.0](https://huggingface.co/spaces/openbmb/MiniCPM-V-2).
+
+## 安装
+
+1. 克隆我们的仓库并跳转到相应目录
+
+```bash
+git clone https://github.com/OpenBMB/MiniCPM-V.git
+cd MiniCPM-V
+```
+
+1. 创建 conda 环境
+
+```Shell
+conda create -n MiniCPMV python=3.10 -y
+conda activate MiniCPMV
+```
+
+3. 安装依赖
+
+```shell
+pip install -r requirements.txt
+```
+
+## 推理
+
+### 模型库
+
+| 模型           | 设备 | 资源     | &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp; 简介       | 下载链接 |
+|:--------------|:-:|:----------:|:-------------------|:---------------:|
+| MiniCPM-Llama3-V 2.5| GPU | 19 GB  | 最新版本，提供最佳的端侧多模态理解能力。   |  [🤗](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5/) &nbsp;&nbsp; [<img src="./assets/modelscope_logo.png" width="20px"></img>](https://modelscope.cn/models/OpenBMB/MiniCPM-Llama3-V-2_5) |
+| MiniCPM-Llama3-V 2.5 gguf| CPU  | 5 GB | gguf 版本，更低的内存占用和更高的推理效率。  |  [🤗](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) &nbsp;&nbsp; [<img src="./assets/modelscope_logo.png" width="20px"></img>](https://modelscope.cn/models/OpenBMB/MiniCPM-Llama3-V-2_5-gguf) |
+| MiniCPM-Llama3-V 2.5 int4 | GPU | 8 GB | int4量化版，更低显存占用。   |  [🤗](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-int4/) &nbsp;&nbsp; [<img src="./assets/modelscope_logo.png" width="20px"></img>](https://modelscope.cn/models/OpenBMB/MiniCPM-Llama3-V-2_5-int4) |
+| MiniCPM-V 2.0 | GPU | 8 GB  | 轻量级版本，平衡计算开销和多模态理解能力。   |  [🤗](https://huggingface.co/openbmb/MiniCPM-V-2) &nbsp;&nbsp; [<img src="./assets/modelscope_logo.png" width="20px"></img>](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2) |
+| MiniCPM-V 1.0 | GPU | 7 GB | 最轻量版本， 提供最快的推理速度。    |   [🤗](https://huggingface.co/openbmb/MiniCPM-V) &nbsp;&nbsp; [<img src="./assets/modelscope_logo.png" width="20px"></img>](https://modelscope.cn/models/OpenBMB/MiniCPM-V) |
+
+更多[历史版本模型](#legacy-models)
+
+### 多轮对话
+
+请参考以下代码进行推理。
+
+<div align="center">
+<img src="assets/airplane.jpeg" width="500px">
+</div>
+
+
+```python
+from chat import MiniCPMVChat, img2base64
+import torch
+import json
+
+torch.manual_seed(0)
+
+chat_model = MiniCPMVChat('openbmb/MiniCPM-Llama3-V-2_5')
+
+im_64 = img2base64('./assets/airplane.jpeg')
+
+# First round chat 
+msgs = [{"role": "user", "content": "Tell me the model of this aircraft."}]
+
+inputs = {"image": im_64, "question": json.dumps(msgs)}
+answer = chat_model.chat(inputs)
+print(answer)
+
+# Second round chat 
+# pass history context of multi-turn conversation
+msgs.append({"role": "assistant", "content": answer})
+msgs.append({"role": "user", "content": "Introduce something about Airbus A380."})
+
+inputs = {"image": im_64, "question": json.dumps(msgs)}
+answer = chat_model.chat(inputs)
+print(answer)
+```
+
+可以得到以下输出:
+
+```
+"The aircraft in the image is an Airbus A380, which can be identified by its large size, double-deck structure, and the distinctive shape of its wings and engines. The A380 is a wide-body aircraft known for being the world's largest passenger airliner, designed for long-haul flights. It has four engines, which are characteristic of large commercial aircraft. The registration number on the aircraft can also provide specific information about the model if looked up in an aviation database."
+
+"The Airbus A380 is a double-deck, wide-body, four-engine jet airliner made by Airbus. It is the world's largest passenger airliner and is known for its long-haul capabilities. The aircraft was developed to improve efficiency and comfort for passengers traveling over long distances. It has two full-length passenger decks, which can accommodate more passengers than a typical single-aisle airplane. The A380 has been operated by airlines such as Lufthansa, Singapore Airlines, and Emirates, among others. It is widely recognized for its unique design and significant impact on the aviation industry."
+```
+
+
+
+
+### Mac 推理
+<details>
+<summary>点击查看 MiniCPM-Llama3-V 2.5 / MiniCPM-V 2.0 基于Mac MPS运行 (Apple silicon 或 AMD GPUs)的示例。 </summary>
+
+```python
+# test.py    Need more than 16GB memory to run.
+import torch
+from PIL import Image
+from transformers import AutoModel, AutoTokenizer
+
+model = AutoModel.from_pretrained('openbmb/MiniCPM-Llama3-V-2_5', trust_remote_code=True, low_cpu_mem_usage=True)
+model = model.to(device='mps')
+
+tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-Llama3-V-2_5', trust_remote_code=True)
+model.eval()
+
+image = Image.open('./assets/hk_OCR.jpg').convert('RGB')
+question = 'Where is this photo taken?'
+msgs = [{'role': 'user', 'content': question}]
+
+answer, context, _ = model.chat(
+    image=image,
+    msgs=msgs,
+    context=None,
+    tokenizer=tokenizer,
+    sampling=True
+)
+print(answer)
+```
+运行:
+```shell
+PYTORCH_ENABLE_MPS_FALLBACK=1 python test.py
+```
+</details>
+
+
+### 手机端部署
+MiniCPM-V 2.0 可运行在Android手机上，点击[2.0](https://github.com/OpenBMB/mlc-MiniCPM)安装apk使用; MiniCPM-Llama3-V 2.5 将很快推出，敬请期待。
+
+### 本地WebUI Demo部署
+<details>
+<summary>点击查看本地WebUI demo 在 NVIDIA GPU、Mac等不同设备部署方法 </summary>
+  
+```shell
+pip install -r requirements.txt
+```
+  
+```shell
+# For NVIDIA GPUs, run:
+python web_demo_2.5.py --device cuda
+
+# For Mac with MPS (Apple silicon or AMD GPUs), run:
+PYTORCH_ENABLE_MPS_FALLBACK=1 python web_demo_2.5.py --device mps
+```
+</details>
+
+### llama.cpp 部署<a id="llamacpp-部署"></a>
+MiniCPM-Llama3-V 2.5 现在支持llama.cpp啦! 用法请参考我们的fork [llama.cpp](https://github.com/OpenBMB/llama.cpp/tree/minicpm-v2.5/examples/minicpmv)， 在手机上可以支持 6~8 token/s 的流畅推理（测试环境：Xiaomi 14 pro + Snapdragon 8 Gen 3）。
+
+### vLLM 部署 <a id='vllm'></a>
+<details>
+<summary>点击查看 vLLM 部署运行的方法</summary>
+由于我们对 vLLM 提交的 PR 还在 review 中，因此目前我们 fork 了一个 vLLM 仓库以供测试使用。
+
+1. 首先克隆我们 fork 的 vLLM 库:
+```shell
+git clone https://github.com/OpenBMB/vllm.git
+```
+2. 安装 vLLM 库:
+```shell
+cd vllm
+pip install -e .
+```
+3. 安装 timm 库: 
+```shell
+pip install timm=0.9.10
+```
+4. 测试运行示例程序:
+```shell
+python examples/minicpmv_example.py 
+```
+
+
+</details>
+
+
+## 微调
+
+### 简易微调 <!-- omit in toc -->
+
+我们支持使用 Huggingface Transformers 库简易地微调 MiniCPM-V 2.0 和 MiniCPM-Llama3-V 2.5 模型。
+
+[参考文档](./finetune/readme.md)
+
+### 使用 SWIFT 框架 <!-- omit in toc -->
+
+我们支持使用 SWIFT 框架微调 MiniCPM-V 系列模型。SWIFT 支持近 200 种大语言模型和多模态大模型的训练、推理、评测和部署。支持 PEFT 提供的轻量训练方案和完整的 Adapters 库支持的最新训练技术如 NEFTune、LoRA+、LLaMA-PRO 等。 
+
+参考文档：[MiniCPM-V 1.0](https://github.com/modelscope/swift/blob/main/docs/source/Multi-Modal/minicpm-v最佳实践.md)，[MiniCPM-V 2.0](https://github.com/modelscope/swift/blob/main/docs/source/Multi-Modal/minicpm-v-2最佳实践.md)
+
+## 未来计划
+
+- [x] 支持 MiniCPM-V 系列模型微调
+- [ ] 实时多模态交互代码开源
+
+
+
+## 模型协议 <!-- omit in toc -->
+
+本仓库中代码依照 Apache-2.0 协议开源
+
+本项目中模型权重的使用遵循 “[通用模型许可协议-来源说明-宣传限制-商业授权](https://github.com/OpenBMB/General-Model-License/blob/main/通用模型许可协议-来源说明-宣传限制-商业授权.md)”。
+
+本项目中模型权重对学术研究完全开放。
+
+如需将模型用于商业用途，请联系 cpm@modelbest.cn 来获取书面授权，登记后可以免费商业使用。
+
+
+## 声明 <!-- omit in toc -->
+
+作为多模态大模型，MiniCPM-V 系列模型（包括 OmniLMM）通过学习大量的多模态数据来生成内容，但它无法理解、表达个人观点或价值判断，它所输出的任何内容都不代表模型开发者的观点和立场。
+
+因此用户在使用本项目的系列模型生成的内容时，应自行负责对其进行评估和验证。如果由于使用本项目的系列开源模型而导致的任何问题，包括但不限于数据安全问题、公共舆论风险，或模型被误导、滥用、传播或不当利用所带来的任何风险和问题，我们将不承担任何责任。
+
+
+## 机构 <!-- omit in toc -->
+
+本项目由以下机构共同开发：
+
+- <img src="assets/thunlp.png" width="28px"> [清华大学自然语言处理实验室](https://nlp.csai.tsinghua.edu.cn/)
+- <img src="assets/modelbest.png" width="28px"> [面壁智能](https://modelbest.cn/)
+- <img src="assets/zhihu.webp" width="28px"> [知乎](https://www.zhihu.com/ )
+
+## 其他多模态项目 <!-- omit in toc -->
+
+👏 欢迎了解我们更多的多模态项目：
+
+[VisCPM](https://github.com/OpenBMB/VisCPM/tree/main) | [RLHF-V](https://github.com/RLHF-V/RLHF-V) | [LLaVA-UHD](https://github.com/thunlp/LLaVA-UHD) | [RLAIF-V](https://github.com/RLHF-V/RLAIF-V)
+
+## 🌟 Star History
+
+<div>
+<img src="./assets/Star-History.png" width="500em" ></img> 
+</div>
+
+
+## 引用
+
+如果您觉得我们模型/代码/论文有帮助，请给我们 ⭐ 和 引用 📝，感谢！
+
+```bib
+@article{yu2023rlhf,
+  title={Rlhf-v: Towards trustworthy mllms via behavior alignment from fine-grained correctional human feedback},
+  author={Yu, Tianyu and Yao, Yuan and Zhang, Haoye and He, Taiwen and Han, Yifeng and Cui, Ganqu and Hu, Jinyi and Liu, Zhiyuan and Zheng, Hai-Tao and Sun, Maosong and others},
+  journal={arXiv preprint arXiv:2312.00849},
+  year={2023}
+}
+@article{viscpm,
+    title={Large Multilingual Models Pivot Zero-Shot Multimodal Learning across Languages}, 
+    author={Jinyi Hu and Yuan Yao and Chongyi Wang and Shan Wang and Yinxu Pan and Qianyu Chen and Tianyu Yu and Hanghao Wu and Yue Zhao and Haoye Zhang and Xu Han and Yankai Lin and Jiao Xue and Dahai Li and Zhiyuan Liu and Maosong Sun},
+    journal={arXiv preprint arXiv:2308.12038},
+    year={2023}
+}
+@article{xu2024llava-uhd,
+  title={{LLaVA-UHD}: an LMM Perceiving Any Aspect Ratio and High-Resolution Images},
+  author={Xu, Ruyi and Yao, Yuan and Guo, Zonghao and Cui, Junbo and Ni, Zanlin and Ge, Chunjiang and Chua, Tat-Seng and Liu, Zhiyuan and Huang, Gao},
+  journal={arXiv preprint arXiv:2403.11703},
+  year={2024}
+}
+@article{yu2024rlaifv,
+  title={RLAIF-V: Aligning MLLMs through Open-Source AI Feedback for Super GPT-4V Trustworthiness}, 
+  author={Yu, Tianyu and Zhang, Haoye and Yao, Yuan and Dang, Yunkai and Chen, Da and Lu, Xiaoman and Cui, Ganqu and He, Taiwen and Liu, Zhiyuan and Chua, Tat-Seng and Sun, Maosong},
+  journal={arXiv preprint arXiv:2405.17220},
+  year={2024}
+}
+```
diff --git a/assets/MiniCPM-Llama3-V-2.5-peformance.png b/assets/MiniCPM-Llama3-V-2.5-peformance.png
new file mode 100644
index 0000000000000000000000000000000000000000..e8ef3b3ef2c8f38aa2b7b6e571eab57ee1a88c88
Binary files /dev/null and b/assets/MiniCPM-Llama3-V-2.5-peformance.png differ
diff --git a/assets/Snake_cn_Mushroom_en.gif b/assets/Snake_cn_Mushroom_en.gif
new file mode 100644
index 0000000000000000000000000000000000000000..b59c9f6578ca4695317c7e4639e6599501d9ea66
--- /dev/null
+++ b/assets/Snake_cn_Mushroom_en.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:41ddd9dabbbb0b244b43ea4c1ff75762f84f2b75c4a7568142653e294a527d8a
+size 3587380
diff --git a/assets/Star-History.png b/assets/Star-History.png
new file mode 100644
index 0000000000000000000000000000000000000000..4a304105aef1eddbd4bee4f4d0e7024bc684d24b
Binary files /dev/null and b/assets/Star-History.png differ
diff --git a/assets/airplane.jpeg b/assets/airplane.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..dcfdf43a10497f5b8f47503410cdb820c7b03bdd
Binary files /dev/null and b/assets/airplane.jpeg differ
diff --git a/assets/demo_video.mp4 b/assets/demo_video.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..c889d92341319c30dfb2c9f8f0295506c2c35a6d
--- /dev/null
+++ b/assets/demo_video.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6bc5bfa395f07b3dfba59e5142a88bba1219b3ff2aa124f8a551ee6867c4c3bb
+size 2704175
diff --git a/assets/gif_cases/1-4.gif b/assets/gif_cases/1-4.gif
new file mode 100644
index 0000000000000000000000000000000000000000..f284706070ce9d8415f8ee790c0d9e2248249445
--- /dev/null
+++ b/assets/gif_cases/1-4.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a862ab0e495bedb4326d56989fb886bca671589810633657723121e7d71b8a6a
+size 11393533
diff --git a/assets/gif_cases/Mushroom_en.gif b/assets/gif_cases/Mushroom_en.gif
new file mode 100644
index 0000000000000000000000000000000000000000..f45fe499a3154944ec083dbdd9dac0b865fe238e
Binary files /dev/null and b/assets/gif_cases/Mushroom_en.gif differ
diff --git a/assets/gif_cases/Mushroom_en_Snake_cn.gif b/assets/gif_cases/Mushroom_en_Snake_cn.gif
new file mode 100644
index 0000000000000000000000000000000000000000..b7c68b6f7d739ac3e30616edd54ba311dc18d3b5
--- /dev/null
+++ b/assets/gif_cases/Mushroom_en_Snake_cn.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:96d591740d0624deacb08f8bec740ee7f76e0506dca00226ff48154af6e0c794
+size 3555151
diff --git a/assets/gif_cases/Snake_en.gif b/assets/gif_cases/Snake_en.gif
new file mode 100644
index 0000000000000000000000000000000000000000..047104b1fc006f23b992d26aefbaee7488f2e985
--- /dev/null
+++ b/assets/gif_cases/Snake_en.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6118fbac6bec46c54f2cded08bc8ccf9411cdee580d384b70908faee49a368e7
+size 2500076
diff --git a/assets/gif_cases/english_menu.gif b/assets/gif_cases/english_menu.gif
new file mode 100644
index 0000000000000000000000000000000000000000..d4ca254425744ed113e82da7995776c4c4d1b692
--- /dev/null
+++ b/assets/gif_cases/english_menu.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7eb4ea1a179e8a17a5550806efa4fd2957a8d7226daa83a505f510aa3357cc73
+size 5606280
diff --git a/assets/gif_cases/hong_kong_street.gif b/assets/gif_cases/hong_kong_street.gif
new file mode 100644
index 0000000000000000000000000000000000000000..c9f27d29a5c2f1dbdd0c3cfb65fb82f7b465fd76
--- /dev/null
+++ b/assets/gif_cases/hong_kong_street.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f3f02e8f71445192e9389c9c68576e8131d5b354321997b72bdcce20c66d4e1
+size 5276217
diff --git a/assets/gif_cases/london_car.gif b/assets/gif_cases/london_car.gif
new file mode 100644
index 0000000000000000000000000000000000000000..27b5fea9afe3c15bd1d6f9b1b6a476cd66f80c9a
--- /dev/null
+++ b/assets/gif_cases/london_car.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4556aa4b55a2175ba7e623cc36c2493e50c503d10251684e8f1b6ffb19db41ed
+size 7637917
diff --git a/assets/gif_cases/meal_plan.gif b/assets/gif_cases/meal_plan.gif
new file mode 100644
index 0000000000000000000000000000000000000000..a6ba06f3e1d2aa988f2b5680afd50d26a75d3106
--- /dev/null
+++ b/assets/gif_cases/meal_plan.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:69e0640a16d0f631ca9eca3b70ce8a0e262f011145bed52d668a3c6ef246dfea
+size 13681181
diff --git a/assets/gif_cases/station.gif b/assets/gif_cases/station.gif
new file mode 100644
index 0000000000000000000000000000000000000000..6adda160542707e194f32eaedc437f9f1d25dbf5
--- /dev/null
+++ b/assets/gif_cases/station.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e7b88efb0040f31992f4069148561d4aeb46ef0cee5e60de4aed4f4be7c3e1be
+size 7421661
diff --git a/assets/gif_cases/ticket.gif b/assets/gif_cases/ticket.gif
new file mode 100644
index 0000000000000000000000000000000000000000..064a654a82f473ef75208bc4e0c7256ac3c61e91
--- /dev/null
+++ b/assets/gif_cases/ticket.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b642bfed88865e49e0a4a39685786fc7553a52edade86ba9e7287c895c9b33f1
+size 14479038
diff --git "a/assets/gif_cases/\350\230\221\350\217\207_cn.gif" "b/assets/gif_cases/\350\230\221\350\217\207_cn.gif"
new file mode 100644
index 0000000000000000000000000000000000000000..03d3bf578f59b91b6f5d378ddfd0d85522341d4f
--- /dev/null
+++ "b/assets/gif_cases/\350\230\221\350\217\207_cn.gif"
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:52e2bda8a18a921ebde7186f33e667a2b9e5c6c33635ecb5e0e39c485051dad7
+size 1325964
diff --git "a/assets/gif_cases/\350\233\207_cn.gif" "b/assets/gif_cases/\350\233\207_cn.gif"
new file mode 100644
index 0000000000000000000000000000000000000000..9a05171446b7c79660d52f8e43a3d6357a92cd64
--- /dev/null
+++ "b/assets/gif_cases/\350\233\207_cn.gif"
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:053d5617482f53c4f85e6a69bcc5569d66a80c3af48b2385b3582845ebb92be6
+size 2644671
diff --git a/assets/hk_OCR.jpg b/assets/hk_OCR.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..d27d27cb3ab5bb16d268e68e195852d9b9af9636
Binary files /dev/null and b/assets/hk_OCR.jpg differ
diff --git a/assets/llavabench_compare_3.png b/assets/llavabench_compare_3.png
new file mode 100644
index 0000000000000000000000000000000000000000..1423333c034a137b7118eaea5625a30a362dcaac
Binary files /dev/null and b/assets/llavabench_compare_3.png differ
diff --git a/assets/llavabench_compare_phi3.png b/assets/llavabench_compare_phi3.png
new file mode 100644
index 0000000000000000000000000000000000000000..b51405933d5e4507dd9ad465bec98417e5d6e75f
Binary files /dev/null and b/assets/llavabench_compare_phi3.png differ
diff --git a/assets/minicpm-llama-v-2-5_languages.md b/assets/minicpm-llama-v-2-5_languages.md
new file mode 100644
index 0000000000000000000000000000000000000000..0eae344accac46e878f7370f3e755e41493032ed
--- /dev/null
+++ b/assets/minicpm-llama-v-2-5_languages.md
@@ -0,0 +1,176 @@
+- English
+- 中文
+- 한국어
+- 日本語
+- Deutsch
+- Français
+- Português
+- Español
+- မြန်မာဘာသာ
+- ไทย
+- Tiếng Việt
+- Türkçe
+- ܣܘܪܝܝܐ
+- العربية
+- हिन्दी
+- বাংলা
+- नेपाली
+- Türkmençe
+- Тоҷикӣ
+- Кыргызча
+- Русский
+- Українська
+- Беларуская
+- ქართული
+- Azərbaycanca
+- Հայերեն
+- Polski
+- Lietuvių
+- Eesti
+- Latviešu
+- Čeština
+- Slovenčina
+- Magyar
+- Slovenščina
+- Hrvatski
+- Bosanski
+- Crnogorski
+- Српски
+- Shqip
+- Română
+- Български
+- Македонски
+
+
+## 支持语言
+
+英语
+
+中文
+
+韩语
+
+日语
+
+德语
+
+法语
+
+葡萄牙语
+
+西班牙语
+
+缅甸语
+
+泰语
+
+越南语
+
+土耳其语
+
+叙利亚语
+
+阿拉伯语
+
+印地语
+
+孟加拉语
+
+尼泊尔语
+
+土库曼语
+
+塔吉克语
+
+吉尔吉斯语
+
+俄语
+
+乌克兰语
+
+白俄罗斯语
+
+格鲁吉亚语
+
+阿塞拜疆语
+
+亚美尼亚语
+
+波兰语
+
+立陶宛语
+
+爱沙尼亚语
+
+拉脱维亚语
+
+捷克语
+
+斯洛伐克语
+
+匈牙利语
+
+斯洛文尼亚语
+
+克罗地亚语
+
+波斯尼亚语
+
+黑山语
+
+塞尔维亚语
+
+阿尔巴尼亚语
+
+罗马尼亚语
+
+保加利亚
+
+马其顿语
+
+
+
+## Supported Languages
+
+English  
+Chinese  
+Korean  
+Japanese  
+German  
+French  
+Portuguese  
+Spanish  
+Burmese  
+Thai  
+Vietnamese  
+Turkish  
+Syriac  
+Arabic  
+Hindi  
+Bengali  
+Nepali  
+Turkmen  
+Tajik  
+Kyrgyz  
+Russian  
+Ukrainian  
+Belarusian  
+Georgian  
+Azerbaijani  
+Armenian  
+Polish  
+Lithuanian  
+Estonian  
+Latvian  
+Czech  
+Slovak  
+Hungarian  
+Slovenian  
+Croatian  
+Bosnian  
+Montenegrin  
+Serbian  
+Albanian  
+Romanian  
+Bulgarian  
+Macedonian
\ No newline at end of file
diff --git a/assets/minicpmv-2-peformance.png b/assets/minicpmv-2-peformance.png
new file mode 100644
index 0000000000000000000000000000000000000000..1ff226eaa9c1b74bbd7c8e043e2e8ce81804fb68
Binary files /dev/null and b/assets/minicpmv-2-peformance.png differ
diff --git a/assets/minicpmv-llama3-v2.5/case_OCR_en.png b/assets/minicpmv-llama3-v2.5/case_OCR_en.png
new file mode 100644
index 0000000000000000000000000000000000000000..2aae91bdcd6e60abec9839816a561ae7074e12de
--- /dev/null
+++ b/assets/minicpmv-llama3-v2.5/case_OCR_en.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:48895d41723873d46eb3c6ab966afc7dc41dd3cf9083c56e220eb38f19c85f92
+size 5872318
diff --git a/assets/minicpmv-llama3-v2.5/case_complex_reasoning.png b/assets/minicpmv-llama3-v2.5/case_complex_reasoning.png
new file mode 100644
index 0000000000000000000000000000000000000000..f36f8eeb5299094e8976ab13ba4e50a342ba7a86
--- /dev/null
+++ b/assets/minicpmv-llama3-v2.5/case_complex_reasoning.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:59172f5487c203d0a112cde1c14f84a9db9776f37703b8c1db7b1df494cba85d
+size 1730120
diff --git a/assets/minicpmv-llama3-v2.5/case_information_extraction.png b/assets/minicpmv-llama3-v2.5/case_information_extraction.png
new file mode 100644
index 0000000000000000000000000000000000000000..fb0e5caf7c53f15482fc74534988624ea27a6671
--- /dev/null
+++ b/assets/minicpmv-llama3-v2.5/case_information_extraction.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:18f5cfecf5c997b4b6253dbbe1d85cd5851abd2e4108fce0fc8778fd8c2a207b
+size 2650724
diff --git a/assets/minicpmv-llama3-v2.5/case_long_img.png b/assets/minicpmv-llama3-v2.5/case_long_img.png
new file mode 100644
index 0000000000000000000000000000000000000000..741174566d84c768fefd10bfffb806370aa28c9a
--- /dev/null
+++ b/assets/minicpmv-llama3-v2.5/case_long_img.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b9294bcc3f002c74046c7626a34d766959a19548d08b7408259e6fb9b126d51d
+size 3398186
diff --git a/assets/minicpmv-llama3-v2.5/case_markdown.png b/assets/minicpmv-llama3-v2.5/case_markdown.png
new file mode 100644
index 0000000000000000000000000000000000000000..78165c2f528be5ea4fa7521cfdc176c139111f42
--- /dev/null
+++ b/assets/minicpmv-llama3-v2.5/case_markdown.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:745359d2a7779b3997311e08b4b90785c75af0a33d039273935661858c06614a
+size 1803633
diff --git a/assets/minicpmv-llama3-v2.5/cases_all.png b/assets/minicpmv-llama3-v2.5/cases_all.png
new file mode 100644
index 0000000000000000000000000000000000000000..14b886d8ffdcd7d32ceeeca4485117153c9d1edb
--- /dev/null
+++ b/assets/minicpmv-llama3-v2.5/cases_all.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2f8f26e235dec760f4b0d7184462d17bfd095ffbe0b1dfee5c659a5aa8f9a4d7
+size 13152835
diff --git a/assets/minicpmv-llama3-v2.5/temp b/assets/minicpmv-llama3-v2.5/temp
new file mode 100644
index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc
--- /dev/null
+++ b/assets/minicpmv-llama3-v2.5/temp
@@ -0,0 +1 @@
+
diff --git a/assets/minicpmv-omnilmm.png b/assets/minicpmv-omnilmm.png
new file mode 100644
index 0000000000000000000000000000000000000000..c8da16509d4964798a6be07ff14d894044d89f75
Binary files /dev/null and b/assets/minicpmv-omnilmm.png differ
diff --git a/assets/minicpmv.png b/assets/minicpmv.png
new file mode 100644
index 0000000000000000000000000000000000000000..3f4cedb07d48f40b1325b1ff49336d5445a85101
Binary files /dev/null and b/assets/minicpmv.png differ
diff --git a/assets/minicpmv2-cases.png b/assets/minicpmv2-cases.png
new file mode 100644
index 0000000000000000000000000000000000000000..a04aab0b961de03efc53e902a71ac0b89d9ac404
--- /dev/null
+++ b/assets/minicpmv2-cases.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c3bf31d9884742309d9d88bc79593215648d4f351599f67822471917c9c7d85f
+size 16536092
diff --git a/assets/minicpmv2-cases_1.png b/assets/minicpmv2-cases_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..8bd841fbfa65b123bd44a28ff06f7bd6cebae616
--- /dev/null
+++ b/assets/minicpmv2-cases_1.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f9ef4cadfdf31c3506d4be1ca1340c66581e897a63b1cf44d1b5bf94be70e304
+size 16491313
diff --git a/assets/minicpmv2-cases_2.png b/assets/minicpmv2-cases_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..e6ae72d5cdc0dadf7fea1df2eb0a0e19c68bb894
--- /dev/null
+++ b/assets/minicpmv2-cases_2.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a42538366108371fa1b67db58e908345eee15511801a806b2f42071e398e0f8c
+size 19156882
diff --git a/assets/modelbest.png b/assets/modelbest.png
new file mode 100644
index 0000000000000000000000000000000000000000..5e7ee61f9721149d01d559310b7ba1f00d917ed4
Binary files /dev/null and b/assets/modelbest.png differ
diff --git a/assets/modelscope_logo.png b/assets/modelscope_logo.png
new file mode 100644
index 0000000000000000000000000000000000000000..0286d28f488f6425c9671d2b0ec81db56aede2f8
Binary files /dev/null and b/assets/modelscope_logo.png differ
diff --git a/assets/omnilmm-12b-examples.png b/assets/omnilmm-12b-examples.png
new file mode 100644
index 0000000000000000000000000000000000000000..760d549569b476eed14185f8d41a80630695ec3b
--- /dev/null
+++ b/assets/omnilmm-12b-examples.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5371b074618b6b6c89c845831a02bf4591fc39dad16df483e15a1904ea266d1c
+size 5314232
diff --git a/assets/omnilmm-12b-examples_2.pdf b/assets/omnilmm-12b-examples_2.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..366b4e03f770ba31637351f2974b33a5b639abd7
Binary files /dev/null and b/assets/omnilmm-12b-examples_2.pdf differ
diff --git a/assets/omnilmm-12b-examples_2.png b/assets/omnilmm-12b-examples_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..1c4c792fbea05b230a8f35d9db6bce508fff630a
--- /dev/null
+++ b/assets/omnilmm-12b-examples_2.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:67ab0be2b9b8b5a2eff4ab7a2bc4482326560d189ac9196eb4bd314df18aa918
+size 5915606
diff --git a/assets/omnilmm-12b-examples_2_00.jpg b/assets/omnilmm-12b-examples_2_00.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..d13830c2f92e64fad549388007f8152b54a11480
--- /dev/null
+++ b/assets/omnilmm-12b-examples_2_00.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0a5eca53f0c298edb6aa2dc783043002edcbc91caf7323df121cb31bb1cad49a
+size 2787481
diff --git a/assets/omnilmm-12b-examples_3.png b/assets/omnilmm-12b-examples_3.png
new file mode 100644
index 0000000000000000000000000000000000000000..62b14de26249fbd4271680daf63d0875eb05bbeb
--- /dev/null
+++ b/assets/omnilmm-12b-examples_3.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1b32941f757c5390206b2fd9ffbc99534bbf13bd05b225e157afc331139e3c2e
+size 6542021
diff --git a/assets/phi3_vision_comparison.jpg b/assets/phi3_vision_comparison.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..00dfe2e8a010ad3a966cb2ef1f95fe39a86fb618
Binary files /dev/null and b/assets/phi3_vision_comparison.jpg differ
diff --git a/assets/radar_omnilmm12b.png b/assets/radar_omnilmm12b.png
new file mode 100644
index 0000000000000000000000000000000000000000..35a2601447773fcec586031c2b554e6759946754
Binary files /dev/null and b/assets/radar_omnilmm12b.png differ
diff --git a/assets/thunlp.png b/assets/thunlp.png
new file mode 100644
index 0000000000000000000000000000000000000000..85f512884e2b1441ec7617be2f0c35126e0dab0b
Binary files /dev/null and b/assets/thunlp.png differ
diff --git a/assets/title-1.png b/assets/title-1.png
new file mode 100644
index 0000000000000000000000000000000000000000..1fb78d38cd7138580f4351079123f1cb48b7d2f4
Binary files /dev/null and b/assets/title-1.png differ
diff --git a/assets/title-2.png b/assets/title-2.png
new file mode 100644
index 0000000000000000000000000000000000000000..b7a9ea2df93b19cf20fff0b14adf1725cbddde9b
Binary files /dev/null and b/assets/title-2.png differ
diff --git a/assets/title-3.png b/assets/title-3.png
new file mode 100644
index 0000000000000000000000000000000000000000..694f11dfdf60d6c1c8593427686e0a8da28f34b9
Binary files /dev/null and b/assets/title-3.png differ
diff --git a/assets/worldmap_ck.jpg b/assets/worldmap_ck.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..dce3c0abb53e9b914cccfe621f767c8e83c87121
Binary files /dev/null and b/assets/worldmap_ck.jpg differ
diff --git a/assets/zhihu.webp b/assets/zhihu.webp
new file mode 100644
index 0000000000000000000000000000000000000000..33d3dffc86a04a4f97250223d57fe88fc367317c
Binary files /dev/null and b/assets/zhihu.webp differ
diff --git a/chat.py b/chat.py
new file mode 100644
index 0000000000000000000000000000000000000000..8dbf8ef01a98a2afd6e77407653aec41fbb70fb9
--- /dev/null
+++ b/chat.py
@@ -0,0 +1,219 @@
+import os
+import torch
+import json
+from PIL import Image
+import base64
+import io
+from accelerate import load_checkpoint_and_dispatch, init_empty_weights
+from transformers import AutoTokenizer, AutoModel
+
+from omnilmm.utils import disable_torch_init
+from omnilmm.model.omnilmm import OmniLMMForCausalLM
+from omnilmm.model.utils import build_transform
+from omnilmm.train.train_utils import omni_preprocess
+
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+
+    
+
+def init_omni_lmm(model_path):
+    torch.backends.cuda.matmul.allow_tf32 = True
+    disable_torch_init()
+    model_name = os.path.expanduser(model_path)
+    print(f'Load omni_lmm model and tokenizer from {model_name}')
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_name, model_max_length=2048)
+
+    if False:
+        # model on multiple devices for small size gpu memory (Nvidia 3090 24G x2) 
+        with init_empty_weights():
+            model = OmniLMMForCausalLM.from_pretrained(model_name, tune_clip=True, torch_dtype=torch.bfloat16)
+        model = load_checkpoint_and_dispatch(model, model_name, dtype=torch.bfloat16, 
+                    device_map="auto",  no_split_module_classes=['Eva','MistralDecoderLayer', 'ModuleList', 'Resampler']
+        )
+    else:
+        model = OmniLMMForCausalLM.from_pretrained(
+            model_name, tune_clip=True, torch_dtype=torch.bfloat16
+        ).to(device='cuda', dtype=torch.bfloat16)
+
+    image_processor = build_transform(
+        is_train=False, input_size=model.model.config.image_size, std_mode='OPENAI_CLIP')
+
+    mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
+    assert mm_use_im_start_end
+
+    tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN,
+                         DEFAULT_IM_END_TOKEN], special_tokens=True)
+
+
+    vision_config = model.model.vision_config
+    vision_config.im_patch_token = tokenizer.convert_tokens_to_ids(
+        [DEFAULT_IMAGE_PATCH_TOKEN])[0]
+    vision_config.use_im_start_end = mm_use_im_start_end
+    vision_config.im_start_token, vision_config.im_end_token = tokenizer.convert_tokens_to_ids(
+        [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN])
+    image_token_len = model.model.config.num_query
+
+    return model, image_processor, image_token_len, tokenizer
+
+def expand_question_into_multimodal(question_text, image_token_len, im_st_token, im_ed_token, im_patch_token):
+    if '<image>' in question_text[0]['content']:
+        question_text[0]['content'] = question_text[0]['content'].replace(
+            '<image>', im_st_token + im_patch_token * image_token_len + im_ed_token)
+    else:
+        question_text[0]['content'] = im_st_token + im_patch_token * \
+            image_token_len + im_ed_token + '\n' + question_text[0]['content']
+    return question_text
+
+def wrap_question_for_omni_lmm(question, image_token_len, tokenizer):
+    question = expand_question_into_multimodal(
+        question, image_token_len, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_IMAGE_PATCH_TOKEN)
+
+    conversation = question
+    data_dict = omni_preprocess(sources=[conversation],
+                                  tokenizer=tokenizer,
+                                  generation=True)
+
+    data_dict = dict(input_ids=data_dict["input_ids"][0],
+                     labels=data_dict["labels"][0])
+    return data_dict
+
+
+
+class OmniLMM12B:
+    def __init__(self, model_path) -> None:
+        model, img_processor, image_token_len, tokenizer = init_omni_lmm(model_path)
+        self.model = model
+        self.image_token_len = image_token_len
+        self.image_transform = img_processor
+        self.tokenizer = tokenizer
+        self.model.eval()
+
+    def decode(self, image, input_ids):
+        with torch.inference_mode():
+            output = self.model.generate_vllm(
+                input_ids=input_ids.unsqueeze(0).cuda(),
+                images=image.unsqueeze(0).half().cuda(),
+                temperature=0.6,
+                max_new_tokens=1024,
+                # num_beams=num_beams,
+                do_sample=True,
+                output_scores=True,
+                return_dict_in_generate=True,
+                repetition_penalty=1.1,
+                top_k=30,
+                top_p=0.9,
+            )
+
+            response = self.tokenizer.decode(
+                output.sequences[0], skip_special_tokens=True)
+            response = response.strip()
+            return response
+
+    def chat(self, input):
+        try:
+            image = Image.open(io.BytesIO(base64.b64decode(input['image']))).convert('RGB')
+        except Exception as e:
+            return "Image decode error"
+
+        msgs = json.loads(input['question'])
+        input_ids = wrap_question_for_omni_lmm(
+            msgs, self.image_token_len, self.tokenizer)['input_ids']
+        input_ids = torch.as_tensor(input_ids)
+        #print('input_ids', input_ids)
+        image = self.image_transform(image)
+
+        out = self.decode(image, input_ids)
+
+        return out
+        
+
+def img2base64(file_name):
+    with open(file_name, 'rb') as f:
+        encoded_string = base64.b64encode(f.read())
+        return encoded_string
+
+class MiniCPMV:
+    def __init__(self, model_path) -> None:
+        self.model = AutoModel.from_pretrained(model_path, trust_remote_code=True).to(dtype=torch.bfloat16)
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        self.model.eval().cuda()
+
+    def chat(self, input):
+        try:
+            image = Image.open(io.BytesIO(base64.b64decode(input['image']))).convert('RGB')
+        except Exception as e:
+            return "Image decode error"
+
+        msgs = json.loads(input['question'])
+        
+        answer, context, _ = self.model.chat(
+            image=image,
+            msgs=msgs,
+            context=None,
+            tokenizer=self.tokenizer,
+            sampling=True,
+            temperature=0.7
+    	)
+        return answer
+
+class MiniCPMV2_5:
+    def __init__(self, model_path) -> None:
+        self.model = AutoModel.from_pretrained(model_path, trust_remote_code=True).to(dtype=torch.float16)
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        self.model.eval().cuda()
+
+    def chat(self, input):
+        try:
+            image = Image.open(io.BytesIO(base64.b64decode(input['image']))).convert('RGB')
+        except Exception as e:
+            return "Image decode error"
+
+        msgs = json.loads(input['question'])
+        
+        answer = self.model.chat(
+            image=image,
+            msgs=msgs,
+            tokenizer=self.tokenizer,
+            sampling=True,
+            temperature=0.7
+    	)
+        return answer
+
+
+class MiniCPMVChat:
+    def __init__(self, model_path) -> None:
+        if '12B' in model_path:
+            self.model = OmniLMM12B(model_path)
+        elif 'MiniCPM-Llama3-V' in model_path:
+            self.model = MiniCPMV2_5(model_path)
+        else:
+            self.model = MiniCPMV(model_path)
+
+    def chat(self, input):
+        return self.model.chat(input)
+
+
+if __name__ == '__main__':
+    
+    model_path = 'openbmb/OmniLMM-12B'
+    chat_model = MiniCPMVChat(model_path)
+
+    im_64 = img2base64('./assets/worldmap_ck.jpg')
+
+    # first round chat 
+    msgs = [{"role": "user", "content": "What is interesting about this image?"}]
+    input = {"image": im_64, "question": json.dumps(msgs, ensure_ascii=True)}
+    answer = chat_model.chat(input)
+    print(msgs[-1]["content"]+'\n', answer)
+
+    # second round chat 
+    msgs.append({"role": "assistant", "content": answer})
+    msgs.append({"role": "user", "content": "Where is China in the image"})
+    input = {"image": im_64,"question": json.dumps(msgs, ensure_ascii=True)}
+    answer = chat_model.chat(input)
+    print(msgs[-1]["content"]+'\n', answer)
+
diff --git a/docs/compare_with_phi-3_vision.md b/docs/compare_with_phi-3_vision.md
new file mode 100644
index 0000000000000000000000000000000000000000..1edab3e49f6996a0b34e5997e5df35c4ffc069ab
--- /dev/null
+++ b/docs/compare_with_phi-3_vision.md
@@ -0,0 +1,27 @@
+## Phi-3-vision-128K-Instruct vs MiniCPM-Llama3-V 2.5
+
+Comparison results of Phi-3-vision-128K-Instruct and MiniCPM-Llama3-V 2.5, regarding the model size, hardware requirements, and performances.
+With int4 quantization, MiniCPM-Llama3-V 2.5 delivers **smooth inference with only 8GB of GPU memory**. In most benchmarks, MiniCPM-Llama3-V 2.5 achieves **better performance** compared with Phi-3-vision-128K-Instruct. Moreover, MiniCPM-Llama3-V 2.5 also exhibits **lower latency and better throughtput even without quantization**.
+
+我们提供了从模型参数、硬件需求、性能指标等方面对比 Phi-3-vision-128K-Instruct 和 MiniCPM-Llama3-V 2.5 的结果。通过 int4 量化，MiniCPM-Llama3-V 2.5 **仅需 8GB 显存即可推理**。在大多数评测集上， MiniCPM-Llama3-V 2.5 相比于 Phi-3-vision-128K-Instruct 都展现出了**更优的性能表现**。 即使未经量化，MiniCPM-Llama3-V 2.5 的**推理延迟和吞吐率也都更具优势**。
+
+<div align="center">
+    <img src="../assets/phi3_vision_comparison.jpg" width="85%" />
+</div>
+
+
+
+### Multilingual Capabilities（多语言能力对比）
+
+
+MiniCPM-Llama3-V 2.5 exhibits **stronger multilingual** capabilities compared with Phi-3-vision-128K-Instruct on LLaVA Bench.
+
+MiniCPM-Llama3-V 2.5 在对话和推理评测榜单 LLaVA Bench 上展现出了比 Phi-3-vision-128K-Instruct **更强的多语言的性能**。
+
+<div align="center">
+    <img src="../assets/llavabench_compare_phi3.png" width="100%" />
+    <br>
+    Evaluation results of multilingual LLaVA Bench
+    <br>
+    多语言LLaVA Bench评测结果
+</div>
diff --git a/eval_mm/README.md b/eval_mm/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c094716dea2aa179437d6a2c9508c21c40472f85
--- /dev/null
+++ b/eval_mm/README.md
@@ -0,0 +1,177 @@
+# Evaluation
+
+## opencompass
+First, enter the `vlmevalkit` directory and install all dependencies:
+```bash
+cd vlmevalkit
+pip install -r requirements.txt
+```
+<br />
+
+Then, run `script/run_inference.sh`, which receives three input parameters in sequence: `MODELNAME`, `DATALIST`, and `MODE`. `MODELNAME` represents the name of the model, `DATALIST` represents the datasets used for inference, and `MODE` represents evaluation mode:
+```bash
+chmod +x ./script/run_inference.sh
+./script/run_inference.sh $MODELNAME $DATALIST $MODE
+```
+<br />
+
+The three available choices for `MODELNAME` are listed in `vlmeval/config.py`:
+```bash
+ungrouped = {
+    'MiniCPM-V':partial(MiniCPM_V, model_path='openbmb/MiniCPM-V'),
+    'MiniCPM-V-2':partial(MiniCPM_V, model_path='openbmb/MiniCPM-V-2'),
+    'MiniCPM-Llama3-V-2_5':partial(MiniCPM_Llama3_V, model_path='openbmb/MiniCPM-Llama3-V-2_5'),
+}
+```
+<br />
+
+All available choices for `DATALIST` are listed in `vlmeval/utils/dataset_config.py`. While evaluating on a single dataset, call the dataset name directly without quotation marks; while evaluating on multiple datasets, separate the names of different datasets with spaces and add quotation marks at both ends:
+```bash
+$DATALIST="POPE ScienceQA_TEST ChartQA_TEST"
+```
+<br />
+
+While scoring on each benchmark directly, set `MODE=all`. If only inference results are required, set `MODE=infer`. In order to reproduce the results in the table displayed on the homepage (columns between MME and RealWorldQA), you need to run the script according to the following settings:
+```bash
+# run on all 7 datasets
+./script/run_inference.sh MiniCPM-Llama3-V-2_5 "MME MMBench_TEST_EN MMBench_TEST_CN MMMU_DEV_VAL MathVista_MINI LLaVABench RealWorldQA" all
+
+# The following are instructions for running on a single dataset
+# MME
+./script/run_inference.sh MiniCPM-Llama3-V-2_5 MME all
+# MMBench_TEST_EN
+./script/run_inference.sh MiniCPM-Llama3-V-2_5 MMBench_TEST_EN all
+# MMBench_TEST_CN
+./script/run_inference.sh MiniCPM-Llama3-V-2_5 MMBench_TEST_CN all
+# MMMU_DEV_VAL
+./script/run_inference.sh MiniCPM-Llama3-V-2_5 MMMU_DEV_VAL all
+# MathVista_MINI
+./script/run_inference.sh MiniCPM-Llama3-V-2_5 MathVista_MINI all
+# LLaVABench
+./script/run_inference.sh MiniCPM-Llama3-V-2_5 LLaVABench all
+# RealWorldQA
+./script/run_inference.sh MiniCPM-Llama3-V-2_5 RealWorldQA all
+```
+<br />
+
+## vqadataset
+First, enter the `vqaeval` directory and install all dependencies. Then, create `downloads` subdirectory to store the downloaded dataset for all tasks:
+```bash
+cd vqaeval
+pip install -r requirements.txt
+mkdir downloads
+```
+<br />
+
+Download the datasets from the following links and place it in the specified directories:
+###### TextVQA
+```bash
+cd downloads
+mkdir TextVQA && cd TextVQA
+wget https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip
+unzip train_val_images.zip && rm train_val_images.zip
+mv train_val_images/train_images . && rm -rf train_val_images
+wget https://dl.fbaipublicfiles.com/textvqa/data/TextVQA_0.5.1_val.json
+cd ../..
+```
+
+###### DocVQA / DocVQATest
+
+```bash
+cd downloads
+mkdir DocVQA && cd DocVQA && mkdir spdocvqa_images
+# Download Images and Annotations from Task 1 - Single Page Document Visual Question Answering at https://rrc.cvc.uab.es/?ch=17&com=downloads
+# Move the spdocvqa_images.tar.gz and spdocvqa_qas.zip to DocVQA directory
+tar -zxvf spdocvqa_images.tar.gz -C spdocvqa_images && rm spdocvqa_images.tar.gz
+unzip spdocvqa_qas.zip && rm spdocvqa_qas.zip
+cp spdocvqa_qas/val_v1.0_withQT.json . && cp spdocvqa_qas/test_v1.0.json .  && rm -rf spdocvqa_qas
+cd ../..
+```
+<br />
+
+The `downloads` directory should be organized according to the following structure:
+```bash
+downloads
+├── TextVQA
+│   ├── train_images
+│   │   ├── ...
+│   ├── TextVQA_0.5.1_val.json
+├── DocVQA
+│   ├── spdocvqa_images
+│   │   ├── ...
+│   ├── val_v1.0_withQT.json
+│   ├── test_v1.0.json
+```
+<br />
+
+Modify the parameters in `shell/run_inference.sh` and run inference:
+
+```bash
+chmod +x ./shell/run_inference.sh
+./shell/run_inference.sh
+```
+<br />
+
+All optional parameters are listed in `eval_utils/getargs.py`. The meanings of some major parameters are listed as follows:
+```bash
+# path to images and their corresponding questions
+# TextVQA
+--textVQA_image_dir
+--textVQA_ann_path
+# DocVQA
+--docVQA_image_dir
+--docVQA_ann_path
+# DocVQATest
+--docVQATest_image_dir
+--docVQATest_ann_path
+
+# whether to eval on certain task
+--eval_textVQA
+--eval_docVQA
+--eval_docVQATest
+--eval_all
+
+# model name and model path
+--model_name
+--model_path
+# load model from ckpt
+--ckpt
+# the way the model processes input data, "interleave" represents interleaved image-text form, while "old" represents non-interleaved.
+--generate_method
+
+--batchsize
+
+# path to save the outputs
+--answer_path
+```
+<br />
+
+While evaluating on different tasks, parameters need to be set as follows:
+###### TextVQA
+```bash
+--eval_textVQA
+--textVQA_image_dir ./downloads/TextVQA/train_images
+--textVQA_ann_path ./downloads/TextVQA/TextVQA_0.5.1_val.json
+```
+
+###### DocVQA
+```bash
+--eval_docVQA
+--docVQA_image_dir ./downloads/DocVQA/spdocvqa_images
+--docVQA_ann_path ./downloads/DocVQA/val_v1.0_withQT.json
+```
+
+###### DocVQATest
+```bash
+--eval_docVQATest
+--docVQATest_image_dir ./downloads/DocVQA/spdocvqa_images
+--docVQATest_ann_path ./downloads/DocVQA/test_v1.0.json
+```
+
+<br />
+
+For the DocVQATest task, in order to upload the inference results to the [official website](https://rrc.cvc.uab.es/?ch=17) for evaluation, run `shell/run_transform.sh` for format transformation after inference. `input_file_path` represents the path to the original output json, `output_file_path` represents the path to the transformed json:
+```bash
+chmod +x ./shell/run_transform.sh
+./shell/run_transform.sh
+```
\ No newline at end of file
diff --git a/eval_mm/README_zh.md b/eval_mm/README_zh.md
new file mode 100644
index 0000000000000000000000000000000000000000..58e0288a3a348f6ec34a8363b81d89106a98f620
--- /dev/null
+++ b/eval_mm/README_zh.md
@@ -0,0 +1,175 @@
+# Evaluation
+
+## opencompass
+首先，进入 `vlmevalkit` 目录下，安装必要的依赖：
+```bash
+cd vlmevalkit
+pip install -r requirements.txt
+```
+<br />
+
+然后，运行 `script/run_inference.sh`，该脚本依次接收三个输入参数：`MODELNAME`, `DATALIST`, `MODE`。`MODELNAME` 为模型名称，`DATALIST` 为目标数据集，`MODE` 为评测模式。
+```bash
+chmod +x ./script/run_inference.sh
+./script/run_inference.sh $MODELNAME $DATALIST $MODE
+```
+<br />
+
+`MODELNAME` 有三种选择，位于 `vlmeval/config.py` 中：
+```bash
+ungrouped = {
+    'MiniCPM-V':partial(MiniCPM_V, model_path='openbmb/MiniCPM-V'),
+    'MiniCPM-V-2':partial(MiniCPM_V, model_path='openbmb/MiniCPM-V-2'),
+    'MiniCPM-Llama3-V-2_5':partial(MiniCPM_Llama3_V, model_path='openbmb/MiniCPM-Llama3-V-2_5'),
+}
+```
+<br />
+
+可选的所有 `DATALIST` 位于 `vlmeval/utils/dataset_config.py` 中，评测单个数据集时，直接调用数据集名称，不加引号；评测多个数据集时，将不同数据集名称以空格隔开，两端加引号：
+```bash
+$DATALIST="POPE ScienceQA_TEST ChartQA_TEST"
+```
+<br />
+
+直接对各 benchmark 进行评分时，设置 `MODE=all`。如果仅需要推理结果，则设置 `MODE=infer`
+为了复现出首页展示的表格中的各项结果（MME 到 RealWorldQA 之间的列），需要按照如下设置运行：
+```bash
+# 一次性运行 7 个数据集
+./script/run_inference.sh MiniCPM-Llama3-V-2_5 "MME MMBench_TEST_EN MMBench_TEST_CN MMMU_DEV_VAL MathVista_MINI LLaVABench RealWorldQA" all
+
+# 以下是单独运行 1 个数据集的指令
+# MME
+./script/run_inference.sh MiniCPM-Llama3-V-2_5 MME all
+# MMBench_TEST_EN
+./script/run_inference.sh MiniCPM-Llama3-V-2_5 MMBench_TEST_EN all
+# MMBench_TEST_CN
+./script/run_inference.sh MiniCPM-Llama3-V-2_5 MMBench_TEST_CN all
+# MMMU_DEV_VAL
+./script/run_inference.sh MiniCPM-Llama3-V-2_5 MMMU_DEV_VAL all
+# MathVista_MINI
+./script/run_inference.sh MiniCPM-Llama3-V-2_5 MathVista_MINI all
+# LLaVABench
+./script/run_inference.sh MiniCPM-Llama3-V-2_5 LLaVABench all
+# RealWorldQA
+./script/run_inference.sh MiniCPM-Llama3-V-2_5 RealWorldQA all
+```
+<br />
+
+## vqadataset
+首先，进入 `vqaeval` 目录下，安装必要的依赖，并创建 `downloads` 子目录，用于存储下载的数据集：
+```bash
+cd vqaeval
+pip install -r requirements.txt
+mkdir downloads
+```
+<br />
+
+然后，从下列各地址下载数据集并置于指定目录下：
+###### TextVQA
+```bash
+cd downloads
+mkdir TextVQA && cd TextVQA
+wget https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip
+unzip train_val_images.zip && rm train_val_images.zip
+mv train_val_images/train_images . && rm -rf train_val_images
+wget https://dl.fbaipublicfiles.com/textvqa/data/TextVQA_0.5.1_val.json
+cd ../..
+```
+
+###### DocVQA / DocVQATest
+```bash
+cd downloads
+mkdir DocVQA && cd DocVQA && mkdir spdocvqa_images
+# 在 https://rrc.cvc.uab.es/?ch=17&com=downloads 下载 Task 1 - Single Page Document Visual Question Answering 下的 Images 和 Annotations
+# 将下载得到的 spdocvqa_images.tar.gz 以及 spdocvqa_qas.zip 置于 DocVQA 目录下
+tar -zxvf spdocvqa_images.tar.gz -C spdocvqa_images && rm spdocvqa_images.tar.gz
+unzip spdocvqa_qas.zip && rm spdocvqa_qas.zip
+cp spdocvqa_qas/val_v1.0_withQT.json . && cp spdocvqa_qas/test_v1.0.json .  && rm -rf spdocvqa_qas
+cd ../..
+```
+<br />
+
+`downloads` 目录应当按照下列结构组织：
+```bash
+downloads
+├── TextVQA
+│   ├── train_images
+│   │   ├── ...
+│   ├── TextVQA_0.5.1_val.json
+├── DocVQA
+│   ├── spdocvqa_images
+│   │   ├── ...
+│   ├── val_v1.0_withQT.json
+│   ├── test_v1.0.json
+```
+<br />
+
+准备好相应的数据集之后，修改 `shell/run_inference.sh` 的参数，运行推理：
+
+```bash
+chmod +x ./shell/run_inference.sh
+./shell/run_inference.sh
+```
+<br />
+
+可以传入的参数位于 `eval_utils/getargs.py` 中，各主要参数的含义如下：
+```bash
+# 指定 TextVQA 评测所有图片和问题的路径
+--textVQA_image_dir
+--textVQA_ann_path
+# 指定 DocVQA 评测所有图片和问题的路径
+--docVQA_image_dir
+--docVQA_ann_path
+# 指定 DocVQATest 评测所有图片和问题的路径
+--docVQATest_image_dir
+--docVQATest_ann_path
+
+# 决定是否评测某个任务，eval_all 设置为 True 表示所有任务都评测
+--eval_textVQA
+--eval_docVQA
+--eval_docVQATest
+--eval_all
+
+# 模型名称、模型路径（从指定路径加载模型）
+--model_name
+--model_path
+# 从 checkpoint 加载模型
+--ckpt
+# 模型处理输入数据的方式，interleave 表示图文交错式，old 表示非交错式
+--generate_method
+# 推理时的批处理规模，建议推理时设置为 1
+--batchsize
+
+# 输出内容保存的路径
+--answer_path
+```
+<br />
+
+评测三个任务需要设置的参数如下：
+###### TextVQA
+```bash
+--eval_textVQA
+--textVQA_image_dir ./downloads/TextVQA/train_images
+--textVQA_ann_path ./downloads/TextVQA/TextVQA_0.5.1_val.json
+```
+
+###### DocVQA
+```bash
+--eval_docVQA
+--docVQA_image_dir ./downloads/DocVQA/spdocvqa_images
+--docVQA_ann_path ./downloads/DocVQA/val_v1.0_withQT.json
+```
+
+###### DocVQATest
+```bash
+--eval_docVQATest
+--docVQATest_image_dir ./downloads/DocVQA/spdocvqa_images
+--docVQATest_ann_path ./downloads/DocVQA/test_v1.0.json
+```
+<br />
+
+对于 DocVQATest 任务，为了将推理结果上传到[官方网站](https://rrc.cvc.uab.es/?ch=17)进行评测，还需要运行 `shell/run_transform.sh` 进行格式转换。其中，`input_file_path` 对应原始输出的 json 的路径，`output_file_path` 为自定义的转换后的 json 的路径：
+```bash
+chmod +x ./shell/run_transform.sh
+./shell/run_transform.sh
+```
\ No newline at end of file
diff --git a/eval_mm/vlmevalkit/requirements.txt b/eval_mm/vlmevalkit/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5bebbeec3f596a819e9340126d73f82a0d9d25d3
--- /dev/null
+++ b/eval_mm/vlmevalkit/requirements.txt
@@ -0,0 +1,33 @@
+einops
+gradio==4.15.0
+huggingface_hub
+matplotlib
+numpy>=1.23.4
+omegaconf
+openai==1.3.5
+opencv-python>=4.4.0.46
+openpyxl
+pandas>=1.5.3
+pillow
+portalocker
+protobuf
+pycocoevalcap
+python-dotenv
+requests
+rich
+seaborn
+sentencepiece
+sty
+tabulate
+tiktoken
+timeout-decorator
+tqdm
+typing_extensions==4.7.1
+validators
+visual_genome
+xlsxwriter
+Pillow==10.1.0
+sentencepiece==0.1.99
+transformers==4.40.0
+torch==1.13.1
+torchvision
diff --git a/eval_mm/vlmevalkit/run.py b/eval_mm/vlmevalkit/run.py
new file mode 100644
index 0000000000000000000000000000000000000000..c30cbd044fd975a39bc88cffff77ecc7d9ea5e53
--- /dev/null
+++ b/eval_mm/vlmevalkit/run.py
@@ -0,0 +1,149 @@
+import torch
+import torch.distributed as dist
+from vlmeval.smp import *
+from vlmeval.evaluate import *
+from vlmeval.inference import infer_data_job
+from vlmeval.config import supported_VLM
+from vlmeval.utils import dataset_URLs, DATASET_TYPE, abbr2full, MMMU_result_transfer
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data', type=str, nargs='+', required=True)
+    parser.add_argument('--model', type=str, nargs='+', required=True)
+    parser.add_argument('--work-dir', type=str, default='.', help='select the output directory')
+    parser.add_argument('--mode', type=str, default='all', choices=['all', 'infer'])
+    parser.add_argument('--nproc', type=int, default=4, help='Parallel API calling')
+    parser.add_argument('--retry', type=int, default=None, help='retry numbers for API VLMs')
+    parser.add_argument('--judge', type=str, default=None)
+    parser.add_argument('--ignore', action='store_true', help='Ignore failed indices. ')
+    parser.add_argument('--verbose', action='store_true')
+    parser.add_argument('--rerun', action='store_true')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    logger = get_logger('RUN')
+
+    args = parse_args()
+    assert len(args.data), '--data should be a list of data files'
+
+    if args.retry is not None:
+        for k, v in supported_VLM.items():
+            if hasattr(v, 'keywords') and 'retry' in v.keywords:
+                v.keywords['retry'] = args.retry
+                supported_VLM[k] = v
+            if hasattr(v, 'keywords') and 'verbose' in v.keywords:
+                v.keywords['verbose'] = args.verbose
+                supported_VLM[k] = v
+
+    rank, world_size = get_rank_and_world_size()
+    if world_size > 1:
+        local_rank = os.environ.get('LOCAL_RANK', 0)
+        torch.cuda.set_device(int(local_rank))
+        dist.init_process_group(backend='nccl', timeout=datetime.timedelta(seconds=10800))
+
+    for _, model_name in enumerate(args.model):
+        model = None
+
+        pred_root = osp.join(args.work_dir, model_name)
+        os.makedirs(pred_root, exist_ok=True)
+
+        for _, dataset_name in enumerate(args.data):
+            custom_flag = False
+
+            if dataset_name not in dataset_URLs:
+                dataset_name = abbr2full(dataset_name)
+
+            if dataset_name not in dataset_URLs:
+                logger.warning(f'Dataset {dataset_name} is not officially supported. ')
+                file_path = osp.join(LMUDataRoot(), f'{dataset_name}.tsv')
+                if not osp.exists(file_path):
+                    logger.error(f'Cannot find the local dataset {dataset_name}. ')
+                    continue
+                else:
+                    custom_flag = True
+
+            result_file = f'{pred_root}/{model_name}_{dataset_name}.xlsx'
+            if osp.exists(result_file) and args.rerun:
+                os.system(f'rm {pred_root}/{model_name}_{dataset_name}_*')
+
+            if model is None:
+                model = model_name  # which is only a name
+
+            model = infer_data_job(
+                model,
+                work_dir=pred_root,
+                model_name=model_name,
+                dataset_name=dataset_name,
+                verbose=args.verbose,
+                api_nproc=args.nproc,
+                ignore_failed=args.ignore)
+
+            if rank == 0:
+                if dataset_name in ['MMMU_TEST']:
+                    result_json = MMMU_result_transfer(result_file)
+                    logger.info(f'Transfer MMMU_TEST result to json for official evaluation, json file saved in {result_json}')  # noqa: E501
+                    continue
+
+            if dataset_name in [
+                'MMBench_TEST_CN', 'MMBench_TEST_EN', 'MMBench', 'MMBench_CN'
+                'MMBench_TEST_CN_V11', 'MMBench_TEST_EN_V11', 'MMBench_V11', 'MMBench_CN_V11'
+            ]:
+                if not MMBenchOfficialServer(dataset_name):
+                    logger.error(
+                        f'Can not evaluate {dataset_name} on non-official servers, '
+                        'will skip the evaluation. '
+                    )
+                    continue
+
+            judge_kwargs = {
+                'nproc': args.nproc,
+                'verbose': args.verbose,
+            }
+            if args.retry is not None:
+                judge_kwargs['retry'] = args.retry
+            if args.judge is not None:
+                judge_kwargs['model'] = args.judge
+            else:
+                if DATASET_TYPE(dataset_name) in ['multi-choice', 'Y/N']:
+                    judge_kwargs['model'] = 'chatgpt-0613'
+                elif listinstr(['MMVet', 'MathVista', 'LLaVABench'], dataset_name):
+                    judge_kwargs['model'] = 'gpt-4-turbo'
+            if 'OPENAI_API_KEY_JUDGE' in os.environ and len(os.environ['OPENAI_API_KEY_JUDGE']):
+                judge_kwargs['key'] = os.environ['OPENAI_API_KEY_JUDGE']
+            if 'OPENAI_API_BASE_JUDGE' in os.environ and len(os.environ['OPENAI_API_BASE_JUDGE']):
+                judge_kwargs['api_base'] = os.environ['OPENAI_API_BASE_JUDGE']
+
+            if rank == 0 and args.mode == 'all':
+                if DATASET_TYPE(dataset_name) == 'multi-choice':
+                    dataset_name = 'default' if custom_flag else dataset_name
+                    multiple_choice_eval(
+                        result_file,
+                        dataset=dataset_name,
+                        **judge_kwargs)
+                elif DATASET_TYPE(dataset_name) == 'Y/N':
+                    YOrN_eval(
+                        result_file,
+                        dataset=dataset_name,
+                        **judge_kwargs)
+                elif DATASET_TYPE(dataset_name) == 'Caption':
+                    COCO_eval(result_file)
+                elif dataset_name == 'MMVet':
+                    MMVet_eval(result_file, **judge_kwargs)
+                elif dataset_name == 'OCRBench':
+                    OCRBench_eval(result_file)
+                elif listinstr(['OCRVQA', 'TextVQA', 'ChartQA', 'DocVQA', 'InfoVQA'], dataset_name):
+                    VQAEval(result_file, dataset_name)
+                elif listinstr(['MathVista'], dataset_name):
+                    MathVista_eval(result_file, **judge_kwargs)
+                elif listinstr(['LLaVABench'], dataset_name):
+                    LLaVABench_eval(result_file, **judge_kwargs)
+                else:
+                    logger.error(f'Dataset {dataset_name} is not handled by evaluator, will be skipped. ')
+
+
+if __name__ == '__main__':
+    load_env()
+    main()
diff --git a/eval_mm/vlmevalkit/script/run_inference.sh b/eval_mm/vlmevalkit/script/run_inference.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f9c3a5a5e99c701e35ac8d3995c2b36aa3aee6c3
--- /dev/null
+++ b/eval_mm/vlmevalkit/script/run_inference.sh
@@ -0,0 +1,31 @@
+export PATH=/usr/local/cuda/bin:$PATH
+
+export HF_ENDPOINT=https://hf-mirror.com
+export OMP_NUM_THREADS=1
+export timestamp=`date +"%Y%m%d%H%M%S"`
+export OLD_VERSION='False'
+export PYTHONPATH=$(dirname $SELF_DIR):$PYTHONPATH
+
+# gpu consumed
+# fp16 17-18G
+# int4 7-8G
+
+# model to be used
+# Example: MODELNAME=MiniCPM-Llama3-V-2_5
+MODELNAME=$1
+# datasets to be tested
+# Example: DATALIST="POPE ScienceQA_TEST ChartQA_TEST"
+DATALIST=$2
+# test mode, all or infer
+MODE=$3
+
+echo "Starting inference with model $MODELNAME on datasets $DATALIST"
+# run on multi gpus with torchrun command
+# remember to run twice, the first run may fail
+torchrun --nproc_per_node=8 run.py --data $DATALIST --model $MODELNAME --mode $MODE
+torchrun --nproc_per_node=8 run.py --data $DATALIST --model $MODELNAME --mode $MODE
+# run on single gpu with python command
+# python run.py --data $DATALIST --model $MODELNAME --verbose --mode $MODE
+# python run.py --data $DATALIST --model $MODELNAME --verbose --mode $MODE
+
+ls
diff --git a/eval_mm/vlmevalkit/vlmeval/__init__.py b/eval_mm/vlmevalkit/vlmeval/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..336a61a616f8051a193f87cf1548febd3903c7e7
--- /dev/null
+++ b/eval_mm/vlmevalkit/vlmeval/__init__.py
@@ -0,0 +1,13 @@
+try:
+    import torch
+except ImportError:
+    pass
+
+from .smp import *
+from .api import *
+from .evaluate import *
+from .utils import *
+from .vlm import *
+from .config import *
+
+load_env()
diff --git a/eval_mm/vlmevalkit/vlmeval/api/__init__.py b/eval_mm/vlmevalkit/vlmeval/api/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..050b1b666c67b6762cb3e33bb889a596a1acfe05
--- /dev/null
+++ b/eval_mm/vlmevalkit/vlmeval/api/__init__.py
@@ -0,0 +1,6 @@
+from .gpt import OpenAIWrapper, GPT4V
+from .gpt_int import OpenAIWrapperInternal, GPT4V_Internal
+
+__all__ = [
+    'OpenAIWrapper', 'OpenAIWrapperInternal', 'GPT4V', 'GPT4V_Internal'
+]
diff --git a/eval_mm/vlmevalkit/vlmeval/api/base.py b/eval_mm/vlmevalkit/vlmeval/api/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd60c7d3d05fa2af352341538150c16e7a327d91
--- /dev/null
+++ b/eval_mm/vlmevalkit/vlmeval/api/base.py
@@ -0,0 +1,195 @@
+import time
+import random as rd
+from abc import abstractmethod
+import os.path as osp
+import copy as cp
+from ..smp import get_logger, parse_file
+
+
+class BaseAPI:
+
+    allowed_types = ['text', 'image']
+    INTERLEAVE = True
+    INSTALL_REQ = False
+
+    def __init__(self,
+                 retry=10,
+                 wait=3,
+                 system_prompt=None,
+                 verbose=True,
+                 fail_msg='Failed to obtain answer via API.',
+                 **kwargs):
+        """Base Class for all APIs.
+
+        Args:
+            retry (int, optional): The retry times for `generate_inner`. Defaults to 10.
+            wait (int, optional): The wait time after each failed retry of `generate_inner`. Defaults to 3.
+            system_prompt (str, optional): Defaults to None.
+            verbose (bool, optional): Defaults to True.
+            fail_msg (str, optional): The message to return when failed to obtain answer.
+                Defaults to 'Failed to obtain answer via API.'.
+            **kwargs: Other kwargs for `generate_inner`.
+        """
+
+        self.wait = wait
+        self.retry = retry
+        self.system_prompt = system_prompt
+        self.verbose = verbose
+        self.fail_msg = fail_msg
+        self.logger = get_logger('ChatAPI')
+
+        if len(kwargs):
+            self.logger.info(f'BaseAPI received the following kwargs: {kwargs}')
+            self.logger.info('Will try to use them as kwargs for `generate`. ')
+        self.default_kwargs = kwargs
+
+    @abstractmethod
+    def generate_inner(self, inputs, **kwargs):
+        """The inner function to generate the answer.
+
+        Returns:
+            tuple(int, str, str): ret_code, response, log
+        """
+        self.logger.warning('For APIBase, generate_inner is an abstract method. ')
+        assert 0, 'generate_inner not defined'
+        ret_code, answer, log = None, None, None
+        # if ret_code is 0, means succeed
+        return ret_code, answer, log
+
+    def working(self):
+        """If the API model is working, return True, else return False.
+
+        Returns:
+            bool: If the API model is working, return True, else return False.
+        """
+        retry = 3
+        while retry > 0:
+            ret = self.generate('hello')
+            if ret is not None and ret != '' and self.fail_msg not in ret:
+                return True
+            retry -= 1
+        return False
+
+    def check_content(self, msgs):
+        """Check the content type of the input. Four types are allowed: str, dict, liststr, listdict.
+
+        Args:
+            msgs: Raw input messages.
+
+        Returns:
+            str: The message type.
+        """
+        if isinstance(msgs, str):
+            return 'str'
+        if isinstance(msgs, dict):
+            return 'dict'
+        if isinstance(msgs, list):
+            types = [self.check_content(m) for m in msgs]
+            if all(t == 'str' for t in types):
+                return 'liststr'
+            if all(t == 'dict' for t in types):
+                return 'listdict'
+        return 'unknown'
+
+    def preproc_content(self, inputs):
+        """Convert the raw input messages to a list of dicts.
+
+        Args:
+            inputs: raw input messages.
+
+        Returns:
+            list(dict): The preprocessed input messages. Will return None if failed to preprocess the input.
+        """
+        if self.check_content(inputs) == 'str':
+            return [dict(type='text', value=inputs)]
+        elif self.check_content(inputs) == 'dict':
+            assert 'type' in inputs and 'value' in inputs
+            return [inputs]
+        elif self.check_content(inputs) == 'liststr':
+            res = []
+            for s in inputs:
+                mime, pth = parse_file(s)
+                if mime is None or mime == 'unknown':
+                    res.append(dict(type='text', value=s))
+                else:
+                    res.append(dict(type=mime.split('/')[0], value=pth))
+            return res
+        elif self.check_content(inputs) == 'listdict':
+            for item in inputs:
+                assert 'type' in item and 'value' in item
+                mime, s = parse_file(item['value'])
+                if mime is None:
+                    assert item['type'] == 'text', item['value']
+                else:
+                    assert mime.split('/')[0] == item['type']
+                    item['value'] = s
+            return inputs
+        else:
+            return None
+
+    def generate(self, message, **kwargs1):
+        """The main function to generate the answer. Will call `generate_inner` with the preprocessed input messages.
+
+        Args:
+            message: raw input messages.
+
+        Returns:
+            str: The generated answer of the Failed Message if failed to obtain answer.
+        """
+        assert self.check_content(message) in ['str', 'dict', 'liststr', 'listdict'], f'Invalid input type: {message}'
+        message = self.preproc_content(message)
+        assert message is not None and self.check_content(message) == 'listdict'
+        for item in message:
+            assert item['type'] in self.allowed_types, f'Invalid input type: {item["type"]}'
+
+        # merge kwargs
+        kwargs = cp.deepcopy(self.default_kwargs)
+        kwargs.update(kwargs1)
+
+        answer = None
+        # a very small random delay [0s - 0.5s]
+        T = rd.random() * 0.5
+        time.sleep(T)
+
+        for i in range(self.retry):
+            try:
+                ret_code, answer, log = self.generate_inner(message, **kwargs)
+                if ret_code == 0 and self.fail_msg not in answer and answer != '':
+                    if self.verbose:
+                        print(answer)
+                    return answer
+                elif self.verbose:
+                    if not isinstance(log, str):
+                        try:
+                            log = log.text
+                        except:
+                            self.logger.warning(f'Failed to parse {log} as an http response. ')
+                    self.logger.info(f'RetCode: {ret_code}\nAnswer: {answer}\nLog: {log}')
+            except Exception as err:
+                if self.verbose:
+                    self.logger.error(f'An error occured during try {i}:')
+                    self.logger.error(err)
+            # delay before each retry
+            T = rd.random() * self.wait * 2
+            time.sleep(T)
+
+        return self.fail_msg if answer in ['', None] else answer
+
+    def message_to_promptimg(self, message):
+        assert not self.INTERLEAVE
+        model_name = self.__class__.__name__
+        import warnings
+        warnings.warn(
+            f'Model {model_name} does not support interleaved input. '
+            'Will use the first image and aggregated texts as prompt. ')
+        num_images = len([x for x in message if x['type'] == 'image'])
+        if num_images == 0:
+            prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
+            image = None
+        elif num_images == 1:
+            prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
+            image = [x['value'] for x in message if x['type'] == 'image'][0]
+        else:
+            prompt = '\n'.join([x['value'] if x['type'] == 'text' else '<image>' for x in message])
+            image = [x['value'] for x in message if x['type'] == 'image'][0]
+        return prompt, image
diff --git a/eval_mm/vlmevalkit/vlmeval/api/gpt.py b/eval_mm/vlmevalkit/vlmeval/api/gpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8f19838905472d817719467606000c7738676d2
--- /dev/null
+++ b/eval_mm/vlmevalkit/vlmeval/api/gpt.py
@@ -0,0 +1,178 @@
+from ..smp import *
+import os
+import sys
+from .base import BaseAPI
+
+APIBASES = {
+    'OFFICIAL': 'https://api.openai.com/v1/chat/completions',
+}
+
+
+def GPT_context_window(model):
+    length_map = {
+        'gpt-4-1106-preview': 128000,
+        'gpt-4-vision-preview': 128000,
+        'gpt-4': 8192,
+        'gpt-4-32k': 32768,
+        'gpt-4-0613': 8192,
+        'gpt-4-32k-0613': 32768,
+        'gpt-3.5-turbo-1106': 16385,
+        'gpt-3.5-turbo': 4096,
+        'gpt-3.5-turbo-16k': 16385,
+        'gpt-3.5-turbo-instruct': 4096,
+        'gpt-3.5-turbo-0613': 4096,
+        'gpt-3.5-turbo-16k-0613': 16385,
+    }
+    if model in length_map:
+        return length_map[model]
+    else:
+        return 128000
+
+
+class OpenAIWrapper(BaseAPI):
+
+    is_api: bool = True
+
+    def __init__(self,
+                 model: str = 'gpt-3.5-turbo-0613',
+                 retry: int = 5,
+                 wait: int = 5,
+                 key: str = None,
+                 verbose: bool = True,
+                 system_prompt: str = None,
+                 temperature: float = 0,
+                 timeout: int = 60,
+                 api_base: str = None,
+                 max_tokens: int = 1024,
+                 img_size: int = 512,
+                 img_detail: str = 'low',
+                 **kwargs):
+
+        self.model = model
+        self.cur_idx = 0
+        self.fail_msg = 'Failed to obtain answer via API. '
+        self.max_tokens = max_tokens
+        self.temperature = temperature
+
+        if 'step-1v' in model:
+            env_key = os.environ.get('STEPAI_API_KEY', '')
+            if key is None:
+                key = env_key
+        else:
+            env_key = os.environ.get('OPENAI_API_KEY', '')
+            if key is None:
+                key = env_key
+            assert isinstance(key, str) and key.startswith('sk-'), (
+                f'Illegal openai_key {key}. '
+                'Please set the environment variable OPENAI_API_KEY to your openai key. '
+            )
+        self.key = key
+        assert img_size > 0 or img_size == -1
+        self.img_size = img_size
+        assert img_detail in ['high', 'low']
+        self.img_detail = img_detail
+        self.timeout = timeout
+
+        super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
+
+        if api_base is None:
+            if 'OPENAI_API_BASE' in os.environ and os.environ['OPENAI_API_BASE'] != '':
+                self.logger.error('Environment variable OPENAI_API_BASE is set. Will use it as api_base. ')
+                api_base = os.environ['OPENAI_API_BASE']
+            else:
+                api_base = 'OFFICIAL'
+
+        assert api_base is not None
+
+        if api_base in APIBASES:
+            self.api_base = APIBASES[api_base]
+        elif api_base.startswith('http'):
+            self.api_base = api_base
+        else:
+            self.logger.error('Unknown API Base. ')
+            sys.exit(-1)
+        self.logger.info(f'Using API Base: {self.api_base}; API Key: {self.key}')
+
+    # inputs can be a lvl-2 nested list: [content1, content2, content3, ...]
+    # content can be a string or a list of image & text
+    def prepare_inputs(self, inputs):
+        input_msgs = []
+        if self.system_prompt is not None:
+            input_msgs.append(dict(role='system', content=self.system_prompt))
+        has_images = np.sum([x['type'] == 'image' for x in inputs])
+        if has_images:
+            content_list = []
+            for msg in inputs:
+                if msg['type'] == 'text':
+                    content_list.append(dict(type='text', text=msg['value']))
+                elif msg['type'] == 'image':
+                    from PIL import Image
+                    img = Image.open(msg['value'])
+                    b64 = encode_image_to_base64(img, target_size=self.img_size)
+                    img_struct = dict(url=f'data:image/jpeg;base64,{b64}', detail=self.img_detail)
+                    content_list.append(dict(type='image_url', image_url=img_struct))
+            input_msgs.append(dict(role='user', content=content_list))
+        else:
+            assert all([x['type'] == 'text' for x in inputs])
+            text = '\n'.join([x['value'] for x in inputs])
+            input_msgs.append(dict(role='user', content=text))
+        return input_msgs
+
+    def generate_inner(self, inputs, **kwargs) -> str:
+        input_msgs = self.prepare_inputs(inputs)
+        temperature = kwargs.pop('temperature', self.temperature)
+        max_tokens = kwargs.pop('max_tokens', self.max_tokens)
+
+        context_window = GPT_context_window(self.model)
+        max_tokens = min(max_tokens, context_window - self.get_token_len(inputs))
+        if 0 < max_tokens <= 100:
+            self.logger.warning(
+                'Less than 100 tokens left, '
+                'may exceed the context window with some additional meta symbols. '
+            )
+        if max_tokens <= 0:
+            return 0, self.fail_msg + 'Input string longer than context window. ', 'Length Exceeded. '
+
+        headers = {'Content-Type': 'application/json', 'Authorization': f'Bearer {self.key}'}
+        payload = dict(
+            model=self.model,
+            messages=input_msgs,
+            max_tokens=max_tokens,
+            n=1,
+            temperature=temperature,
+            **kwargs)
+        response = requests.post(self.api_base, headers=headers, data=json.dumps(payload), timeout=self.timeout * 1.1)
+        ret_code = response.status_code
+        ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code
+        answer = self.fail_msg
+        try:
+            resp_struct = json.loads(response.text)
+            answer = resp_struct['choices'][0]['message']['content'].strip()
+        except:
+            pass
+        return ret_code, answer, response
+
+    def get_token_len(self, inputs) -> int:
+        import tiktoken
+        try:
+            enc = tiktoken.encoding_for_model(self.model)
+        except:
+            enc = tiktoken.encoding_for_model('gpt-4')
+        assert isinstance(inputs, list)
+        tot = 0
+        for item in inputs:
+            if item['type'] == 'text':
+                tot += len(enc.encode(item['value']))
+            elif item['type'] == 'image':
+                tot += 85
+                if self.img_detail == 'high':
+                    img = Image.open(item['value'])
+                    npatch = np.ceil(img.size[0] / 512) * np.ceil(img.size[1] / 512)
+                    tot += npatch * 170
+        return tot
+
+
+class GPT4V(OpenAIWrapper):
+
+    def generate(self, message, dataset=None):
+        return super(GPT4V, self).generate(message)
\ No newline at end of file
diff --git a/eval_mm/vlmevalkit/vlmeval/api/gpt_int.py b/eval_mm/vlmevalkit/vlmeval/api/gpt_int.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5e941c47d184e21750f6b8d06412a2e72d99b04
--- /dev/null
+++ b/eval_mm/vlmevalkit/vlmeval/api/gpt_int.py
@@ -0,0 +1,90 @@
+import json
+import warnings
+import requests
+from ..smp import *
+from .gpt import GPT_context_window, OpenAIWrapper
+
+url = 'http://ecs.sv.us.alles-apin.openxlab.org.cn/v1/openai/v2/text/chat'
+headers = {
+    'Content-Type': 'application/json'
+}
+
+
+class OpenAIWrapperInternal(OpenAIWrapper):
+
+    is_api: bool = True
+
+    def __init__(self,
+                 model: str = 'gpt-3.5-turbo-0613',
+                 retry: int = 5,
+                 wait: int = 3,
+                 verbose: bool = True,
+                 system_prompt: str = None,
+                 temperature: float = 0,
+                 timeout: int = 60,
+                 max_tokens: int = 1024,
+                 img_size: int = 512,
+                 img_detail: str = 'low',
+                 **kwargs):
+
+        self.model = model
+        if 'KEYS' in os.environ and osp.exists(os.environ['KEYS']):
+            keys = load(os.environ['KEYS'])
+            headers['alles-apin-token'] = keys.get('alles-apin-token', '')
+        elif 'ALLES' in os.environ:
+            headers['alles-apin-token'] = os.environ['ALLES']
+        self.headers = headers
+        self.temperature = temperature
+        self.timeout = timeout
+        self.max_tokens = max_tokens
+
+        assert img_size > 0 or img_size == -1
+        self.img_size = img_size
+        assert img_detail in ['high', 'low']
+        self.img_detail = img_detail
+
+        super(OpenAIWrapper, self).__init__(
+            wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
+
+    def generate_inner(self, inputs, **kwargs) -> str:
+        input_msgs = self.prepare_inputs(inputs)
+
+        temperature = kwargs.pop('temperature', self.temperature)
+        max_tokens = kwargs.pop('max_tokens', self.max_tokens)
+
+        # Held out 100 tokens as buffer
+        context_window = GPT_context_window(self.model)
+        max_tokens = min(max_tokens, context_window - self.get_token_len(inputs))
+        if 0 < max_tokens <= 100:
+            print('Less than 100 tokens left, may exceed the context window with some additional meta symbols. ')
+        if max_tokens <= 0:
+            return 0, self.fail_msg + 'Input string longer than context window. ', 'Length Exceeded. '
+
+        payload = dict(
+            model=self.model,
+            messages=input_msgs,
+            max_tokens=max_tokens,
+            n=1,
+            stop=None,
+            timeout=self.timeout,
+            temperature=temperature,
+            **kwargs)
+
+        response = requests.post(url, headers=headers, data=json.dumps(payload), timeout=self.timeout * 1.1)
+        ret_code = response.status_code
+        ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code
+
+        answer = self.fail_msg
+        try:
+            resp_struct = json.loads(response.text)
+            assert resp_struct['msg'] == 'ok' and resp_struct['msgCode'] == '10000', resp_struct
+            answer = resp_struct['data']['choices'][0]['message']['content'].strip()
+        except:
+            pass
+        return ret_code, answer, response
+
+
+class GPT4V_Internal(OpenAIWrapperInternal):
+
+    def generate(self, message, dataset=None):
+        return super(GPT4V_Internal, self).generate(message)
diff --git a/eval_mm/vlmevalkit/vlmeval/config.py b/eval_mm/vlmevalkit/vlmeval/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..2701f305571a4fad6a784426ab989e3ad1690e5b
--- /dev/null
+++ b/eval_mm/vlmevalkit/vlmeval/config.py
@@ -0,0 +1,19 @@
+from vlmeval.vlm import *
+from vlmeval.api import *
+from functools import partial
+
+ungrouped = {
+    'MiniCPM-V':partial(MiniCPM_V, model_path='openbmb/MiniCPM-V'),
+    'MiniCPM-V-2':partial(MiniCPM_V, model_path='openbmb/MiniCPM-V-2'),
+    'MiniCPM-Llama3-V-2_5':partial(MiniCPM_Llama3_V, model_path='openbmb/MiniCPM-Llama3-V-2_5'),
+}
+
+supported_VLM = {}
+
+model_groups = [
+    ungrouped
+]
+
+for grp in model_groups:
+    supported_VLM.update(grp)
+
diff --git a/eval_mm/vlmevalkit/vlmeval/evaluate/OCRBench.py b/eval_mm/vlmevalkit/vlmeval/evaluate/OCRBench.py
new file mode 100644
index 0000000000000000000000000000000000000000..c37ad0872b8b92729001daf9db892dcfccbfffa8
--- /dev/null
+++ b/eval_mm/vlmevalkit/vlmeval/evaluate/OCRBench.py
@@ -0,0 +1,65 @@
+from vlmeval.smp import *
+
+
+def OCRBench_eval(eval_file):
+    OCRBench_score = {
+        'Regular Text Recognition': 0,
+        'Irregular Text Recognition': 0,
+        'Artistic Text Recognition': 0,
+        'Handwriting Recognition': 0,
+        'Digit String Recognition': 0,
+        'Non-Semantic Text Recognition': 0,
+        'Scene Text-centric VQA': 0,
+        'Doc-oriented VQA': 0,
+        'Key Information Extraction': 0,
+        'Handwritten Mathematical Expression Recognition': 0
+    }
+
+    logger = get_logger('Evaluation')
+
+    data = load(eval_file)
+    lt = len(data)
+    lines = [data.iloc[i] for i in range(lt)]
+    for i in tqdm(range(len(lines))):
+        line = lines[i]
+        predict = str(line['prediction'])
+        answers = eval(line['answer'])
+        category = line['category']
+        if category == 'Handwritten Mathematical Expression Recognition':
+            for j in range(len(answers)):
+                answer = answers[j].strip().replace('\n', ' ').replace(' ', '')
+                predict = predict.strip().replace('\n', ' ').replace(' ', '')
+                if answer in predict:
+                    OCRBench_score[category] += 1
+                    break
+        else:
+            for j in range(len(answers)):
+                answer = answers[j].lower().strip().replace('\n', ' ')
+                predict = predict.lower().strip().replace('\n', ' ')
+                if answer in predict:
+                    OCRBench_score[category] += 1
+                    break
+
+    final_score_dict = {}
+    final_score_dict['Text Recognition'] = (
+        OCRBench_score['Regular Text Recognition'] + OCRBench_score['Irregular Text Recognition']
+        + OCRBench_score['Artistic Text Recognition'] + OCRBench_score['Handwriting Recognition']
+        + OCRBench_score['Digit String Recognition'] + OCRBench_score['Non-Semantic Text Recognition']
+    )
+    final_score_dict['Scene Text-centric VQA'] = OCRBench_score['Scene Text-centric VQA']
+    final_score_dict['Doc-oriented VQA'] = OCRBench_score['Doc-oriented VQA']
+    final_score_dict['Key Information Extraction'] = OCRBench_score['Key Information Extraction']
+    final_score_dict['Handwritten Mathematical Expression Recognition'] = \
+        OCRBench_score['Handwritten Mathematical Expression Recognition']
+    final_score_dict['Final Score'] = (
+        final_score_dict['Text Recognition'] + final_score_dict['Scene Text-centric VQA']
+        + final_score_dict['Doc-oriented VQA'] + final_score_dict['Key Information Extraction']
+        + final_score_dict['Handwritten Mathematical Expression Recognition']
+    )
+    final_score_dict['Final Score Norm'] = float(final_score_dict['Final Score']) / 10
+    score_pth = eval_file.replace('.xlsx', '_score.json')
+    dump(final_score_dict, score_pth)
+    logger.info(f'OCRBench_eval successfully finished evaluating {eval_file}, results saved in {score_pth}')
+    logger.info('Score: ')
+    for key, value in final_score_dict.items():
+        logger.info('{}:{}'.format(key, value))
diff --git a/eval_mm/vlmevalkit/vlmeval/evaluate/__init__.py b/eval_mm/vlmevalkit/vlmeval/evaluate/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..10248c4b66d2daf7d3a5603990ec4b35c1194368
--- /dev/null
+++ b/eval_mm/vlmevalkit/vlmeval/evaluate/__init__.py
@@ -0,0 +1,9 @@
+from .yes_or_no import default_rating, MME_rating, YOrN_eval
+from .mmvet_eval import MMVet_eval
+from .multiple_choice import multiple_choice_eval
+from .coco_eval import COCO_eval
+from .vqa_eval import VQAEval
+from .mathvista_eval import MathVista_eval
+from .llavabench import LLaVABench_eval
+from .misc import build_judge
+from .OCRBench import OCRBench_eval
diff --git a/eval_mm/vlmevalkit/vlmeval/evaluate/coco_eval.py b/eval_mm/vlmevalkit/vlmeval/evaluate/coco_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ae8f4f85bf50b8ede6c03c4ccdc573d8303203e
--- /dev/null
+++ b/eval_mm/vlmevalkit/vlmeval/evaluate/coco_eval.py
@@ -0,0 +1,74 @@
+from vlmeval.smp import *
+from pycocoevalcap.bleu.bleu import Bleu
+from pycocoevalcap.rouge.rouge import Rouge
+from pycocoevalcap.cider.cider import Cider
+
+
+class COCO_Caption_Scorer():
+    def __init__(self, ref, gt):
+        self.ref = ref
+        self.gt = gt
+        print('setting up scorers...')
+        self.scorers = [
+            (Bleu(4), ['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4']),
+            # (Meteor(), "METEOR"), # need java version 11.0.16+
+            (Rouge(), 'ROUGE_L'),
+            (Cider(), 'CIDEr'),
+            # (Spice(), "SPICE"), # need java version 11.0.16+
+        ]
+
+    def compute_scores(self):
+        total_scores = {}
+        for scorer, method in self.scorers:
+            print('computing %s score...' % (scorer.method()))
+            score, scores = scorer.compute_score(self.gt, self.ref)
+            if type(method) == list:
+                for sc, scs, m in zip(score, scores, method):
+                    print('%s: %0.3f' % (m, sc * 100))
+                total_scores['Bleu'] = [x * 100 for x in score]
+            else:
+                print('%s: %0.3f' % (method, score * 100))
+                total_scores[method] = score * 100
+
+        print('*****DONE*****')
+        for key, value in total_scores.items():
+            print('{}:{}'.format(key, value))
+        return total_scores
+
+
+def COCO_eval(eval_file, nproc=4, verbose=False):
+    logger = get_logger('Evaluation')
+
+    data = load(eval_file)
+
+    lt = len(data)
+    lines = [data.iloc[i] for i in range(lt)]
+    ref = {}
+    gt = {}
+    for i, line in enumerate(lines):
+        ref[str(i)] = [str(line['prediction'])]
+        gt[str(i)] = eval(line['answer'])
+
+    scorer = COCO_Caption_Scorer(ref, gt)
+    coco_caption_score_dict = scorer.compute_scores()
+
+    score_pth = eval_file.replace('.xlsx', '_score.json')
+    dump(coco_caption_score_dict, score_pth)
+    logger.info(f'COCO_eval successfully finished evaluating {eval_file}, results saved in {score_pth}')
+    logger.info('Score: ')
+    for key, value in coco_caption_score_dict.items():
+        logger.info('{}:{}'.format(key, value))
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Inference LLM Answers. ')
+    parser.add_argument('--data', type=str, help='The question set for inference, in excel / tsv / json format. ')
+    parser.add_argument('--nproc', type=int, default=4)
+    parser.add_argument('--verbose', action='store_true')
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    COCO_eval(eval_file=args.data, nproc=args.nproc, verbose=args.verbose)
diff --git a/eval_mm/vlmevalkit/vlmeval/evaluate/llavabench.py b/eval_mm/vlmevalkit/vlmeval/evaluate/llavabench.py
new file mode 100644
index 0000000000000000000000000000000000000000..1eac9cce0d1405427b00cb39336ae4e7091eb7df
--- /dev/null
+++ b/eval_mm/vlmevalkit/vlmeval/evaluate/llavabench.py
@@ -0,0 +1,120 @@
+import argparse
+import numpy as np
+import pandas as pd
+import os.path as osp
+from vlmeval.evaluate.misc import build_judge
+from vlmeval.smp import *
+from vlmeval.utils import track_progress_rich
+
+rule_dict = {
+    'llava_bench_conv': {'role': 'Assistant', 'prompt': 'We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment.'},  # noqa: E501
+    'llava_bench_detail': {'role': 'Assistant', 'prompt': 'We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment.'},  # noqa: E501
+    'llava_bench_complex': {'role': 'Assistant', 'prompt': 'We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment.'}  # noqa: E501
+}
+
+
+def get_eval(judge, content):
+    return judge.generate(content)
+
+
+def parse_score(review):
+    logger = get_logger('Evaluation')
+    try:
+        score_pair = review.split('\n')[0]
+        score_pair = score_pair.replace(',', ' ')
+        sp = score_pair.split(' ')
+        if len(sp) == 2:
+            return [float(sp[0]), float(sp[1])]
+        else:
+            logger.error('error', review)
+            return [-1, -1]
+    except Exception as e:
+        logger.error(e, 'error', review)
+        return [-1, -1]
+
+
+def build_prompt(line):
+    cap_str = line['caption']
+    question = line['question']
+    ans1 = line['gpt4_ans']
+    ans2 = line['prediction']
+    category = 'llava_bench_' + line['category']
+    rule = rule_dict[category]
+    role, prompt = rule['role'], rule['prompt']
+
+    content = (f'[Context]\n{cap_str}\n\n'
+               f'[Question]\n{question}\n\n'
+               f'[{role} 1]\n{ans1}\n\n[End of {role} 1]\n\n'
+               f'[{role} 2]\n{ans2}\n\n[End of {role} 2]\n\n'
+               f'[System]\n{prompt}\n\n')
+    return content
+
+
+def LLaVABench_atomeval(model, prompt):
+    review = get_eval(model, prompt)
+    scores = parse_score(review)
+    return scores
+
+
+def LLaVABench_score(data):
+    cates = ['overall'] + list(set(data['category']))
+    ret = defaultdict(list)
+
+    for c in cates:
+        ret['split'].append(c)
+        sub = data[data['category'] == c] if c != 'overall' else data
+        ret['Relative Score (main)'].append(np.mean(sub['score']) / np.mean(sub['gpt4_score']) * 100)
+        ret['VLM Score'].append(np.mean(sub['score']) * 10)
+        ret['GPT4 Score'].append(np.mean(sub['gpt4_score']) * 10)
+    return pd.DataFrame(ret)
+
+
+def LLaVABench_eval(eval_file, **judge_kwargs):
+    suffix = '.' + eval_file.split('.')[-1]
+    record_file = eval_file.replace(suffix, '_openai_result' + suffix)
+    score_file = eval_file.replace(suffix, '_score.csv')
+    nproc = judge_kwargs.pop('nproc', 4)
+
+    if not osp.exists(record_file):
+        data = load(eval_file)
+        lines = [data.iloc[i] for i in range(len(data))]
+        model = build_judge(
+            temperature=0.2,
+            system_prompt='You are a helpful and precise assistant for checking the quality of the answer.',
+            **judge_kwargs)
+        prompts = [build_prompt(line) for line in lines]
+        tups = [(model, prompt) for prompt in prompts]
+        scores = track_progress_rich(LLaVABench_atomeval, tups, nproc=nproc, chunksize=nproc)
+        data['gpt4_score'] = [x[0] for x in scores]
+        data['score'] = [x[1] for x in scores]
+        dump(data, record_file)
+
+    data = load(record_file)
+    ret = LLaVABench_score(data).round(1)
+    print(ret)
+    dump(ret, score_file)
+    return ret
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='LLaVABench Evaluation. ')
+    parser.add_argument('data', type=str, help='The question set for inference, in excel / tsv / json format. ')
+    parser.add_argument(
+        '--model', type=str, help='The LLM (GPT) used for inference. ', default='gpt-4-turbo',
+        choices=['gpt-4-0613', 'gpt-4-turbo', 'chatgpt-1106', 'chatgpt-0613', 'gpt-4-0314'])
+    parser.add_argument('--nproc', type=int, default=4)
+    parser.add_argument('--verbose', action='store_true')
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    load_env()
+    args = parse_args()
+    judge_kwargs = dict(model=args.model, nproc=args.nproc, verbose=args.verbose)
+    if 'OPENAI_API_KEY_JUDGE' in os.environ and os.environ['OPENAI_API_KEY_JUDGE']:
+        judge_kwargs['key'] = os.environ['OPENAI_API_KEY_JUDGE']
+    if 'OPENAI_API_BASE_JUDGE' in os.environ and os.environ['OPENAI_API_BASE_JUDGE']:
+        judge_kwargs['api_base'] = os.environ['OPENAI_API_BASE_JUDGE']
+
+    LLaVABench_eval(eval_file=args.data, **judge_kwargs)
diff --git a/eval_mm/vlmevalkit/vlmeval/evaluate/mathvista_eval.py b/eval_mm/vlmevalkit/vlmeval/evaluate/mathvista_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..67c5e8b2981ae8f19e7d143d54f5268d4e864a5a
--- /dev/null
+++ b/eval_mm/vlmevalkit/vlmeval/evaluate/mathvista_eval.py
@@ -0,0 +1,240 @@
+from vlmeval.evaluate.misc import build_judge
+from vlmeval.smp import *
+from vlmeval.utils import track_progress_rich
+from vlmeval.utils.matching_util import can_infer
+
+
+def get_gpt4_ICE():
+    example_1 = """
+Hint: Please answer the question requiring an integer answer and provide the final value,
+e.g., 1, 2, 3, at the end.\n
+Question: Which number is missing?\n
+Model response: The number missing in the sequence is 14.\n
+Extracted answer: 14
+"""
+
+    example_2 = """
+Hint: Please answer the question requiring a floating-point number with one decimal place and provide the final value,
+e.g., 1.2, 1.3, 1.4, at the end.\n
+Question: What is the fraction of females facing the camera?\n
+Model response: The fraction of females facing the camera is 0.6,
+which means that six out of ten females in the group are facing the camera.\n
+Extracted answer: 0.6
+"""
+
+    example_3 = """
+Hint: Please answer the question requiring a floating-point number with two decimal places and provide the final value,
+e.g., 1.23, 1.34, 1.45, at the end.\n
+Question: How much money does Luca need to buy a sour apple candy and a butter-scotch candy? (Unit: $)\n
+Model response: Luca needs $1.45 to buy a sour apple candy and a butterscotch candy.\n
+Extracted answer: 1.45
+"""
+
+    example_4 = """
+Hint: Please answer the question requiring a Python list as an answer and provide the final list,
+e.g., [1, 2, 3], [1.2, 1.3, 1.4], at the end.\n
+Question: Between which two years does the line graph saw its maximum peak?\n
+Model response: The line graph saw its maximum peak between 2007 and 2008.\n
+Extracted answer: [2007, 2008]
+"""
+
+    example_5 = """
+Hint: Please answer the question and provide the correct option letter, e.g., A, B, C, D, at the end.\n
+Question: What fraction of the shape is blue?\n
+Choices: (A) 3/11 (B) 8/11 (C) 6/11 (D) 3/5\n
+Model response: The correct answer is (B) 8/11.\n
+Extracted answer: B
+"""
+
+    return [example_1, example_2, example_3, example_4, example_5]
+
+
+def build_mathvista_gpt4_prompt(line):
+    task_description = """
+Please read the following example.
+Then extract the answer from the model response and type it at the end of the prompt.\n
+"""
+    question = line['question']
+    prediction = str(line['prediction'])
+    prompt = task_description
+    examples = get_gpt4_ICE()
+    for example in examples:
+        prompt += example + '\n'
+    prompt += question + '\n'
+    prompt += 'Model respone: ' + prediction
+    prompt += 'Extracted answer:'
+    return prompt
+
+
+def list_to_dict(lst):
+    return {chr(65 + i): val for i, val in enumerate(lst)}
+
+
+def post_check(line, prefetch=False):
+    res = None
+    ans = line['answer']
+    response = line['prediction'] if prefetch else line['res']
+    try:
+        if line['question_type'] == 'multi_choice':
+            ans = line['answer_option']
+            choices = list_to_dict(eval(line['choices']))
+            res = can_infer(response, choices)
+            if prefetch:
+                return res
+        else:
+            if line['answer_type'] == 'integer':
+                res = int(response)
+                ans = int(line['answer'])
+            elif line['answer_type'] == 'float':
+                res = float(response)
+                ans = float(line['answer'])
+            else:
+                res = str(res)
+                ans = str(ans)
+    except ValueError:
+        pass
+
+    if res == ans:
+        return res if prefetch else True
+    else:
+        return False
+
+
+def MathVista_auxeval(model, line):
+    prompt = build_mathvista_gpt4_prompt(line)
+    log = ''
+    retry = 5
+    if post_check(line, prefetch=True):
+        res = post_check(line, prefetch=True)
+        return dict(log='Prefetch succeed', res=res)
+    for i in range(retry):
+        prediction = line['prediction']
+        res = model.generate(prompt, temperature=i * 0.5)
+        if res is None:
+            log += f'Try {i}: output is {prediction}, failed to parse.\n'
+        else:
+            log += 'Succeed'
+            return dict(log=log, res=res)
+    log += 'All 5 retries failed.\n'
+    return dict(log=log, res='')
+
+
+def MathVista_acc(result_file):
+    data = load(result_file)
+    tot = defaultdict(lambda: 0)
+    fetch = defaultdict(lambda: 0)
+    hit = defaultdict(lambda: 0)
+    lt = len(data)
+    skill_list = []
+    for i in range(lt):
+        item = data.iloc[i]
+        cate = item['task']
+        tot['Overall'] += 1
+        try:
+            skills = eval(item['skills'])
+        except SyntaxError:
+            skills = [item['skills']]
+        for skill in skills:
+            if skill not in skill_list:
+                skill_list.append(skill)
+            tot[skill] += 1
+        tot[cate] += 1
+        if item['log'] == 'Prefetch succeed':
+            fetch['Overall'] += 1
+            fetch[cate] += 1
+            for skill in skills:
+                fetch[skill] += 1
+        if post_check(item, prefetch=False):
+            hit['Overall'] += 1
+            hit[cate] += 1
+            for skill in skills:
+                hit[skill] += 1
+
+    res = defaultdict(list)
+    for k in tot.keys():
+        res['Task&Skill'].append(k)
+        res['tot'].append(tot[k])
+        res['prefetch'].append(fetch[k])
+        res['hit'].append(hit[k])
+        res['prefetch_rate'].append(fetch[k] / tot[k] * 100)
+        res['acc'].append(hit[k] / tot[k] * 100)
+    res = pd.DataFrame(res)
+    return res
+
+
+def MathVista_eval(eval_file, **judge_kwargs):
+    logger = get_logger('Evaluation')
+    model = judge_kwargs['model']
+
+    suffix = eval_file.split('.')[-1]
+    storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
+    tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+    nproc = judge_kwargs.pop('nproc', 4)
+
+    if osp.exists(storage):
+        logger.warning(f'GPT scoring file {storage} already exists, will reuse it in MathVista_eval. ')
+    else:
+        data = load(eval_file)
+        model = build_judge(max_tokens=128, **judge_kwargs)
+        lt = len(data)
+        lines = [data.iloc[i] for i in range(lt)]
+        tups = [(model, line) for line in lines]
+        indices = [line['index'] for line in lines]
+
+        ans = {}
+        if osp.exists(tmp_file):
+            ans = load(tmp_file)
+        tups = [x for x, i in zip(tups, indices) if i not in ans]
+        indices = [i for i in indices if i not in ans]
+
+        if len(indices):
+            new_results = track_progress_rich(
+                MathVista_auxeval, tups, nproc=nproc, chunksize=nproc,
+                keys=indices, save=tmp_file)
+            ans = load(tmp_file)
+            for k, v in zip(indices, new_results):
+                assert k in ans
+                assert ans[k]['log'] == v['log'] and ans[k]['res'] == v['res']
+
+        log_map, res_map = {}, {}
+        all_inds = [line['index'] for line in lines]
+        for k in all_inds:
+            log_map[k] = ans[k]['log']
+            res_map[k] = ans[k]['res']
+        data['res'] = [res_map[idx] for idx in data['index']]
+        data['log'] = [log_map[idx] for idx in data['index']]
+        dump(data, storage)
+
+    score = MathVista_acc(storage)
+    score_pth = storage.replace('.xlsx', '_score.csv')
+
+    dump(score, score_pth)
+    logger.info(f'MathVista_eval successfully finished evaluating {eval_file}, results saved in {score_pth}')
+    logger.info('Score: ')
+    logger.info(score)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Inference LLM Answers. ')
+    parser.add_argument('data', type=str, help='The question set for inference, in excel / tsv / json format. ')
+    parser.add_argument(
+        '--model',
+        type=str,
+        help='The LLM (GPT) used for inference. ',
+        default='gpt-4-turbo',
+        choices=['gpt-4-0613', 'gpt-4-turbo', 'chatgpt-1106', 'chatgpt-0613'])
+    parser.add_argument('--nproc', type=int, default=4)
+    parser.add_argument('--verbose', action='store_true')
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    load_env()
+    args = parse_args()
+    judge_kwargs = dict(model=args.model, nproc=args.nproc, verbose=args.verbose)
+    if 'OPENAI_API_KEY_JUDGE' in os.environ and os.environ['OPENAI_API_KEY_JUDGE']:
+        judge_kwargs['key'] = os.environ['OPENAI_API_KEY_JUDGE']
+    if 'OPENAI_API_BASE_JUDGE' in os.environ and os.environ['OPENAI_API_BASE_JUDGE']:
+        judge_kwargs['api_base'] = os.environ['OPENAI_API_BASE_JUDGE']
+    MathVista_eval(eval_file=args.data, **judge_kwargs)
diff --git a/eval_mm/vlmevalkit/vlmeval/evaluate/misc.py b/eval_mm/vlmevalkit/vlmeval/evaluate/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddf0c9e01843e04faeda43d93ebfffe901e2028f
--- /dev/null
+++ b/eval_mm/vlmevalkit/vlmeval/evaluate/misc.py
@@ -0,0 +1,29 @@
+import os
+from vlmeval.api import OpenAIWrapper, OpenAIWrapperInternal
+from vlmeval.smp import load_env
+
+INTERNAL = os.environ.get('INTERNAL', 0)
+
+
+def build_judge(**kwargs):
+    model = kwargs.pop('model', None)
+    load_env()
+    LOCAL_LLM = os.environ.get('LOCAL_LLM', None)
+    if LOCAL_LLM is None:
+        model_map = {
+            'gpt-4-turbo': 'gpt-4-1106-preview',
+            'gpt-4-0613': 'gpt-4-0613',
+            'gpt-4-0314': 'gpt-4-0314',
+            'gpt-4-0125': 'gpt-4-0125-preview',
+            'chatgpt-1106': 'gpt-3.5-turbo-1106',
+            'chatgpt-0613': 'gpt-3.5-turbo-0613',
+            'chatgpt-0125': 'gpt-3.5-turbo-0125'
+        }
+        model_version = model_map[model]
+    else:
+        model_version = LOCAL_LLM
+    if INTERNAL:
+        model = OpenAIWrapperInternal(model_version, **kwargs)
+    else:
+        model = OpenAIWrapper(model_version, **kwargs)
+    return model
diff --git a/eval_mm/vlmevalkit/vlmeval/evaluate/mmvet_eval.py b/eval_mm/vlmevalkit/vlmeval/evaluate/mmvet_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0067e7c35015642249b91b21ac984f37a2028c0
--- /dev/null
+++ b/eval_mm/vlmevalkit/vlmeval/evaluate/mmvet_eval.py
@@ -0,0 +1,191 @@
+from vlmeval.evaluate.misc import build_judge
+from vlmeval.smp import *
+from vlmeval.utils import track_progress_rich
+
+
+def build_mmvet_gpt4_prompt(line):
+    question = line['question']
+    gt = str(line['answer'])
+    prediction = str(line['prediction'])
+    prompt = """
+Compare the ground truth and prediction from AI models, to give a correctness score for the prediction.
+<AND> in the ground truth means it is totally right
+only when all elements in the ground truth are present in the prediction,
+and <OR> means it is totally right when any one element in the ground truth is present in the prediction.
+The correctness score is 0.0 (totally wrong), 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, or 1.0 (totally right).
+Just complete the last space of the correctness score.
+
+Question | Ground truth | Prediction | Correctness
+--- | --- | --- | ---
+What is x in the equation? | -1 <AND> -5 | x = 3 | 0.0
+What is x in the equation? | -1 <AND> -5 | x = -1 | 0.5
+What is x in the equation? | -1 <AND> -5 | x = -5 | 0.5
+What is x in the equation? | -1 <AND> -5 | x = -5 or 5 | 0.5
+What is x in the equation? | -1 <AND> -5 | x = -1 or x = -5 | 1.0
+Can you explain this meme? | This meme is poking fun at the fact that the names of the countries
+Iceland and Greenland are misleading. Despite its name, Iceland is known for its beautiful green landscapes,
+while Greenland is mostly covered in ice and snow. The meme is saying that the person has trust issues
+because the names of these countries do not accurately represent their landscapes. |
+The meme talks about Iceland and Greenland. It's pointing out that despite their names,
+Iceland is not very icy and Greenland isn't very green. | 0.4
+Can you explain this meme? | This meme is poking fun at the fact that the names of the countries
+Iceland and Greenland are misleading. Despite its name, Iceland is known for its beautiful green landscapes,
+while Greenland is mostly covered in ice and snow. The meme is saying that the person has trust issues
+because the names of these countries do not accurately represent their landscapes. |
+The meme is using humor to point out the misleading nature of Iceland's and Greenland's names.
+Iceland, despite its name, has lush green landscapes while Greenland is mostly covered in ice and snow.
+The text 'This is why I have trust issues' is a playful way to suggest
+that these contradictions can lead to distrust or confusion.
+The humor in this meme is derived from the unexpected contrast between the names of the countries
+and their actual physical characteristics. | 1.0
+"""
+    gpt4_prompt = prompt + '\n' + ' | '.join(
+        [question, gt.replace('<AND>', ' <AND> ').replace('<OR>', ' <OR> '), prediction, ''])
+    return gpt4_prompt
+
+
+def MMVet_auxeval(model, line):
+    def float_cvt(s):
+        try:
+            return float(s)
+        except ValueError:
+            return None
+
+    prompt = build_mmvet_gpt4_prompt(line)
+    log = ''
+    retry = 5
+    for i in range(retry):
+        output = model.generate(prompt, temperature=i * 0.5)
+        score = float_cvt(output)
+        if score is None:
+            log += f'Try {i}: output is {output}, failed to parse.\n'
+        elif score < 0 or score > 1:
+            log += f'Try {i}: output is {output}, invalid score: {score}.\n'
+        else:
+            log += 'Succeed'
+            return dict(log=log, score=score)
+    log += 'All 5 retries failed.\n'
+    return dict(log=log, score=0.0)
+
+
+def MMVet_acc(result_file):
+    data = load(result_file)
+    tot = defaultdict(lambda: 0)
+    score = defaultdict(lambda: 0)
+    lt = len(data)
+    cate2_list = []
+    for i in range(lt):
+        item = data.iloc[i]
+        cate = item['category']
+        cate2 = cate.replace(',', '_')
+        if cate2 not in cate2_list:
+            cate2_list.append(cate2)
+        grade = float(item['score'])
+        cate_list = ['rec', 'ocr', 'know', 'gen', 'spat', 'math']
+        for capa in cate_list:
+            if capa in cate:
+                tot[capa] += 1
+                score[capa] += grade
+        tot['Overall'] += 1
+        tot[cate2] += 1
+        score['Overall'] += grade
+        score[cate2] += grade
+
+    res = defaultdict(list)
+    res2 = defaultdict(list)
+    cate_list.append('Overall')
+    cate2_list.append('Overall')
+    for k in cate_list:
+        res['Category'].append(k)
+        res['tot'].append(tot[k])
+        res['acc'].append(score[k] / tot[k] * 100)
+    for v in cate2_list:
+        res2['Category'].append(v)
+        res2['tot'].append(tot[v])
+        res2['acc'].append(score[v] / tot[v] * 100)
+    res = pd.DataFrame(res)
+    res2 = pd.DataFrame(res2)
+    return res, res2
+
+
+def MMVet_eval(eval_file, **judge_kwargs):
+    logger = get_logger('Evaluation')
+
+    suffix = eval_file.split('.')[-1]
+    model = judge_kwargs['model']
+    storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
+    tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+    nproc = judge_kwargs.pop('nproc', 4)
+    if osp.exists(storage):
+        logger.warning(f'GPT scoring file {storage} already exists, will reuse it in MMVet_eval. ')
+    else:
+        data = load(eval_file)
+        model = build_judge(max_tokens=3, **judge_kwargs)
+
+        lt = len(data)
+        lines = [data.iloc[i] for i in range(lt)]
+        tups = [(model, line) for line in lines]
+        indices = [line['index'] for line in lines]
+
+        ans = {}
+        if osp.exists(tmp_file):
+            ans = load(tmp_file)
+        tups = [x for x, i in zip(tups, indices) if i not in ans]
+        indices = [i for i in indices if i not in ans]
+
+        if len(indices):
+            new_results = track_progress_rich(
+                MMVet_auxeval, tups, nproc=nproc, chunksize=nproc,
+                keys=indices, save=tmp_file)
+            ans = load(tmp_file)
+            for k, v in zip(indices, new_results):
+                assert k in ans
+                assert ans[k]['log'] == v['log'] and ans[k]['score'] == v['score']
+
+        log_map, score_map = {}, {}
+        all_inds = [line['index'] for line in lines]
+        for k in all_inds:
+            log_map[k] = ans[k]['log']
+            score_map[k] = ans[k]['score']
+        data['score'] = [score_map[idx] for idx in data['index']]
+        data['log'] = [log_map[idx] for idx in data['index']]
+        dump(data, storage)
+
+    score, score_fine = MMVet_acc(storage)
+    score_pth = storage.replace('.xlsx', '_score.csv')
+    score_fine_pth = storage.replace('.xlsx', '_score_fine.csv')
+
+    dump(score, score_pth)
+    dump(score_fine, score_fine_pth)
+    logger.info(
+        f'MMVet_eval successfully finished evaluating {eval_file}, '
+        f'results saved in {score_pth} and {score_fine_pth}'
+    )
+    logger.info('Score: ')
+    logger.info(score)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Inference LLM Answers. ')
+    parser.add_argument('data', type=str, help='The question set for inference, in excel / tsv / json format. ')
+    parser.add_argument(
+        '--model',
+        type=str,
+        help='The LLM (GPT) used for inference. ',
+        default='gpt-4-turbo',
+        choices=['gpt-4-0613', 'gpt-4-turbo', 'chatgpt-1106', 'chatgpt-0613'])
+    parser.add_argument('--nproc', type=int, default=4)
+    parser.add_argument('--verbose', action='store_true')
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    load_env()
+    args = parse_args()
+    judge_kwargs = dict(model=args.model, nproc=args.nproc, verbose=args.verbose)
+    if 'OPENAI_API_KEY_JUDGE' in os.environ and os.environ['OPENAI_API_KEY_JUDGE']:
+        judge_kwargs['key'] = os.environ['OPENAI_API_KEY_JUDGE']
+    if 'OPENAI_API_BASE_JUDGE' in os.environ and os.environ['OPENAI_API_BASE_JUDGE']:
+        judge_kwargs['api_base'] = os.environ['OPENAI_API_BASE_JUDGE']
+    MMVet_eval(eval_file=args.data, **judge_kwargs)
diff --git a/eval_mm/vlmevalkit/vlmeval/evaluate/multiple_choice.py b/eval_mm/vlmevalkit/vlmeval/evaluate/multiple_choice.py
new file mode 100644
index 0000000000000000000000000000000000000000..50d52bf2edb3e42fd3ebd0d91730fa6dd553969f
--- /dev/null
+++ b/eval_mm/vlmevalkit/vlmeval/evaluate/multiple_choice.py
@@ -0,0 +1,399 @@
+import os.path as osp
+import pandas as pd
+from tqdm import tqdm
+from vlmeval.evaluate.misc import build_judge
+from vlmeval.utils import can_infer, track_progress_rich, TSVDataset
+from vlmeval.smp import *
+import numpy as np
+
+INTERNAL = os.environ.get('INTERNAL', 0)
+
+abbrs = {
+    'coarse_perception': 'CP',
+    'finegrained_perception (instance-level)': 'FP-S',
+    'finegrained_perception (cross-instance)': 'FP-C',
+    'logic_reasoning': 'LR',
+    'relation_reasoning': 'RR',
+    'attribute_reasoning': 'AR'
+}
+
+
+def MMMU_preproc(data):
+    logger = get_logger('Evaluation')
+    cnt = 0
+    As, Bs, Ans = list(data['A']), list(data['B']), list(data['answer'])
+    lt = len(data)
+    for i in range(lt):
+        if pd.isna(As[i]):
+            As[i] = Ans[i]
+            Bs[i] = 'Other Answers'
+            cnt += 1
+    logger.info(f'During MMMU_preproc in Evaluation, {cnt} open questions are re-formulated to multi-choice ones. ')
+    data['A'] = As
+    data['B'] = Bs
+    return data
+
+
+def report_acc(df):
+    # assert group in [None, 'category', 'l2-category']
+    res = defaultdict(list)
+
+    if 'split' in df:
+        splits = list(set(df['split']))
+        res['split'] = splits
+    else:
+        df['split'] = ['none'] * len(df)
+        res['split'] = ['none']
+
+    for group in [None, 'l2-category', 'category']:
+        if group is None:
+            res['Overall'] = [np.mean(df[df['split'] == sp]['hit']) for sp in res['split']]
+        elif group not in df:
+            continue
+        else:
+            abilities = list(set(df[group]))
+            abilities.sort()
+            for ab in abilities:
+                ab_name = abbrs[ab] if ab in abbrs else ab
+                sub_df = df[df[group] == ab]
+                res[ab_name] = [np.mean(sub_df[sub_df['split'] == sp]['hit']) for sp in res['split']]
+    return pd.DataFrame(res)
+
+
+def build_prompt(question, options, prediction):
+    tmpl = (
+        'You are an AI assistant who will help me to match '
+        'an answer with several options of a single-choice question. '
+        'You are provided with a question, several options, and an answer, '
+        'and you need to find which option is most similar to the answer. '
+        'If the meaning of all options are significantly different from the answer, output Z. '
+        'Your should output a single uppercase character in A, B, C, D (if they are valid options), and Z. \n'
+        'Example 1: \n'
+        'Question: What is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n'
+        'Answer: a cute teddy bear\nYour output: A\n'
+        'Example 2: \n'
+        'Question: What is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n'
+        'Answer: Spider\nYour output: Z\n'
+        'Example 3: \n'
+        'Question: {}?\nOptions: {}\nAnswer: {}\nYour output: '
+    )
+    return tmpl.format(question, options, prediction)
+
+
+def build_prompt_cn(question, options, prediction):
+    tmpl = (
+        '你是一个帮助我匹配答案与单选题中多个选项的 AI 助手。'
+        '你会被提供：一个问题，多个选项，一个答案。你的任务是找到与答案意义最相近的选项。'
+        '如果所有选项的意义都与答案显著不同，则输出 Z。'
+        '你应该输出一个单个的大写字母，例如 A, B, C, D（如果它们是有效选项），或 Z。'
+        '例 1:'
+        '问题: 图中最主要的物体是什么?\n选项: A. 泰迪熊 B. 兔子 C. 猫 D. 狗\n答案: 一只可爱的泰迪熊\n输出: A\n'
+        '例 2: \n'
+        '问题: 图中最主要的物体是什么?\n选项: A. 泰迪熊 B. 兔子 C. 猫 D. 狗\n答案: 蜘蛛\n输出: Z\n'
+        '例 3: \n'
+        '问题: {}?\n选项: {}\n答案: {}\n输出: '
+    )
+    return tmpl.format(question, options, prediction)
+
+
+def build_choices(item):
+    ret = {}
+    for ch in string.ascii_uppercase:
+        if ch in item and (not pd.isna(item[ch])):
+            ret[ch] = item[ch]
+    return ret
+
+
+def prefetch_answer(item):
+    choices = build_choices(item)
+    return can_infer(item['prediction'], choices)
+
+
+def extract_answer_from_item(model, item):
+    logger = get_logger('Evaluation')
+    # It will return: (pred, raw, llm_time)
+    choices = build_choices(item)
+    option_str = build_option_str(choices)
+
+    if cn_string(item['question']):
+        prompt = build_prompt_cn(item['question'], option_str, item['prediction'])
+    else:
+        prompt = build_prompt(item['question'], option_str, item['prediction'])
+    retry = 3
+
+    ret = can_infer(item['prediction'], choices)
+    if ret:
+        return dict(opt=ret, log=item['prediction'])
+
+    while retry:
+        ans = model.generate(prompt)
+        if 'Failed to obtain answer via API' in ans:
+            logger.warning('GPT API failed to answer. ')
+        else:
+            ret = can_infer(ans, choices)
+            if ret:
+                return dict(opt=ret, log=ans)
+            else:
+                logger.warning(f'Output includes 0 / > 1 letter among candidates {set(choices)} and Z: {ans}')
+        retry -= 1
+
+        if retry == 0:
+            options = list(choices) + ['Z'] if 'Z' not in choices else []
+            return dict(opt=rd.choice(options), log='Failed to predict, thus randomly generate one. ')
+
+
+def prefetch_sub_data(sub_data, answer_map, verbose=False):
+    lt = len(sub_data)
+    GT, PRED = [], []
+    for i in range(lt):
+        item = sub_data.iloc[i]
+        idx = item['index']
+        GT.append(answer_map[idx])
+        PRED.append(prefetch_answer(item))
+        if PRED[-1] and (GT[-1] != PRED[-1]):
+            log = (
+                f'Failed in Prefetching Rolling {i}: Answer is {GT[-1]}, '
+                f"Prediction is {item['prediction']}, Pre-fetched is {PRED[-1]}. "
+            )
+            return dict(hit=0, log=log)
+    flag = True
+    for g, p in zip(GT, PRED):
+        if g != p:
+            flag = False
+    ret = (dict(hit=1, log='Succeed During Pre-fetching'), ) if flag else (None, )
+    ret = ret + (GT, PRED) if verbose else ret
+    return ret if len(ret) > 1 else ret[0]
+
+
+def eval_sub_data(model, sub_data, answer_map):
+    res, GT, PRED = prefetch_sub_data(sub_data, answer_map, verbose=True)
+    if res is not None:
+        return res
+
+    lt = len(sub_data)
+    log = ''
+    for i in range(lt):
+        if PRED[i]:
+            log += f'Rolling {i} Matched.\n'
+        else:
+            res = extract_answer_from_item(model, sub_data.iloc[i])
+            opt, match_log = res['opt'], res['log']
+            PRED[i] = opt
+            if PRED[i] != GT[i]:
+                log += (
+                    f"Failed in Rolling {i}: Answer is {GT[i]}; Prediction is {sub_data.iloc[i]['prediction']}; "
+                    f'Pre-fetched is {PRED[i]}; Match Log is {match_log}.\n'
+                )
+                return dict(hit=0, log=log)
+            else:
+                log += (
+                    f"Rolling {i}: Answer is {GT[i]}, Prediction is {sub_data.iloc[i]['prediction']}, "
+                    f'Pre-fetched is {PRED[i]}.\n'
+                )
+
+    return dict(hit=1, log=log)
+
+
+def eval_data_groups(model, data_groups, answer_map, result, result_file, nproc=16):
+    prefetched = [prefetch_sub_data(g, answer_map, verbose=False) for g in data_groups]
+    remain = []
+    for dg, pf in zip(data_groups, prefetched):
+        if pf:
+            result[dg.iloc[0]['index'] % 1e6] = pf
+        else:
+            remain.append(dg)
+    dump(result, result_file)
+    tups = [(model, x, answer_map) for x in remain]
+    keys = [x.iloc[0]['index'] % 1e6 for x in remain]
+    if len(tups) == 0:
+        return
+
+    if model is None:
+        logger = get_logger('Evaluation')
+        logger.warning('Exact Matching mode, will not do GPT-based answer matching. ')
+        for k in keys:
+            result[k] = dict(
+                hit=0, log='Failed in Prefetch, no GPT-based answer matching under `exact_matching` policy.')
+        dump(result, result_file)
+        return
+
+    res = track_progress_rich(
+        eval_sub_data,
+        tups,
+        nproc=nproc,
+        chunksize=nproc,
+        save=result_file,
+        keys=keys)
+    result = load(result_file)
+    for k, v in zip(keys, res):
+        if k in result:
+            assert result[k]['hit'] == v['hit'] and result[k]['log'] == v['log']
+        else:
+            result[k] = v
+    dump(result, result_file)
+
+
+def multiple_choice_eval(eval_file, dataset='default', **judge_kwargs):
+    logger = get_logger('Evaluation')
+
+    # assert dataset is not None
+    dataset_map = {
+        'MMBench_TEST_EN': 'MMBench', 'MMBench_TEST_EN_V11': 'MMBench_V11',
+        'MMBench_TEST_CN': 'MMBench_CN', 'MMBench_TEST_CN_V11': 'MMBench_CN_V11'
+    }
+    if dataset in dataset_map:
+        dataset = dataset_map[dataset]
+    nproc = judge_kwargs.pop('nproc', 4)
+
+    if listinstr(['mmbench', 'ccbench'], dataset.lower()):
+        data = load(eval_file)
+        data['index'] = [int(x) for x in data['index']]
+        dump(data, eval_file)
+
+    rd.seed(2680)
+    suffix = eval_file.split('.')[-1]
+    model = judge_kwargs['model']
+    assert model in ['chatgpt-0613', 'exact_matching', 'gpt-4-0125']
+    name_str_map = {
+        'chatgpt-0613': 'openai',
+        'gpt-4-0125': 'gpt4'
+    }
+    name_str = name_str_map[model] if model in name_str_map else model
+
+    if model == 'exact_matching':
+        model = None
+    else:
+        if INTERNAL or gpt_key_set():
+            model = build_judge(**judge_kwargs)
+        else:
+            logger.error('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
+            model = None
+
+    logger.info(f'Evaluating {eval_file}')
+    result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
+    result = {}
+    if osp.exists(result_file):
+        result = load(result_file)
+
+    data = load(eval_file)
+    data = data.sort_values(by='index')
+    data['prediction'] = [str(x) for x in data['prediction']]
+    for k in data.keys():
+        data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
+
+    if dataset != 'default':
+        meta = TSVDataset(dataset).data
+    else:
+        logger.warning('Dataset is not provided, try to use the original `eval_file` as meta data. ')
+        meta = load(eval_file)
+        assert 'index' in meta and 'answer' in meta, 'Essentail columns missing in the eval_file.'
+
+    answer_map = {i: c for i, c in zip(meta['index'], meta['answer'])}
+    cate_map = {i: c for i, c in zip(meta['index'], meta['category'])} if 'category' in meta else None
+    l2_cate_map = {i: c for i, c in zip(meta['index'], meta['l2-category'])} if 'l2-category' in meta else None
+    split_map = {i: c for i, c in zip(meta['index'], meta['split'])} if 'split' in meta else None
+
+    if cate_map is not None and np.all([pd.isna(x) for x in cate_map.values()]):
+        cate_map = None
+    if l2_cate_map is not None and np.all([pd.isna(x) for x in l2_cate_map.values()]):
+        l2_cate_map = None
+    if split_map is not None and np.all([pd.isna(x) for x in split_map.values()]):
+        split_map = None
+
+    if listinstr(['MMMU'], dataset):
+        data = MMMU_preproc(data)
+        answer_map = {k: (v if v in list(string.ascii_uppercase) else 'A') for k, v in answer_map.items()}
+
+    data = data[data['index'].isin(answer_map)]
+    data_main = data[data['index'] < int(1e6)]
+    meta_idx_set = set(meta['index'])
+    data_main = data_main[data_main['index'].isin(meta_idx_set)]
+
+    lt = len(data_main)
+    hit, tot = 0, 0
+
+    data_groups = []
+    for i in tqdm(range(lt)):
+        # Dealing with the normal part
+        item_main = data_main.iloc[i]
+        idx = item_main['index']
+
+        if idx in result:
+            correct = result[idx]['hit']
+            assert correct in [0, 1]
+            hit += correct
+            tot += 1
+            continue
+
+        sub_data = data[data['index'] % int(1e6) == idx]
+        data_groups.append(sub_data)
+
+    if len(data_groups):
+        eval_data_groups(
+            model=model,
+            data_groups=data_groups,
+            answer_map=answer_map,
+            nproc=nproc,
+            result=result,
+            result_file=result_file)
+
+    tmp_pth = f'/tmp/{timestr()}.xlsx'
+    dump(data_main, tmp_pth)
+    data_main = load(tmp_pth)
+
+    res = load(result_file)
+    indices = data_main['index']
+
+    data_main['hit'] = [res[i]['hit'] for i in indices]
+    data_main['log'] = [res[i]['log'] for i in indices]
+
+    main_idx = data_main['index']
+    if cate_map is not None:
+        data_main['category'] = [cate_map[i] for i in main_idx]
+    if l2_cate_map is not None:
+        data_main['l2-category'] = [l2_cate_map[i] for i in main_idx]
+    if split_map is not None:
+        data_main['split'] = [split_map[i] for i in indices]
+
+    # load split
+    dump(data_main, eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
+    data_main = load(eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
+
+    acc = report_acc(data_main)
+    score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+    dump(acc, score_file)
+    logger.info(f'multiple_choice_eval successfully finished evaluating {eval_file}, results saved in {score_file}')
+    logger.info('Score: ')
+    logger.info(acc)
+    return acc
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Inference LLM Answers. ')
+    parser.add_argument('data', type=str, help='The question set for inference, in excel / tsv / json format. ')
+    parser.add_argument(
+        '--model',
+        type=str,
+        help='The LLM (GPT) used for inference. ',
+        default='chatgpt-0613',
+        choices=['chatgpt-0613', 'exact_matching', 'gpt-4-0125'])
+    parser.add_argument(
+        '--dataset',
+        type=str,
+        default='default',
+        help='The dataset to evaluate')
+    parser.add_argument('--nproc', type=int, default=6)
+    parser.add_argument('--verbose', action='store_true')
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    load_env()
+    args = parse_args()
+    judge_kwargs = dict(model=args.model, nproc=args.nproc, verbose=args.verbose)
+    if 'OPENAI_API_KEY_JUDGE' in os.environ and os.environ['OPENAI_API_KEY_JUDGE']:
+        judge_kwargs['key'] = os.environ['OPENAI_API_KEY_JUDGE']
+    if 'OPENAI_API_BASE_JUDGE' in os.environ and os.environ['OPENAI_API_BASE_JUDGE']:
+        judge_kwargs['api_base'] = os.environ['OPENAI_API_BASE_JUDGE']
+    acc = multiple_choice_eval(eval_file=args.data, dataset=args.dataset, **judge_kwargs)
diff --git a/eval_mm/vlmevalkit/vlmeval/evaluate/vqa_eval.py b/eval_mm/vlmevalkit/vlmeval/evaluate/vqa_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..33dc01d9662692cbf1c9b726feccdfd0f9ac9f84
--- /dev/null
+++ b/eval_mm/vlmevalkit/vlmeval/evaluate/vqa_eval.py
@@ -0,0 +1,340 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Partly adopted from https://github.com/GT-Vision-Lab/VQA
+# Copyright (c) 2014, Aishwarya Agrawal
+
+import re
+from vlmeval.smp import *
+from typing import Optional
+from functools import partial
+
+
+def _process_digit_article(inText):
+    outText = []
+    tempText = inText.lower().split()
+    articles = ['a', 'an', 'the']
+    manualMap = {
+        'none': '0',
+        'zero': '0',
+        'one': '1',
+        'two': '2',
+        'three': '3',
+        'four': '4',
+        'five': '5',
+        'six': '6',
+        'seven': '7',
+        'eight': '8',
+        'nine': '9',
+        'ten': '10',
+    }
+    contractions = {
+        'aint': "ain't",
+        'arent': "aren't",
+        'cant': "can't",
+        'couldve': "could've",
+        'couldnt': "couldn't",
+        "couldn'tve": "couldn't've",
+        "couldnt've": "couldn't've",
+        'didnt': "didn't",
+        'doesnt': "doesn't",
+        'dont': "don't",
+        'hadnt': "hadn't",
+        "hadnt've": "hadn't've",
+        "hadn'tve": "hadn't've",
+        'hasnt': "hasn't",
+        'havent': "haven't",
+        'hed': "he'd",
+        "hed've": "he'd've",
+        "he'dve": "he'd've",
+        'hes': "he's",
+        'howd': "how'd",
+        'howll': "how'll",
+        'hows': "how's",
+        "Id've": "I'd've",
+        "I'dve": "I'd've",
+        'Im': "I'm",
+        'Ive': "I've",
+        'isnt': "isn't",
+        'itd': "it'd",
+        "itd've": "it'd've",
+        "it'dve": "it'd've",
+        'itll': "it'll",
+        "let's": "let's",
+        'maam': "ma'am",
+        'mightnt': "mightn't",
+        "mightnt've": "mightn't've",
+        "mightn'tve": "mightn't've",
+        'mightve': "might've",
+        'mustnt': "mustn't",
+        'mustve': "must've",
+        'neednt': "needn't",
+        'notve': "not've",
+        'oclock': "o'clock",
+        'oughtnt': "oughtn't",
+        "ow's'at": "'ow's'at",
+        "'ows'at": "'ow's'at",
+        "'ow'sat": "'ow's'at",
+        'shant': "shan't",
+        "shed've": "she'd've",
+        "she'dve": "she'd've",
+        "she's": "she's",
+        'shouldve': "should've",
+        'shouldnt': "shouldn't",
+        "shouldnt've": "shouldn't've",
+        "shouldn'tve": "shouldn't've",
+        "somebody'd": 'somebodyd',
+        "somebodyd've": "somebody'd've",
+        "somebody'dve": "somebody'd've",
+        'somebodyll': "somebody'll",
+        'somebodys': "somebody's",
+        'someoned': "someone'd",
+        "someoned've": "someone'd've",
+        "someone'dve": "someone'd've",
+        'someonell': "someone'll",
+        'someones': "someone's",
+        'somethingd': "something'd",
+        "somethingd've": "something'd've",
+        "something'dve": "something'd've",
+        'somethingll': "something'll",
+        'thats': "that's",
+        'thered': "there'd",
+        "thered've": "there'd've",
+        "there'dve": "there'd've",
+        'therere': "there're",
+        'theres': "there's",
+        'theyd': "they'd",
+        "theyd've": "they'd've",
+        "they'dve": "they'd've",
+        'theyll': "they'll",
+        'theyre': "they're",
+        'theyve': "they've",
+        'twas': "'twas",
+        'wasnt': "wasn't",
+        "wed've": "we'd've",
+        "we'dve": "we'd've",
+        'weve': "we've",
+        'werent': "weren't",
+        'whatll': "what'll",
+        'whatre': "what're",
+        'whats': "what's",
+        'whatve': "what've",
+        'whens': "when's",
+        'whered': "where'd",
+        'wheres': "where's",
+        'whereve': "where've",
+        'whod': "who'd",
+        "whod've": "who'd've",
+        "who'dve": "who'd've",
+        'wholl': "who'll",
+        'whos': "who's",
+        'whove': "who've",
+        'whyll': "why'll",
+        'whyre': "why're",
+        'whys': "why's",
+        'wont': "won't",
+        'wouldve': "would've",
+        'wouldnt': "wouldn't",
+        "wouldnt've": "wouldn't've",
+        "wouldn'tve": "wouldn't've",
+        'yall': "y'all",
+        "yall'll": "y'all'll",
+        "y'allll": "y'all'll",
+        "yall'd've": "y'all'd've",
+        "y'alld've": "y'all'd've",
+        "y'all'dve": "y'all'd've",
+        'youd': "you'd",
+        "youd've": "you'd've",
+        "you'dve": "you'd've",
+        'youll': "you'll",
+        'youre': "you're",
+        'youve': "you've",
+    }
+    for word in tempText:
+        word = manualMap.setdefault(word, word)
+        if word not in articles:
+            outText.append(word)
+    for wordId, word in enumerate(outText):
+        if word in contractions:
+            outText[wordId] = contractions[word]
+    outText = ' '.join(outText)
+    return outText
+
+
+def hit_calculate(result, dataset_name, anls_threshold=0.5):
+    if listinstr(['TextVQA'], dataset_name):
+        return [np.mean(x['match']) for x in result]
+    elif listinstr(['DocVQA', 'InfoVQA'], dataset_name):
+        # return [1 - np.min(x['match']) >= anls_threshold for x in result]
+        return [0.0 if 1 - np.min(x['match']) < anls_threshold else 1 - np.min(x['match']) for x in result]
+    elif listinstr(['ChartQA', 'OCRVQA'], dataset_name):
+        return [np.max(x['match']) for x in result]
+    else:  # default using vqa_score to calculate score
+        return [np.mean(x['match']) for x in result]
+
+
+# https://github.com/google-research/pix2struct/blob/main/pix2struct/metrics.py#L81
+def relaxed_correctness(target: str,
+                        prediction: str,
+                        max_relative_change: float = 0.05) -> bool:
+    """Calculates relaxed correctness.
+
+    The correctness tolerates certain error ratio defined by max_relative_change.
+    See https://arxiv.org/pdf/2203.10244.pdf, end of section 5.1:
+    “Following Methani et al. (2020), we use a relaxed accuracy measure for the
+    numeric answers to allow a minor inaccuracy that may result from the automatic
+    data extraction process. We consider an answer to be correct if it is within
+    5% of the gold answer. For non-numeric answers, we still need an exact match
+    to consider an answer to be correct.”
+
+    Args:
+      target: Target string.
+      prediction: Predicted string.
+      max_relative_change: Maximum relative change.
+
+    Returns:
+      Whether the prediction was correct given the specified tolerance.
+    """
+
+    def _to_float(text: str) -> Optional[float]:
+        try:
+            if text.endswith('%'):
+                # Convert percentages to floats.
+                return float(text.rstrip('%')) / 100.0
+            else:
+                return float(text)
+        except ValueError:
+            return None
+    prediction = str(prediction)
+    target = str(target)
+    prediction_float = _to_float(prediction)
+    target_float = _to_float(target)
+    if prediction_float is not None and target_float:
+        relative_change = abs(prediction_float - target_float) / abs(target_float)
+        return relative_change <= max_relative_change
+    else:
+        return prediction.lower() == target.lower()
+
+
+def levenshtein_distance(s1, s2):
+    if len(s1) > len(s2):
+        s1, s2 = s2, s1
+
+    distances = range(len(s1) + 1)
+    for i2, c2 in enumerate(s2):
+        distances_ = [i2 + 1]
+        for i1, c1 in enumerate(s1):
+            if c1 == c2:
+                distances_.append(distances[i1])
+            else:
+                distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
+        distances = distances_
+    return distances[-1]
+
+
+def anls_compute(groundtruth, prediction):
+    gt_answer = ' '.join(groundtruth.strip().lower().split())
+    det_answer = ' '.join(prediction.strip().lower().split())
+    dist = levenshtein_distance(gt_answer, det_answer)
+    length = max(len(groundtruth.upper()), len(prediction.upper()))
+    values = 0.0 if length == 0 else float(dist) / float(length)
+    return values
+
+
+def process_answer(answer):
+    answer = answer.replace('\n', ' ')
+    answer = answer.replace('\t', ' ')
+    answer = answer.strip()
+    answer = process_punctuation(answer)
+    answer = _process_digit_article(answer)
+    return answer
+
+
+def process_line(line, method='vqa_score'):
+    ret = {}
+    if istype(line['answer'], list):
+        answers = eval(line['answer'])
+    else:
+        answers = [line['answer']]
+    if method == 'vqa_score':
+        ret['gt'] = [process_answer(x) for x in answers]
+        ret['pred'] = process_answer(line['prediction'])
+        ret['match'] = []
+        for current_idx, gtAnsDatum in enumerate(ret['gt']):
+            otherGTAns = [
+                item for ret_gt_idx, item in enumerate(ret['gt'])
+                if ret_gt_idx != current_idx
+            ]
+            matchingAns = [
+                item for item in otherGTAns if item == ret['pred']
+            ]
+            acc = min(1, float(len(matchingAns)) / 3)
+            ret['match'].append(acc)
+    elif method == 'anls':
+        ret['gt'] = answers
+        ret['pred'] = line['prediction']
+        ret['match'] = [anls_compute(x, ret['pred']) for x in ret['gt']]
+    elif method == 'relaxed_accuracy':
+        ret['gt'] = answers
+        ret['pred'] = line['prediction'].strip()
+        ret['match'] = [relaxed_correctness(ret['pred'], x) for x in ret['gt']]
+    elif method == 'accuracy':
+        ret['gt'] = answers
+        ret['pred'] = line['prediction'].strip()
+        ret['match'] = [(1.0 if (x.strip().lower() == ret['pred'].strip().lower()) else 0.0) for x in ret['gt']]
+    else:  # default using vqa_score to calculate score
+        ret['gt'] = [process_answer(x) for x in answers]
+        ret['pred'] = process_answer(line['prediction'])
+        ret['match'] = [x == ret['pred'] for x in ret['gt']]
+
+    return ret
+
+
+def VQAEval(eval_file, dataset_name, **kwargs):
+    logger = get_logger('Evaluation')
+    data = load(eval_file)
+    assert 'answer' in data and 'prediction' in data
+    data['prediction'] = [str(x) for x in data['prediction']]
+    data['answer'] = [str(x) for x in data['answer']]
+    lt = len(data)
+    pool = mp.Pool(16)
+    lines = [data.iloc[i] for i in range(lt)]
+    if listinstr(['TextVQA'], dataset_name):
+        res = pool.map(partial(process_line, method='vqa_score'), lines)
+    elif listinstr(['ChartQA'], dataset_name):
+        res = pool.map(partial(process_line, method='relaxed_accuracy'), lines)
+    elif listinstr(['OCRVQA'], dataset_name):
+        res = pool.map(partial(process_line, method='accuracy'), lines)
+    elif listinstr(['DocVQA', 'InfoVQA'], dataset_name):
+        res = pool.map(partial(process_line, method='anls'), lines)
+    else:  # default using vqa_score to calculate score
+        res = pool.map(process_line, lines)
+    # [np.mean(x['match']) >= full_score_weight for x in res]
+    hit = hit_calculate(res, dataset_name)
+    ret = dict()
+    if 'split' in data:
+        splits = set(data['split'])
+        for sp in splits:
+            sub = [r for l, r in zip(lines, res) if l['split'] == sp]
+            # [np.mean(x['match']) >= full_score_weight for x in sub]
+            hit = hit_calculate(sub, dataset_name)
+            ret[sp] = np.mean(hit) * 100
+        sub = [r for l, r in zip(lines, res)]
+        hit = hit_calculate(sub, dataset_name)
+        ret['Overall'] = np.mean(hit) * 100
+    else:
+        ret['Overall'] = np.mean(hit) * 100
+        if 'category' in data:
+            cates = list(set(data['category']))
+            cates.sort()
+            for c in cates:
+                sub = [r for l, r in zip(lines, res) if l['category'] == c]
+                # [np.mean(x['match']) >= full_score_weight for x in sub]
+                hit = hit_calculate(sub, dataset_name)
+                ret[c] = np.mean(hit) * 100
+    ret = d2df(ret)
+    ret.round(2)
+
+    suffix = eval_file.split('.')[-1]
+    result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+    logger.info(f'VQA Eval Finished. Saved to {result_file}. ')
+    logger.info(ret)
+    dump(ret, result_file)
diff --git a/eval_mm/vlmevalkit/vlmeval/evaluate/yes_or_no.py b/eval_mm/vlmevalkit/vlmeval/evaluate/yes_or_no.py
new file mode 100644
index 0000000000000000000000000000000000000000..6012c3745842c2b4d29674e05b13afb0aa9ff4b0
--- /dev/null
+++ b/eval_mm/vlmevalkit/vlmeval/evaluate/yes_or_no.py
@@ -0,0 +1,297 @@
+from vlmeval.evaluate.misc import build_judge
+from vlmeval.smp import *
+from vlmeval.utils import track_progress_rich
+
+INTERNAL = os.environ.get('INTERNAL', 0)
+
+
+def MME_rating(data_file):
+    data = load(data_file)
+    stats = defaultdict(dict)
+    lt = len(data)
+    for i in range(lt):
+        item = data.iloc[i]
+        category = item['category']
+        image_path = item['image_path']
+        score = item['score']
+        if image_path not in stats[category]:
+            stats[category][image_path] = []
+        stats[category][image_path].append(score)
+
+    def acc(key, mode='normal'):
+        res = stats[key]
+        values = []
+        for val in res.values():
+            if mode == 'normal':
+                values.extend(val)
+            elif mode == 'plus':
+                values.append(val[0] * val[1])
+        return np.mean(values) * 100
+
+    scores = {}
+    for k in stats:
+        scores[k] = acc(k) + acc(k, 'plus')
+
+    super_cates = dict(
+        perception=[
+            'OCR', 'artwork', 'celebrity', 'color', 'count', 'existence',
+            'landmark', 'position', 'posters', 'scene'
+        ],
+        reasoning=['code_reasoning', 'commonsense_reasoning', 'numerical_calculation', 'text_translation']
+    )
+
+    ret = {}
+    for sc, cate_list in super_cates.items():
+        base = 0
+        for c in cate_list:
+            base += scores[c]
+        ret[sc] = base
+    ret.update(scores)
+    ret = d2df(ret)
+    return ret
+
+
+def Hallusion_rating(data_file):
+    def calc_fAcc(data):
+        res = defaultdict(list)
+        lt = len(data)
+        for i in range(lt):
+            line = data.iloc[i]
+            res[f"{line['l2-category']}_{line['set_id']}_{line['figure_id']}"].append(line['score'])
+        return np.mean([np.all(x) for x in res.values()]) * 100
+
+    def calc_qAcc(data):
+        res = defaultdict(list)
+        lt = len(data)
+        for i in range(lt):
+            line = data.iloc[i]
+            res[f"{line['l2-category']}_{line['set_id']}_{line['question_id']}"].append(line['score'])
+        return np.mean([np.all(x) for x in res.values()]) * 100
+
+    def calc_aAcc(data):
+        return np.mean(data['score']) * 100
+
+    data = load(data_file)
+    data['set_id'] = [x.split('_')[3] for x in data['index']]
+    data['figure_id'] = [x.split('_')[4] for x in data['index']]
+    data['question_id'] = [x.split('_')[5] for x in data['index']]
+
+    res = dict(split=[], aAcc=[], fAcc=[], qAcc=[])
+    res['split'].append('Overall')
+    res['aAcc'].append(calc_aAcc(data))
+    res['fAcc'].append(calc_fAcc(data))
+    res['qAcc'].append(calc_qAcc(data))
+
+    if 'category' in data:
+        cates = list(set(data['category']))
+        for c in cates:
+            sub = data[data['category'] == c]
+            res['split'].append(c)
+            res['aAcc'].append(calc_aAcc(sub))
+            res['fAcc'].append(calc_fAcc(sub))
+            res['qAcc'].append(calc_qAcc(sub))
+
+    if 'l2-category' in data:
+        cates = list(set(data['l2-category']))
+        for c in cates:
+            sub = data[data['l2-category'] == c]
+            res['split'].append(c)
+            res['aAcc'].append(calc_aAcc(sub))
+            res['fAcc'].append(calc_fAcc(sub))
+            res['qAcc'].append(calc_qAcc(sub))
+    ret = pd.DataFrame(res)
+    return ret
+
+
+def POPE_rating(data_file):
+    def cal_f1_score(y_true, y_pred):
+        tp = sum((y_true == 1) & (y_pred == 1))
+        fp = sum((y_true == 0) & (y_pred == 1))
+        fn = sum((y_true == 1) & (y_pred == 0))
+
+        precision = tp / (tp + fp) if (tp + fp) != 0 else 0
+        recall = tp / (tp + fn) if (tp + fn) != 0 else 0
+        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
+        return f1_score, precision, recall
+
+    data = load(data_file)
+    data = data.assign(category=data['category'].str.split(',')).explode('category')
+    data['index'] = range(len(data))
+    res = dict(split=[], Overall=[], acc=[], precision=[], recall=[])
+    y_true = np.array([1 if i == 'Yes' else 0 for i in data['answer']])
+    y_pred = np.array([1 if i == 'Yes' else 0 for i in data['extracted']])
+    f1_score, precision, recall = cal_f1_score(y_true, y_pred)
+    res['split'].append('Overall')
+    res['Overall'].append(f1_score * 100)
+    res['acc'].append(np.mean(data['score']) * 100)
+    res['precision'].append(precision * 100)
+    res['recall'].append(recall * 100)
+
+    if 'category' in data:
+        cates = list(set(data['category']))
+        cates = [c for c in cates if not pd.isna(c)]
+        for c in cates:
+            sub = data[data['category'] == c]
+            y_true = np.array([1 if i == 'Yes' else 0 for i in sub['answer']])
+            y_pred = np.array([1 if i == 'Yes' else 0 for i in sub['extracted']])
+            f1_score, precision, recall = cal_f1_score(y_true, y_pred)
+            res['split'].append(c)
+            res['Overall'].append(f1_score * 100)
+            res['acc'].append(np.mean(sub['score']) * 100)
+            res['precision'].append(precision * 100)
+            res['recall'].append(recall * 100)
+
+    ret = pd.DataFrame(res)
+    return ret
+
+
+def default_rating(data_file):
+    data = load(data_file)
+    res = {}
+    res['Overall'] = np.mean(data['score']) * 100
+    if 'category' in data:
+        cates = list(set(data['category']))
+        cates = [c for c in cates if not pd.isna(c)]
+        cates.sort()
+        for c in cates:
+            sub = data[data['category'] == c]
+            res[c] = np.mean(sub['score']) * 100
+    if 'l2-category' in data:
+        cates = list(set(data['l2-category']))
+        cates = [c for c in cates if not pd.isna(c)]
+        cates.sort()
+        for c in cates:
+            sub = data[data['l2-category'] == c]
+            res[c] = np.mean(sub['score']) * 100
+    ret = d2df(res)
+    return ret
+
+
+def YOrN_match_prompt(line):
+    tmpl = (
+        'You are an AI assistant who will help me to match an answer with two options of a question. '
+        'The options are only Yes / No. '
+        'You are provided with a question and an answer, '
+        'and you need to find which option (Yes / No) is most similar to the answer. '
+        'If the meaning of all options are significantly different from the answer, output Unknown. '
+        'Your should output a single word among the following 3 choices: Yes, No, Unknown.\n'
+        'Example 1: \n'
+        "Question: Is the word in this image 'Hello'?\nAnswer: The word in this image is 'Hello'.\nYour output: Yes\n"
+        'Example 2: \n'
+        "Question: Is the word in this image 'Hello'?\n"
+        "Answer: The word in this image is not 'Hello'.\nYour output: No\n"
+        'Example 3: \n'
+        'Question: {}?\nAnswer: {}\nYour output: '
+    )
+    return tmpl.format(line['question'], line['prediction'])
+
+
+def YOrN_Extraction(output):
+    s = output.lower()
+    words = process_punctuation(s).split()
+    if 'yes' in words and 'no' not in words:
+        return 'Yes'
+    if 'yes' not in words and 'no' in words:
+        return 'No'
+    return 'Unknown'
+
+
+def YOrN_auxeval(model, line):
+    prompt = YOrN_match_prompt(line)
+    retry = 5
+    for i in range(retry):
+        output = model.generate(prompt, temperature=0.5 * i)
+        ans = YOrN_Extraction(output)
+        if ans != 'Unknown':
+            return ans
+    return 'Unknown'
+
+
+def YOrN_eval(eval_file, dataset=None, **judge_kwargs):
+    logger = get_logger('Evaluation')
+    data = load(eval_file)
+    data['prediction'] = [str(x) for x in data['prediction']]
+    storage = eval_file.replace('.xlsx', '_auxmatch.xlsx')
+    tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
+    nproc = judge_kwargs.pop('nproc', 4)
+
+    if not osp.exists(storage):
+        ans_map = {k: YOrN_Extraction(v) for k, v in zip(data['index'], data['prediction'])}
+        if osp.exists(tmp_file):
+            tmp = load(tmp_file)
+            for k in tmp:
+                if ans_map[k] == 'Unknown' and tmp[k] != 'Unknown':
+                    ans_map[k] = tmp[k]
+
+        data['extracted'] = [ans_map[x] for x in data['index']]
+        unknown = data[data['extracted'] == 'Unknown']
+
+        if INTERNAL or gpt_key_set():
+            model = build_judge(**judge_kwargs)
+        else:
+            logger.error('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
+            model = None
+
+        if model is not None:
+            lt = len(unknown)
+            lines = [unknown.iloc[i] for i in range(lt)]
+            tups = [(model, line) for line in lines]
+            indices = list(unknown['index'])
+            if len(tups):
+                res = track_progress_rich(
+                    YOrN_auxeval, tups, nproc=nproc, chunksize=nproc, keys=indices, save=tmp_file)
+                for k, v in zip(indices, res):
+                    ans_map[k] = v
+
+        data['extracted'] = [ans_map[x] for x in data['index']]
+        dump(data, storage)
+    else:
+        logger.warning(f'GPT matching file {storage} already exists, will reuse it in YOrN_eval. ')
+
+    data = load(storage)
+    data['score'] = (data['answer'] == data['extracted'])
+    dump(data, storage)
+
+    if dataset is not None and listinstr(['MME'], dataset):
+        score = MME_rating(storage)
+    elif dataset is not None and listinstr(['Hallusion'], dataset):
+        score = Hallusion_rating(storage)
+    elif dataset is not None and listinstr(['POPE'], dataset):
+        score = POPE_rating(storage)
+    else:
+        score = default_rating(storage)
+
+    score_tgt = eval_file.replace('.xlsx', '_score.csv')
+    dump(score, score_tgt)
+
+    logger.info(f'YOrN_eval successfully finished evaluating {eval_file}, results saved in {score_tgt}')
+    logger.info('Score: ')
+    logger.info(score)
+    return score
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Inference LLM Answers. ')
+    parser.add_argument('data', type=str, help='The question set for inference, in excel / tsv / json format. ')
+    parser.add_argument(
+        '--model',
+        type=str,
+        help='The LLM (GPT) used for inference. ',
+        default='chatgpt-0613',
+        choices=['chatgpt-0613'])
+    parser.add_argument('--nproc', type=int, default=4)
+    parser.add_argument('--dataset', type=str, default=None)
+    parser.add_argument('--verbose', action='store_true')
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    load_env()
+    args = parse_args()
+    judge_kwargs = dict(model=args.model, nproc=args.nproc, verbose=args.verbose)
+    if 'OPENAI_API_KEY_JUDGE' in os.environ and os.environ['OPENAI_API_KEY_JUDGE']:
+        judge_kwargs['key'] = os.environ['OPENAI_API_KEY_JUDGE']
+    if 'OPENAI_API_BASE_JUDGE' in os.environ and os.environ['OPENAI_API_BASE_JUDGE']:
+        judge_kwargs['api_base'] = os.environ['OPENAI_API_BASE_JUDGE']
+    acc = YOrN_eval(eval_file=args.data, dataset=args.dataset, **judge_kwargs)
diff --git a/eval_mm/vlmevalkit/vlmeval/inference.py b/eval_mm/vlmevalkit/vlmeval/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..4719c2e42346c58ff449bd8f8ee3ba48590b78e2
--- /dev/null
+++ b/eval_mm/vlmevalkit/vlmeval/inference.py
@@ -0,0 +1,184 @@
+import torch
+import torch.distributed as dist
+import datetime
+from vlmeval.config import supported_VLM, api_models
+from vlmeval.utils import TSVDataset, track_progress_rich, split_MMMU
+from vlmeval.smp import *
+
+FAIL_MSG = 'Failed to obtain answer via API.'
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data', type=str, nargs='+', required=True)
+    parser.add_argument('--model', type=str, nargs='+', required=True)
+    parser.add_argument('--nproc', type=int, default=4, required=True)
+    parser.add_argument('--verbose', action='store_true')
+    args = parser.parse_args()
+    return args
+
+
+# Only API model is accepted
+def infer_data_api(work_dir, model_name, dataset_name, index_set=None, api_nproc=4, ignore_failed=False):
+    rank, world_size = get_rank_and_world_size()
+    assert rank == 0 and world_size == 1
+    dataset = TSVDataset(dataset_name)
+    data = dataset.data
+    if index_set is not None:
+        data = data[data['index'].isin(index_set)]
+
+    model = supported_VLM[model_name]() if isinstance(model_name, str) else model_name
+    assert getattr(model, 'is_api', False)
+
+    lt, indices = len(data), list(data['index'])
+    structs = [dataset.build_prompt(data.iloc[i]) for i in range(lt)]
+
+    # Corner Case
+    if listinstr(['MMMU'], dataset_name):
+        structs = [split_MMMU(s) for s in structs]
+
+    out_file = f'{work_dir}/{model_name}_{dataset_name}_supp.pkl'
+    res = {}
+    if osp.exists(out_file):
+        res = load(out_file)
+        if ignore_failed:
+            res = {k: v for k, v in res.items() if FAIL_MSG not in v}
+
+    structs = [s for i, s in zip(indices, structs) if i not in res]
+    indices = [i for i in indices if i not in res]
+
+    gen_func = model.generate
+    # For now, we do not use split_MMMU for MMMU dataset
+    structs = [dict(message=struct, dataset=dataset_name) for struct in structs]
+
+    if len(structs):
+        track_progress_rich(gen_func, structs, nproc=api_nproc, chunksize=api_nproc, save=out_file, keys=indices)
+
+    res = load(out_file)
+    if index_set is not None:
+        res = {k: v for k, v in res.items() if k in index_set}
+    os.remove(out_file)
+    return res
+
+
+def infer_data(model_name, work_dir, dataset_name, out_file, verbose=False, api_nproc=4):
+    prev_file = f'{work_dir}/{model_name}_{dataset_name}_PREV.pkl'
+    res = load(prev_file) if osp.exists(prev_file) else {}
+    if osp.exists(out_file):
+        res.update(load(out_file))
+
+    rank, world_size = get_rank_and_world_size()
+    if rank == 0:
+        dataset = TSVDataset(dataset_name)
+    if world_size > 1:
+        dist.barrier()
+    dataset = TSVDataset(dataset_name)
+
+    sheet_indices = list(range(rank, len(dataset), world_size))
+    lt = len(sheet_indices)
+    data = dataset.data.iloc[sheet_indices]
+    data_indices = [i for i in data['index']]
+
+    # If finished, will exit without building the model
+    all_finished = True
+    for i in range(lt):
+        idx = data.iloc[i]['index']
+        if idx not in res:
+            all_finished = False
+    if all_finished:
+        res = {k: res[k] for k in data_indices}
+        dump(res, out_file)
+        return
+
+    # Data need to be inferred
+    data = data[~data['index'].isin(res)]
+    lt = len(data)
+
+    model = supported_VLM[model_name]() if isinstance(model_name, str) else model_name
+
+    is_api = getattr(model, 'is_api', False)
+    if is_api:
+        lt, indices = len(data), list(data['index'])
+        supp = infer_data_api(
+            work_dir=work_dir,
+            model_name=model_name,
+            dataset_name=dataset_name,
+            index_set=set(indices),
+            api_nproc=api_nproc)
+        for idx in indices:
+            assert idx in supp
+        res.update(supp)
+        res = {k: res[k] for k in data_indices}
+        dump(res, out_file)
+        return model_name
+
+    for i in tqdm(range(lt)):
+        idx = data.iloc[i]['index']
+        if idx in res:
+            continue
+
+        if hasattr(model, 'use_custom_prompt') and model.use_custom_prompt(dataset_name):
+            struct = model.build_prompt(data.iloc[i], dataset=dataset_name)
+        else:
+            struct = dataset.build_prompt(data.iloc[i])
+
+        # Corner Case
+        if listinstr(['MMMU'], dataset_name):
+            struct = split_MMMU(struct)
+
+        # For now, we do not use split_MMMU for MMMU dataset
+        response = model.generate(message=struct, dataset=dataset_name)
+        # torch.cuda.empty_cache()
+
+        if verbose:
+            print(response, flush=True)
+
+        res[idx] = response
+        if (i + 1) % 20 == 0:
+            dump(res, out_file)
+
+    res = {k: res[k] for k in data_indices}
+    dump(res, out_file)
+    return model
+
+
+# A wrapper for infer_data, do the pre & post processing
+def infer_data_job(model, work_dir, model_name, dataset_name, verbose=False, api_nproc=4, ignore_failed=False):
+    rank, world_size = get_rank_and_world_size()
+    result_file = osp.join(work_dir, f'{model_name}_{dataset_name}.xlsx')
+
+    prev_file = f'{work_dir}/{model_name}_{dataset_name}_PREV.pkl'
+    if osp.exists(result_file):
+        if rank == 0:
+            data = load(result_file)
+            results = {k: v for k, v in zip(data['index'], data['prediction'])}
+            if not ignore_failed:
+                results = {k: v for k, v in results.items() if FAIL_MSG not in str(v)}
+            dump(results, prev_file)
+        if world_size > 1:
+            dist.barrier()
+
+    tmpl = osp.join(work_dir, '{}' + f'{world_size}_{dataset_name}.pkl')
+    out_file = tmpl.format(rank)
+
+    model = infer_data(
+        model, work_dir=work_dir, dataset_name=dataset_name, out_file=out_file, verbose=verbose, api_nproc=api_nproc)
+    if world_size > 1:
+        dist.barrier()
+
+    if rank == 0:
+        data_all = {}
+        for i in range(world_size):
+            data_all.update(load(tmpl.format(i)))
+
+        data = TSVDataset(dataset_name).data
+        for x in data['index']:
+            assert x in data_all
+        data['prediction'] = [str(data_all[x]) for x in data['index']]
+        if 'image' in data:
+            data.pop('image')
+
+        dump(data, result_file)
+        for i in range(world_size):
+            os.remove(tmpl.format(i))
+    return model
diff --git a/eval_mm/vlmevalkit/vlmeval/smp/__init__.py b/eval_mm/vlmevalkit/vlmeval/smp/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..46e89687d469b83ec7dd7e3205841d35087108c7
--- /dev/null
+++ b/eval_mm/vlmevalkit/vlmeval/smp/__init__.py
@@ -0,0 +1,4 @@
+from .file import *
+from .vlm import *
+from .misc import *
+from .log import *
diff --git a/eval_mm/vlmevalkit/vlmeval/smp/file.py b/eval_mm/vlmevalkit/vlmeval/smp/file.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9aa693fc8f332981ceff44dd392a7d2ded2c9de
--- /dev/null
+++ b/eval_mm/vlmevalkit/vlmeval/smp/file.py
@@ -0,0 +1,230 @@
+import json
+import pickle
+import pandas as pd
+import os
+import csv
+import hashlib
+import os.path as osp
+import time
+import numpy as np
+import validators
+import mimetypes
+
+
+def LMUDataRoot():
+    if 'LMUData' in os.environ and osp.exists(os.environ['LMUData']):
+        return os.environ['LMUData']
+    home = osp.expanduser('~')
+    root = osp.join(home, 'LMUData')
+    os.makedirs(root, exist_ok=True)
+    # root = './LMUData'
+    # os.makedirs(root, exist_ok=True)
+    return root
+
+def MMBenchOfficialServer(dataset_name):
+    root = LMUDataRoot()
+
+    if dataset_name in ['MMBench', 'MMBench_V11', 'MMBench_CN', 'MMBench_CN_V11']:
+        ans_file = f'{root}/{dataset_name}.tsv'
+        if osp.exists(ans_file):
+            data = load(ans_file)
+            if 'answer' in data and sum([pd.isna(x) for x in data['answer']]) == 0:
+                return True
+
+    if dataset_name in ['MMBench_TEST_EN', 'MMBench_TEST_CN', 'MMBench_TEST_EN_V11', 'MMBench_TEST_CN_V11']:
+        ans_file1 = f'{root}/{dataset_name}.tsv'
+        mapp = {
+            'MMBench_TEST_EN': 'MMBench', 'MMBench_TEST_CN': 'MMBench_CN',
+            'MMBench_TEST_EN_V11': 'MMBench_V11', 'MMBench_TEST_CN_V11': 'MMBench_CN_V11',
+        }
+        ans_file2 = f'{root}/{mapp[dataset_name]}.tsv'
+        for f in [ans_file1, ans_file2]:
+            if osp.exists(f):
+                data = load(f)
+                if 'answer' in data and sum([pd.isna(x) for x in data['answer']]) == 0:
+                    return True
+    return False
+
+
+class NumpyEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, (np.int_, np.intc, np.intp, np.int8,
+                            np.int16, np.int32, np.int64, np.uint8,
+                            np.uint16, np.uint32, np.uint64)):
+            return int(obj)
+        elif isinstance(obj, (np.float_, np.float16, np.float32, np.float64)):
+            return float(obj)
+        elif isinstance(obj, (np.complex_, np.complex64, np.complex128)):
+            return {'real': obj.real, 'imag': obj.imag}
+        elif isinstance(obj, (np.ndarray,)):
+            return obj.tolist()
+        elif isinstance(obj, (np.bool_)):
+            return bool(obj)
+        elif isinstance(obj, (np.void)):
+            return None
+        return json.JSONEncoder.default(self, obj)
+
+
+# LOAD & DUMP
+def dump(data, f, **kwargs):
+    def dump_pkl(data, pth, **kwargs):
+        pickle.dump(data, open(pth, 'wb'))
+
+    def dump_json(data, pth, **kwargs):
+        json.dump(data, open(pth, 'w'), indent=4, ensure_ascii=False, cls=NumpyEncoder)
+
+    def dump_jsonl(data, f, **kwargs):
+        lines = [json.dumps(x, ensure_ascii=False, cls=NumpyEncoder) for x in data]
+        with open(f, 'w', encoding='utf8') as fout:
+            fout.write('\n'.join(lines))
+
+    def dump_xlsx(data, f, **kwargs):
+        data.to_excel(f, index=False, engine='xlsxwriter')
+
+    def dump_csv(data, f, quoting=csv.QUOTE_ALL):
+        data.to_csv(f, index=False, encoding='utf-8', quoting=quoting)
+
+    def dump_tsv(data, f, quoting=csv.QUOTE_ALL):
+        data.to_csv(f, sep='\t', index=False, encoding='utf-8', quoting=quoting)
+
+    handlers = dict(pkl=dump_pkl, json=dump_json, jsonl=dump_jsonl, xlsx=dump_xlsx, csv=dump_csv, tsv=dump_tsv)
+    suffix = f.split('.')[-1]
+    return handlers[suffix](data, f, **kwargs)
+
+
+def load(f):
+    def load_pkl(pth):
+        return pickle.load(open(pth, 'rb'))
+
+    def load_json(pth):
+        return json.load(open(pth, 'r', encoding='utf-8'))
+
+    def load_jsonl(f):
+        lines = open(f, encoding='utf-8').readlines()
+        lines = [x.strip() for x in lines]
+        if lines[-1] == '':
+            lines = lines[:-1]
+        data = [json.loads(x) for x in lines]
+        return data
+
+    def load_xlsx(f):
+        return pd.read_excel(f)
+
+    def load_csv(f):
+        return pd.read_csv(f)
+
+    def load_tsv(f):
+        return pd.read_csv(f, sep='\t')
+
+    handlers = dict(pkl=load_pkl, json=load_json, jsonl=load_jsonl, xlsx=load_xlsx, csv=load_csv, tsv=load_tsv)
+    suffix = f.split('.')[-1]
+    return handlers[suffix](f)
+
+
+def download_file(url, filename=None):
+    import urllib.request
+    from tqdm import tqdm
+
+    class DownloadProgressBar(tqdm):
+        def update_to(self, b=1, bsize=1, tsize=None):
+            if tsize is not None:
+                self.total = tsize
+            self.update(b * bsize - self.n)
+
+    if filename is None:
+        filename = url.split('/')[-1]
+
+    with DownloadProgressBar(unit='B', unit_scale=True,
+                             miniters=1, desc=url.split('/')[-1]) as t:
+        urllib.request.urlretrieve(url, filename=filename, reporthook=t.update_to)
+    return filename
+
+
+def ls(dirname='.', match=[], mode='all', level=1):
+    if isinstance(level, str):
+        assert '+' in level
+        level = int(level[:-1])
+        res = []
+        for i in range(1, level + 1):
+            res.extend(ls(dirname, match=match, mode='file', level=i))
+        return res
+
+    if dirname == '.':
+        ans = os.listdir(dirname)
+    else:
+        ans = [osp.join(dirname, x) for x in os.listdir(dirname)]
+    assert mode in ['all', 'dir', 'file']
+    assert level >= 1 and isinstance(level, int)
+    if level == 1:
+        if isinstance(match, str):
+            match = [match]
+        for m in match:
+            if len(m) == 0:
+                continue
+            if m[0] != '!':
+                ans = [x for x in ans if m in x]
+            else:
+                ans = [x for x in ans if m[1:] not in x]
+        if mode == 'dir':
+            ans = [x for x in ans if osp.isdir(x)]
+        elif mode == 'file':
+            ans = [x for x in ans if not osp.isdir(x)]
+        return ans
+    else:
+        dirs = [x for x in ans if osp.isdir(x)]
+        res = []
+        for d in dirs:
+            res.extend(ls(d, match=match, mode=mode, level=level - 1))
+        return res
+
+
+def mrlines(fname, sp='\n'):
+    f = open(fname).read().split(sp)
+    while f != [] and f[-1] == '':
+        f = f[:-1]
+    return f
+
+
+def mwlines(lines, fname):
+    with open(fname, 'w') as fout:
+        fout.write('\n'.join(lines))
+
+
+def md5(s):
+    hash = hashlib.new('md5')
+    if osp.exists(s):
+        with open(s, 'rb') as f:
+            for chunk in iter(lambda: f.read(2**20), b''):
+                hash.update(chunk)
+    else:
+        hash.update(s.encode('utf-8'))
+    return str(hash.hexdigest())
+
+
+def last_modified(pth):
+    stamp = osp.getmtime(pth)
+    m_ti = time.ctime(stamp)
+    t_obj = time.strptime(m_ti)
+    t = time.strftime('%Y%m%d%H%M%S', t_obj)[2:]
+    return t
+
+
+def parse_file(s):
+    if osp.exists(s):
+        assert osp.isfile(s)
+        suffix = osp.splitext(s)[1].lower()
+        mime = mimetypes.types_map.get(suffix, 'unknown')
+        return (mime, s)
+    elif validators.url(s):
+        suffix = osp.splitext(s)[1].lower()
+        if suffix in mimetypes.types_map:
+            mime = mimetypes.types_map[suffix]
+            dname = osp.join(LMUDataRoot(), 'files')
+            os.makedirs(dname, exist_ok=True)
+            tgt = osp.join(dname, md5(s) + suffix)
+            download_file(s, tgt)
+            return (mime, tgt)
+        else:
+            return ('url', s)
+    else:
+        return (None, s)
diff --git a/eval_mm/vlmevalkit/vlmeval/smp/log.py b/eval_mm/vlmevalkit/vlmeval/smp/log.py
new file mode 100644
index 0000000000000000000000000000000000000000..95804d5ed4048e6aca6535e1e0fde64525c0fac3
--- /dev/null
+++ b/eval_mm/vlmevalkit/vlmeval/smp/log.py
@@ -0,0 +1,44 @@
+import logging
+
+logger_initialized = {}
+
+
+def get_logger(name, log_file=None, log_level=logging.INFO, file_mode='w'):
+    logger = logging.getLogger(name)
+    if name in logger_initialized:
+        return logger
+
+    for logger_name in logger_initialized:
+        if name.startswith(logger_name):
+            return logger
+
+    stream_handler = logging.StreamHandler()
+    handlers = [stream_handler]
+
+    try:
+        import torch.distributed as dist
+        if dist.is_available() and dist.is_initialized():
+            rank = dist.get_rank()
+        else:
+            rank = 0
+    except ImportError:
+        rank = 0
+
+    if rank == 0 and log_file is not None:
+        file_handler = logging.FileHandler(log_file, file_mode)
+        handlers.append(file_handler)
+
+    formatter = logging.Formatter(
+        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+    for handler in handlers:
+        handler.setFormatter(formatter)
+        handler.setLevel(log_level)
+        logger.addHandler(handler)
+
+    if rank == 0:
+        logger.setLevel(log_level)
+    else:
+        logger.setLevel(logging.ERROR)
+
+    logger_initialized[name] = True
+    return logger
diff --git a/eval_mm/vlmevalkit/vlmeval/smp/misc.py b/eval_mm/vlmevalkit/vlmeval/smp/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..7640fad28849c325e0955fdb9f43d27a1c9ecc64
--- /dev/null
+++ b/eval_mm/vlmevalkit/vlmeval/smp/misc.py
@@ -0,0 +1,191 @@
+# flake8: noqa: F401, F403
+import abc
+import argparse
+import csv
+import multiprocessing as mp
+import os
+import os.path as osp
+import copy as cp
+import random as rd
+import requests
+import shutil
+import subprocess
+import warnings
+import logging
+import pandas as pd
+from collections import OrderedDict, defaultdict
+from multiprocessing import Pool, current_process
+from tqdm import tqdm
+import datetime
+import matplotlib.pyplot as plt
+import seaborn as sns
+from tabulate import tabulate_formats, tabulate
+from huggingface_hub import scan_cache_dir
+from sty import fg, bg, ef, rs
+
+def process_punctuation(inText):
+    import re
+    outText = inText
+    punct = [
+        ';', r'/', '[', ']', '"', '{', '}', '(', ')', '=', '+', '\\', '_', '-',
+        '>', '<', '@', '`', ',', '?', '!'
+    ]
+    commaStrip = re.compile('(\d)(,)(\d)')  # noqa: W605
+    periodStrip = re.compile('(?!<=\d)(\.)(?!\d)')  # noqa: W605
+    for p in punct:
+        if (p + ' ' in inText or ' ' + p in inText) or (re.search(
+                commaStrip, inText) is not None):
+            outText = outText.replace(p, '')
+        else:
+            outText = outText.replace(p, ' ')
+    outText = periodStrip.sub('', outText, re.UNICODE)
+    return outText
+
+def h2r(value):
+    if value[0] == '#':
+        value = value[1:]
+    assert len(value) == 6
+    return tuple(int(value[i:i + 2], 16) for i in range(0, 6, 2))
+
+def r2h(rgb):
+    return '#%02x%02x%02x' % rgb
+
+def colored(s, color):
+    if isinstance(color, str):
+        if hasattr(fg, color):
+            return getattr(fg, color) + s + fg.rs
+        color = h2r(color)
+    return fg(*color) + s + fg.rs
+
+def istype(s, type):
+    if isinstance(s, type):
+        return True
+    try:
+        return isinstance(eval(s), type)
+    except Exception as _:
+        return False
+
+def bincount(lst):
+    bins = defaultdict(lambda: 0)
+    for item in lst:
+        bins[item] += 1
+    return bins
+
+def get_cache_path(repo_id):
+    hf_cache_info = scan_cache_dir()
+    repos = list(hf_cache_info.repos)
+    repo = None
+    for r in repos:
+        if r.repo_id == repo_id:
+            repo = r
+            break
+    if repo is None:
+        return None
+    revs = list(repo.revisions)
+    rev2keep, last_modified = None, 0
+    for rev in revs:
+        if rev.last_modified > last_modified:
+            rev2keep, last_modified = rev, rev.last_modified
+    if rev2keep is None:
+        return None
+    return str(rev2keep.snapshot_path)
+
+def proxy_set(s):
+    import os
+    for key in ['http_proxy', 'HTTP_PROXY', 'https_proxy', 'HTTPS_PROXY']:
+        os.environ[key] = s
+
+def get_rank_and_world_size():
+    rank = int(os.environ.get('RANK', 0))
+    world_size = int(os.environ.get('WORLD_SIZE', 1))
+    return rank, world_size
+
+def splitlen(s, sym='/'):
+    return len(s.split(sym))
+
+def listinstr(lst, s):
+    assert isinstance(lst, list)
+    for item in lst:
+        if item in s:
+            return True
+    return False
+
+def d2df(D):
+    return pd.DataFrame({x: [D[x]] for x in D})
+
+def cn_string(s):
+    import re
+    if re.search(u'[\u4e00-\u9fff]', s):
+        return True
+    return False
+
+try:
+    import decord
+except ImportError:
+    pass
+
+def timestr(second=True, minute=False):
+    s = datetime.datetime.now().strftime('%Y%m%d%H%M%S')[2:]
+    if second:
+        return s
+    elif minute:
+        return s[:-2]
+    else:
+        return s[:-4]
+
+def dict_merge(dct, merge_dct):
+    for k, _ in merge_dct.items():
+        if (k in dct and isinstance(dct[k], dict) and isinstance(merge_dct[k], dict)):  #noqa
+            dict_merge(dct[k], merge_dct[k])
+        else:
+            dct[k] = merge_dct[k]
+
+def youtube_dl(idx):
+    cmd = f'youtube-dl -f best -f mp4 "{idx}"  -o {idx}.mp4'
+    os.system(cmd)
+
+def run_command(cmd):
+    if isinstance(cmd, str):
+        cmd = cmd.split()
+    return subprocess.check_output(cmd).decode()
+
+def load_env():
+    logger = logging.getLogger('LOAD_ENV')
+    try:
+        import vlmeval
+    except ImportError:
+        logger.error('VLMEval is not installed. Failed to import environment variables from .env file. ')
+        return
+    pth = osp.realpath(vlmeval.__path__[0])
+    pth = osp.join(pth, '../.env')
+    pth = osp.realpath(pth)
+    if not osp.exists(pth):
+        logger.error(f'Did not detect the .env file at {pth}, failed to load. ')
+        return
+
+    from dotenv import dotenv_values
+    values = dotenv_values(pth)
+    for k, v in values.items():
+        if v is not None and len(v):
+            os.environ[k] = v
+    logger.info(f'API Keys successfully loaded from {pth}')
+
+def pip_install_robust(package):
+    import sys
+    retry = 3
+    while retry > 0:
+        try:
+            package_base = package.split('=')[0]
+            module = __import__(package)
+            return True
+        except ImportError:
+            subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])
+            retry -= 1
+    return False
+
+
+def version_cmp(v1, v2, op='eq'):
+    from packaging import version
+    import operator
+    op_func = getattr(operator, op)
+    return op_func(version.parse(v1), version.parse(v2))
diff --git a/eval_mm/vlmevalkit/vlmeval/smp/vlm.py b/eval_mm/vlmevalkit/vlmeval/smp/vlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..6577cec1a9292daa3a3dbe2a42b0f7bee892f9cf
--- /dev/null
+++ b/eval_mm/vlmevalkit/vlmeval/smp/vlm.py
@@ -0,0 +1,134 @@
+import os
+import io
+import pandas as pd
+import numpy as np
+import string
+from uuid import uuid4
+import os.path as osp
+import base64
+from PIL import Image
+from .file import load, dump
+Image.MAX_IMAGE_PIXELS = 1e9
+
+
+def mmqa_display(question, target_size=512):
+    question = {k.lower(): v for k, v in question.items()}
+    keys = list(question.keys())
+    keys = [k for k in keys if k not in ['index', 'image']]
+
+    images = question['image']
+    if isinstance(images, str):
+        images = [images]
+
+    idx = question.pop('index', 'XXX')
+    print(f'INDEX: {idx}')
+
+    for im in images:
+        image = decode_base64_to_image(im, target_size=target_size)
+        display(image)  # noqa: F821
+
+    for k in keys:
+        try:
+            if not pd.isna(question[k]):
+                print(f'{k.upper()}. {question[k]}')
+        except ValueError:
+            if False in pd.isna(question[k]):
+                print(f'{k.upper()}. {question[k]}')
+
+
+def encode_image_to_base64(img, target_size=-1):
+    # if target_size == -1, will not do resizing
+    # else, will set the max_size ot (target_size, target_size)
+    if img.mode in ('RGBA', 'P'):
+        img = img.convert('RGB')
+    tmp = osp.join('/tmp', str(uuid4()) + '.jpg')
+    if target_size > 0:
+        img.thumbnail((target_size, target_size))
+    img.save(tmp)
+    with open(tmp, 'rb') as image_file:
+        image_data = image_file.read()
+    ret = base64.b64encode(image_data).decode('utf-8')
+    os.remove(tmp)
+    return ret
+
+
+def encode_image_file_to_base64(image_path, target_size=-1):
+    image = Image.open(image_path)
+    return encode_image_to_base64(image, target_size=target_size)
+
+
+def decode_base64_to_image(base64_string, target_size=-1):
+    image_data = base64.b64decode(base64_string)
+    image = Image.open(io.BytesIO(image_data))
+    if image.mode in ('RGBA', 'P'):
+        image = image.convert('RGB')
+    if target_size > 0:
+        image.thumbnail((target_size, target_size))
+    return image
+
+
+def decode_base64_to_image_file(base64_string, image_path, target_size=-1):
+    image = decode_base64_to_image(base64_string, target_size=target_size)
+    image.save(image_path)
+
+
+def build_option_str(option_dict):
+    s = 'There are several options: \n'
+    for c, content in option_dict.items():
+        if not pd.isna(content):
+            s += f'{c}. {content}\n'
+    return s
+
+
+def isimg(s):
+    return osp.exists(s) or s.startswith('http')
+
+
+def read_ok(img_path):
+    if not osp.exists(img_path):
+        return False
+    try:
+        im = Image.open(img_path)
+        assert im.size[0] > 0 and im.size[1] > 0
+        return True
+    except:
+        return False
+
+
+def gpt_key_set():
+    openai_key = os.environ.get('OPENAI_API_KEY', None)
+    return isinstance(openai_key, str) and openai_key.startswith('sk-')
+
+
+def apiok(wrapper):
+    s = wrapper.generate('Hello!')
+    return wrapper.fail_msg not in s
+
+
+def circular_pred(df, extract_func=None):
+    if extract_func is None:
+        extract_func = lambda x: x  # noqa: E731
+    df = df.sort_values('index')
+    from vlmeval.utils import can_infer_option
+    shift = int(1e6)
+
+    choices = [extract_func(x) for x in df['prediction']]
+    pred_map = {i: c for i, c in zip(df['index'], choices)}
+    flag_map = {i: True for i in pred_map if i < 1e6}
+    valid_map = {i: True for i in pred_map if i < 1e6}
+    for i in df['index']:
+        if i >= shift and pred_map[i] and pred_map[i - shift]:
+            if (
+                pred_map[i] not in list(string.ascii_uppercase) or  # noqa: W504
+                pred_map[i - shift] not in list(string.ascii_uppercase)
+            ):
+
+                valid_map[i % shift] = False
+                continue
+            if (ord(pred_map[i]) - ord(pred_map[i - shift])) % 4 == 1:
+                continue
+            else:
+                flag_map[i % shift] = False
+    flag_map = {k: v for k, v in flag_map.items() if valid_map[k]}
+    flags = list(flag_map.values())
+    return np.mean(flags)
diff --git a/eval_mm/vlmevalkit/vlmeval/utils/__init__.py b/eval_mm/vlmevalkit/vlmeval/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..930367891e263a35a6b537cf1c494c5dff299260
--- /dev/null
+++ b/eval_mm/vlmevalkit/vlmeval/utils/__init__.py
@@ -0,0 +1,12 @@
+from .matching_util import can_infer, can_infer_option, can_infer_text
+from .mp_util import track_progress_rich
+from .custom_prompt import CustomPrompt
+from .dataset_config import dataset_URLs, img_root_map, DATASET_TYPE, abbr2full
+from .dataset import TSVDataset, split_MMMU, MMMU_result_transfer
+
+
+__all__ = [
+    'can_infer', 'can_infer_option', 'can_infer_text', 'track_progress_rich',
+    'TSVDataset', 'dataset_URLs', 'img_root_map', 'DATASET_TYPE', 'CustomPrompt',
+    'split_MMMU', 'abbr2full'
+]
diff --git a/eval_mm/vlmevalkit/vlmeval/utils/custom_prompt.py b/eval_mm/vlmevalkit/vlmeval/utils/custom_prompt.py
new file mode 100644
index 0000000000000000000000000000000000000000..5841fbcec61217d4d2c61369455c3211a8ba8b45
--- /dev/null
+++ b/eval_mm/vlmevalkit/vlmeval/utils/custom_prompt.py
@@ -0,0 +1,33 @@
+from ..smp import *
+from .dataset_config import img_root_map
+from abc import abstractmethod
+
+
+class CustomPrompt:
+
+    @abstractmethod
+    def use_custom_prompt(self, dataset):
+        raise NotImplementedError
+
+    @abstractmethod
+    def build_prompt(self, line, dataset):
+        raise NotImplementedError
+
+    def dump_image(self, line, dataset):
+        ROOT = LMUDataRoot()
+        assert isinstance(dataset, str)
+        img_root = osp.join(ROOT, 'images', img_root_map[dataset] if dataset in img_root_map else dataset)
+        os.makedirs(img_root, exist_ok=True)
+        if isinstance(line['image'], list):
+            tgt_path = []
+            assert 'image_path' in line
+            for img, im_name in zip(line['image'], line['image_path']):
+                path = osp.join(img_root, im_name)
+                if not read_ok(path):
+                    decode_base64_to_image_file(img, path)
+                tgt_path.append(path)
+        else:
+            tgt_path = osp.join(img_root, f"{line['index']}.jpg")
+            if not read_ok(tgt_path):
+                decode_base64_to_image_file(line['image'], tgt_path)
+        return tgt_path
diff --git a/eval_mm/vlmevalkit/vlmeval/utils/dataset.py b/eval_mm/vlmevalkit/vlmeval/utils/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbbb440e1353f4ae69d1067e2172544235069af5
--- /dev/null
+++ b/eval_mm/vlmevalkit/vlmeval/utils/dataset.py
@@ -0,0 +1,190 @@
+import pandas as pd
+import hashlib
+from ..smp import *
+from .dataset_config import dataset_URLs, dataset_md5_dict, DATASET_TYPE
+from .custom_prompt import CustomPrompt
+from .matching_util import can_infer
+
+
+def isliststr(s):
+    return (s[0] == '[') and (s[-1] == ']')
+
+
+def check_md5(data_path, dataset):
+    if dataset not in dataset_md5_dict:
+        warnings.warn(f'We do not have an md5 record for dataset {dataset}, skip the md5 check. ')
+        return True
+    assert osp.exists(data_path)
+    with open(data_path, 'rb') as f:
+        hash = hashlib.new('md5')
+        for chunk in iter(lambda: f.read(2**20), b''):
+            hash.update(chunk)
+    if str(hash.hexdigest()) == dataset_md5_dict[dataset]:
+        return True
+    else:
+        warnings.warn('this data file is incomplete, so it needs to be downloaded again.')
+        return False
+
+
+def split_MMMU(msgs):
+    text, images = None, []
+    for s in msgs:
+        if s['type'] == 'image':
+            images.append(s['value'])
+        elif s['type'] == 'text':
+            assert text is None
+            text = s['value']
+    text_segs = text.split('<image ')
+    segs = [dict(type='text', value=text_segs[0])]
+    for i, seg in enumerate(text_segs):
+        if i == 0:
+            continue
+        assert istype(seg[0], int) and seg[1] == '>'
+        image_idx = int(seg[0]) - 1
+        segs.append(dict(type='image', value=images[image_idx]))
+        segs.append(dict(type='text', value=seg[2:]))
+    return segs
+
+
+def MMMU_result_transfer(result_path):
+    res = {}
+    result_data = load(result_path)
+    mcq = result_data['A'].notna()
+    lt = len(result_data)
+    for i in range(lt):
+        line = result_data.iloc[i]
+        if mcq[i]:
+            options = {
+                cand: line[cand]
+                for cand in string.ascii_uppercase
+                if cand in line and not pd.isna(line[cand])
+            }
+            prediction = line['prediction']
+            infer_prediction = can_infer(prediction, options)
+            res[line['id']] = infer_prediction
+        else:
+            res[line['id']] = line['prediction']
+    result_json = result_path.replace('.xlsx', '.json')
+    dump(res, result_json)
+    return result_json
+
+
+class TSVDataset(CustomPrompt):
+
+    def __init__(self, dataset='MMBench', skip_noimg=True):
+
+        self.data_root = LMUDataRoot()
+        assert osp.exists(self.data_root)
+
+        self.dataset = dataset
+        self.dataset_type = DATASET_TYPE(dataset)
+
+        if dataset in dataset_URLs:
+            url = dataset_URLs[dataset]
+            file_name = url.split('/')[-1]
+            data_path = osp.join(self.data_root, file_name)
+
+            if osp.exists(data_path) and check_md5(data_path, dataset):
+                pass
+            elif osp.isfile(url):
+            # If url is actually a file path, use it directly
+                data_path = url
+            else:
+                warnings.warn('The dataset tsv is not downloaded')
+                download_file(url, data_path)
+        else:
+            data_path = osp.join(self.data_root, dataset + '.tsv')
+            assert osp.exists(data_path)
+
+        data = load(data_path)
+        self.skip_noimg = skip_noimg
+        if skip_noimg and 'image' in data:
+            data = data[~pd.isna(data['image'])]
+
+        # Prompt for Captioning
+        if listinstr(['COCO'], dataset):
+            data['question'] = [(
+                'Please describe this image in general. Directly provide the description, '
+                'do not include prefix like "This image depicts". '
+            )] * len(data)
+
+        data['index'] = [str(x) for x in data['index']]
+
+        self.meta_only = True
+        if 'image' in data:
+            data['image'] = [str(x) for x in data['image']]
+
+            image_map = {x: y for x, y in zip(data['index'], data['image'])}
+            for k in image_map:
+                if len(image_map[k]) <= 64:
+                    idx = image_map[k]
+                    assert idx in image_map and len(image_map[idx]) > 64
+                    image_map[k] = image_map[idx]
+
+            data['image'] = [
+                eval(image_map[k]) if isliststr(image_map[k]) else image_map[k]
+                for k in data['index']
+            ]
+            self.meta_only = False
+
+        if 'image_path' in data:
+            data['image_path'] = [
+                eval(pths) if isliststr(pths) else pths for pths in data['image_path']
+            ]
+
+        if np.all([istype(x, int) for x in data['index']]):
+            data['index'] = [int(x) for x in data['index']]
+
+        self.data = data
+
+    def __len__(self):
+        return len(self.data)
+
+    def build_prompt(self, line, dataset=None):
+        if dataset is None:
+            dataset = self.dataset
+
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+
+        if self.meta_only:
+            tgt_path = line['image_path']
+        else:
+            tgt_path = self.dump_image(line, dataset)
+
+        prompt = line['question']
+        if DATASET_TYPE(dataset) == 'multi-choice':
+            question = line['question']
+            options = {
+                cand: line[cand]
+                for cand in string.ascii_uppercase
+                if cand in line and not pd.isna(line[cand])
+            }
+            options_prompt = 'Options:\n'
+            for key, item in options.items():
+                options_prompt += f'{key}. {item}\n'
+            hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+            prompt = ''
+            if hint is not None:
+                prompt += f'Hint: {hint}\n'
+            prompt += f'Question: {question}\n'
+            if len(options):
+                prompt += options_prompt
+                prompt += 'Please select the correct answer from the options above. \n'
+        elif DATASET_TYPE(dataset) == 'VQA':
+            if listinstr(['ocrvqa', 'textvqa', 'chartqa', 'docvqa'], dataset.lower()):
+                prompt += '\nPlease try to answer the question with short words or phrases if possible\n.'
+
+        msgs = []
+        if isinstance(tgt_path, list):
+            msgs.extend([dict(type='image', value=p) for p in tgt_path])
+        else:
+            msgs = [dict(type='image', value=tgt_path)]
+        msgs.append(dict(type='text', value=prompt))
+
+        return msgs
+
+    def display(self, line):
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+        mmqa_display(line)
diff --git a/eval_mm/vlmevalkit/vlmeval/utils/dataset_config.py b/eval_mm/vlmevalkit/vlmeval/utils/dataset_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..933665317a6ff3b386822bac80791b3325dacf25
--- /dev/null
+++ b/eval_mm/vlmevalkit/vlmeval/utils/dataset_config.py
@@ -0,0 +1,158 @@
+from ..smp import listinstr
+
+dataset_URLs = {
+    # MMBench v1.0
+    'MMBench_DEV_EN': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_DEV_EN.tsv',
+    'MMBench_TEST_EN': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_TEST_EN.tsv',
+    'MMBench_DEV_CN': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_DEV_CN.tsv',
+    'MMBench_TEST_CN': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_TEST_CN.tsv',
+    'MMBench': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench.tsv',  # Internal Only
+    'MMBench_CN': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_CN.tsv',    # Internal Only
+    # MMBench v1.1
+    'MMBench_DEV_EN_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_DEV_EN_V11.tsv',
+    'MMBench_TEST_EN_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_TEST_EN_V11.tsv',
+    'MMBench_DEV_CN_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_DEV_CN_V11.tsv',
+    'MMBench_TEST_CN_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_TEST_CN_V11.tsv',
+    'MMBench_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_V11.tsv',  # Internal Only
+    'MMBench_CN_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_CN_V11.tsv',    # Internal Only
+    # CCBench
+    'CCBench': 'https://opencompass.openxlab.space/utils/VLMEval/CCBench.tsv',
+    'MME': 'https://opencompass.openxlab.space/utils/VLMEval/MME.tsv',
+    'SEEDBench_IMG': 'https://opencompass.openxlab.space/utils/VLMEval/SEEDBench_IMG.tsv',
+    'CORE_MM': 'https://opencompass.openxlab.space/utils/VLMEval/CORE_MM.tsv',
+    'MMVet': 'https://opencompass.openxlab.space/utils/VLMEval/MMVet.tsv',
+    'COCO_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/COCO_VAL.tsv',
+    'OCRVQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/OCRVQA_TEST.tsv',
+    'OCRVQA_TESTCORE': 'https://opencompass.openxlab.space/utils/VLMEval/OCRVQA_TESTCORE.tsv',
+    'TextVQA_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/TextVQA_VAL.tsv',
+    'MMMU_DEV_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/MMMU_DEV_VAL.tsv',
+    'MMMU_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/MMMU_TEST.tsv',
+    'MathVista_MINI': 'https://opencompass.openxlab.space/utils/VLMEval/MathVista_MINI.tsv',
+    'ScienceQA_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/ScienceQA_VAL.tsv',
+    'ScienceQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/ScienceQA_TEST.tsv',
+    'HallusionBench': 'https://opencompass.openxlab.space/utils/VLMEval/HallusionBench.tsv',
+    'DocVQA_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/DocVQA_VAL.tsv',
+    'DocVQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/DocVQA_TEST.tsv',
+    'InfoVQA_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/InfoVQA_VAL.tsv',
+    'InfoVQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/InfoVQA_TEST.tsv',
+    'AI2D_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/AI2D_TEST.tsv',
+    'LLaVABench': 'https://opencompass.openxlab.space/utils/VLMEval/LLaVABench.tsv',
+    'OCRBench': 'https://opencompass.openxlab.space/utils/VLMEval/OCRBench.tsv',
+    'ChartQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/ChartQA_TEST.tsv',
+    'MMStar': 'https://opencompass.openxlab.space/utils/VLMEval/MMStar.tsv',
+    'RealWorldQA': 'https://opencompass.openxlab.space/utils/VLMEval/RealWorldQA.tsv',
+    'POPE': 'https://opencompass.openxlab.space/utils/VLMEval/POPE.tsv',
+}
+
+dataset_md5_dict = {
+    # MMBench v1.0
+    'MMBench_DEV_EN': 'b6caf1133a01c6bb705cf753bb527ed8',
+    'MMBench_TEST_EN': '6939fadb0ce626fefc0bdc9c64efc528',
+    'MMBench_DEV_CN': '08b8fc3324a5ed74155350f57be69fbd',
+    'MMBench_TEST_CN': '7e1239baf0ee4c8b513e19705a0f317e',
+    'MMBench': '4115aea3383f3dd0083be6a633e0f820',  # Internal Only
+    'MMBench_CN': '2e053ffc90ea598b1feae13c36dc13ee',    # Internal Only
+    # MMBench v1.1
+    'MMBench_DEV_EN_V11': '30c05be8f2f347a50be25aa067248184',
+    'MMBench_TEST_EN_V11': '26f0f15381a21720255091d3e0316ce6',
+    'MMBench_DEV_CN_V11': '593f9b5f6bea453d870a798b34ae4f37',
+    'MMBench_TEST_CN_V11': '74bbe4556dac745613c7cbe5ad787050',
+    'MMBench_V11': 'b9276414f57af1308dcc4d0cd9b42e7c',  # Internal Only
+    'MMBench_CN_V11': '95f6980dd1b4de38e3cbffe0305a3f25',    # Internal Only
+    # CCBench
+    'CCBench': '1de88b4257e7eee3f60b18d45eda6f07',
+    'MME': 'b36b43c3f09801f5d368627fb92187c3',
+    'SEEDBench_IMG': '68017231464752261a2526d6ca3a10c0',
+    'CORE_MM': '8a8da2f2232e79caf98415bfdf0a202d',
+    'MMVet': '748aa6d4aa9d4de798306a63718455e3',
+    'COCO_VAL': '72a5079dead060269ac222c5aa5128af',
+    'OCRVQA_TEST': 'ca46a6d74b403e9d6c0b670f6fc00db9',
+    'OCRVQA_TESTCORE': 'c5239fe77db8bdc1f2ad8e55e0d1fe97',
+    'TextVQA_VAL': 'b233b31f551bbf4056f2f955da3a92cd',
+    'MMMU_DEV_VAL': '521afc0f3bf341e6654327792781644d',
+    'MMMU_TEST': 'c19875d11a2d348d07e5eb4bdf33166d',
+    'MathVista_MINI': 'f199b98e178e5a2a20e7048f5dcb0464',
+    'ScienceQA_VAL': '96320d05e142e585e7204e72affd29f3',
+    'ScienceQA_TEST': 'e42e9e00f9c59a80d8a5db35bc32b71f',
+    'HallusionBench': '0c23ac0dc9ef46832d7a24504f2a0c7c',
+    'DocVQA_VAL': 'd5ee77e1926ff10690d469c56b73eabf',
+    'DocVQA_TEST': '6a2f28cac26ef2d3447374e8c6f6c8e9',
+    'InfoVQA_VAL': '2342e9c225222f0ef4dec545ebb126fe',
+    'InfoVQA_TEST': 'df535bf51b88dc9718252c34131a6227',
+    'AI2D_TEST': '0f593e0d1c7df9a3d69bf1f947e71975',
+    'LLaVABench': 'd382a093f749a697820d3dadd61c8428',
+    'OCRBench': 'e953d98a987cc6e26ef717b61260b778',
+    'ChartQA_TEST': 'c902e0aa9be5582a7aad6dcf52734b42',
+    'MMStar': 'e1ecd2140806c1b1bbf54b43372efb9e',
+    'RealWorldQA': '92321028d2bc29040284b6674721e48f',
+    'POPE': 'c12f5acb142f2ef1f85a26ba2fbe41d5',
+}
+
+img_root_map = {k: k for k in dataset_URLs}
+img_root_map.update({
+    # MMBench v1.0
+    'MMBench_DEV_EN': 'MMBench',
+    'MMBench_TEST_EN': 'MMBench',
+    'MMBench_DEV_CN': 'MMBench',
+    'MMBench_TEST_CN': 'MMBench',
+    'MMBench': 'MMBench',   # Internal Only
+    'MMBench_CN': 'MMBench',    # Internal Only
+    # MMBench v1.1
+    'MMBench_DEV_EN_V11': 'MMBench_V11',
+    'MMBench_TEST_EN_V11': 'MMBench_V11',
+    'MMBench_DEV_CN_V11': 'MMBench_V11',
+    'MMBench_TEST_CN_V11': 'MMBench_V11',
+    'MMBench_V11': 'MMBench_V11',   # Internal Only
+    'MMBench_CN_V11': 'MMBench_V11',    # Internal Only
+    'COCO_VAL': 'COCO',
+    'OCRVQA_TEST': 'OCRVQA',
+    'OCRVQA_TESTCORE': 'OCRVQA',
+    'TextVQA_VAL': 'TextVQA',
+    'MMMU_DEV_VAL': 'MMMU',
+    'MMMU_TEST': 'MMMU',
+    'MathVista_MINI': 'MathVista',
+    'HallusionBench': 'Hallusion',
+    'DocVQA_VAL': 'DocVQA',
+    'DocVQA_TEST': 'DocVQA_TEST',
+    'OCRBench': 'OCRBench',
+    'ChartQA_TEST': 'ChartQA_TEST',
+    'InfoVQA_VAL': 'InfoVQA_VAL',
+    'InfoVQA_TEST': 'InfoVQA_TEST',
+    'MMStar': 'MMStar',
+    'RealWorldQA': 'RealWorldQA',
+    'POPE': 'POPE',
+})
+
+assert set(dataset_URLs) == set(img_root_map)
+
+
+def DATASET_TYPE(dataset):
+    # Dealing with Custom Dataset
+    dataset = dataset.lower()
+    if listinstr(['mmbench', 'seedbench', 'ccbench', 'mmmu', 'scienceqa', 'ai2d', 'mmstar', 'realworldqa'], dataset):
+        return 'multi-choice'
+    elif listinstr(['mme', 'hallusion', 'pope'], dataset):
+        return 'Y/N'
+    elif 'coco' in dataset:
+        return 'Caption'
+    elif listinstr(['ocrvqa', 'textvqa', 'chartqa', 'mathvista', 'docvqa', 'infovqa', 'llavabench',
+                    'mmvet', 'ocrbench'], dataset):
+        return 'VQA'
+    else:
+        if dataset not in dataset_URLs:
+            import warnings
+            warnings.warn(f"Dataset {dataset} not found in dataset_URLs, will use 'multi-choice' as the default TYPE.")
+            return 'multi-choice'
+        else:
+            return 'QA'
+
+
+def abbr2full(s):
+    datasets = [x for x in img_root_map]
+    ins = [s in d for d in datasets]
+    if sum(ins) == 1:
+        for d in datasets:
+            if s in d:
+                return d
+    else:
+        return s
diff --git a/eval_mm/vlmevalkit/vlmeval/utils/matching_util.py b/eval_mm/vlmevalkit/vlmeval/utils/matching_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccdf1b7d563125a4763724b81cf0164b06b94185
--- /dev/null
+++ b/eval_mm/vlmevalkit/vlmeval/utils/matching_util.py
@@ -0,0 +1,69 @@
+import string
+import copy as cp
+import os
+from ..smp import *
+
+
+def can_infer_option(answer, choices):
+    verbose = os.environ.get('VERBOSE', 0)
+    # Choices is a dictionary
+    if 'Failed to obtain answer via API' in answer:
+        return False
+
+    reject_to_answer = [
+        "Sorry, I can't help with images of people yet.",
+        "I can't process this file.",
+        "I'm sorry, but without the image provided",
+        'Cannot determine the answer'
+    ]
+    for err in reject_to_answer:
+        if err in answer:
+            return 'Z'
+
+    def count_choice(splits, choices, prefix='', suffix=''):
+        cnt = 0
+        for c in choices:
+            if prefix + c + suffix in splits:
+                cnt += 1
+        return cnt
+
+    answer_mod = cp.copy(answer)
+    chars = '.()[],:;!*#{}'
+    for c in chars:
+        answer_mod = answer_mod.replace(c, ' ')
+
+    splits = [x.strip() for x in answer_mod.split()]
+    count = count_choice(splits, choices)
+
+    if count == 1:
+        for ch in choices:
+            if 'A' in splits and len(splits) > 3 and verbose:
+                logger = get_logger('Evaluation')
+                logger.info(f'A might be a quantifier in the string: {answer}.')
+                return False
+            if ch in splits:
+                return ch
+    elif count == 0 and count_choice(splits, {'Z', ''}) == 1:
+        return 'Z'
+    return False
+
+
+def can_infer_text(answer, choices):
+    answer = answer.lower()
+    assert isinstance(choices, dict)
+    for k in choices:
+        assert k in string.ascii_uppercase
+        choices[k] = str(choices[k]).lower()
+    cands = []
+    for k in choices:
+        if choices[k] in answer:
+            cands.append(k)
+    if len(cands) == 1:
+        return cands[0]
+    return False
+
+
+def can_infer(answer, choices):
+    answer = str(answer)
+    copt = can_infer_option(answer, choices)
+    return copt if copt else can_infer_text(answer, choices)
diff --git a/eval_mm/vlmevalkit/vlmeval/utils/mp_util.py b/eval_mm/vlmevalkit/vlmeval/utils/mp_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..2322e2d0e404cb632f4954e6a08ac386c6d38a11
--- /dev/null
+++ b/eval_mm/vlmevalkit/vlmeval/utils/mp_util.py
@@ -0,0 +1,191 @@
+from multiprocessing import Pool
+import os
+from typing import Callable, Iterable, Sized
+
+from rich.progress import (BarColumn, MofNCompleteColumn, Progress, Task,
+                           TaskProgressColumn, TextColumn, TimeRemainingColumn)
+from rich.text import Text
+import os.path as osp
+import portalocker
+from ..smp import load, dump
+
+
+class _Worker:
+    """Function wrapper for ``track_progress_rich``"""
+
+    def __init__(self, func) -> None:
+        self.func = func
+
+    def __call__(self, inputs):
+        inputs, idx = inputs
+        if not isinstance(inputs, (tuple, list, dict)):
+            inputs = (inputs, )
+
+        if isinstance(inputs, dict):
+            return self.func(**inputs), idx
+        else:
+            return self.func(*inputs), idx
+
+
+class _SkipFirstTimeRemainingColumn(TimeRemainingColumn):
+    """Skip calculating remaining time for the first few times.
+
+    Args:
+        skip_times (int): The number of times to skip. Defaults to 0.
+    """
+
+    def __init__(self, *args, skip_times=0, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.skip_times = skip_times
+
+    def render(self, task: Task) -> Text:
+        """Show time remaining."""
+        if task.completed <= self.skip_times:
+            return Text('-:--:--', style='progress.remaining')
+        return super().render(task)
+
+
+def _tasks_with_index(tasks):
+    """Add index to tasks."""
+    for idx, task in enumerate(tasks):
+        yield task, idx
+
+
+def track_progress_rich(func: Callable,
+                        tasks: Iterable = tuple(),
+                        task_num: int = None,
+                        nproc: int = 1,
+                        chunksize: int = 1,
+                        description: str = 'Processing',
+                        save=None, keys=None,
+                        color: str = 'blue') -> list:
+    """Track the progress of parallel task execution with a progress bar. The
+    built-in :mod:`multiprocessing` module is used for process pools and tasks
+    are done with :func:`Pool.map` or :func:`Pool.imap_unordered`.
+
+    Args:
+        func (callable): The function to be applied to each task.
+        tasks (Iterable or Sized): A tuple of tasks. There are several cases
+            for different format tasks:
+            - When ``func`` accepts no arguments: tasks should be an empty
+              tuple, and ``task_num`` must be specified.
+            - When ``func`` accepts only one argument: tasks should be a tuple
+              containing the argument.
+            - When ``func`` accepts multiple arguments: tasks should be a
+              tuple, with each element representing a set of arguments.
+              If an element is a ``dict``, it will be parsed as a set of
+              keyword-only arguments.
+            Defaults to an empty tuple.
+        task_num (int, optional): If ``tasks`` is an iterator which does not
+            have length, the number of tasks can be provided by ``task_num``.
+            Defaults to None.
+        nproc (int): Process (worker) number, if nuproc is 1,
+            use single process. Defaults to 1.
+        chunksize (int): Refer to :class:`multiprocessing.Pool` for details.
+            Defaults to 1.
+        description (str): The description of progress bar.
+            Defaults to "Process".
+        color (str): The color of progress bar. Defaults to "blue".
+
+    Examples:
+        >>> import time
+
+        >>> def func(x):
+        ...    time.sleep(1)
+        ...    return x**2
+        >>> track_progress_rich(func, range(10), nproc=2)
+
+    Returns:
+        list: The task results.
+    """
+    if save is not None:
+        assert osp.exists(osp.dirname(save)) or osp.dirname(save) == ''
+        if not osp.exists(save):
+            dump({}, save)
+    if keys is not None:
+        assert len(keys) == len(tasks)
+
+    if not callable(func):
+        raise TypeError('func must be a callable object')
+    if not isinstance(tasks, Iterable):
+        raise TypeError(
+            f'tasks must be an iterable object, but got {type(tasks)}')
+    if isinstance(tasks, Sized):
+        if len(tasks) == 0:
+            if task_num is None:
+                raise ValueError('If tasks is an empty iterable, '
+                                 'task_num must be set')
+            else:
+                tasks = tuple(tuple() for _ in range(task_num))
+        else:
+            if task_num is not None and task_num != len(tasks):
+                raise ValueError('task_num does not match the length of tasks')
+            task_num = len(tasks)
+
+    if nproc <= 0:
+        raise ValueError('nproc must be a positive number')
+
+    skip_times = nproc * chunksize if nproc > 1 else 0
+    prog_bar = Progress(
+        TextColumn('{task.description}'),
+        BarColumn(),
+        _SkipFirstTimeRemainingColumn(skip_times=skip_times),
+        MofNCompleteColumn(),
+        TaskProgressColumn(show_speed=True),
+    )
+
+    worker = _Worker(func)
+    task_id = prog_bar.add_task(
+        total=task_num, color=color, description=description)
+    tasks = _tasks_with_index(tasks)
+
+    # Use single process when nproc is 1, else use multiprocess.
+    with prog_bar:
+        if nproc == 1:
+            results = []
+            for task in tasks:
+                result, idx = worker(task)
+                results.append(worker(task)[0])
+                if save is not None:
+                    with portalocker.Lock(save, timeout=5) as fh:
+                        ans = load(save)
+                        ans[keys[idx]] = result
+
+                        if os.environ.get('VERBOSE', True):
+                            print(keys[idx], result, flush=True)
+
+                        dump(ans, save)
+                        fh.flush()
+                        os.fsync(fh.fileno())
+
+                prog_bar.update(task_id, advance=1, refresh=True)
+        else:
+            with Pool(nproc) as pool:
+                results = []
+                unordered_results = []
+                gen = pool.imap_unordered(worker, tasks, chunksize)
+                try:
+                    for result in gen:
+                        result, idx = result
+                        unordered_results.append((result, idx))
+
+                        if save is not None:
+                            with portalocker.Lock(save, timeout=5) as fh:
+                                ans = load(save)
+                                ans[keys[idx]] = result
+
+                                if os.environ.get('VERBOSE', False):
+                                    print(keys[idx], result, flush=True)
+
+                                dump(ans, save)
+                                fh.flush()
+                                os.fsync(fh.fileno())
+
+                        results.append(None)
+                        prog_bar.update(task_id, advance=1, refresh=True)
+                except Exception as e:
+                    prog_bar.stop()
+                    raise e
+            for result, idx in unordered_results:
+                results[idx] = result
+    return results
diff --git a/eval_mm/vlmevalkit/vlmeval/vlm/__init__.py b/eval_mm/vlmevalkit/vlmeval/vlm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..51b2b2b6ac2106839421526daf74a15970ab38c2
--- /dev/null
+++ b/eval_mm/vlmevalkit/vlmeval/vlm/__init__.py
@@ -0,0 +1,7 @@
+import torch
+
+torch.set_grad_enabled(False)
+torch.manual_seed(1234)
+from .base import BaseModel
+from .minicpm_llama3_v_2_5 import MiniCPM_Llama3_V
+from .minicpm_v import MiniCPM_V
\ No newline at end of file
diff --git a/eval_mm/vlmevalkit/vlmeval/vlm/base.py b/eval_mm/vlmevalkit/vlmeval/vlm/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..70a5b7bfd10a9f1a2797b8e9b194297c6cf0da8a
--- /dev/null
+++ b/eval_mm/vlmevalkit/vlmeval/vlm/base.py
@@ -0,0 +1,150 @@
+from ..smp import *
+from ..utils.dataset_config import img_root_map
+from abc import abstractmethod
+
+
+class BaseModel:
+
+    INTERLEAVE = False
+    allowed_types = ['text', 'image']
+
+    def use_custom_prompt(self, dataset):
+        """Whether to use custom prompt for the given dataset.
+
+        Args:
+            dataset (str): The name of the dataset.
+
+        Returns:
+            bool: Whether to use custom prompt. If True, will call `build_prompt` of the VLM to build the prompt.
+                Default to False.
+        """
+        return False
+
+    @abstractmethod
+    def build_prompt(self, line, dataset):
+        """Build custom prompts for a specific dataset. Called only if `use_custom_prompt` returns True.
+
+        Args:
+            line (line of pd.DataFrame): The raw input line.
+            dataset (str): The name of the dataset.
+
+        Returns:
+            str: The built message.
+        """
+        raise NotImplementedError
+
+    def dump_image(self, line, dataset):
+        """Dump the image(s) of the input line to the corresponding dataset folder.
+
+        Args:
+            line (line of pd.DataFrame): The raw input line.
+            dataset (str): The name of the dataset.
+
+        Returns:
+            str | list[str]: The paths of the dumped images.
+        """
+        ROOT = LMUDataRoot()
+        assert isinstance(dataset, str)
+        img_root = osp.join(ROOT, 'images', img_root_map[dataset] if dataset in img_root_map else dataset)
+        os.makedirs(img_root, exist_ok=True)
+        if isinstance(line['image'], list):
+            tgt_path = []
+            assert 'image_path' in line
+            for img, im_name in zip(line['image'], line['image_path']):
+                path = osp.join(img_root, im_name)
+                if not read_ok(path):
+                    decode_base64_to_image_file(img, path)
+                tgt_path.append(path)
+        else:
+            tgt_path = osp.join(img_root, f"{line['index']}.jpg")
+            if not read_ok(tgt_path):
+                decode_base64_to_image_file(line['image'], tgt_path)
+            tgt_path = [tgt_path]
+        return tgt_path
+
+    @abstractmethod
+    def generate_inner(self, message, dataset=None):
+        raise NotImplementedError
+
+    def check_content(self, msgs):
+        """Check the content type of the input. Four types are allowed: str, dict, liststr, listdict.
+        """
+        if isinstance(msgs, str):
+            return 'str'
+        if isinstance(msgs, dict):
+            return 'dict'
+        if isinstance(msgs, list):
+            types = [self.check_content(m) for m in msgs]
+            if all(t == 'str' for t in types):
+                return 'liststr'
+            if all(t == 'dict' for t in types):
+                return 'listdict'
+        return 'unknown'
+
+    def preproc_content(self, inputs):
+        """Convert the raw input messages to a list of dicts.
+
+        Args:
+            inputs: raw input messages.
+
+        Returns:
+            list(dict): The preprocessed input messages. Will return None if failed to preprocess the input.
+        """
+        if self.check_content(inputs) == 'str':
+            return [dict(type='text', value=inputs)]
+        elif self.check_content(inputs) == 'dict':
+            assert 'type' in inputs and 'value' in inputs
+            return [inputs]
+        elif self.check_content(inputs) == 'liststr':
+            res = []
+            for s in inputs:
+                mime, pth = parse_file(s)
+                if mime is None or mime == 'unknown':
+                    res.append(dict(type='text', value=s))
+                else:
+                    res.append(dict(type=mime.split('/')[0], value=pth))
+            return res
+        elif self.check_content(inputs) == 'listdict':
+            for item in inputs:
+                assert 'type' in item and 'value' in item
+                mime, s = parse_file(item['value'])
+                if mime is None:
+                    assert item['type'] == 'text'
+                else:
+                    assert mime.split('/')[0] == item['type']
+                    item['value'] = s
+            return inputs
+        else:
+            return None
+
+    def generate(self, message, dataset=None):
+        """Generate the output message.
+
+        Args:
+            message (list[dict]): The input message.
+            dataset (str, optional): The name of the dataset. Defaults to None.
+
+        Returns:
+            str: The generated message.
+        """
+        assert self.check_content(message) in ['str', 'dict', 'liststr', 'listdict'], f'Invalid input type: {message}'
+        message = self.preproc_content(message)
+        assert message is not None and self.check_content(message) == 'listdict'
+        for item in message:
+            assert item['type'] in self.allowed_types, f'Invalid input type: {item["type"]}'
+        return self.generate_inner(message, dataset)
+
+    def message_to_promptimg(self, message):
+        assert not self.INTERLEAVE
+        model_name = self.__class__.__name__
+        warnings.warn(
+            f'Model {model_name} does not support interleaved input. '
+            'Will use the first image and aggregated texts as prompt. ')
+        num_images = len([x for x in message if x['type'] == 'image'])
+        if num_images == 0:
+            prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
+            image = None
+        else:
+            prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
+            image = [x['value'] for x in message if x['type'] == 'image'][0]
+        return prompt, image
diff --git a/eval_mm/vlmevalkit/vlmeval/vlm/minicpm_llama3_v_2_5.py b/eval_mm/vlmevalkit/vlmeval/vlm/minicpm_llama3_v_2_5.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0d06adaeec55c76794b000f35167e146a0ed8de
--- /dev/null
+++ b/eval_mm/vlmevalkit/vlmeval/vlm/minicpm_llama3_v_2_5.py
@@ -0,0 +1,155 @@
+import torch
+from PIL import Image
+from transformers import AutoModel, AutoTokenizer
+
+from ..smp import *
+from ..utils import DATASET_TYPE
+from .base import BaseModel
+
+
+class MiniCPM_Llama3_V(BaseModel):
+    INSTALL_REQ = False
+    INTERLEAVE = True
+
+    def __init__(self, model_path='openbmb/MiniCPM-V', **kwargs):
+        assert model_path is not None
+        self.model_path = model_path
+        self.ckpt = model_path
+
+        print(f'load from {self.model_path}')
+        self.model = AutoModel.from_pretrained(self.model_path, trust_remote_code=True)
+        if '.pt' in model_path:
+            print(f'load from {model_path}')
+            self.state_dict = torch.load(self.ckpt, map_location='cpu')
+            self.model.load_state_dict(self.state_dict, strict=False)
+        self.model = self.model.to(dtype=torch.float16)
+        self.model.eval().cuda()
+        self.kwargs = kwargs
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True)
+        torch.cuda.empty_cache()
+        self.num_beams = 1 if self.model_path == 'openbmb/MiniCPM-V' else 3
+        self.options_system_prompt = ('Carefully read the following question and select the letter corresponding '
+                                      'to the correct answer. Highlight the applicable choices without giving '
+                                      'explanations.')
+        self.wo_options_system_prompt = 'Carefully read the following question Answer the question directly.'
+        self.detail_system_prompt = 'Answer this question in detail.'
+        self.vqa_prompt = 'Answer the question using a single word or phrase.'
+        
+
+    def use_custom_prompt(self, dataset):
+        if listinstr(['multi-choice', 'VQA'], DATASET_TYPE(dataset)):
+            return True
+        elif dataset is not None and listinstr(['HallusionBench'], dataset):
+            return True
+        return False
+
+    def build_prompt(self, line, dataset=None):
+        if dataset is None:
+            dataset = self.dataset
+
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+
+        tgt_path = self.dump_image(line, dataset)
+        system_prompt = ''
+
+        question = line['question']
+        if DATASET_TYPE(dataset) == 'multi-choice':
+            options = {
+                cand: line[cand]
+                for cand in string.ascii_uppercase
+                if cand in line and not pd.isna(line[cand])
+            }
+            options_prompt = 'Options:\n'
+            for key, item in options.items():
+                options_prompt += f'{key}. {item}\n'
+            hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+            prompt = ''
+            if hint is not None:
+                prompt += f'Hint: {hint}\n'
+            prompt += f'Question: {question}\n'
+            if len(options):
+                prompt += options_prompt
+                system_prompt = self.options_system_prompt +  "\nPlease just indicate your choice."
+            else:
+                system_prompt = self.wo_options_system_prompt
+
+            if 'MMMU' in dataset: # Corner Case
+                prompt = system_prompt + '\n' + prompt
+                system_prompt = ''
+        elif dataset is not None and listinstr(['HallusionBench'], dataset):
+            question = line['question'] + " Yes or No?"
+            prompt = question
+        
+        elif dataset is not None and listinstr(['OCRBench'], dataset):
+            system_prompt = self.vqa_prompt
+            question = line['question']
+            prompt = question
+        elif DATASET_TYPE(dataset) == 'VQA':
+            if listinstr(['LLaVABench'], dataset):
+                system_prompt = ""
+                prompt = question
+            elif listinstr(['MMVet'], dataset):
+                system_prompt = self.detail_system_prompt
+                prompt = question
+            else:
+                system_prompt = self.vqa_prompt
+                prompt = question
+
+        msgs = []
+        if system_prompt:
+            msgs.append(dict(type='text', value=system_prompt))
+        if isinstance(tgt_path, list):
+            msgs.extend([dict(type='image', value=p) for p in tgt_path])
+        else:
+            msgs = [dict(type='image', value=tgt_path)]
+        msgs.append(dict(type='text', value=prompt))
+
+        return msgs
+
+    def generate_inner(self, message, dataset=None):
+        if DATASET_TYPE(dataset) == 'multi-choice':
+            max_new_tokens = 200
+        elif DATASET_TYPE(dataset) == 'Y/N':
+            max_new_tokens = 3
+        else:
+            max_new_tokens = 1024
+
+        '''
+        nums_beams = 3
+        '''
+        default_kwargs = dict(
+            max_new_tokens=max_new_tokens,
+            sampling=False,
+            num_beams=self.num_beams,
+        )
+        default_kwargs.update(self.kwargs)
+
+        content = []
+        
+    #     message = [
+    #     {'type': 'text', 'value': 'sys prompt'},
+    #     {'type': 'image', 'value': '/path/to/image1.jpg'},
+    #     {'type': 'text', 'value': 'Here is an image:'},
+    # ]
+
+        for x in message:
+            if x['type'] == 'text':
+                content.append(x['value'])
+            elif x['type'] == 'image':
+                image = Image.open(x['value']).convert('RGB')
+                content.append(image)
+        msgs = [{'role': 'user', 'content': content}]
+
+        res = self.model.chat(
+            image = None,
+            msgs=msgs,
+            context=None,
+            tokenizer=self.tokenizer,
+            **default_kwargs
+        )
+
+        if isinstance(res, tuple) and len(res) > 0:
+            res = res[0]
+        # print(f"content: {content}, res: {res}")
+        return res
diff --git a/eval_mm/vlmevalkit/vlmeval/vlm/minicpm_v.py b/eval_mm/vlmevalkit/vlmeval/vlm/minicpm_v.py
new file mode 100644
index 0000000000000000000000000000000000000000..7605482a45de6707598e066984a5c0cb4833287b
--- /dev/null
+++ b/eval_mm/vlmevalkit/vlmeval/vlm/minicpm_v.py
@@ -0,0 +1,85 @@
+import torch
+from PIL import Image
+from transformers import AutoModel, AutoTokenizer
+
+from .base import BaseModel
+from ..smp import *
+from ..utils import DATASET_TYPE
+
+
+class MiniCPM_V(BaseModel):
+
+    INSTALL_REQ = False
+    INTERLEAVE = False
+
+    def __init__(self, model_path='openbmb/MiniCPM-V', **kwargs):
+        assert model_path is not None
+        self.model_path = model_path
+        print(f'load from {self.model_path}')
+        self.model = AutoModel.from_pretrained(self.model_path, trust_remote_code=True)
+        self.model = self.model.to(dtype=torch.bfloat16)
+        self.model.eval().cuda()
+        self.kwargs = kwargs
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True)
+        torch.cuda.empty_cache()
+        self.num_beams = 1 if self.model_path == 'openbmb/MiniCPM-V' else 3
+
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if listinstr(['MMMU'], dataset):
+            return True
+        return False
+
+    def build_prompt(self, line, dataset=None):
+        assert dataset is None or isinstance(dataset, str)
+        assert self.use_custom_prompt(dataset)
+        tgt_path = self.dump_image(line, dataset)
+
+        question = line['question']
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        options_prompt = 'Options:\n'
+        for key, item in options.items():
+            options_prompt += f'{key}. {item}\n'
+        hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+        prompt = ''
+        if hint is not None:
+            prompt += f'Hint: {hint}\n'
+        prompt += f'{question}\n'
+        if len(options):
+            prompt += options_prompt
+            prompt = 'Study the image carefully and pick the option associated with the correct answer. \
+                Focus solely on selecting the option and avoid including any other content.\n' + prompt
+        message = [dict(type='text', value=prompt)]
+        message.extend([dict(type='image', value=p) for p in tgt_path])
+
+        return message
+
+    def generate_inner(self, message, dataset=None):
+        prompt, image_path = self.message_to_promptimg(message)
+        image = Image.open(image_path).convert('RGB')
+        msgs = [{'role': 'user', 'content': prompt}]
+        if DATASET_TYPE(dataset) == 'multi-choice':
+            max_new_tokens = 20
+        elif DATASET_TYPE(dataset) == 'Y/N':
+            max_new_tokens = 100
+        else:
+            max_new_tokens = 1024
+
+        default_kwargs = dict(
+            max_new_tokens=max_new_tokens,
+            sampling=False,
+            num_beams=self.num_beams
+        )
+        default_kwargs.update(self.kwargs)
+        res, _, _ = self.model.chat(
+            image=image,
+            msgs=msgs,
+            context=None,
+            tokenizer=self.tokenizer,
+            **default_kwargs
+        )
+        return res
diff --git a/eval_mm/vqaeval/README.md b/eval_mm/vqaeval/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b6fd3f4f3886751f838c8708e029e6ff717ab4fe
--- /dev/null
+++ b/eval_mm/vqaeval/README.md
@@ -0,0 +1,3 @@
+# vqa-eval
+
+contains vqa_eval kit from the server.
diff --git a/eval_mm/vqaeval/datasets/__init__.py b/eval_mm/vqaeval/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/eval_mm/vqaeval/datasets/vqa_dataset.py b/eval_mm/vqaeval/datasets/vqa_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..83c9e9aef5359ebed23b85448b382119d8488736
--- /dev/null
+++ b/eval_mm/vqaeval/datasets/vqa_dataset.py
@@ -0,0 +1,116 @@
+import json
+import os
+import re
+from torch.utils.data import Dataset
+
+def prompt_processor(prompt):
+    if prompt.startswith('OCR tokens: '):
+        pattern = r"Question: (.*?) Short answer:"
+        match = re.search(pattern, prompt, re.DOTALL)
+        question = match.group(1)
+    elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3:
+        if prompt.startswith('Reference OCR token:'):
+            question = prompt.split('\n')[1]
+        else:
+            question = prompt.split('\n')[0]
+    elif len(prompt.split('\n')) == 2:
+        question = prompt.split('\n')[0]
+    else:
+        assert False
+
+    return question.lower()
+    
+class textVQADataset(Dataset):
+    def __init__(
+        self,
+        image_dir="./downloads/TextVQA/train_images",
+        ann_path="./downloads/TextVQA/TextVQA_0.5.1_val.json",
+    ):
+        self.data = json.load(open(ann_path, "r"))["data"]
+        self.image_dir = image_dir
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, idx):
+        question = self.data[idx]['question']
+        answers = self.data[idx]['answers']
+        img_id = self.data[idx]['image_id']
+        qid = self.data[idx]['question_id']
+        img_path = os.path.join(self.image_dir, f"{img_id}.jpg")
+        
+        item = {
+            "question_id": qid,
+            "image_path": img_path,
+            "question": question,
+            "gt_answers": answers
+        }
+        
+        return item
+    
+class docVQADataset(Dataset):
+    def __init__(
+        self,
+        image_dir= "./downloads/DocVQA/spdocvqa_images",
+        ann_path= "./downloads/DocVQA/val_v1.0_withQT.json",
+        ocr_token_path=None
+    ):
+
+        self.data = json.load(open(ann_path, "r"))["data"]
+        self.image_dir = image_dir
+        self.ann_path = ann_path
+        if ocr_token_path:
+            self.ocr_token_data = {item['image_id']: item for item in json.load(open(ocr_token_path, "r"))["data"]}
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, idx):
+        question_id = self.data[idx]['questionId']  
+        relative_img_path = self.data[idx]['image']
+        corrected_relative_img_path = relative_img_path.replace("documents", "images")
+        img_path = os.path.join(self.image_dir, corrected_relative_img_path)
+        question = self.data[idx]['question']
+        answers = self.data[idx]['answers']
+        
+        question_type = self.data[idx]['question_types']
+        
+        return {
+            "question_id": question_id,  
+            "image_path": img_path,
+            "question": question,
+            "gt_answers": answers,
+            'question_type': question_type,
+        }
+
+
+class docVQATESTDataset(Dataset):
+    def __init__(
+        self,
+        image_dir= "./downloads/DocVQA/spdocvqa_images",
+        ann_path= "./downloads/DocVQA/test_v1.0.json",
+        ocr_token_path=None
+    ):
+
+        self.data = json.load(open(ann_path, "r"))["data"]
+        self.image_dir = image_dir
+        self.ann_path = ann_path
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, idx):
+        question_id = self.data[idx]['questionId']  
+        relative_img_path = self.data[idx]['image']
+        corrected_relative_img_path = relative_img_path.replace("documents", "images")
+        img_path = os.path.join(self.image_dir, corrected_relative_img_path)
+        question = self.data[idx]['question']
+        
+        
+        return {
+            "question_id": question_id,  
+            "image_path": img_path,
+            "question": question,
+            "gt_answers": "",
+            'question_type': "",
+        }
diff --git a/eval_mm/vqaeval/eval.py b/eval_mm/vqaeval/eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee2bf68ef6c668bb865c277a935b1cef4753400e
--- /dev/null
+++ b/eval_mm/vqaeval/eval.py
@@ -0,0 +1,97 @@
+import sys
+import datetime
+import json
+import os
+import torch
+
+script_dir = os.path.dirname(os.path.realpath(__file__))
+
+sys.path.append(os.path.join(script_dir, '..'))
+
+from datasets.vqa_dataset import docVQADataset, docVQATESTDataset, textVQADataset
+
+
+print(torch.__version__)
+
+import numpy as np
+
+from eval_utils.getargs import parse_args
+from eval_utils.vqa_evaluate import *
+
+
+def get_model(args):
+    if args.model_name=='':
+        raise Exception('Model name cannot be empty str!')
+    from models.MiniCPM.minicpmv import MiniCPM_V
+    model_path = args.model_path
+    ckpt = args.ckpt
+    model = MiniCPM_V(model_path=model_path, ckpt=ckpt, device=args.device)
+    
+    return model
+
+
+def main(args):
+    np.random.seed(0)
+    max_sample_num = None
+
+    torch.distributed.init_process_group(
+        backend='nccl',
+        world_size=int(os.getenv('WORLD_SIZE', '1')),
+        rank=int(os.getenv('RANK', '0')),
+    )
+    torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
+    print(f'Init Rank-{torch.distributed.get_rank()}')
+    if torch.distributed.is_initialized():
+        args.device = torch.device(f"cuda:{torch.cuda.current_device()}")
+
+    model = get_model(args)
+    
+    result = {}
+    time = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
+
+    if args.eval_textVQA or args.eval_all:
+        dataset = textVQADataset(args.textVQA_image_dir, args.textVQA_ann_path)
+        if max_sample_num is not None:
+            dataset = torch.utils.data.Subset(dataset, range(max_sample_num))
+        acc = evaluate_VQA(model, dataset, args.model_name, 'textVQA', time, \
+                batch_size=args.batchsize, generate_method=args.generate_method, answer_path=args.answer_path)
+        result['textVQA'] = acc
+
+    if args.eval_docVQA or args.eval_all:
+        dataset = docVQADataset(args.docVQA_image_dir, args.docVQA_ann_path)
+        if max_sample_num is not None:
+            dataset = torch.utils.data.Subset(dataset, range(max_sample_num))
+        acc = evaluate_VQA(model, dataset, args.model_name, 'docVQA', time, batch_size=args.batchsize, generate_method=args.generate_method, answer_path=args.answer_path)
+        result['docVQA'] = acc
+
+    if args.eval_docVQATest or args.eval_all:
+        target_dataset = "docVQATest"
+        dataset = docVQATESTDataset(args.docVQATest_image_dir, args.docVQATest_ann_path)
+        if max_sample_num is not None:
+            dataset = torch.utils.data.Subset(dataset, range(max_sample_num))
+        acc = evaluate_VQA(model, dataset, args.model_name, target_dataset, time, batch_size=args.batchsize, generate_method=args.generate_method, answer_path=args.answer_path)
+        result['docVQATest'] = acc
+    
+    if torch.distributed.is_initialized():
+        torch.distributed.barrier()
+
+    if torch.distributed.is_initialized() and torch.distributed.get_rank() != 0:
+        return None
+
+    result_path = os.path.join(os.path.join(args.answer_path, args.model_name), 'result.json')
+    
+    output_flag = False
+    for k, v in result.items():
+        if v > 0.0:
+            output_flag = True
+            break
+    
+    if output_flag:
+        with open(result_path, "w") as f:
+            f.write(json.dumps(result, indent=4))
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    main(args)
\ No newline at end of file
diff --git a/eval_mm/vqaeval/eval_utils/cal_metric.py b/eval_mm/vqaeval/eval_utils/cal_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5fb43986c47c40014eb47275dff927b49f21d44
--- /dev/null
+++ b/eval_mm/vqaeval/eval_utils/cal_metric.py
@@ -0,0 +1,40 @@
+import json
+import glob
+import re
+
+def has_word(sentence, word):
+    pattern = r"\b" + re.escape(word) + r"\b"
+    match = re.search(pattern, sentence)
+    if match:
+        return True
+    else:
+        return False
+def remove_special_chars(s):
+    pattern = r"[^a-zA-Z0-9\s]"
+    s = re.sub(pattern, "", s)
+    return s
+
+for model in glob.glob('./answer_save/*'):
+    print(model, ':')
+    result_list = sorted(glob.glob(f'{model}/*.json'))
+    for task_result_path in result_list:
+        taskname = task_result_path.split('/')[-1]
+        taskname = taskname.split('.')[0]
+        if taskname not in ['IIIT5K', 'svt', 'IC13_857', 'IC15_1811', 'svtp', 'ct80',
+                            'cocotext', 'ctw', 'totaltext', 'HOST']:
+            continue
+
+        correct = 0
+        num = 0
+        with open(task_result_path, 'r') as f:
+            dict = json.load(f)[:100]
+            for i in range(len(dict)):
+                gt_answers = dict[i]['gt_answers']
+                answer = dict[i]['answer']
+                gt_answers = remove_special_chars(gt_answers).lower()
+                answer = remove_special_chars(answer).lower()
+                if has_word(answer, gt_answers):
+                    correct+=1
+                num+=1
+        print(f'{taskname:10s}:{float(correct)/num*100:.2f}')
+    print('=' * 32)
diff --git a/eval_mm/vqaeval/eval_utils/getargs.py b/eval_mm/vqaeval/eval_utils/getargs.py
new file mode 100644
index 0000000000000000000000000000000000000000..6df6a2d0f366de4f411095df432ff9f22c8cbf2d
--- /dev/null
+++ b/eval_mm/vqaeval/eval_utils/getargs.py
@@ -0,0 +1,62 @@
+import argparse
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Demo")
+
+    parser.add_argument('--local-rank', type=int, default=0, help='Local rank for distributed training')
+
+    # textVQA
+    parser.add_argument("--textVQA_image_dir", type=str, default="")
+    parser.add_argument("--textVQA_ann_path", type=str, default="")
+
+    # docVQA
+    parser.add_argument("--docVQA_image_dir", type=str, default="")
+    parser.add_argument("--docVQA_ann_path", type=str, default="")
+
+    # docVQATest
+    parser.add_argument("--docVQATest_image_dir", type=str, default="")
+    parser.add_argument("--docVQATest_ann_path", type=str, default="")
+
+    # result path
+    parser.add_argument("--answer_path", type=str, default="./answers-new")
+
+    # eval
+    parser.add_argument(
+        "--eval_textVQA",
+        action="store_true",
+        default=False,
+        help="Whether to evaluate on textVQA."
+    )
+    parser.add_argument(
+        "--eval_docVQA",
+        action="store_true",
+        default=False,
+        help="Whether to evaluate on docVQA."
+    )
+    parser.add_argument(
+        "--eval_docVQATest",
+        action="store_true",
+        default=False,
+        help="Whether to evaluate on docVQA."
+    )
+    
+    parser.add_argument(
+        "--eval_all",
+        action="store_true",
+        default=False,
+        help="Whether to evaluate all datasets"
+    )
+
+    parser.add_argument("--model_name", type=str, default="")
+    parser.add_argument("--model_path", type=str, default="")
+
+    parser.add_argument("--generate_method", type=str, default="", help="generate with interleave or not.")
+
+    parser.add_argument("--device", type=str, default="cuda:0")
+    parser.add_argument('--batchsize', type=int, default=1, help='Batch size for processing.')
+
+    parser.add_argument("--ckpt", type=str, default="")
+
+    args = parser.parse_args()
+    return args
\ No newline at end of file
diff --git a/eval_mm/vqaeval/eval_utils/vqa_evaluate.py b/eval_mm/vqaeval/eval_utils/vqa_evaluate.py
new file mode 100644
index 0000000000000000000000000000000000000000..78888949ce171b972fd459d4e3a8fa37ec173692
--- /dev/null
+++ b/eval_mm/vqaeval/eval_utils/vqa_evaluate.py
@@ -0,0 +1,446 @@
+import itertools
+import json
+import os
+import re
+from collections import namedtuple
+
+import torch
+from tqdm import tqdm
+
+
+class InferenceSampler(torch.utils.data.sampler.Sampler):
+
+    def __init__(self, size):
+        self._size = int(size)
+        assert size > 0
+        self._rank = torch.distributed.get_rank()
+        self._world_size = torch.distributed.get_world_size()
+        self._local_indices = self._get_local_indices(size, self._world_size,
+                                                      self._rank)
+
+    @staticmethod
+    def _get_local_indices(total_size, world_size, rank):
+        shard_size = total_size // world_size
+        left = total_size % world_size
+        shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
+
+        begin = sum(shard_sizes[:rank])
+        end = min(sum(shard_sizes[:rank + 1]), total_size)
+        return range(begin, end)
+
+    def __iter__(self):
+        yield from self._local_indices
+
+    def __len__(self):
+        return len(self._local_indices)
+    
+def collate_fn_vqa(batches):
+    '''
+    '''
+    image_paths = [_['image_path'] for _ in batches]
+    questions = [_['question'] for _ in batches]
+    gt_answers = [_['gt_answers'] for _ in batches]
+    ocr_tokens = [_['ocr_tokens'] if 'ocr_tokens' in _ else None for _ in batches]
+    question_ids = [_['question_id'] if 'question_id' in _ else None for _ in batches]
+    question_type = [_['question_type'] if 'question_type' in _ else None for _ in batches]
+
+    return image_paths, questions, gt_answers, ocr_tokens, question_ids, question_type
+
+def has_word(sentence, word):
+    if word[0].isalnum():
+        start_pattern = r"\b"
+    else:
+        start_pattern = r""
+
+    if word[-1].isalnum():
+        end_pattern = r"\b"
+    else:
+        end_pattern = r""
+
+    pattern = start_pattern + re.escape(word) + end_pattern
+    match = re.search(pattern, sentence)
+    return bool(match)
+
+def remove_special_chars(s):
+    pattern = r"[^a-zA-Z0-9\s]"
+    s = re.sub(pattern, "", s)
+    return s
+
+def levenshtein_distance(s1, s2):
+    if len(s1) > len(s2):
+        s1, s2 = s2, s1
+
+    distances = range(len(s1) + 1)
+    for i2, c2 in enumerate(s2):
+        distances_ = [i2+1]
+        for i1, c1 in enumerate(s1):
+            if c1 == c2:
+                distances_.append(distances[i1])
+            else:
+                distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
+        distances = distances_
+    return distances[-1]
+
+class VQAEval:
+    def __init__(self):
+        self.contractions = {
+            "aint": "ain't",
+            "arent": "aren't",
+            "cant": "can't",
+            "couldve": "could've",
+            "couldnt": "couldn't",
+            "couldn'tve": "couldn't've",
+            "couldnt've": "couldn't've",
+            "didnt": "didn't",
+            "doesnt": "doesn't",
+            "dont": "don't",
+            "hadnt": "hadn't",
+            "hadnt've": "hadn't've",
+            "hadn'tve": "hadn't've",
+            "hasnt": "hasn't",
+            "havent": "haven't",
+            "hed": "he'd",
+            "hed've": "he'd've",
+            "he'dve": "he'd've",
+            "hes": "he's",
+            "howd": "how'd",
+            "howll": "how'll",
+            "hows": "how's",
+            "Id've": "I'd've",
+            "I'dve": "I'd've",
+            "Im": "I'm",
+            "Ive": "I've",
+            "isnt": "isn't",
+            "itd": "it'd",
+            "itd've": "it'd've",
+            "it'dve": "it'd've",
+            "itll": "it'll",
+            "let's": "let's",
+            "maam": "ma'am",
+            "mightnt": "mightn't",
+            "mightnt've": "mightn't've",
+            "mightn'tve": "mightn't've",
+            "mightve": "might've",
+            "mustnt": "mustn't",
+            "mustve": "must've",
+            "neednt": "needn't",
+            "notve": "not've",
+            "oclock": "o'clock",
+            "oughtnt": "oughtn't",
+            "ow's'at": "'ow's'at",
+            "'ows'at": "'ow's'at",
+            "'ow'sat": "'ow's'at",
+            "shant": "shan't",
+            "shed've": "she'd've",
+            "she'dve": "she'd've",
+            "she's": "she's",
+            "shouldve": "should've",
+            "shouldnt": "shouldn't",
+            "shouldnt've": "shouldn't've",
+            "shouldn'tve": "shouldn't've",
+            "somebody'd": "somebodyd",
+            "somebodyd've": "somebody'd've",
+            "somebody'dve": "somebody'd've",
+            "somebodyll": "somebody'll",
+            "somebodys": "somebody's",
+            "someoned": "someone'd",
+            "someoned've": "someone'd've",
+            "someone'dve": "someone'd've",
+            "someonell": "someone'll",
+            "someones": "someone's",
+            "somethingd": "something'd",
+            "somethingd've": "something'd've",
+            "something'dve": "something'd've",
+            "somethingll": "something'll",
+            "thats": "that's",
+            "thered": "there'd",
+            "thered've": "there'd've",
+            "there'dve": "there'd've",
+            "therere": "there're",
+            "theres": "there's",
+            "theyd": "they'd",
+            "theyd've": "they'd've",
+            "they'dve": "they'd've",
+            "theyll": "they'll",
+            "theyre": "they're",
+            "theyve": "they've",
+            "twas": "'twas",
+            "wasnt": "wasn't",
+            "wed've": "we'd've",
+            "we'dve": "we'd've",
+            "weve": "we've",
+            "werent": "weren't",
+            "whatll": "what'll",
+            "whatre": "what're",
+            "whats": "what's",
+            "whatve": "what've",
+            "whens": "when's",
+            "whered": "where'd",
+            "wheres": "where's",
+            "whereve": "where've",
+            "whod": "who'd",
+            "whod've": "who'd've",
+            "who'dve": "who'd've",
+            "wholl": "who'll",
+            "whos": "who's",
+            "whove": "who've",
+            "whyll": "why'll",
+            "whyre": "why're",
+            "whys": "why's",
+            "wont": "won't",
+            "wouldve": "would've",
+            "wouldnt": "wouldn't",
+            "wouldnt've": "wouldn't've",
+            "wouldn'tve": "wouldn't've",
+            "yall": "y'all",
+            "yall'll": "y'all'll",
+            "y'allll": "y'all'll",
+            "yall'd've": "y'all'd've",
+            "y'alld've": "y'all'd've",
+            "y'all'dve": "y'all'd've",
+            "youd": "you'd",
+            "youd've": "you'd've",
+            "you'dve": "you'd've",
+            "youll": "you'll",
+            "youre": "you're",
+            "youve": "you've",
+        }
+        self.manualMap = {
+            "none": "0",
+            "zero": "0",
+            "one": "1",
+            "two": "2",
+            "three": "3",
+            "four": "4",
+            "five": "5",
+            "six": "6",
+            "seven": "7",
+            "eight": "8",
+            "nine": "9",
+            "ten": "10",
+        }
+        self.articles = ["a", "an", "the"]
+
+        self.periodStrip = re.compile("(?!<=\d)(\.)(?!\d)")
+        self.commaStrip = re.compile("(\d)(\,)(\d)")
+        self.punct = [
+            ";",
+            r"/",
+            "[",
+            "]",
+            '"',
+            "{",
+            "}",
+            "(",
+            ")",
+            "=",
+            "+",
+            "\\",
+            "_",
+            "-",
+            ">",
+            "<",
+            "@",
+            "`",
+            ",",
+            "?",
+            "!",
+        ]
+    def clean_text(self, text):
+        text = text.replace("\n", " ").replace("\t", " ").strip()
+        text = self.processPunctuation(text)
+        text = self.processDigitArticle(text)
+        return text
+    
+    def evaluate_vqa_human(self, answer, gt_answers):
+        '''TextVQA, VQAv2, OKVQA, vizwiz'''
+        answer = answer.replace("\n", " ").replace("\t", " ").strip()
+        answer = self.processPunctuation(answer)
+        answer = self.processDigitArticle(answer)
+        gt_answers = [self.processPunctuation(ans) for ans in gt_answers]
+        gt_answers = [self.processDigitArticle(ans) for ans in gt_answers]
+
+        gtAcc = [] 
+
+        for idx, gtAnsDatum in enumerate(gt_answers):  
+            otherGTAns = gt_answers[:idx] + gt_answers[idx+1:]
+
+            matchingAns = [item for item in otherGTAns if answer == item]
+            
+            acc = min(1, float(len(matchingAns)) / 3)
+            gtAcc.append(acc) 
+
+        avgGTAcc = float(sum(gtAcc)) / len(gtAcc) if gtAcc else 0  
+
+        return avgGTAcc
+
+    def evaluate_anls(self, answer, gt_answers, threshold=0.5):
+        '''DOcVQA, InfographicsVQA, STVQA'''
+        answer = ' '.join(answer.strip().lower().split())
+        if not isinstance(gt_answers, list):
+            gt_answers = [gt_answers]
+        gt_answers = [' '.join(gt_answer.strip().lower().split()) for gt_answer in gt_answers]
+
+        values = []
+        for gt_answer in gt_answers:
+            dist = levenshtein_distance(answer, gt_answer)
+            length = max(len(answer), len(gt_answer))
+            values.append(0.0 if length == 0 else float(dist) / float(length))
+
+        score = 1 - min(values)
+        
+        score = 0 if score < threshold else score
+        
+        return score
+
+    def processPunctuation(self, inText):
+        outText = inText
+        for p in self.punct:
+            if (p + " " in inText or " " + p in inText) or (
+                re.search(self.commaStrip, inText) != None
+            ):
+                outText = outText.replace(p, "")
+            else:
+                outText = outText.replace(p, " ")
+        outText = self.periodStrip.sub("", outText, re.UNICODE)
+        return outText
+
+    def processDigitArticle(self, inText):
+        outText = []
+        tempText = inText.lower().split()
+        for word in tempText:
+            word = self.manualMap.setdefault(word, word)
+            if word not in self.articles:
+                outText.append(word)
+            else:
+                pass
+        for wordId, word in enumerate(outText):
+            if word in self.contractions:
+                outText[wordId] = self.contractions[word]
+        outText = " ".join(outText)
+        return outText
+
+
+def evaluate_dataset(dataset_name, answer_file_path, model_name, method = None):
+    with open(answer_file_path, 'r', encoding='utf-8') as f:
+        predictions = json.load(f)
+
+    eval = VQAEval()
+    total_accuracy = 0
+    num = 0
+    Entry = namedtuple('Entry', ['text', 'bbox'])
+
+    for item in predictions:
+        gt_answers = item['gt_answers']
+        answer = item['answer']
+        if method is not None:
+            pass
+        if dataset_name in ["textVQA"]:
+            if num == 0:
+                print(f"evaluating vqa...")
+            accuracy = eval.evaluate_vqa_human(answer, gt_answers)
+        elif dataset_name in ['docVQA']:
+            if  num == 0:
+                print(f"evaluating anls...")
+            accuracy = eval.evaluate_anls(answer, gt_answers)
+        else:
+            accuracy = eval.evaluate_has(answer, gt_answers)
+        item['accuracy'] = accuracy
+
+        total_accuracy += accuracy
+        num += 1
+
+    average_accuracy = total_accuracy / num
+    print(f'{dataset_name}:{average_accuracy}')
+    
+    answer_model_method_path = answer_file_path.replace('.json', f'_{model_name}_{method}.json')
+    with open(answer_model_method_path, "w", encoding='utf-8') as f:
+        json.dump(predictions, f, indent=4, ensure_ascii=False)
+
+    return average_accuracy
+
+
+def evaluate_VQA(
+    model,
+    dataset,
+    model_name,
+    dataset_name,
+    time,
+    batch_size=1,
+    generate_method="interleave",
+    answer_path='./answers',
+):
+    print(f"answer path:{answer_path}")
+
+    sampler = None
+    if torch.distributed.is_initialized():
+        sampler=InferenceSampler(len(dataset))
+
+    dataloader = torch.utils.data.DataLoader(
+        dataset=dataset,
+        batch_size=batch_size,
+        sampler=sampler,
+        collate_fn=collate_fn_vqa
+    )
+    
+    now_rank = torch.distributed.get_rank()
+    
+    answer_dir = os.path.join(answer_path, model_name, time)
+    os.makedirs(answer_dir, exist_ok=True)
+    
+    image_list = []
+    for item in dataset:
+        image_list.append(item["image_path"])
+    
+    predictions = []
+    
+    for batch in tqdm(dataloader, desc="Running inference"):
+        image_paths, questions, gt_answers, ocr_tokens_list, question_ids, question_type  = batch
+
+        with torch.no_grad():
+            if model_name != "minicpm":
+                if model_name != "codellama":
+                    outputs = model.generate(images=image_paths, questions=questions, datasetname=dataset_name)
+                else:
+                    outputs = model.generate()
+            elif model_name == "minicpm":
+                if generate_method == "old":
+                    outputs = model.generate(images=image_paths, questions=questions, datasetname=dataset_name)
+                elif generate_method == "interleave":
+                    outputs = model.generate_with_interleaved(images=image_paths, questions=questions, datasetname=dataset_name)
+                else:
+                    raise Exception(f"Wrong generate paradigm {generate_method}!")
+            
+            for i in range(len(outputs)):
+                answer_dict = {
+                    'question_id': question_ids[i],
+                    'question': questions[i],
+                    'answer': outputs[i],
+                    'gt_answers': gt_answers[i],
+                    'image_path': image_paths[i],
+                    'model_name': model_name,
+                    'question_type': question_type[i]
+                }              
+                predictions.append(answer_dict)
+                    
+    if torch.distributed.is_initialized():
+        torch.distributed.barrier()
+    if torch.distributed.is_initialized():
+        world_size = torch.distributed.get_world_size()
+        merged_predictions = [None for _ in range(world_size)]
+        torch.distributed.all_gather_object(merged_predictions, predictions)
+        predictions = [_ for _ in itertools.chain.from_iterable(merged_predictions)]
+
+    if torch.distributed.is_initialized() and torch.distributed.get_rank() != 0:
+        return None
+    
+    answer_file_path = os.path.join(answer_dir, f"{dataset_name}.json")
+    print(f"answer_file_path:{answer_file_path}")
+    
+    with open(answer_file_path, "w", encoding='utf-8') as f:
+        json.dump(predictions, f, indent=4, ensure_ascii=False)
+
+    if dataset_name in ["docVQATest"]:
+        return -1.0
+
+    return evaluate_dataset(answer_file_path=answer_file_path, dataset_name=dataset_name, model_name=model_name)
diff --git a/eval_mm/vqaeval/models/MiniCPM/minicpmv.py b/eval_mm/vqaeval/models/MiniCPM/minicpmv.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3016f1beda851825252994ed971b02e679dc423
--- /dev/null
+++ b/eval_mm/vqaeval/models/MiniCPM/minicpmv.py
@@ -0,0 +1,96 @@
+
+import torch
+from PIL import Image
+from transformers import AutoModel, AutoTokenizer
+
+Image.MAX_IMAGE_PIXELS = 1000000000
+
+max_token  = {
+    'docVQA': 100,
+    'textVQA': 100,
+    "docVQATest": 100
+}
+
+class MiniCPM_V:
+
+    def __init__(self, model_path, ckpt, device=None)->None:
+        self.model_path = model_path
+        self.ckpt = ckpt
+        self.model = AutoModel.from_pretrained(self.model_path, trust_remote_code=True).eval()
+        if self.ckpt is not None:
+            self.ckpt = ckpt
+            self.state_dict = torch.load(self.ckpt, map_location=torch.device('cpu'))
+            self.model.load_state_dict(self.state_dict)
+            
+        self.model = self.model.to(dtype=torch.float16)
+        self.model.to(device)
+        
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True)
+        torch.cuda.empty_cache()
+
+    def generate(self, images, questions, datasetname):
+        image = Image.open(images[0]).convert('RGB')
+        try:
+            max_new_tokens = max_token[datasetname]
+        except:
+            max_new_tokens = 1024
+        if (datasetname == 'docVQA') or (datasetname == "docVQATest") :
+            prompt = "Answer the question directly with single word." + "\n" + questions[0]
+        elif (datasetname == 'textVQA') :
+            prompt = "Answer the question directly with single word." + '\n'+ questions[0]
+        
+        msgs = [{'role': 'user', 'content': prompt}]
+        default_kwargs = dict(
+            max_new_tokens=max_new_tokens,
+            sampling=False,
+            num_beams=3
+        )
+        res = self.model.chat(
+            image=image,
+            msgs=msgs,
+            context=None,
+            tokenizer=self.tokenizer,
+            **default_kwargs
+        )
+        
+        return [res]
+    
+    def generate_with_interleaved(self, images, questions, datasetname):
+        try:
+            max_new_tokens = max_token[datasetname]
+        except:
+            max_new_tokens = 1024
+        
+        prompt = "Answer the question directly with single word."
+        
+        default_kwargs = dict(
+            max_new_tokens=max_new_tokens,
+            sampling=False,
+            num_beams=3
+        )
+        
+        content = []
+        message = [
+            {'type': 'text', 'value': prompt},
+            {'type': 'image', 'value': images[0]},
+            {'type': 'text', 'value': questions[0]}
+        ]
+        for x in message:
+            if x['type'] == 'text':
+                content.append(x['value'])
+            elif x['type'] == 'image':
+                image = Image.open(x['value']).convert('RGB')
+                content.append(image)
+        msgs = [{'role': 'user', 'content': content}]
+
+        res = self.model.chat(
+            msgs=msgs,
+            context=None,
+            tokenizer=self.tokenizer,
+            **default_kwargs
+        )
+
+        if isinstance(res, tuple) and len(res) > 0:
+            res = res[0]
+        print(f"Q: {content}, \nA: {res}")
+        return [res]
diff --git a/eval_mm/vqaeval/requirements.txt b/eval_mm/vqaeval/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5eb942567d8848695989821165de9381d851e34e
--- /dev/null
+++ b/eval_mm/vqaeval/requirements.txt
@@ -0,0 +1,49 @@
+accelerate
+aiohttp==3.8.4
+aiosignal==1.3.1
+async-timeout==4.0.2
+attrs==22.2.0
+bitsandbytes==0.37.0
+cchardet==2.1.7
+chardet==5.1.0
+contourpy==1.0.7
+cycler==0.11.0
+filelock==3.9.0
+fonttools==4.38.0
+frozenlist==1.3.3
+huggingface-hub==0.13.4
+importlib-resources==5.12.0
+kiwisolver==1.4.4
+matplotlib==3.7.0
+multidict==6.0.4
+openai==0.27.0
+packaging==23.0
+psutil==5.9.4
+pycocotools==2.0.6
+pyparsing==3.0.9
+python-dateutil==2.8.2
+pyyaml==6.0
+regex==2022.10.31
+tokenizers==0.13.2
+tqdm==4.64.1
+transformers
+timm==0.6.13
+spacy==3.5.1
+webdataset==0.2.48
+scikit-learn==1.2.2
+scipy==1.10.1
+yarl==1.8.2
+zipp==3.14.0
+omegaconf==2.3.0
+opencv-python==4.7.0.72
+iopath==0.1.10
+decord==0.6.0
+tenacity==8.2.2
+peft
+pycocoevalcap
+sentence-transformers
+umap-learn
+notebook
+gradio==3.24.1
+gradio-client==0.0.8
+wandb
diff --git a/eval_mm/vqaeval/shell/run_inference.sh b/eval_mm/vqaeval/shell/run_inference.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f255ff18b50e4734aad8431535962019ad2e842d
--- /dev/null
+++ b/eval_mm/vqaeval/shell/run_inference.sh
@@ -0,0 +1,15 @@
+export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+python -m torch.distributed.launch \
+    --nproc_per_node=${NPROC_PER_NODE:-8} \
+    --nnodes=${WORLD_SIZE:-1} \
+    --node_rank=${RANK:-0} \
+    --master_addr=${MASTER_ADDR:-127.0.0.1} \
+    --master_port=${MASTER_PORT:-12345} \
+    ./eval.py \
+    --model_name minicpm \
+    --model_path \
+    --generate_method interleave \
+    --eval_textVQA \
+    --eval_docVQA \
+    --answer_path ./answers \
+    --batchsize 1
\ No newline at end of file
diff --git a/eval_mm/vqaeval/shell/run_transform.sh b/eval_mm/vqaeval/shell/run_transform.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a19a83fba836a366d4893635aee57852b77f2d8b
--- /dev/null
+++ b/eval_mm/vqaeval/shell/run_transform.sh
@@ -0,0 +1,3 @@
+python ./transform_docvqatest_for_submission.py \
+    --input_file_path \
+    --output_file_path 
\ No newline at end of file
diff --git a/eval_mm/vqaeval/transform_docvqatest_for_submission.py b/eval_mm/vqaeval/transform_docvqatest_for_submission.py
new file mode 100644
index 0000000000000000000000000000000000000000..44048fef9b2c8bc673d449096504fbc708a24762
--- /dev/null
+++ b/eval_mm/vqaeval/transform_docvqatest_for_submission.py
@@ -0,0 +1,16 @@
+import argparse
+import json
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input_file_path", type=str, default="", help="path to the originial output json.")
+    parser.add_argument("--output_file_path", type=str, default="", help="path to where you want to save the processed json.")
+    args = parser.parse_args()
+    
+    with open(args.input_file_path , 'r') as f:
+        data = json.load(f)
+
+    transformed_data = [{"questionId": item["question_id"], "answer": item["answer"].replace("</s>", "")} for item in data]
+
+    with open(args.output_file_path, 'w') as f:
+        json.dump(transformed_data, f)
diff --git a/finetune/__init__.py b/finetune/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/finetune/dataset.py b/finetune/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2dbfda6a694e9317b0e9b2c1e2c086ba985b68e
--- /dev/null
+++ b/finetune/dataset.py
@@ -0,0 +1,457 @@
+import copy
+import json
+import logging
+import math
+import os
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional
+
+import numpy as np
+import torch
+from PIL import Image
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import Dataset
+from transformers import AutoProcessor, AutoTokenizer
+
+
+class SupervisedDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+
+    def __init__(
+        self,
+        raw_data,
+        transform,
+        tokenizer,
+        slice_config,
+        llm_type="minicpm",
+        patch_size=14,
+        query_nums=64,
+        batch_vision=False,
+    ):
+        super(SupervisedDataset, self).__init__()
+        self.raw_data = raw_data
+        self.tokenizer = tokenizer
+        self.transform = transform
+        self.slice_config = slice_config
+        self.llm_type = llm_type
+        self.patch_size = patch_size
+        self.query_nums=query_nums
+        self.batch_vision = batch_vision
+
+    def __len__(self):
+        return len(self.raw_data)
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        image = Image.open(self.raw_data[i]["image"]).convert("RGB")
+        ret = preprocess(
+            image,
+            self.raw_data[i]["conversations"],
+            self.tokenizer,
+            self.transform,
+            query_nums=self.query_nums,
+            slice_config=self.slice_config,
+            llm_type=self.llm_type,
+            patch_size=self.patch_size,
+            batch_vision=self.batch_vision,
+        )
+        ret = dict(
+            input_ids=ret["input_ids"],
+            position_ids=ret["position_ids"],
+            labels=ret["target"],
+            attention_mask=torch.ones_like(ret["input_ids"], dtype=torch.bool),
+            pixel_values=ret["pixel_values"],
+            tgt_sizes=ret["tgt_sizes"],
+            image_bound=ret["image_bound"],
+        )
+
+        return ret
+
+def data_collator(examples, padding_value=0, max_length=2048):
+    def trim_and_pad(seq, batch_first, padding_value):
+        return pad_sequence([s[:max_length] for s in seq], batch_first=True, padding_value=padding_value)
+
+    input_ids = trim_and_pad(
+        [example["input_ids"] for example in examples],
+        batch_first=True,
+        padding_value=padding_value,
+    )
+    position_ids = trim_and_pad(
+        [example["position_ids"] for example in examples],
+        batch_first=True,
+        padding_value=padding_value,
+    )
+    targets = trim_and_pad(
+        [example["labels"] for example in examples],
+        batch_first=True,
+        padding_value=-100,
+    )
+    attention_mask = trim_and_pad(
+        [example["attention_mask"] for example in examples],
+        batch_first=True,
+        padding_value=padding_value,
+    )
+    pixel_values = [example["pixel_values"] for example in examples]
+    image_bound = [example["image_bound"] for example in examples]
+    tgt_sizes = [example["tgt_sizes"] for example in examples]
+    return {
+        "input_ids": input_ids,
+        "position_ids": position_ids,
+        "labels": targets,
+        "attention_mask": attention_mask,
+        "image_bound": image_bound,
+        "tgt_sizes": tgt_sizes,
+        "pixel_values": pixel_values,
+    }
+
+
+def conversation_to_ids(conversation, tokenizer, llm_type=None):
+    """
+    for single image multi-turn conversation
+    conversation: [{'role': 'user', 'content': 'Describe this image'},
+                   {'role': 'assistant', 'content': 'This is a cat.'}]
+    """
+    if llm_type == "llama3":
+        input_ids, context, raw_msg = conversation_to_ids_llama3(
+            conversation, tokenizer
+        )
+    else:
+        input_ids, context, raw_msg = conversation_to_ids_minicpm(
+            conversation, tokenizer
+        )
+
+    ids = torch.from_numpy(np.hstack(input_ids, dtype=np.int32))
+    context = torch.from_numpy(np.hstack(context, dtype=np.int8))
+
+    # build target
+    target = torch.full_like(ids, -100, dtype=torch.int32)
+    for i in range(1, len(ids)):
+        if context[i] == 0:
+            target[i - 1] = ids[i]
+        if context[i] == 1 and context[i - 1] == 0:
+            if hasattr(tokenizer, "eot_id"):
+                target[i - 1] = tokenizer.eot_id
+            else:
+                target[i - 1] = tokenizer.eos_id
+
+    # build image bound
+    image_start_tokens = torch.where(ids == tokenizer.im_start_id)[0]
+    image_start_tokens += 1
+    image_end_tokens = torch.where(ids == tokenizer.im_end_id)[0]
+    if len(image_start_tokens) != len(image_end_tokens):
+        print("image start token != image end tokens")
+        
+    if len(image_start_tokens) > 0:
+        image_bound = torch.hstack(
+            [image_start_tokens.unsqueeze(-1), image_end_tokens.unsqueeze(-1)]
+        )
+    else:
+        image_bound = []
+
+    position_ids = torch.arange(ids.size(0)).long()
+    return {
+        "input_ids": ids,
+        "target": target,
+        "image_bound": image_bound,
+        "raw_msg": raw_msg,
+        "position_ids": position_ids
+    }
+
+
+def conversation_to_ids_minicpm(conversation, tokenizer):
+    raw_msg = ""
+    input_ids = []
+    context = []
+    for idx, msg in enumerate(conversation):
+        role = msg["role"]
+        message = msg["content"]
+        assert role in ["user", "assistant"]
+        if role == "user":
+            prefix = "<用户>"
+        else:
+            prefix = "<AI>"
+        # append eos
+        if idx == len(conversation) - 1:
+            message = message + tokenizer.eos_token
+        prefix_ids = tokenizer.encode(prefix)[1:]  # remove bos
+        message_ids = tokenizer.encode(message)[1:]
+
+        input_ids.append(prefix_ids)
+        input_ids.append(message_ids)
+
+        context.append(np.ones((len(prefix_ids),), dtype=np.int8))
+        if role == "assistant":
+            context.append(np.zeros((len(message_ids),), dtype=np.int8))
+        else:
+            context.append(np.ones((len(message_ids),), dtype=np.int8))
+
+        raw_msg += prefix + message
+
+    return input_ids, context, raw_msg
+
+
+def conversation_to_ids_llama3(conversation, tokenizer):
+    raw_msg = ""
+    input_ids = []
+    context = []
+    raw_msg = tokenizer.apply_chat_template(
+        conversation, tokenize=False, add_generation_prompt=False
+    )
+    input_ids = tokenizer.apply_chat_template(
+        conversation, tokenize=True, add_generation_prompt=False
+    )
+    input_ids = np.array(input_ids)
+
+    start_header_idxs = np.where(
+        input_ids == tokenizer.convert_tokens_to_ids("<|start_header_id|>")
+    )[0]
+    assistant_idxs = np.where(
+        input_ids == tokenizer.convert_tokens_to_ids("assistant")
+    )[0]
+    end_header_idxs = np.where(
+        input_ids == tokenizer.convert_tokens_to_ids("<|end_header_id|>")
+    )[0]
+    eot_idxs = np.where(
+        input_ids == tokenizer.convert_tokens_to_ids("<|eot_id|>"))[0]
+
+    context = np.ones_like(input_ids, dtype=np.int8)
+
+    for assistant_idx in assistant_idxs:
+        if assistant_idx in set((start_header_idxs + end_header_idxs) / 2):
+            st = assistant_idx + 3  # assistant<|end_header_id|>\n\n
+            for eot_idx in eot_idxs:
+                if eot_idx > st:
+                    context[st: eot_idx + 1] = 0
+                    break
+
+    input_ids = np.hstack(input_ids)
+    context = np.hstack(context)
+
+    return input_ids, context, raw_msg
+
+
+def preprocess(
+    image,
+    conversation,
+    tokenizer,
+    transform,
+    query_nums=64,
+    slice_config=None,
+    llm_type=None,
+    patch_size=14,
+    batch_vision=False,
+):
+    """
+    single image preprocess, the image will be placed at the top of the conversation
+    """
+    conversation = copy.deepcopy(conversation)
+    assert len(conversation) > 1, "conversation length must large than 2"
+    assert conversation[0]["role"] == "user", "the first role must be user"
+
+    if slice_config is not None:
+        assert isinstance(slice_config, Dict)
+        assert "patch_size" in slice_config
+        assert "max_slice_nums" in slice_config
+        assert "scale_resolution" in slice_config
+    default_image_placeholder = (
+        tokenizer.im_start + tokenizer.unk_token * query_nums + tokenizer.im_end
+    )
+    if slice_config:
+        images = []
+        source_image, patches, best_grid = slice_image(
+            image,
+            slice_config["max_slice_nums"],
+            slice_config["scale_resolution"],
+            slice_config["patch_size"],
+        )
+        images.append(source_image)
+        image_placeholder = default_image_placeholder
+        if len(patches) > 0:
+            for i in range(len(patches)):
+                for j in range(len(patches[0])):
+                    images.append(patches[i][j])
+
+            image_placeholder += get_grid_placeholder(
+                tokenizer, best_grid, query_nums)
+        images = [transform(i) for i in images]
+    else:
+        images = [transform(image)]
+        image_placeholder = default_image_placeholder
+    if "<image>" in conversation[0]["content"]:
+        conversation[0]["content"] = conversation[0]["content"].replace(
+            "<image>", image_placeholder
+        )
+    else:
+        conversation[0]["content"] = (
+            image_placeholder + "\n" + conversation[0]["content"]
+        )
+
+    input_dict = conversation_to_ids(conversation, tokenizer, llm_type)
+
+    if batch_vision:
+        tgt_sizes = []
+        reshape_images = []
+        for image in images:
+            H, W = image.shape[1:]
+            reshape_image = reshape_by_patch(image, patch_size)
+            reshape_images.append(reshape_image)
+            tgt_sizes.append([H // patch_size, W // patch_size])
+        if tgt_sizes:
+            tgt_sizes = torch.Tensor(tgt_sizes).type(torch.int32)
+
+        input_dict["pixel_values"] = reshape_images
+        input_dict["tgt_sizes"] = tgt_sizes
+
+    else:
+        input_dict["pixel_values"] = images
+        input_dict["tgt_sizes"] = []
+
+    return input_dict
+
+
+def slice_image(
+    image, max_slice_nums=9, scale_resolution=448, patch_size=14, never_split=False
+):
+    original_size = image.size
+    original_width, original_height = original_size
+    log_ratio = math.log(original_width / original_height)
+    ratio = original_width * original_height / \
+        (scale_resolution * scale_resolution)
+    multiple = min(math.ceil(ratio), max_slice_nums)
+
+    source_image = None
+    best_grid = None
+    patches = []
+
+    if multiple <= 1 or never_split:
+        # dont need to slice, upsample
+        best_size = find_best_resize(
+            original_size, scale_resolution, patch_size, allow_upscale=True
+        )
+        source_image = image.resize(best_size, Image.Resampling.BICUBIC)
+    else:
+        candidate_split_grids_nums = []
+        for i in [multiple - 1, multiple, multiple + 1]:
+            if i == 1 or i > max_slice_nums:
+                continue
+            candidate_split_grids_nums.append(i)
+
+        # source image, down-sampling and ensure divided by patch_size
+        best_resize = find_best_resize(
+            original_size, scale_resolution, patch_size)
+        source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
+        candidate_grids = []
+
+        # find best grid
+        for split_grids_nums in candidate_split_grids_nums:
+            m = 1
+            while m <= split_grids_nums:
+                if split_grids_nums % m == 0:
+                    candidate_grids.append([m, split_grids_nums // m])
+                m += 1
+
+        best_grid = [1, 1]
+        min_error = float("inf")
+        for grid in candidate_grids:
+            error = abs(log_ratio - math.log(grid[0] / grid[1]))
+            if error < min_error:
+                best_grid = grid
+                min_error = error
+
+        refine_size = get_refine_size(
+            original_size, best_grid, scale_resolution, patch_size, allow_upscale=True
+        )
+
+        refine_image = image.resize(refine_size, Image.Resampling.BICUBIC)
+        patches = split_to_patches(refine_image, best_grid)
+
+    return source_image, patches, best_grid
+
+
+def ensure_divide(length, patch_size):
+    return max(round(length / patch_size) * patch_size, patch_size)
+
+
+def find_best_resize(original_size, scale_resolution, patch_size, allow_upscale=False):
+    width, height = original_size
+    if (width * height > scale_resolution * scale_resolution) or allow_upscale:
+        r = width / height
+        height = int(scale_resolution / math.sqrt(r))
+        width = int(height * r)
+    best_width = ensure_divide(width, patch_size)
+    best_height = ensure_divide(height, patch_size)
+    return (best_width, best_height)
+
+
+def get_refine_size(
+    original_size, grid, scale_resolution, patch_size, allow_upscale=False
+):
+    width, height = original_size
+    grid_x, grid_y = grid
+
+    refine_width = ensure_divide(width, grid_x)
+    refine_height = ensure_divide(height, grid_y)
+
+    grid_width = refine_width / grid_x
+    grid_height = refine_height / grid_y
+
+    best_grid_size = find_best_resize(
+        (grid_width, grid_height),
+        scale_resolution,
+        patch_size,
+        allow_upscale=allow_upscale,
+    )
+
+    refine_size = (best_grid_size[0] * grid_x, best_grid_size[1] * grid_y)
+
+    return refine_size
+
+
+def split_to_patches(image, grid):
+    patches = []
+    width, height = image.size
+    grid_x = int(width / grid[0])
+    grid_y = int(height / grid[1])
+
+    for i in range(0, height, grid_y):
+        images = []
+        for j in range(0, width, grid_x):
+            box = (j, i, j + grid_x, i + grid_y)
+            patch = image.crop(box)
+            images.append(patch)
+        patches.append(images)
+
+    return patches
+
+
+def get_grid_placeholder(tokenizer, grid, query_num):
+    image_placeholder = (
+        tokenizer.im_start + tokenizer.unk_token * query_num + tokenizer.im_end
+    )
+
+    cols = grid[0]
+    rows = grid[1]
+    slices = []
+    for i in range(rows):
+        lines = []
+        for j in range(cols):
+            lines.append(image_placeholder)
+        slices.append("".join(lines))
+    slice_placeholder = tokenizer.slice_start + \
+        "\n".join(slices) + tokenizer.slice_end
+    return slice_placeholder
+
+
+def reshape_by_patch(image_tensor, patch_size):
+    """
+    :param image_tensor: shape [3, H, W]
+    :param patch_size:
+    :return: [3, patch_size, HW/patch_size]
+    """
+    patches = torch.nn.functional.unfold(
+        image_tensor, (patch_size, patch_size), stride=(patch_size, patch_size)
+    )
+
+    patches = patches.reshape(image_tensor.size(0), patch_size, patch_size, -1)
+    patches = patches.permute(0, 1, 3, 2).reshape(
+        image_tensor.size(0), patch_size, -1)
+    return patches
diff --git a/finetune/ds_config_zero2.json b/finetune/ds_config_zero2.json
new file mode 100644
index 0000000000000000000000000000000000000000..4d42d440b4b2b55dba2e54863109773b1ee3fee9
--- /dev/null
+++ b/finetune/ds_config_zero2.json
@@ -0,0 +1,54 @@
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+
+    "bf16": {
+        "enabled": "auto"
+    },
+
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto"
+        }
+    },
+
+    "zero_optimization": {
+        "stage": 2,
+        "offload_optimizer": {
+            "device": "none",
+            "pin_memory": true
+        },
+        "allgather_partitions": true,
+        "allgather_bucket_size": 2e8,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 2e8,
+        "contiguous_gradients": true
+    },
+
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 100,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
diff --git a/finetune/ds_config_zero3.json b/finetune/ds_config_zero3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5c3cd9c9274dee4f57449196edbc07920b3b94ec
--- /dev/null
+++ b/finetune/ds_config_zero3.json
@@ -0,0 +1,61 @@
+
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto"
+        }
+    },
+
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "none",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "none",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    },
+
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 100,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
+
diff --git a/finetune/finetune.py b/finetune/finetune.py
new file mode 100644
index 0000000000000000000000000000000000000000..27385559ab6a1a4c8c6ac70a730286505b1adf08
--- /dev/null
+++ b/finetune/finetune.py
@@ -0,0 +1,312 @@
+import glob
+import json
+import logging
+import os
+from dataclasses import dataclass, field
+from functools import partial
+from typing import Dict, List, Optional, Union, Literal, Tuple
+from types import MethodType
+import torch
+import transformers
+from accelerate.utils import DistributedType
+from deepspeed import zero
+from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+
+from transformers import AutoModel, AutoTokenizer
+from transformers.integrations import deepspeed
+from transformers import AutoModel, AutoTokenizer
+
+from dataset import SupervisedDataset, data_collator
+from trainer import CPMTrainer
+
+from peft import LoraConfig, get_peft_model
+
+@dataclass
+class ModelArguments:
+    model_name_or_path: Optional[str] = field(default="openbmb/MiniCPM-V-2")
+
+
+@dataclass
+class DataArguments:
+    data_path: str = field(
+        default=None, metadata={"help": "Path to the training data."}
+    )
+    eval_data_path: str = field(
+        default=None, metadata={"help": "Path to the evaluation data."}
+    )
+
+
+@dataclass
+class TrainingArguments(transformers.TrainingArguments):
+    cache_dir: Optional[str] = field(default=None)
+    optim: str = field(default="adamw_torch")
+    model_max_length: int = field(
+        default=2048,
+        metadata={
+            "help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
+        },
+    )
+    tune_vision: Optional[bool] = field(default=True)
+    tune_llm: Optional[bool] = field(default=False)
+    llm_type: str = field(default="minicpm")
+    use_lora: Optional[bool] = field(default=False)
+
+
+@dataclass
+class LoraArguments:
+    lora_r: int = 64
+    lora_alpha: int = 64
+    lora_dropout: float = 0.05
+    lora_target_modules: str = r"llm\..*layers\.\d+\.self_attn\.(q_proj|k_proj|v_proj)"
+    lora_weight_path: str = ""
+    lora_bias: str = "none"
+    q_lora: bool = False
+    lora_modules_to_save: str = ""
+    lora_layer_replication: Optional[List[Tuple[int, int]]] = None
+    lora_layers_to_transform: Optional[List[int]] = None
+    lora_layers_pattern: Optional[str] = None
+   
+def maybe_zero_3(param):
+    if hasattr(param, "ds_id"):
+        assert param.ds_status == ZeroParamStatus.NOT_AVAILABLE
+        with zero.GatheredParameters([param]):
+            param = param.data.detach().cpu().clone()
+    else:
+        param = param.detach().cpu().clone()
+    return param
+
+
+# Borrowed from peft.utils.get_peft_model_state_dict
+def get_peft_state_maybe_zero_3(named_params, bias):
+    if bias == "none":
+        to_return = {k: t for k, t in named_params if "lora_" in k}
+    elif bias == "all":
+        to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k}
+    elif bias == "lora_only":
+        to_return = {}
+        maybe_lora_bias = {}
+        lora_bias_names = set()
+        for k, t in named_params:
+            if "lora_" in k:
+                to_return[k] = t
+                bias_name = k.split("lora_")[0] + "bias"
+                lora_bias_names.add(bias_name)
+            elif "bias" in k:
+                maybe_lora_bias[k] = t
+        for k, t in maybe_lora_bias:
+            if bias_name in lora_bias_names:
+                to_return[bias_name] = t
+    else:
+        raise NotImplementedError
+    to_return = {k: maybe_zero_3(v) for k, v in to_return.items()}
+    return to_return
+
+
+local_rank = None
+def rank0_print(*args):
+    if local_rank == 0:
+        print(*args)
+
+
+def safe_save_model_for_hf_trainer(trainer, output_dir: str, bias="none"):
+    """Collects the state dict and dump to disk."""
+    # check if zero3 mode enabled
+    if deepspeed.is_deepspeed_zero3_enabled():
+        state_dict = trainer.model_wrapped._zero3_consolidated_16bit_state_dict()
+    else:
+        if trainer.args.use_lora:
+            state_dict = get_peft_state_maybe_zero_3(
+                trainer.model.named_parameters(), bias
+            )
+        else:
+            state_dict = trainer.model.state_dict()
+    if trainer.args.should_save and trainer.args.local_rank == 0:
+        trainer._save(output_dir, state_dict=state_dict)
+
+
+def make_supervised_data_module(
+    tokenizer: transformers.PreTrainedTokenizer,
+    data_args,
+    transform,
+    data_collator=None,
+    llm_type="minicpm",
+    slice_config=None,
+    patch_size=14,
+    query_nums=64,
+    batch_vision=False,
+    max_length=2048,
+) -> Dict:
+    """Make dataset and collator for supervised fine-tuning."""
+    dataset_cls = SupervisedDataset
+
+    rank0_print("Loading data...")
+
+    train_json = json.load(open(data_args.data_path, "r"))
+    train_dataset = dataset_cls(
+        train_json,
+        transform,
+        tokenizer,
+        slice_config=slice_config,
+        llm_type=llm_type,
+        patch_size=patch_size,
+        query_nums=query_nums,
+        batch_vision=batch_vision,
+    )
+
+    if data_args.eval_data_path:
+        eval_json = json.load(open(data_args.eval_data_path, "r"))
+        eval_dataset = dataset_cls(
+            eval_json,
+            transform,
+            tokenizer,
+            slice_config=slice_config,
+            llm_type=llm_type,
+            patch_size=patch_size,
+            query_nums=query_nums,
+            batch_vision=batch_vision,
+        )
+    else:
+        eval_dataset = None
+
+    return dict(
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        data_collator= partial(data_collator, max_length=max_length),
+    )
+
+
+def get_parameter_number(model):
+    trainable_params, all_param = 0, 0
+    for param in model.parameters():
+        num_params = param.numel()
+        # if using DS Zero 3 and the weights are initialized empty
+        if num_params == 0 and hasattr(param, "ds_numel"):
+            num_params = param.ds_numel
+
+        all_param += num_params
+        if param.requires_grad:
+            trainable_params += num_params
+        
+    return {'Total': all_param, 'Trainable': trainable_params}
+
+
+local_rank = 0
+
+
+def train():
+    global local_rank
+    parser = transformers.HfArgumentParser(
+        (ModelArguments, DataArguments, TrainingArguments, LoraArguments)
+    )
+
+    (
+        model_args,
+        data_args,
+        training_args,
+        lora_args,
+    ) = parser.parse_args_into_dataclasses()
+
+    if getattr(training_args, "deepspeed", None) : 
+        training_args.distributed_state.distributed_type = DistributedType.DEEPSPEED
+
+    compute_dtype = (
+        torch.float16
+        if training_args.fp16
+        else (torch.bfloat16 if training_args.bf16 else torch.float32)
+    )
+
+    local_rank = training_args.local_rank
+    world_size = int(os.environ.get("WORLD_SIZE", 1))
+    ddp = world_size != 1
+    device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)} if ddp else None
+    
+    model = AutoModel.from_pretrained(
+        model_args.model_name_or_path,
+        trust_remote_code=True,
+        torch_dtype=compute_dtype,
+        device_map=device_map,
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.model_name_or_path, trust_remote_code=True
+    )
+
+    if not training_args.tune_vision:
+        model.vpm.requires_grad_(False)
+    if not training_args.tune_llm:
+        model.llm.requires_grad_(False)
+        
+    if training_args.use_lora:
+        if training_args.use_lora and training_args.tune_llm:
+            raise ValueError("The model cannot simultaneously adjust LLM parameters and apply LoRA.")
+            
+        rank0_print("Currently using LoRA for fine-tuning the MiniCPM-V model.")
+        for name, param in model.llm.named_parameters():
+            param.requires_grad = False
+        lora_config = LoraConfig(
+            r=lora_args.lora_r,
+            lora_alpha=lora_args.lora_alpha,
+            target_modules=lora_args.lora_target_modules,
+            lora_dropout=lora_args.lora_dropout,
+            bias=lora_args.lora_bias,
+            layers_to_transform=lora_args.lora_layers_to_transform,
+            task_type="CAUSAL_LM",
+        )
+        if training_args.gradient_checkpointing:
+            def get_input_embeddings(self):
+                return self.llm.get_input_embeddings()
+            model.get_input_embeddings = MethodType(get_input_embeddings, model)
+        model = get_peft_model(model, lora_config)
+        model.base_model.llm.model.embed_tokens.weight.requires_grad_(True)
+        if training_args.gradient_checkpointing:
+            model.enable_input_require_grads()
+
+    rank0_print(get_parameter_number(model))
+
+    llm_type = training_args.llm_type    
+    if llm_type == "llama3":
+        tokenizer.chat_template = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}"
+    
+    rank0_print(f'llm_type={llm_type}')
+
+    # Load data
+    if hasattr(model.config, "slice_config"):
+        slice_config = model.config.slice_config.to_dict()
+    else:
+        slice_config = model.config.to_dict()
+    if hasattr(model.config, "batch_vision_input"):
+        batch_vision = model.config.batch_vision_input
+    else:
+        batch_vision = False
+
+    data_module = make_supervised_data_module(
+        tokenizer=tokenizer,
+        data_args=data_args,
+        transform=model.transform,
+        data_collator=data_collator,
+        slice_config=slice_config,
+        llm_type=llm_type,
+        patch_size=model.config.patch_size,
+        query_nums=model.config.query_num,
+        batch_vision=batch_vision,
+        max_length=training_args.model_max_length,
+    )
+    
+    trainer = CPMTrainer(
+        model=model,
+        tokenizer=tokenizer,
+        args=training_args,
+        **data_module,
+    )
+
+    trainer.train()
+    trainer.save_state()
+
+    safe_save_model_for_hf_trainer(
+        trainer=trainer,
+        output_dir=training_args.output_dir,
+        bias=lora_args.lora_bias)
+
+
+if __name__ == "__main__":
+    train()
diff --git a/finetune/finetune_ds.sh b/finetune/finetune_ds.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ecc6cd656d8d1ef493d56b9e3d7d65083fbad3af
--- /dev/null
+++ b/finetune/finetune_ds.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+
+GPUS_PER_NODE=8
+NNODES=1
+NODE_RANK=0
+MASTER_ADDR=localhost
+MASTER_PORT=6001
+
+MODEL="openbmb/MiniCPM-Llama3-V-2_5" # or openbmb/MiniCPM-V-2
+# ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
+# See the section for finetuning in README for more information.
+DATA="path/to/trainging_data"
+EVAL_DATA="path/to/test_data"
+LLM_TYPE="llama3" # if use openbmb/MiniCPM-V-2, please set LLM_TYPE=minicpm
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+torchrun $DISTRIBUTED_ARGS finetune.py  \
+    --model_name_or_path $MODEL \
+    --llm_type $LLM_TYPE \
+    --data_path $DATA \
+    --eval_data_path $EVAL_DATA \
+    --remove_unused_columns false \
+    --label_names "labels" \
+    --prediction_loss_only false \
+    --bf16 true \
+    --bf16_full_eval true \
+    --fp16 false \
+    --fp16_full_eval false \
+    --do_train \
+    --do_eval \
+    --tune_vision true \
+    --tune_llm true \
+    --model_max_length 2048 \
+    --max_steps 10000 \
+    --eval_steps 1000 \
+    --output_dir output/output_minicpmv2 \
+    --logging_dir output/output_minicpmv2 \
+    --logging_strategy "steps" \
+    --per_device_train_batch_size 2 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 1 \
+    --evaluation_strategy "steps" \
+    --save_strategy "steps" \
+    --save_steps 1000 \
+    --save_total_limit 10 \
+    --learning_rate 1e-6 \
+    --weight_decay 0.1 \
+    --adam_beta2 0.95 \
+    --warmup_ratio 0.01 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --gradient_checkpointing true \
+    --deepspeed ds_config_zero2.json \
+    --report_to "tensorboard" 
diff --git a/finetune/finetune_lora.sh b/finetune/finetune_lora.sh
new file mode 100644
index 0000000000000000000000000000000000000000..deba0d5a16421dac982394f53de15f08ae729d04
--- /dev/null
+++ b/finetune/finetune_lora.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+
+GPUS_PER_NODE=8
+NNODES=1
+NODE_RANK=0
+MASTER_ADDR=localhost
+MASTER_PORT=6001
+
+MODEL="openbmb/MiniCPM-Llama3-V-2_5" # or openbmb/MiniCPM-V-2
+# ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
+# See the section for finetuning in README for more information.
+DATA="path/to/trainging_data"
+EVAL_DATA="path/to/test_data"
+LLM_TYPE="llama3" # if use openbmb/MiniCPM-V-2, please set LLM_TYPE=minicpm
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+torchrun $DISTRIBUTED_ARGS finetune.py  \
+    --model_name_or_path $MODEL \
+    --llm_type $LLM_TYPE \
+    --data_path $DATA \
+    --eval_data_path $EVAL_DATA \
+    --remove_unused_columns false \
+    --label_names "labels" \
+    --prediction_loss_only false \
+    --bf16 true \
+    --bf16_full_eval true \
+    --do_train \
+    --do_eval \
+    --tune_vision true \
+    --tune_llm false \
+    --use_lora true \
+    --lora_target_modules "llm\..*layers\.\d+\.self_attn\.(q_proj|k_proj)" \
+    --model_max_length 2048 \
+    --max_steps 10000 \
+    --eval_steps 1000 \
+    --output_dir output/output_minicpmv2_lora \
+    --logging_dir output/output_minicpmv2_lora \
+    --logging_strategy "steps" \
+    --per_device_train_batch_size 2 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 1 \
+    --evaluation_strategy "steps" \
+    --save_strategy "steps" \
+    --save_steps 1000 \
+    --save_total_limit 10 \
+    --learning_rate 1e-6 \
+    --weight_decay 0.1 \
+    --adam_beta2 0.95 \
+    --warmup_ratio 0.01 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --gradient_checkpointing true \
+    --deepspeed ds_config_zero2.json \
+    --report_to "tensorboard" # wandb
diff --git a/finetune/readme.md b/finetune/readme.md
new file mode 100644
index 0000000000000000000000000000000000000000..6bf6d8d11a0a1549d3b302689fdcd6a8f31ddc1a
--- /dev/null
+++ b/finetune/readme.md
@@ -0,0 +1,149 @@
+# MiniCPM-V Finetuning
+
+
+We offer the official scripts for easy finetuning of the pretrained **MiniCPM-Llama3-V 2.5** and **MiniCPM-V 2.0** on downstream tasks. Our finetune scripts use transformers Trainer and DeepSpeed by default.
+
+### Data preparation
+
+To prepare your finetuning data, you should formulate each sample as a dictionary consisting of an id, an image path list with an image, and a list of conversations. Then save data samples in JSON files.
+
+For the vision-language example with image, you are required to provide **\<image\>** to define the position to insert the image embeddings. If you don't provide \<image\>, the image will be placed at the front of the conversation.
+
+<details>
+  <summary>
+    <b>vision-language example (vl_finetune_data.json) with 1 samples.</b>
+  </summary>
+
+```
+  [
+    {
+      "id": "0",
+      "image": 'path/to/image_0.jpg',
+      "conversations": [
+            {
+              'role': 'user', 
+              'content': '<image>\nHow many desserts are on the white plate?'
+            }, 
+            {
+                'role': 'assistant', 
+                'content': 'There are three desserts on the white plate.'
+            },   
+            {
+                'role': 'user', 
+                'content': 'What type of desserts are they?'
+            },
+            {
+                'role': 'assistant', 
+                'content': 'The desserts are cakes with bananas and pecans on top. They share similarities with donuts, but the presence of bananas and pecans differentiates them.'
+            }, 
+            {
+                'role': 'user', 
+                'content': 'What is the setting of the image?'}, 
+            {
+                'role': 'assistant', 
+                'content': 'The image is set on a table top with a plate containing the three desserts.'
+            },
+        ]
+    },
+  ]
+```
+
+</details>
+
+### Full-parameter finetuning
+
+Full-parameter parameter finetuning requires updating all parameters of LLM in the whole training process. Please specify the correct MODEL path, DATA path and LLM_TYPE in the shell scripts.
+
+```shell
+MODEL="openbmb/MiniCPM-Llama3-V-2_5" # or openbmb/MiniCPM-V-2
+DATA="path/to/trainging_data" # json file
+EVAL_DATA="path/to/test_data" # json file
+LLM_TYPE="llama3" # if use openbmb/MiniCPM-V-2, please set LLM_TYPE=minicpm
+```
+
+To launch your training, run the following script:
+
+```
+sh finetune_ds.sh
+```
+
+Specially, Llama3 has a different chat_template for training and inference, we modified the chat_template for training, so please take care to restore the chat_template when inference on the training ckpt.
+
+### LoRA finetuning
+
+The LoRA allows light-weight model tuning with only a small subset of parameters updated. We provide the LoRA implementation based on `peft`. To launch your training, run the following script:
+
+```
+sh finetune_ds_lora.sh
+```
+
+After training, you could load the model with the path to the adapter. We advise you to use absolute path for your pretrained model. This is because LoRA only saves the adapter and the absolute path in the adapter configuration json file is used for finding out the pretrained model to load.
+
+```
+from peft import AutoPeftModelForCausalLM
+
+model = AutoPeftModelForCausalLM.from_pretrained(
+    # path to the output directory
+    path_to_adapter,
+    device_map="auto",
+    trust_remote_code=True
+).eval()
+```
+
+
+### Model Fine-tuning Memory Usage Statistics
+
+The following table presents the memory usage of the model when fine-tuning using NVIDIA A100 (80GiB) GPUs under different numbers of GPUs. The fine-tuning was performed with the DeepSpeed Zero-2 optimization and Gradient Checkpointing techniques, with a maximum length set to 2048 and batch size set to 1.
+
+| Fine-tuning Method | GPUs: 2 | GPUs: 4 | GPUs: 8 |
+|--------------------|---------|---------|---------|
+| LoRA Fine-tuning   | 31.2 GiB| 29.3 GiB|    28.4GiB   |
+| Full Parameters Fine-tuning | Out of memory | 75.0 GiB | 51.2GiB |
+
+### Notes
+- **Fine-tuning Method**: Displays two different fine-tuning strategies, LoRA fine-tuning and Full parameters fine-tuning.
+- **Number of GPUs**: The table lists the memory usage for configurations with 2, 4, and 8 GPUs.
+- **Memory Usage**: Expressed in GiB, this shows the required memory for each fine-tuning method under corresponding GPU configurations.
+- **Out of memory**: Indicates that the memory was insufficient for full parameters fine-tuning under the current GPU configurations.
+
+### Finetuning FAQs
+<details>
+<summary>Q: How do I use the `flash_attention_2` implementation when loading a pretrained model?</summary>
+
+A: If your environment supports `flash_attn2`, you can add an argument `_attn_implementation="flash_attention_2"` when using the `AutoModel.from_pretrained` method to load a model. For example:
+
+```python
+model = AutoModel.from_pretrained('model_name', _attn_implementation="flash_attention_2")
+```
+</details>
+
+<details>
+<summary>Q: What if our data is resized to 512? Can we use the original image size instead?</summary>
+
+A: Our model supports up to 1344x1344 lossless encoding. If you are currently resizing your images to 512, you might want to try using the original image sizes instead. Our system automatically includes a high-definition image encoding scheme by default.
+
+</details>
+
+<details>
+<summary>Q: What should we do if we encounter out-of-memory (OOM) errors?</summary>
+
+A: If you experience OOM issues, consider reducing the batch size (`bs`). To maintain an equivalent total batch size, you can adjust the `gradient_accumulation_steps` setting. This approach allows you to manage memory usage effectively while still processing the desired amount of data per training step.
+</details>
+
+<details>
+<summary>Q: How can we determine the maximum length for our training data, and what if we do not want to train the vision encoder?</summary>
+
+A: I recommend using this function [here](https://github.com/OpenBMB/MiniCPM-V/blob/main/finetune/dataset.py#L220) to sample the length of your training data. Note that the `input_ids` length includes the image portion. Once you determine the maximum length, you can specify it in the startup command using `--model_max_length xxx`.
+
+Additionally, if you prefer not to train the vision encoder, you can add `--tune_vision false` to your command.
+
+</details>
+
+<details>
+<summary>Q: How can we adjust training hyperparameters when using LoRA to train our model?</summary>
+
+A: You can refer to the [LoRA documentation](https://huggingface.co/docs/peft/en/package_reference/lora#peft.LoraConfig) for guidance on adjusting your training hyperparameters when using LoRA. This documentation provides detailed information on configuring various parameters specific to the LoRA adaptation technique.
+</details>
+
+#### Customizing Hyperparameters
+To tailor the training process according to your specific requirements, you can adjust various hyperparameters. For comprehensive documentation on available hyperparameters and their functionalities, you can refer to the [official Transformers documentation](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments) and [Lora documentation](https://huggingface.co/docs/peft/en/package_reference/lora#peft.LoraConfig). Experimentation and fine-tuning of these parameters are essential for achieving optimal model performance tailored to your specific task and dataset.
diff --git a/finetune/trainer.py b/finetune/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..773e3587bff2f5ec1c77b02042d0a2635b9adbac
--- /dev/null
+++ b/finetune/trainer.py
@@ -0,0 +1,167 @@
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from transformers import Trainer
+from transformers.trainer_pt_utils import nested_detach
+from transformers.utils import is_sagemaker_mp_enabled
+
+
+class CPMTrainer(Trainer):
+    def compute_loss(self, model, inputs, return_outputs=False):
+        if "labels" in inputs:
+            labels = inputs.pop("labels")
+        else:
+            labels = None
+        if not self. args.use_lora:
+            outputs = self.model(data = inputs, use_cache=False)
+        else:
+            outputs = self.model.base_model(data = inputs, use_cache=False)
+
+        if labels is not None:
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            logits = outputs.logits.view(-1,
+                                         self.model.config.vocab_size).contiguous()
+            labels = labels.view(-1).long().contiguous()
+            # Enable model parallelism
+            labels = labels.to(logits.device)
+            loss = loss_fct(logits, labels)
+        else:
+            if isinstance(outputs, dict) and "loss" not in outputs:
+                raise ValueError(
+                    "The model did not return a loss from the inputs, only the following keys: "
+                    f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}."
+                )
+            # We don't use .loss here since the model may return tuples instead of ModelOutput.
+            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
+
+        return (loss, outputs) if return_outputs else loss
+
+    def prediction_step(
+        self,
+        model: nn.Module,
+        inputs: Dict[str, Union[torch.Tensor, Any]],
+        prediction_loss_only: bool,
+        ignore_keys: Optional[List[str]] = None,
+    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]:
+        """
+        Perform an evaluation step on `model` using `inputs`.
+
+        Subclass and override to inject custom behavior.
+
+        Args:
+            model (`nn.Module`):
+                The model to evaluate.
+            inputs (`Dict[str, Union[torch.Tensor, Any]]`):
+                The inputs and targets of the model.
+
+                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
+                argument `labels`. Check your model's documentation for all accepted arguments.
+            prediction_loss_only (`bool`):
+                Whether or not to return the loss only.
+            ignore_keys (`List[str]`, *optional*):
+                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
+                gathering predictions.
+
+        Return:
+            Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss,
+            logits and labels (each being optional).
+        """
+        has_labels = (
+            False
+            if len(self.label_names) == 0
+            else all(inputs.get(k) is not None for k in self.label_names)
+        )
+        # For CLIP-like models capable of returning loss values.
+        # If `return_loss` is not specified or being `None` in `inputs`, we check if the default value of `return_loss`
+        # is `True` in `model.forward`.
+        return_loss = inputs.get("return_loss", None)
+        if return_loss is None:
+            return_loss = self.can_return_loss
+        loss_without_labels = (
+            True if len(self.label_names) == 0 and return_loss else False
+        )
+
+        inputs = self._prepare_inputs(inputs)
+        if ignore_keys is None:
+            if hasattr(self.model, "config"):
+                ignore_keys = getattr(
+                    self.model.config, "keys_to_ignore_at_inference", []
+                )
+            else:
+                ignore_keys = []
+
+        # labels may be popped when computing the loss (label smoothing for instance) so we grab them first.
+        if has_labels or loss_without_labels:
+            labels = nested_detach(tuple(inputs.get(name)
+                                   for name in self.label_names))
+            if len(labels) == 1:
+                labels = labels[0]
+        else:
+            labels = None
+
+        with torch.no_grad():
+            if is_sagemaker_mp_enabled():
+                raw_outputs = smp_forward_only(model, inputs)
+                if has_labels or loss_without_labels:
+                    if isinstance(raw_outputs, dict):
+                        loss_mb = raw_outputs["loss"]
+                        logits_mb = tuple(
+                            v
+                            for k, v in raw_outputs.items()
+                            if k not in ignore_keys + ["loss"]
+                        )
+                    else:
+                        loss_mb = raw_outputs[0]
+                        logits_mb = raw_outputs[1:]
+
+                    loss = loss_mb.reduce_mean().detach().cpu()
+                    logits = smp_nested_concat(logits_mb)
+                else:
+                    loss = None
+                    if isinstance(raw_outputs, dict):
+                        logits_mb = tuple(
+                            v for k, v in raw_outputs.items() if k not in ignore_keys
+                        )
+                    else:
+                        logits_mb = raw_outputs
+                    logits = smp_nested_concat(logits_mb)
+            else:
+                if has_labels or loss_without_labels:
+                    with self.compute_loss_context_manager():
+                        loss, outputs = self.compute_loss(
+                            model, inputs, return_outputs=True
+                        )
+                    loss = loss.mean().detach()
+
+                    if isinstance(outputs, dict):
+                        logits = tuple(
+                            v
+                            for k, v in outputs.items()
+                            if k not in ignore_keys + ["loss"]
+                        )
+                    else:
+                        logits = outputs[1:]
+                else:
+                    loss = None
+                    with self.compute_loss_context_manager():
+                        outputs = model(**inputs)
+                    if isinstance(outputs, dict):
+                        logits = tuple(
+                            v for k, v in outputs.items() if k not in ignore_keys
+                        )
+                    else:
+                        logits = outputs
+                    # TODO: this needs to be fixed and made cleaner later.
+                    if self.args.past_index >= 0:
+                        self._past = outputs[self.args.past_index - 1]
+
+        if prediction_loss_only:
+            return (loss, None, None)
+
+        logits = nested_detach(logits)
+        if len(logits) == 1:
+            logits = logits[0]
+
+        return (loss, logits, labels)
diff --git a/minicpm_v1.md b/minicpm_v1.md
new file mode 100644
index 0000000000000000000000000000000000000000..af345c2fb780343f37aeea03fd4c1e7ecb80a16a
--- /dev/null
+++ b/minicpm_v1.md
@@ -0,0 +1,214 @@
+## MiniCPM-V 1.0
+
+
+> Archive at：2024-05-19
+
+MiniCPM-V 1.0 is an efficient version with promising performance for deployment. The model is built based on SigLip-400M and [MiniCPM-2.4B](https://github.com/OpenBMB/MiniCPM/), connected by a perceiver resampler. Notable features of MiniCPM-V 1.0 include:
+
+- ⚡️ **High Efficiency.** 
+
+  MiniCPM-V 1.0 can be **efficiently deployed on most GPU cards and personal computers**, and **even on end devices such as mobile phones**. In terms of visual encoding, we compress the image representations into 64 tokens via a perceiver resampler, which is significantly fewer than other LMMs based on MLP architecture (typically > 512 tokens). This allows MiniCPM-V 1.0 to operate with **much less memory cost and higher speed during inference**.
+
+- 🔥 **Promising Performance.** 
+
+  MiniCPM-V 1.0 achieves **state-of-the-art performance** on multiple benchmarks (including MMMU, MME, and MMbech, etc) among models with comparable sizes, surpassing existing LMMs built on Phi-2. It even **achieves comparable or better performance than the 9.6B Qwen-VL-Chat**.
+
+- 🙌 **Bilingual Support.** 
+
+  MiniCPM-V 1.0 is **the first end-deployable LMM supporting bilingual multimodal interaction in English and Chinese**. This is achieved by generalizing multimodal capabilities across languages, a technique from the ICLR 2024 spotlight [paper](https://arxiv.org/abs/2308.12038).
+
+### Evaluation
+
+<div align="center">
+
+<table style="margin: 0px auto;">
+<thead>
+  <tr>
+    <th align="left">Model</th>
+    <th>Size</th>
+    <th nowrap="nowrap" >Visual Tokens</th>
+    <th>MME</th>
+    <th nowrap="nowrap" >MMB dev (en)</th>
+    <th nowrap="nowrap" >MMB dev (zh)</th>
+    <th nowrap="nowrap" >MMMU val</th>
+    <th nowrap="nowrap" >CMMMU val</th>
+  </tr>
+</thead>
+<tbody align="center">
+  <tr>
+    <td align="left">LLaVA-Phi</td>
+    <td align="right">3B</td>
+    <td>576</td>
+    <td>1335</td>
+    <td>59.8</td>
+    <td>- </td>
+    <td>- </td>
+    <td>- </td>
+  </tr>
+  <tr>
+    <td nowrap="nowrap" align="left">MobileVLM</td>
+    <td align="right">3B</td>
+    <td>144</td>
+    <td>1289</td>
+    <td>59.6</td>
+    <td>- </td>
+    <td>- </td>
+    <td>- </td>
+  </tr>
+  <tr>
+    <td nowrap="nowrap" align="left" >Imp-v1</td>
+    <td align="right">3B</td>
+    <td>576</td>
+    <td>1434</td>
+    <td>66.5</td>
+    <td>- </td>
+    <td>- </td>
+    <td>- </td>
+  </tr>
+  <tr>
+    <td  nowrap="nowrap" align="left" >Qwen-VL-Chat</td>
+    <td align="right" >9.6B</td>
+    <td>256</td>
+    <td>1487</td>
+    <td>60.6 </td>
+    <td>56.7 </td>
+    <td>35.9 </td>
+    <td>30.7 </td>
+  </tr>
+  <tr>
+    <td nowrap="nowrap" align="left" >CogVLM</td>
+    <td align="right">17.4B </td>
+    <td>1225</td>
+    <td>1438 </td>
+    <td>63.7 </td>
+    <td>53.8 </td>
+    <td>32.1 </td>
+    <td>- </td>
+  </tr>
+  <tr>
+    <td nowrap="nowrap" align="left" ><b>MiniCPM-V 1.0</b></td>
+    <td align="right">3B </td>
+    <td>64</td>
+    <td>1452 </td>
+    <td>67.9 </td>
+    <td>65.3 </td>
+    <td>37.2 </td>
+    <td>32.1 </td>
+  </tr>
+</tbody>
+</table>
+
+</div>
+
+### Examples
+
+We deploy MiniCPM-V 1.0 on end devices. The demo video is the raw screen recording on a OnePlus 9R without edition.
+
+<table align="center">
+    <p align="center">
+      <img src="assets/gif_cases/蛇_cn.gif" width=36%/>
+      <img src="assets/gif_cases/Mushroom_en.gif" width=36%/>
+    </p>
+</table>
+
+## Install
+
+1. Clone this repository and navigate to the source folder
+
+```bash
+git clone https://github.com/OpenBMB/OmniLMM.git
+cd OmniLMM
+```
+
+2. Create conda environment
+
+```Shell
+conda create -n OmniLMM python=3.10 -y
+conda activate OmniLMM
+```
+
+3. Install dependencies
+
+```shell
+pip install -r requirements.txt
+```
+
+## Inference
+
+### Model Zoo
+| Model                | Description       | Download Link |
+|:----------------------|:-------------------|:---------------:|
+| MiniCPM-V 1.0  | The efficient version for end device deployment.    |  [🤗](https://huggingface.co/openbmb/MiniCPM-V) &nbsp;&nbsp; [<img src="./assets/modelscope_logo.png" width="20px"></img>](https://modelscope.cn/models/OpenBMB/MiniCPM-V/files) |
+
+
+### Multi-turn Conversation
+Please refer to the following codes to run `MiniCPM-V 1.0`.
+
+<div align="center">
+<img src="assets/worldmap_ck.jpg" width="500px">
+</div>
+
+
+```python
+from chat import OmniLMMChat, img2base64
+
+chat_model = OmniLMMChat('openbmb/MiniCPM-V')
+
+im_64 = img2base64('./assets/worldmap_ck.jpg')
+
+# First round chat 
+msgs = [{"role": "user", "content": "What is interesting about this image?"}]
+
+inputs = {"image": im_64, "question": json.dumps(msgs)}
+answer = chat_model.chat(inputs)
+print(answer)
+
+# Second round chat 
+# pass history context of multi-turn conversation
+msgs.append({"role": "assistant", "content": answer})
+msgs.append({"role": "user", "content": "Where is China in the image"})
+
+inputs = {"image": im_64, "question": json.dumps(msgs)}
+answer = chat_model.chat(inputs)
+print(answer)
+```
+
+
+### Inference on Mac
+<details>
+<summary>Click to view example, MiniCPM-V 1.0 can run on Mac with MPS (Apple silicon or AMD GPUs). </summary>
+
+```python
+# test.py
+import torch
+from PIL import Image
+from transformers import AutoModel, AutoTokenizer
+
+model = AutoModel.from_pretrained('openbmb/MiniCPM-V', trust_remote_code=True, torch_dtype=torch.bfloat16)
+model = model.to(device='mps', dtype=torch.float16)
+
+tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V', trust_remote_code=True)
+model.eval()
+
+image = Image.open('./assets/worldmap_ck.jpg').convert('RGB')
+question = 'What is interesting about this image?'
+msgs = [{'role': 'user', 'content': question}]
+
+answer, context, _ = model.chat(
+    image=image,
+    msgs=msgs,
+    context=None,
+    tokenizer=tokenizer,
+    sampling=True
+)
+print(answer)
+```
+Run with command:
+```shell
+PYTORCH_ENABLE_MPS_FALLBACK=1 python test.py
+```
+</details>
+
+### Deployment on Mobile Phone
+
+Currently MiniCPM-V 1.0 can be deployed on mobile phones with Android and Harmony operating systems. 🚀 Try it out [here](https://github.com/OpenBMB/mlc-MiniCPM).
diff --git a/omnilmm.md b/omnilmm.md
new file mode 100644
index 0000000000000000000000000000000000000000..3c1753c0b669f514a398fa28056f530f979d21f4
--- /dev/null
+++ b/omnilmm.md
@@ -0,0 +1,183 @@
+## OmniLMM-12B
+
+> OmniLMM-12B 发布于本项目早期。推荐您使用我们[最新发布的模型](./README_zh.md)，以获得更高效的推理和更强大的性能体验。
+
+> 归档时间：2024-05-19
+
+**OmniLMM-12B** 是当前系列中性能最佳的版本。该模型基于EVA02-5B和Zephyr-7B-β初始化构建，并使用perceiver resampler连接，采用了课程学习的方法在多模态数据上进行训练。该模型具有三个特点：
+
+- 🔥 **性能领先。**
+
+  OmniLMM-12B 相比其他同规模模型在多个基准测试中取得**领先的性能**（包括 MME、MMBench、SEED-Bench 等），模型掌握了较为丰富的多模态世界知识。
+
+- 🏆 **行为可信。**
+
+  多模态大模型的幻觉问题备受关注，模型经常生成和图像中的事实不符的文本（例如，确信地描述图片中并不存在的物体）。OmniLMM-12B是 **第一个通过多模态 RLHF 对齐的综合能力优秀的开源多模态大模型**（借助 [RLHF-V](https://rlhf-v.github.io/) [CVPR'24] 系列技术）。该模型在 [MMHal-Bench](https://huggingface.co/datasets/Shengcao1006/MMHal-Bench) 幻觉评测基准上达到**开源模型最佳水平**，并在 [Object HalBench](https://arxiv.org/abs/2312.00849) 中**优于GPT-4V**。
+
+- 🕹 **实时多模态交互。**
+
+  我们尝试结合OmniLMM-12B和GPT-3.5 (纯文本模型) ，实现**实时多模态交互助手**。该模型接受来自摄像头的视频流，并借助工具处理语音输入输出。虽然还很初步，我们发现该模型无需视频编辑可以**复现Gemini演示视频中的一些有趣例子**。
+
+### 评测结果 <!-- omit in toc -->
+
+<div align="center">
+    <img src=assets/radar_omnilmm12b.png width=66% />
+</div>
+<details>
+<summary> MME, MMBench, MMMU, MMBench, MMHal-Bench, Object HalBench, SeedBench, LLaVA Bench W, MathVista 上的详细评测结果。 </summary>
+
+<table>
+<thead>
+  <tr>
+    <th align="left">Model</th>
+    <th>Size</th>
+    <th>MME</th>
+    <th nowrap="nowrap">MMB dev (en)</th>
+    <th nowrap="nowrap" >MMMU val</th>
+    <th nowrap="nowrap" >MMHal-Bench</th>
+    <th nowrap="nowrap" >Object HalBench</th>
+    <th nowrap="nowrap" >SeedBench-I</th>
+    <th>MathVista</th>
+    <th nowrap="nowrap" >LLaVA Bench</th>
+  </tr>
+</thead>
+<tbody align="center">
+  <tr>
+    <td align="left">GPT-4V†</td>
+    <td>-</td>
+    <td>1771.5</td>
+    <td>75.1 </td>
+    <td>56.8</td>
+    <td>3.53 / 70.8</td>
+    <td>86.4 / 92.7</td>
+    <td>71.6 </td>
+    <td>47.8 </td>
+    <td>93.1 </td>
+  </tr>
+  <tr>
+    <td nowrap="nowrap" align="left">Qwen-VL-Plus†</td>
+    <td>-</td>
+    <td>2183.4</td>
+    <td>66.2 </td>
+    <td>45.2</td>
+    <td>- </td>
+    <td>- </td>
+    <td>65.7 </td>
+    <td>36.0 </td>
+    <td>73.7 </td>
+  </tr>
+  <tr>
+    <td align="left">Yi-VL 6B</td>
+    <td align="right">6.7B </td>
+    <td>1915.1 </td>
+    <td>68.6 </td>
+    <td>40.3 </td>
+    <td>- </td>
+    <td>- </td>
+    <td>67.5 </td>
+    <td>28.8 </td>
+    <td>51.9 </td>
+  </tr>
+  <tr>
+    <td nowrap="nowrap" align="left" >Qwen-VL-Chat</td>
+    <td align="right">9.6B</td>
+    <td>1860.0</td>
+    <td>60.6 </td>
+    <td>35.9</td>
+    <td>2.93 / 59.4</td>
+    <td>56.2 / 80.0</td>
+    <td>64.8 </td>
+    <td>33.8 </td>
+    <td>67.7 </td>
+  </tr>
+  <tr>
+    <td align="left" >CogVLM-Chat</td>
+    <td align="right">17.4B</td>
+    <td>1736.6</td>
+    <td>63.7 </td>
+    <td>32.1 </td>
+    <td>2.68 / 52.1 </td>
+    <td>73.6 / 87.4 </td>
+    <td>68.8 </td>
+    <td>34.7 </td>
+    <td>73.9 </td>
+  </tr>
+  <tr>
+    <td align="left" >LLaVA 1.5</td>
+    <td align="right">13.6B </td>
+    <td>1808.4 </td>
+    <td>68.2 </td>
+    <td>36.4 </td>
+    <td>2.71 / 51.0 </td>
+    <td>53.7 / 77.4 </td>
+    <td>68.1 </td>
+    <td>26.4 </td>
+    <td>64.6 </td>
+  </tr>
+  <tr>
+    <td nowrap="nowrap" align="left" ><b>OmniLMM-12B</b></td>
+    <td align="right">11.6B </td>
+    <td>1935.8 </td>
+    <td>71.6 </td>
+    <td>40.7 </td>
+    <td>3.45 / 68.8 </td>
+    <td>90.3 / 95.5 </td>
+    <td>71.1 </td>
+    <td>34.9 </td>
+    <td>72.0 </td>
+  </tr>
+</tbody>
+</table>
+<small>†: 闭源模型</small>
+<br>
+</details>
+
+### 典型示例 <!-- omit in toc -->
+
+<table align="center" >
+  <p align="center" > 
+    <img src="assets/omnilmm-12b-examples_2.png" />
+  </p>
+</table>
+
+
+我们结合 OmniLMM-12B 和 ChatGPT-3.5 (纯文本模型) 尝试构建 **实时多模态交互助手**. OmniLMM-12B 将视频帧转为对应的图像描述并输入给ChatGPT-3.5来生成对用户指令的响应。演示视频未经编辑。
+
+<div align="center" >
+  <video controls src="https://github.com/OpenBMB/OmniLMM/assets/157115220/8fec13bf-bb47-4bf8-8f8c-d0b716a964ec" type="video/mp4" width=80%/>
+</div>
+
+## Online Demo
+
+欢迎通过以下链接使用我们的网页端推理服务： [OmniLMM-12B](http://120.92.209.146:8081) ｜ [MiniCPM-V 2.0](http://120.92.209.146:80).
+
+## 安装
+
+1. 克隆我们的仓库并跳转到相应目录
+
+```bash
+git clone https://github.com/OpenBMB/MiniCPM-V.git
+cd MiniCPM-V
+```
+
+1. 创建 conda 环境
+
+```Shell
+conda create -n MiniCPMV python=3.10 -y
+conda activate MiniCPMV
+```
+
+3. 安装依赖
+
+```shell
+pip install -r requirements.txt
+```
+
+## 推理
+
+### 模型库
+
+| 模型                | 简介       | 下载链接 |
+|:----------------------|:-------------------|:---------------:|
+| OmniLMM-12B | 性能最强的版本                   |  [🤗](https://huggingface.co/openbmb/OmniLMM-12B) &nbsp;&nbsp;  [<img src="./assets/modelscope_logo.png" width="20px"></img>](https://modelscope.cn/models/OpenBMB/OmniLMM-12B/files) |
+
diff --git a/omnilmm/__init__.py b/omnilmm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/omnilmm/constants.py b/omnilmm/constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1ac41d363a634f45a8ba6ef681d4844a2824f5e
--- /dev/null
+++ b/omnilmm/constants.py
@@ -0,0 +1,4 @@
+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+WORKER_HEART_BEAT_INTERVAL = 15
+
+LOGDIR = "."
diff --git a/omnilmm/conversation.py b/omnilmm/conversation.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e997f01f0cbe5caab5c48112189947f5392b9d5
--- /dev/null
+++ b/omnilmm/conversation.py
@@ -0,0 +1,320 @@
+import dataclasses
+from enum import auto, Enum
+from typing import List, Tuple
+
+
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    SINGLE = auto()
+    TWO = auto()
+
+
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "###"
+    sep2: str = None
+    version: str = "Unknown"
+
+    skip_next: bool = False
+
+    def get_prompt(self):
+        if self.sep_style == SeparatorStyle.SINGLE:
+            ret = self.system + self.sep
+            for role, message in self.messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+            return ret
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+
+    def get_images(self, return_pil=False):
+        images = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+                    from PIL import Image
+                    msg, image, image_process_mode = msg
+                    if image_process_mode == "Pad":
+                        def expand2square(pil_img, background_color=(122, 116, 104)):
+                            width, height = pil_img.size
+                            if width == height:
+                                return pil_img
+                            elif width > height:
+                                result = Image.new(
+                                    pil_img.mode, (width, width), background_color)
+                                result.paste(
+                                    pil_img, (0, (width - height) // 2))
+                                return result
+                            else:
+                                result = Image.new(
+                                    pil_img.mode, (height, height), background_color)
+                                result.paste(
+                                    pil_img, ((height - width) // 2, 0))
+                                return result
+                        image = expand2square(image)
+                    elif image_process_mode == "Crop":
+                        pass
+                    elif image_process_mode == "Resize":
+                        image = image.resize((224, 224))
+                    else:
+                        raise ValueError(
+                            f"Invalid image_process_mode: {image_process_mode}")
+                    max_hw, min_hw = max(image.size), min(image.size)
+                    aspect_ratio = max_hw / min_hw
+                    max_len, min_len = 800, 400
+                    shortest_edge = int(
+                        min(max_len / aspect_ratio, min_len, min_hw))
+                    longest_edge = int(shortest_edge * aspect_ratio)
+                    W, H = image.size
+                    if H > W:
+                        H, W = longest_edge, shortest_edge
+                    else:
+                        H, W = shortest_edge, longest_edge
+                    image = image.resize((W, H))
+                    if return_pil:
+                        images.append(image)
+                    else:
+                        buffered = BytesIO()
+                        image.save(buffered, format="JPEG")
+                        img_b64_str = base64.b64encode(
+                            buffered.getvalue()).decode()
+                        images.append(img_b64_str)
+        return images
+
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+                    msg, image, image_process_mode = msg
+                    max_hw, min_hw = max(image.size), min(image.size)
+                    aspect_ratio = max_hw / min_hw
+                    max_len, min_len = 800, 400
+                    shortest_edge = int(
+                        min(max_len / aspect_ratio, min_len, min_hw))
+                    longest_edge = int(shortest_edge * aspect_ratio)
+                    W, H = image.size
+                    if H > W:
+                        H, W = longest_edge, shortest_edge
+                    else:
+                        H, W = shortest_edge, longest_edge
+                    image = image.resize((W, H))
+                    # image = image.resize((224, 224))
+                    buffered = BytesIO()
+                    image.save(buffered, format="JPEG")
+                    img_b64_str = base64.b64encode(
+                        buffered.getvalue()).decode()
+                    img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
+                    msg = msg.replace('<image>', img_str)
+                ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2)
+
+    def dict(self):
+        if len(self.get_images()) > 0:
+            return {
+                "system": self.system,
+                "roles": self.roles,
+                "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
+                "offset": self.offset,
+                "sep": self.sep,
+                "sep2": self.sep2,
+            }
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+        }
+
+
+conv_v1 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+        ("Human", "Give three tips for staying healthy."),
+        ("Assistant",
+            "Sure, here are three tips for staying healthy:\n"
+            "1. Exercise regularly: Regular physical activity can help improve your overall health and wellbeing. "
+            "It can also help reduce your risk of chronic conditions such as obesity, diabetes, heart disease, "
+            "and certain cancers. Aim for at least 150 minutes of moderate-intensity aerobic exercise or "
+            "75 minutes of vigorous-intensity aerobic exercise per week, along with muscle-strengthening "
+            "activities at least two days per week.\n"
+            "2. Eat a balanced diet: Eating a balanced diet that is rich in fruits, "
+            "vegetables, whole grains, lean proteins, and healthy fats can help support "
+            "your overall health. Try to limit your intake of processed and high-sugar foods, "
+            "and aim to drink plenty of water throughout the day.\n"
+            "3. Get enough sleep: Getting enough quality sleep is essential for your physical "
+            "and mental health. Adults should aim for seven to nine hours of sleep per night. "
+            "Establish a regular sleep schedule and try to create a relaxing bedtime routine to "
+            "help improve the quality of your sleep.")
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+
+conv_v1_2 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+        ("Human", "What are the key differences between renewable and non-renewable energy sources?"),
+        ("Assistant",
+            "Renewable energy sources are those that can be replenished naturally in a relatively "
+            "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
+            "Non-renewable energy sources, on the other hand, are finite and will eventually be "
+            "depleted, such as coal, oil, and natural gas. Here are some key differences between "
+            "renewable and non-renewable energy sources:\n"
+            "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
+            "energy sources are finite and will eventually run out.\n"
+            "2. Environmental impact: Renewable energy sources have a much lower environmental impact "
+            "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
+            "and other negative effects.\n"
+            "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
+            "have lower operational costs than non-renewable sources.\n"
+            "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
+            "locations than non-renewable sources.\n"
+            "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
+            "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
+            "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
+            "non-renewable sources are not, and their depletion can lead to economic and social instability.\n")
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+
+conv_vicuna_v1_1 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+
+conv_bair_v1 = Conversation(
+    system="BEGINNING OF CONVERSATION:",
+    roles=("USER", "GPT"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+
+simple_conv = Conversation(
+    system="You are LLaVA, a large language model trained by UW Madison WAIV Lab, based on LLaMA architecture."
+           "You are designed to assist human with a variety of tasks using natural language."
+           "Follow the instructions carefully.",
+    roles=("Human", "Assistant"),
+    messages=(
+        ("Human", "Hi!"),
+        ("Assistant", "Hi there!  How can I help you today?\n")
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+
+simple_conv_multimodal = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+
+simple_conv_legacy = Conversation(
+    system="You are LLaVA, a large language model trained by UW Madison WAIV Lab."
+           "You are designed to assist human with a variety of tasks using natural language."
+           "Follow the instructions carefully.",
+    roles=("Human", "Assistant"),
+    messages=(
+        ("Human", "Hi!\n\n### Response:"),
+        ("Assistant", "Hi there!  How can I help you today?\n")
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+
+conv_llava_v1 = Conversation(
+    system="You are LLaVA, a large language and vision assistant trained by UW Madison WAIV Lab."
+           "You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+           "Follow the instructions carefully and explain your answers in detail.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+
+default_conversation = conv_v1_2
+conv_templates = {
+    "default": conv_v1_2,
+    "simple": simple_conv,
+    "simple_legacy": simple_conv_legacy,
+    "multimodal": simple_conv_multimodal,
+    "llava_v1": conv_llava_v1,
+
+    # fastchat
+    "v1": conv_v1_2,
+    "bair_v1": conv_bair_v1,
+    "vicuna_v1_1": conv_vicuna_v1_1,
+}
+
+
+if __name__ == "__main__":
+    print(default_conversation.get_prompt())
diff --git a/omnilmm/model/__init__.py b/omnilmm/model/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..35b9845c8726de2a6b82fb05ab11071115f6b09c
--- /dev/null
+++ b/omnilmm/model/__init__.py
@@ -0,0 +1 @@
+from .omnilmm import OmniLMMForCausalLM
\ No newline at end of file
diff --git a/omnilmm/model/omnilmm.py b/omnilmm/model/omnilmm.py
new file mode 100644
index 0000000000000000000000000000000000000000..e720675292e9d8e14e25a057624ae5c290f82d5d
--- /dev/null
+++ b/omnilmm/model/omnilmm.py
@@ -0,0 +1,457 @@
+
+import gc
+import math
+import timm
+import torch
+from torch import Tensor
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss
+from typing import List, Optional, Tuple, Union
+
+from transformers import AutoConfig, AutoModelForCausalLM
+from transformers import MistralForCausalLM, MistralModel, MistralConfig
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+
+from omnilmm.model.utils import build_transform
+from omnilmm.model.resampler import Resampler
+
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+
+
+class OmniLMMConfig(MistralConfig):
+    model_type = "omnilmm"
+
+
+class Identity(torch.nn.Identity):
+    def forward(self, input: Tensor, **kwargs) -> Tensor:
+        return super().forward(input)
+
+
+def create_vision_module(config):
+    vision_tower = timm.create_model('eva02_enormous_patch14_clip_224.laion2b_plus',
+                                     pretrained=False,
+                                     num_classes=0,
+                                     dynamic_img_size=True,
+                                     dynamic_img_pad=True)
+
+    if isinstance(vision_tower, timm.models.VisionTransformer):
+        if vision_tower.attn_pool is not None:
+            vision_tower.attn_pool = Identity()
+
+    # use 2nd last layer's output
+    vision_tower.blocks[-1] = Identity()
+
+    embed_dim = config.hidden_size
+    resampler = Resampler(
+        grid_size=int(math.sqrt(config.num_query)),
+        embed_dim=embed_dim,
+        num_heads=embed_dim // 128,
+        kv_dim=vision_tower.embed_dim,
+    )
+    return vision_tower, resampler
+
+
+class OmniLMMModel(MistralModel):
+    config_class = OmniLMMConfig
+
+    def __init__(self, config: OmniLMMConfig, mm_vision_tower=None, mm_hidden_size=None, tune_clip=True):
+        super(OmniLMMModel, self).__init__(config)
+
+        if hasattr(config, "mm_vision_tower"):
+            vision_tower, resampler = create_vision_module(config)
+
+            # print(__file__, 'skip loading vision tower weights')
+
+            # HACK: for FSDP
+            self.vision_tower = [vision_tower]
+            self.resampler = resampler
+            if tune_clip:
+                self.vision_tower = self.vision_tower[0]
+
+        self.vision_config = lambda x: None
+
+    def initialize_vision_modules(self, vision_tower, no_randaug, num_query, image_size, tune_clip=False):
+        self.config.mm_vision_tower = vision_tower
+        self.config.use_mm_proj = True
+        self.config.num_query = num_query
+        self.config.image_size = image_size
+
+        if not hasattr(self, 'vision_tower'):
+            vision_tower, resampler = create_vision_module(self.config)
+            state_dict = torch.load(
+                '/tt/data/public/multimodal/multimodal_model_ckpts/timm/eva02_enormous_patch14_clip_224.laion2b_plus.pt')
+            vision_tower.load_state_dict(state_dict, strict=False)
+            del state_dict
+            gc.collect()
+        else:
+            if isinstance(self.vision_tower, list):
+                vision_tower = self.vision_tower[0]
+            else:
+                vision_tower = self.vision_tower
+            resampler = self.resampler
+        self.vision_tower = vision_tower if tune_clip else [vision_tower]
+        self.resampler = resampler
+
+        train_img_transform = build_transform(
+            is_train=True, randaug=not no_randaug, input_size=self.config.image_size, std_mode='OPENAI_CLIP')
+        eval_img_transform = build_transform(
+            is_train=False, input_size=self.config.image_size, std_mode='OPENAI_CLIP')
+
+        return dict(
+            image_processor=(train_img_transform, eval_img_transform),
+            image_token_len=num_query,
+            vision_config=self.vision_config
+        )
+
+    def get_vision_embedding(self, pixel_values):
+        if isinstance(self.vision_tower, list):
+            vision_tower = self.vision_tower[0]  # HACK: for FSDP
+        else:
+            vision_tower = self.vision_tower
+
+        dtype = vision_tower.pos_embed.data.dtype
+        vision_embedding = vision_tower.forward_features(
+            pixel_values.type(dtype))
+        if hasattr(vision_tower, 'num_prefix_tokens') and vision_tower.num_prefix_tokens > 0:
+            vision_embedding = vision_embedding[:,
+                                                vision_tower.num_prefix_tokens:]
+        res = self.resampler(vision_embedding)
+        return res
+
+    def get_vllm_embedding(self, data):
+
+        if 'vision_hidden_states' not in data:
+            pixel_values_list = data['pixel_values']
+            vision_hidden_states = []
+            for pixel_values in pixel_values_list:
+                if len(pixel_values) > 0:
+                    vision_hidden_states.append(self.get_vision_embedding(pixel_values.unsqueeze(0))[0])
+                else:
+                    vision_hidden_states.append([])
+        else:
+            vision_hidden_states = data['vision_hidden_states']
+
+        #vllm_embedding = self.llm.model.embed_tokens(data['input_ids']) * self.llm.config.scale_emb
+        inputs_embeds = self.embed_tokens(data['input_ids'])
+        vision_hidden_states = [i.type(inputs_embeds.dtype) 
+            if isinstance(i, torch.Tensor) else i for i in vision_hidden_states
+        ]
+
+
+        # HACK: replace back original embeddings for LLaVA pretraining
+        orig_embeds_params = getattr(self, 'orig_embeds_params', None)
+
+        new_input_embeds = []
+        cur_image_idx = 0
+        for cur_input_ids, cur_input_embeds in zip(data['input_ids'], inputs_embeds):
+            if (cur_input_ids == self.vision_config.im_patch_token).sum() == 0:
+                # multimodal LLM, but the current sample is not multimodal
+                cur_input_embeds = cur_input_embeds + (0. * dummy_image_features).sum()
+                new_input_embeds.append(cur_input_embeds)
+                continue
+
+            if self.vision_config.use_im_start_end:
+                cur_image_features = vision_hidden_states[cur_image_idx]
+                num_patches = cur_image_features.shape[0]
+                if (cur_input_ids == self.vision_config.im_start_token).sum() != (cur_input_ids == self.vision_config.im_end_token).sum():
+                    raise ValueError(
+                        "The number of image start tokens and image end tokens should be the same.")
+                image_start_tokens = torch.where(
+                    cur_input_ids == self.vision_config.im_start_token)[0]
+                for image_start_token_pos in image_start_tokens:
+                    cur_image_features = vision_hidden_states[cur_image_idx].to(
+                        device=cur_input_embeds.device)
+                    num_patches = cur_image_features.shape[0]
+                    if cur_input_ids[image_start_token_pos + num_patches + 1] != self.vision_config.im_end_token:
+                        raise ValueError(
+                            "The image end token should follow the image start token.")
+                    if orig_embeds_params is not None:
+                        cur_new_input_embeds = torch.cat((cur_input_embeds[:image_start_token_pos].detach(), cur_input_embeds[image_start_token_pos:image_start_token_pos+1], cur_image_features,
+                                                         cur_input_embeds[image_start_token_pos + num_patches + 1:image_start_token_pos + num_patches + 2], cur_input_embeds[image_start_token_pos + num_patches + 2:].detach()), dim=0)
+                    else:
+                        cur_new_input_embeds = torch.cat(
+                            (cur_input_embeds[:image_start_token_pos+1], cur_image_features, cur_input_embeds[image_start_token_pos + num_patches + 1:]), dim=0)
+                    cur_image_idx += 1
+                new_input_embeds.append(cur_new_input_embeds)
+            else:
+                raise NotImplementedError
+        inputs_embeds = torch.stack(new_input_embeds, dim=0)
+
+        return inputs_embeds, vision_hidden_states
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+
+        # HACK: replace back original embeddings for LLaVA pretraining
+        orig_embeds_params = getattr(self, 'orig_embeds_params', None)
+
+        if inputs_embeds is None and past_key_values is None:
+          inputs_embeds = self.embed_tokens(input_ids)
+
+          vision_tower = getattr(self, 'vision_tower', None)
+          if vision_tower is not None and (input_ids.shape[1] != 1 or self.training) and images is not None:
+
+            if type(images) is list:
+                image_features = []
+                for image in images:
+                    image_forward_out = self.get_vision_embedding(image.unsqueeze(0))[
+                        0]
+                    image_features.append(image_forward_out)
+            else:
+                image_features = self.get_vision_embedding(images)
+
+            dummy_image_features = torch.zeros(
+                self.config.num_query,
+                self.config.hidden_size,
+                device=inputs_embeds.device,
+                dtype=inputs_embeds.dtype)
+
+            new_input_embeds = []
+            cur_image_idx = 0
+            for cur_input_ids, cur_input_embeds in zip(input_ids, inputs_embeds):
+                if (cur_input_ids == self.vision_config.im_patch_token).sum() == 0:
+                    # multimodal LLM, but the current sample is not multimodal
+                    cur_input_embeds = cur_input_embeds + \
+                        (0. * dummy_image_features).sum()
+                    new_input_embeds.append(cur_input_embeds)
+                    continue
+
+                if self.vision_config.use_im_start_end:
+                    cur_image_features = image_features[cur_image_idx]
+                    num_patches = cur_image_features.shape[0]
+                    if (cur_input_ids == self.vision_config.im_start_token).sum() != (cur_input_ids == self.vision_config.im_end_token).sum():
+                        raise ValueError(
+                            "The number of image start tokens and image end tokens should be the same.")
+                    image_start_tokens = torch.where(
+                        cur_input_ids == self.vision_config.im_start_token)[0]
+                    for image_start_token_pos in image_start_tokens:
+                        cur_image_features = image_features[cur_image_idx].to(
+                            device=cur_input_embeds.device)
+                        num_patches = cur_image_features.shape[0]
+                        if cur_input_ids[image_start_token_pos + num_patches + 1] != self.vision_config.im_end_token:
+                            raise ValueError(
+                                "The image end token should follow the image start token.")
+                        if orig_embeds_params is not None:
+                            cur_new_input_embeds = torch.cat((cur_input_embeds[:image_start_token_pos].detach(), cur_input_embeds[image_start_token_pos:image_start_token_pos+1], cur_image_features,
+                                                             cur_input_embeds[image_start_token_pos + num_patches + 1:image_start_token_pos + num_patches + 2], cur_input_embeds[image_start_token_pos + num_patches + 2:].detach()), dim=0)
+                        else:
+                            cur_new_input_embeds = torch.cat(
+                                (cur_input_embeds[:image_start_token_pos+1], cur_image_features, cur_input_embeds[image_start_token_pos + num_patches + 1:]), dim=0)
+                        cur_image_idx += 1
+                    new_input_embeds.append(cur_new_input_embeds)
+                else:
+                    raise NotImplementedError
+            inputs_embeds = torch.stack(new_input_embeds, dim=0)
+            input_ids = None
+
+        return super(OmniLMMModel, self).forward(
+            input_ids=input_ids, attention_mask=attention_mask, past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds, use_cache=use_cache,
+            output_attentions=output_attentions, output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            **kwargs
+        )
+
+
+class OmniLMMForCausalLM(MistralForCausalLM):
+    config_class = OmniLMMConfig
+
+    def __init__(self, config, mm_vision_tower=None, tune_clip=True):
+        super(MistralForCausalLM, self).__init__(config)
+        self.model = OmniLMMModel(
+            config, mm_vision_tower=mm_vision_tower, tune_clip=tune_clip)
+
+        self.lm_head = nn.Linear(
+            config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # print(f'@@@ At forward, labels: {labels.shape}-{labels}', flush=True)
+        # print(f'@@@ At forward, input_ids: {input_ids.shape}-{input_ids}', flush=True)
+        # print(f'@@@ At forward, input_ids: {attention_mask.shape}-{attention_mask}', flush=True)
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            images=images,
+            **kwargs
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model/pipeline parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    # TODO could be removed for generate_vllm()
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+                "images": kwargs.get("images", None),
+            }
+        )
+        return model_inputs
+
+    def generate_vllm(
+        self,
+        input_ids: torch.LongTensor = None,
+        images: Optional[torch.FloatTensor] = None,
+        vision_hidden_states=None,
+        return_vision_hidden_states=False,
+        **kwargs
+    ):
+        model_inputs = {'input_ids': input_ids}
+        if vision_hidden_states is None:
+            model_inputs['pixel_values'] = images
+        else:
+            model_inputs['vision_hidden_states'] = vision_hidden_states
+
+        with torch.inference_mode():
+            inputs_embeds, vision_hidden_states = self.model.get_vllm_embedding(model_inputs)
+
+            result = self.generate(
+                inputs_embeds=inputs_embeds,
+                **kwargs
+            )
+
+        if return_vision_hidden_states:
+            return result, vision_hidden_states
+
+        return result
+
+
+    def initialize_vision_tokenizer(self, mm_use_im_start_end, tokenizer, device,
+                                    tune_mm_mlp_adapter=False):
+        self.model.vision_config.use_im_start_end = mm_use_im_start_end
+        tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+        self.resize_token_embeddings(len(tokenizer))
+
+        if mm_use_im_start_end:
+            num_new_tokens = tokenizer.add_tokens(
+                [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
+            self.resize_token_embeddings(len(tokenizer))
+            self.model.vision_config.im_start_token, self.model.vision_config.im_end_token = tokenizer.convert_tokens_to_ids(
+                [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN])
+
+            if num_new_tokens > 0:
+                input_embeddings = self.get_input_embeddings().weight.data
+                output_embeddings = self.get_output_embeddings().weight.data
+
+                input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
+                    dim=0, keepdim=True)
+                output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
+                    dim=0, keepdim=True)
+
+                input_embeddings[-num_new_tokens:] = input_embeddings_avg
+                output_embeddings[-num_new_tokens:] = output_embeddings_avg
+
+            # for new sft data
+            num_new_tokens = tokenizer.add_tokens(
+                ['<box>', '</box>', '<ref>', '</ref>', '<quad>', '</quad>'], special_tokens=True)
+            self.resize_token_embeddings(len(tokenizer))
+
+            if num_new_tokens > 0:
+                input_embeddings = self.get_input_embeddings().weight.data
+                output_embeddings = self.get_output_embeddings().weight.data
+
+                input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
+                    dim=0, keepdim=True)
+                output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
+                    dim=0, keepdim=True)
+
+                input_embeddings[-num_new_tokens:] = input_embeddings_avg
+                output_embeddings[-num_new_tokens:] = output_embeddings_avg
+
+            if tune_mm_mlp_adapter:
+                self.model.orig_embeds_params = [
+                    self.get_input_embeddings().weight.data.clone().to(device=device)]
+                for p in self.get_input_embeddings().parameters():
+                    p.requires_grad = True
+                for p in self.get_output_embeddings().parameters():
+                    p.requires_grad = False
+
+        self.model.vision_config.im_patch_token = tokenizer.convert_tokens_to_ids(
+            [DEFAULT_IMAGE_PATCH_TOKEN])[0]
+        print(f'Tokenizer: {tokenizer}\n patch_token_id: {self.model.vision_config.im_patch_token}, visoin_config: {self.model.vision_config}', flush=True)
+        # exit()
+
+
+AutoConfig.register("omnilmm", OmniLMMConfig)
+AutoModelForCausalLM.register(OmniLMMConfig, OmniLMMForCausalLM)
diff --git a/omnilmm/model/resampler.py b/omnilmm/model/resampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d46f55f460384ab23d0b4ff00768417281110dc
--- /dev/null
+++ b/omnilmm/model/resampler.py
@@ -0,0 +1,171 @@
+# Copyright (c) Alibaba Cloud.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections import OrderedDict
+import math
+import requests
+from io import BytesIO
+from functools import partial
+from PIL import Image
+from typing import Callable, Optional, Sequence, Tuple, List, Union
+import numpy as np
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.nn.init import trunc_normal_
+from torchvision import transforms
+from torchvision.transforms import InterpolationMode
+
+
+def get_abs_pos(abs_pos, tgt_size):
+    # abs_pos: L, C
+    # tgt_size: M
+    # return: M, C
+    src_size = int(math.sqrt(abs_pos.size(0)))
+    tgt_size = int(math.sqrt(tgt_size))
+    dtype = abs_pos.dtype
+
+    if src_size != tgt_size:
+        return F.interpolate(
+            abs_pos.float().reshape(1, src_size, src_size, -1).permute(0, 3, 1, 2),
+            size=(tgt_size, tgt_size),
+            mode="bicubic",
+            align_corners=False,
+        ).permute(0, 2, 3, 1).flatten(0, 2).to(dtype=dtype)
+    else:
+        return abs_pos
+
+
+# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token:
+        pos_embed = np.concatenate(
+            [np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+
+
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(
+        embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(
+        embed_dim // 2, grid[1])  # (H*W, D/2)
+
+    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+    return emb
+
+
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float32)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000 ** omega  # (D/2,)
+
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+
+
+class Resampler(nn.Module):
+    """
+    A 2D perceiver-resampler network with one cross attention layers by
+        (grid_size**2) learnable queries and 2d sincos pos_emb
+    Outputs:
+        A tensor with the shape of (grid_size**2, embed_dim)
+    """
+
+    def __init__(
+            self,
+            grid_size,
+            embed_dim,
+            num_heads,
+            kv_dim=None,
+            norm_layer=partial(nn.LayerNorm, eps=1e-6)
+    ):
+        super().__init__()
+        self.num_queries = grid_size ** 2
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+
+        self.pos_embed = nn.Parameter(
+            torch.from_numpy(get_2d_sincos_pos_embed(
+                embed_dim, grid_size)).float()
+        ).requires_grad_(False)
+
+        self.query = nn.Parameter(torch.zeros(self.num_queries, embed_dim))
+        trunc_normal_(self.query, std=.02)
+
+        if kv_dim is not None and kv_dim != embed_dim:
+            self.kv_proj = nn.Linear(kv_dim, embed_dim, bias=False)
+        else:
+            self.kv_proj = nn.Identity()
+
+        self.attn = nn.MultiheadAttention(embed_dim, num_heads)
+        self.ln_q = norm_layer(embed_dim)
+        self.ln_kv = norm_layer(embed_dim)
+
+        self.ln_post = norm_layer(embed_dim)
+        self.proj = nn.Parameter(
+            (embed_dim ** -0.5) * torch.randn(embed_dim, embed_dim))
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def forward(self, x, attn_mask=None):
+
+        pos_embed = get_abs_pos(self.pos_embed, x.size(1))
+
+        x = self.kv_proj(x)
+        x = self.ln_kv(x).permute(1, 0, 2)
+
+        N = x.shape[1]
+        q = self.ln_q(self.query)
+        # print((self._repeat(q, N) + self.pos_embed.unsqueeze(1)).dtype, (x + pos_embed.unsqueeze(1)).dtype, x.dtype)
+        out = self.attn(
+            self._repeat(q, N) + self.pos_embed.unsqueeze(1),
+            x + pos_embed.unsqueeze(1),
+            x,
+            attn_mask=attn_mask)[0]
+        x = out.permute(1, 0, 2)
+
+        x = self.ln_post(x)
+        x = x @ self.proj
+        return x
+
+    def _repeat(self, query, N: int):
+        return query.unsqueeze(1).repeat(1, N, 1)
diff --git a/omnilmm/model/utils.py b/omnilmm/model/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcb8eba1f74b111aa68be3aa0211c831cf243cb9
--- /dev/null
+++ b/omnilmm/model/utils.py
@@ -0,0 +1,555 @@
+from torchvision import transforms
+from timm.data.transforms import RandomResizedCropAndInterpolation
+from timm.data.constants import IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
+from transformers import AutoConfig
+from PIL import Image
+from io import BytesIO
+import torch.distributed as dist
+import numpy as np
+import pickle
+import base64
+import cv2
+import os
+import torch
+from transformers import AutoConfig, StoppingCriteria
+
+try:
+    from timm.data.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
+except ImportError:
+    OPENAI_CLIP_MEAN = (0.48145466, 0.4578275, 0.40821073)
+    OPENAI_CLIP_STD = (0.26862954, 0.26130258, 0.27577711)
+
+
+def auto_upgrade(config):
+    cfg = AutoConfig.from_pretrained(config)
+    if 'llava' in config and cfg.model_type != 'llava':
+        print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.")
+        print("You must upgrade the checkpoint to the new code base (this can be done automatically).")
+        confirm = input(
+            "Please confirm that you want to upgrade the checkpoint. [Y/N]")
+        if confirm.lower() in ["y", "yes"]:
+            print("Upgrading checkpoint...")
+            assert len(cfg.architectures) == 1
+            setattr(cfg.__class__, "model_type", "llava")
+            cfg.architectures[0] = 'LlavaLlamaForCausalLM'
+            cfg.save_pretrained(config)
+            print("Checkpoint upgraded.")
+        else:
+            print("Checkpoint upgrade aborted.")
+            exit(1)
+
+
+class KeywordsStoppingCriteria(StoppingCriteria):
+    def __init__(self, keywords, tokenizer, input_ids):
+        self.keywords = keywords
+        self.tokenizer = tokenizer
+        self.start_len = None
+        self.input_ids = input_ids
+
+    def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        if self.start_len is None:
+            self.start_len = self.input_ids.shape[1]
+        else:
+            outputs = self.tokenizer.batch_decode(
+                output_ids[:, self.start_len:], skip_special_tokens=True)[0]
+            for keyword in self.keywords:
+                if keyword in outputs:
+                    return True
+        return False
+
+
+def auto_upgrade(config):
+    cfg = AutoConfig.from_pretrained(config)
+    if 'llava' in config and cfg.model_type != 'llava':
+        print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.")
+        print("You must upgrade the checkpoint to the new code base (this can be done automatically).")
+        confirm = input(
+            "Please confirm that you want to upgrade the checkpoint. [Y/N]")
+        if confirm.lower() in ["y", "yes"]:
+            print("Upgrading checkpoint...")
+            assert len(cfg.architectures) == 1
+            setattr(cfg.__class__, "model_type", "llava")
+            cfg.architectures[0] = 'LlavaLlamaForCausalLM'
+            cfg.save_pretrained(config)
+            print("Checkpoint upgraded.")
+        else:
+            print("Checkpoint upgrade aborted.")
+            exit(1)
+
+# aug functions
+
+
+def identity_func(img):
+    return img
+
+
+def autocontrast_func(img, cutoff=0):
+    '''
+        same output as PIL.ImageOps.autocontrast
+    '''
+    n_bins = 256
+
+    def tune_channel(ch):
+        n = ch.size
+        cut = cutoff * n // 100
+        if cut == 0:
+            high, low = ch.max(), ch.min()
+        else:
+            hist = cv2.calcHist([ch], [0], None, [n_bins], [0, n_bins])
+            low = np.argwhere(np.cumsum(hist) > cut)
+            low = 0 if low.shape[0] == 0 else low[0]
+            high = np.argwhere(np.cumsum(hist[::-1]) > cut)
+            high = n_bins - 1 if high.shape[0] == 0 else n_bins - 1 - high[0]
+        if high <= low:
+            table = np.arange(n_bins)
+        else:
+            scale = (n_bins - 1) / (high - low)
+            table = np.arange(n_bins) * scale - low * scale
+            table[table < 0] = 0
+            table[table > n_bins - 1] = n_bins - 1
+        table = table.clip(0, 255).astype(np.uint8)
+        return table[ch]
+
+    channels = [tune_channel(ch) for ch in cv2.split(img)]
+    out = cv2.merge(channels)
+    return out
+
+
+def equalize_func(img):
+    '''
+        same output as PIL.ImageOps.equalize
+        PIL's implementation is different from cv2.equalize
+    '''
+    n_bins = 256
+
+    def tune_channel(ch):
+        hist = cv2.calcHist([ch], [0], None, [n_bins], [0, n_bins])
+        non_zero_hist = hist[hist != 0].reshape(-1)
+        step = np.sum(non_zero_hist[:-1]) // (n_bins - 1)
+        if step == 0:
+            return ch
+        n = np.empty_like(hist)
+        n[0] = step // 2
+        n[1:] = hist[:-1]
+        table = (np.cumsum(n) // step).clip(0, 255).astype(np.uint8)
+        return table[ch]
+
+    channels = [tune_channel(ch) for ch in cv2.split(img)]
+    out = cv2.merge(channels)
+    return out
+
+
+def rotate_func(img, degree, fill=(0, 0, 0)):
+    '''
+    like PIL, rotate by degree, not radians
+    '''
+    H, W = img.shape[0], img.shape[1]
+    center = W / 2, H / 2
+    M = cv2.getRotationMatrix2D(center, degree, 1)
+    out = cv2.warpAffine(img, M, (W, H), borderValue=fill)
+    return out
+
+
+def solarize_func(img, thresh=128):
+    '''
+        same output as PIL.ImageOps.posterize
+    '''
+    table = np.array([el if el < thresh else 255 - el for el in range(256)])
+    table = table.clip(0, 255).astype(np.uint8)
+    out = table[img]
+    return out
+
+
+def color_func(img, factor):
+    '''
+        same output as PIL.ImageEnhance.Color
+    '''
+    # implementation according to PIL definition, quite slow
+    #  degenerate = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)[:, :, np.newaxis]
+    #  out = blend(degenerate, img, factor)
+    #  M = (
+    #      np.eye(3) * factor
+    #      + np.float32([0.114, 0.587, 0.299]).reshape(3, 1) * (1. - factor)
+    #  )[np.newaxis, np.newaxis, :]
+    M = (
+        np.float32([
+            [0.886, -0.114, -0.114],
+            [-0.587, 0.413, -0.587],
+            [-0.299, -0.299, 0.701]]) * factor
+        + np.float32([[0.114], [0.587], [0.299]])
+    )
+    out = np.matmul(img, M).clip(0, 255).astype(np.uint8)
+    return out
+
+
+def contrast_func(img, factor):
+    """
+        same output as PIL.ImageEnhance.Contrast
+    """
+    mean = np.sum(np.mean(img, axis=(0, 1)) * np.array([0.114, 0.587, 0.299]))
+    table = np.array([(
+        el - mean) * factor + mean
+        for el in range(256)
+    ]).clip(0, 255).astype(np.uint8)
+    out = table[img]
+    return out
+
+
+def brightness_func(img, factor):
+    '''
+        same output as PIL.ImageEnhance.Contrast
+    '''
+    table = (np.arange(256, dtype=np.float32) *
+             factor).clip(0, 255).astype(np.uint8)
+    out = table[img]
+    return out
+
+
+def sharpness_func(img, factor):
+    '''
+    The differences the this result and PIL are all on the 4 boundaries, the center
+    areas are same
+    '''
+    kernel = np.ones((3, 3), dtype=np.float32)
+    kernel[1][1] = 5
+    kernel /= 13
+    degenerate = cv2.filter2D(img, -1, kernel)
+    if factor == 0.0:
+        out = degenerate
+    elif factor == 1.0:
+        out = img
+    else:
+        out = img.astype(np.float32)
+        degenerate = degenerate.astype(np.float32)[1:-1, 1:-1, :]
+        out[1:-1, 1:-1, :] = degenerate + factor * \
+            (out[1:-1, 1:-1, :] - degenerate)
+        out = out.astype(np.uint8)
+    return out
+
+
+def shear_x_func(img, factor, fill=(0, 0, 0)):
+    H, W = img.shape[0], img.shape[1]
+    M = np.float32([[1, factor, 0], [0, 1, 0]])
+    out = cv2.warpAffine(img, M, (W, H), borderValue=fill,
+                         flags=cv2.INTER_LINEAR).astype(np.uint8)
+    return out
+
+
+def translate_x_func(img, offset, fill=(0, 0, 0)):
+    '''
+        same output as PIL.Image.transform
+    '''
+    H, W = img.shape[0], img.shape[1]
+    M = np.float32([[1, 0, -offset], [0, 1, 0]])
+    out = cv2.warpAffine(img, M, (W, H), borderValue=fill,
+                         flags=cv2.INTER_LINEAR).astype(np.uint8)
+    return out
+
+
+def translate_y_func(img, offset, fill=(0, 0, 0)):
+    '''
+        same output as PIL.Image.transform
+    '''
+    H, W = img.shape[0], img.shape[1]
+    M = np.float32([[1, 0, 0], [0, 1, -offset]])
+    out = cv2.warpAffine(img, M, (W, H), borderValue=fill,
+                         flags=cv2.INTER_LINEAR).astype(np.uint8)
+    return out
+
+
+def posterize_func(img, bits):
+    '''
+        same output as PIL.ImageOps.posterize
+    '''
+    out = np.bitwise_and(img, np.uint8(255 << (8 - bits)))
+    return out
+
+
+def shear_y_func(img, factor, fill=(0, 0, 0)):
+    H, W = img.shape[0], img.shape[1]
+    M = np.float32([[1, 0, 0], [factor, 1, 0]])
+    out = cv2.warpAffine(img, M, (W, H), borderValue=fill,
+                         flags=cv2.INTER_LINEAR).astype(np.uint8)
+    return out
+
+
+def cutout_func(img, pad_size, replace=(0, 0, 0)):
+    replace = np.array(replace, dtype=np.uint8)
+    H, W = img.shape[0], img.shape[1]
+    rh, rw = np.random.random(2)
+    pad_size = pad_size // 2
+    ch, cw = int(rh * H), int(rw * W)
+    x1, x2 = max(ch - pad_size, 0), min(ch + pad_size, H)
+    y1, y2 = max(cw - pad_size, 0), min(cw + pad_size, W)
+    out = img.copy()
+    out[x1:x2, y1:y2, :] = replace
+    return out
+
+
+# level to args
+def enhance_level_to_args(MAX_LEVEL):
+    def level_to_args(level):
+        return ((level / MAX_LEVEL) * 1.8 + 0.1,)
+    return level_to_args
+
+
+def shear_level_to_args(MAX_LEVEL, replace_value):
+    def level_to_args(level):
+        level = (level / MAX_LEVEL) * 0.3
+        if np.random.random() > 0.5:
+            level = -level
+        return (level, replace_value)
+
+    return level_to_args
+
+
+def translate_level_to_args(translate_const, MAX_LEVEL, replace_value):
+    def level_to_args(level):
+        level = (level / MAX_LEVEL) * float(translate_const)
+        if np.random.random() > 0.5:
+            level = -level
+        return (level, replace_value)
+
+    return level_to_args
+
+
+def cutout_level_to_args(cutout_const, MAX_LEVEL, replace_value):
+    def level_to_args(level):
+        level = int((level / MAX_LEVEL) * cutout_const)
+        return (level, replace_value)
+
+    return level_to_args
+
+
+def solarize_level_to_args(MAX_LEVEL):
+    def level_to_args(level):
+        level = int((level / MAX_LEVEL) * 256)
+        return (level, )
+    return level_to_args
+
+
+def none_level_to_args(level):
+    return ()
+
+
+def posterize_level_to_args(MAX_LEVEL):
+    def level_to_args(level):
+        level = int((level / MAX_LEVEL) * 4)
+        return (level, )
+    return level_to_args
+
+
+def rotate_level_to_args(MAX_LEVEL, replace_value):
+    def level_to_args(level):
+        level = (level / MAX_LEVEL) * 30
+        if np.random.random() < 0.5:
+            level = -level
+        return (level, replace_value)
+
+    return level_to_args
+
+
+func_dict = {
+    'Identity': identity_func,
+    'AutoContrast': autocontrast_func,
+    'Equalize': equalize_func,
+    'Rotate': rotate_func,
+    'Solarize': solarize_func,
+    'Color': color_func,
+    'Contrast': contrast_func,
+    'Brightness': brightness_func,
+    'Sharpness': sharpness_func,
+    'ShearX': shear_x_func,
+    'TranslateX': translate_x_func,
+    'TranslateY': translate_y_func,
+    'Posterize': posterize_func,
+    'ShearY': shear_y_func,
+}
+
+translate_const = 10
+MAX_LEVEL = 10
+replace_value = (128, 128, 128)
+arg_dict = {
+    'Identity': none_level_to_args,
+    'AutoContrast': none_level_to_args,
+    'Equalize': none_level_to_args,
+    'Rotate': rotate_level_to_args(MAX_LEVEL, replace_value),
+    'Solarize': solarize_level_to_args(MAX_LEVEL),
+    'Color': enhance_level_to_args(MAX_LEVEL),
+    'Contrast': enhance_level_to_args(MAX_LEVEL),
+    'Brightness': enhance_level_to_args(MAX_LEVEL),
+    'Sharpness': enhance_level_to_args(MAX_LEVEL),
+    'ShearX': shear_level_to_args(MAX_LEVEL, replace_value),
+    'TranslateX': translate_level_to_args(
+        translate_const, MAX_LEVEL, replace_value
+    ),
+    'TranslateY': translate_level_to_args(
+        translate_const, MAX_LEVEL, replace_value
+    ),
+    'Posterize': posterize_level_to_args(MAX_LEVEL),
+    'ShearY': shear_level_to_args(MAX_LEVEL, replace_value),
+}
+
+
+class RandomAugment(object):
+
+    def __init__(self, N=2, M=10, isPIL=False, augs=[]):
+        self.N = N
+        self.M = M
+        self.isPIL = isPIL
+        if augs:
+            self.augs = augs
+        else:
+            self.augs = list(arg_dict.keys())
+
+    def get_random_ops(self):
+        sampled_ops = np.random.choice(self.augs, self.N)
+        return [(op, 0.5, self.M) for op in sampled_ops]
+
+    def __call__(self, img):
+        if self.isPIL:
+            img = np.array(img)
+        ops = self.get_random_ops()
+        for name, prob, level in ops:
+            if np.random.random() > prob:
+                continue
+            args = arg_dict[name](level)
+            img = func_dict[name](img, *args)
+        return img
+
+
+def build_transform(is_train, randaug=True, input_size=224, interpolation='bicubic', std_mode='IMAGENET_INCEPTION'):
+    if std_mode == 'IMAGENET_INCEPTION':
+        mean = IMAGENET_INCEPTION_MEAN
+        std = IMAGENET_INCEPTION_STD
+    elif std_mode == 'OPENAI_CLIP':
+        mean = OPENAI_CLIP_MEAN
+        std = OPENAI_CLIP_STD
+    else:
+        raise NotImplementedError
+
+    if is_train:
+        crop_scale = float(os.environ.get('TRAIN_CROP_SCALE', 0.9999))
+        t = [
+            RandomResizedCropAndInterpolation(
+                input_size, scale=(crop_scale, 1.0), interpolation='bicubic'),
+            # transforms.RandomHorizontalFlip(),
+        ]
+        if randaug and os.environ.get('TRAIN_DO_AUG', 'False') == 'True':
+            print(f'@@@@@ Do random aug during training', flush=True)
+            t.append(
+                RandomAugment(
+                    2, 7, isPIL=True,
+                    augs=[
+                        'Identity', 'AutoContrast', 'Equalize', 'Brightness', 'Sharpness',
+                        'ShearX', 'ShearY', 'TranslateX', 'TranslateY', 'Rotate',
+                    ]))
+        else:
+            print(f'@@@@@ Skip random aug during training', flush=True)
+        t += [
+            transforms.ToTensor(),
+            transforms.Normalize(mean=mean, std=std),
+        ]
+        t = transforms.Compose(t)
+    else:
+        t = transforms.Compose([
+            transforms.Resize((input_size, input_size),
+                              interpolation=transforms.InterpolationMode.BICUBIC),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=mean, std=std)
+        ])
+
+    return t
+
+
+def img2b64(img_path):
+    img = Image.open(img_path)  # path to file
+    img_buffer = BytesIO()
+    img.save(img_buffer, format=img.format)
+    byte_data = img_buffer.getvalue()
+    base64_str = base64.b64encode(byte_data)  # bytes
+    base64_str = base64_str.decode("utf-8")  # str
+    return base64_str
+
+
+def str2b64(str):
+    return base64.b64encode(str.encode('utf-8')).decode('utf-8')
+
+
+def b642str(b64):
+    return base64.b64decode(b64).decode('utf-8')
+
+
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+
+
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+
+
+def all_gather(data):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors)
+    Args:
+        data: any picklable object
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+    world_size = get_world_size()
+    if world_size == 1:
+        return [data]
+
+    # serialized to a Tensor
+    buffer = pickle.dumps(data)
+    storage = torch.ByteStorage.from_buffer(buffer)
+    tensor = torch.ByteTensor(storage).to("cuda")
+
+    # obtain Tensor size of each rank
+    local_size = torch.LongTensor([tensor.numel()]).to("cuda")
+    size_list = [torch.LongTensor([0]).to("cuda") for _ in range(world_size)]
+    dist.all_gather(size_list, local_size)
+    size_list = [int(size.item()) for size in size_list]
+    max_size = max(size_list)
+
+    # receiving Tensor from all ranks
+    # we pad the tensor because torch all_gather does not support
+    # gathering tensors of different shapes
+    tensor_list = []
+    for _ in size_list:
+        tensor_list.append(torch.ByteTensor(size=(max_size,)).to("cuda"))
+    if local_size != max_size:
+        padding = torch.ByteTensor(size=(max_size - local_size,)).to("cuda")
+        tensor = torch.cat((tensor, padding), dim=0)
+    dist.all_gather(tensor_list, tensor)
+
+    data_list = []
+    for size, tensor in zip(size_list, tensor_list):
+        buffer = tensor.cpu().numpy().tobytes()[:size]
+        data_list.append(pickle.loads(buffer))
+
+    return data_list
+
+
+def mean(lst):
+    return sum(lst) / len(lst)
+
+
+def stop_gradient_by_name(name: str):
+    def apply_fn(module):
+        if hasattr(module, name):
+            getattr(module, name).requires_grad_(False)
+
+    return apply_fn
diff --git a/omnilmm/train/train_utils.py b/omnilmm/train/train_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..48af610d369071f7ec28784ed436d0f4a4e7b1c0
--- /dev/null
+++ b/omnilmm/train/train_utils.py
@@ -0,0 +1,153 @@
+import os
+import gc
+import copy
+import time
+
+import torch
+import warnings
+import transformers
+
+import numpy as np
+
+from typing import Dict, Optional, Sequence
+from omnilmm import conversation as conversation_lib
+
+IGNORE_INDEX = -100
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+
+
+def _tokenize_fn(strings: Sequence[str],
+                 tokenizer: transformers.PreTrainedTokenizer) -> Dict:
+    """Tokenize a list of strings."""
+    tokenized_list = [
+        tokenizer(
+            text,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ) for text in strings
+    ]
+    input_ids = labels = [
+        tokenized.input_ids[0] for tokenized in tokenized_list
+    ]
+    input_ids_lens = labels_lens = [
+        tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item()
+        for tokenized in tokenized_list
+    ]
+    return dict(
+        input_ids=input_ids,
+        labels=labels,
+        input_ids_lens=input_ids_lens,
+        labels_lens=labels_lens,
+    )
+
+
+
+def omni_preprocess(sources,
+                      tokenizer: transformers.PreTrainedTokenizer,
+                      generation=False):
+    system_content = 'You are an artificial intelligence assistant, which gives helpful, detailed, and polite answers to the human\'s questions.'
+    ignore_index = -100
+
+    response_template = '\n<|assistant|>\n'
+    instruction_template = '\n<|user|>\n'
+    response_token_ids = tokenizer.encode(
+        response_template, add_special_tokens=False)
+    instruction_token_ids = tokenizer.encode(
+        instruction_template, add_special_tokens=False)
+
+    batch_input_ids = []
+    batch_labels = []
+    for i in range(len(sources)):
+        new_source = []
+        prev_role = 'unexpect'
+        for conv_turn in sources[i]:
+            role = conv_turn['from'] if 'from' in conv_turn else conv_turn['role']
+            content = conv_turn['value'] if 'value' in conv_turn else conv_turn['content']
+
+            role = 'user' if role == 'human' else role
+            role = 'assistant' if role == 'gpt' else role
+
+            assert role in ['user', 'assistant']
+            assert role != prev_role, f'role={role}, prev_role={prev_role}'
+            prev_role = role
+
+            new_turn = {
+                'role': role,
+                'content': content
+            }
+            new_source.append(new_turn)
+        if new_source[0]['role'] != 'system':
+            new_source.insert(0, {'role': 'system', 'content': system_content})
+
+        # TODO: this automatically add '\n' to the end
+        res_text = tokenizer.apply_chat_template(
+            new_source, tokenize=False, add_generation_prompt=generation)
+        if not generation:
+            res_text = res_text.strip()
+
+        conversations_tokenized = _tokenize_fn([res_text], tokenizer)
+        res_input_ids = conversations_tokenized["input_ids"][0]
+
+        # since labels and input_ids are reference towards the same object
+        res_labels = copy.deepcopy(conversations_tokenized["labels"][0])
+
+        response_token_ids_idxs = []
+        human_token_ids_idxs = []
+
+        for assistant_idx in np.where(res_labels == response_token_ids[0])[0]:
+            # find the indexes of the start of a response.
+            if (response_token_ids == res_labels[assistant_idx: assistant_idx + len(
+                        response_token_ids)].tolist()
+                    ):
+                response_token_ids_idxs.append(
+                    assistant_idx + len(response_token_ids))
+
+        if len(response_token_ids_idxs) == 0:
+            warnings.warn(
+                f"Could not find response key `{response_template}` in the "
+                f'following instance: @===>{tokenizer.decode(res_input_ids)}<===@ '
+                f'Raw text is @===>{res_text}<===@'
+                f'Raw source is @===>{new_source}<===@'
+                f"This instance will be ignored in loss calculation. "
+                f"Note, if this happens often, consider increasing the `max_seq_length`."
+            )
+            res_labels[:] = ignore_index
+
+        human_token_ids = instruction_token_ids
+        for human_idx in np.where(res_labels == human_token_ids[0])[0]:
+            # find the indexes of the start of a human answer.
+            if human_token_ids == res_labels[human_idx: human_idx + len(human_token_ids)].tolist():
+                human_token_ids_idxs.append(human_idx)
+
+        if len(human_token_ids_idxs) == 0:
+            warnings.warn(
+                f"Could not find instruction key `{instruction_template}` in the "
+                f'following instance: @===>{tokenizer.decode(res_input_ids)}<===@ '
+                f'Raw text is @===>{res_text}<===@'
+                f'Raw source is @===>{new_source}<===@'
+                f"This instance will be ignored in loss calculation. "
+                f"Note, if this happens often, consider increasing the `max_seq_length`."
+            )
+            res_labels[:] = ignore_index
+
+        for idx, (start, end) in enumerate(zip(human_token_ids_idxs, response_token_ids_idxs)):
+            # Make pytorch loss function ignore all non response tokens
+            if idx != 0:
+                res_labels[start:end] = ignore_index
+            else:
+                res_labels[:end] = ignore_index
+
+        if len(response_token_ids_idxs) < len(human_token_ids_idxs):
+            res_labels[human_token_ids_idxs[-1]:] = ignore_index
+
+        batch_input_ids.append(res_input_ids)
+        batch_labels.append(res_labels)
+
+    return dict(input_ids=batch_input_ids, labels=batch_labels)
+
+
diff --git a/omnilmm/utils.py b/omnilmm/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..784dabe80320119b6998d79e2840303bdee3ed04
--- /dev/null
+++ b/omnilmm/utils.py
@@ -0,0 +1,127 @@
+import datetime
+import logging
+import logging.handlers
+import os
+import sys
+
+import requests
+
+from omnilmm.constants import LOGDIR
+
+server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
+moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN."
+
+handler = None
+
+
+def build_logger(logger_name, logger_filename):
+    global handler
+
+    formatter = logging.Formatter(
+        fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+
+    # Set the format of root handlers
+    if not logging.getLogger().handlers:
+        logging.basicConfig(level=logging.INFO)
+    logging.getLogger().handlers[0].setFormatter(formatter)
+
+    # Redirect stdout and stderr to loggers
+    stdout_logger = logging.getLogger("stdout")
+    stdout_logger.setLevel(logging.INFO)
+    sl = StreamToLogger(stdout_logger, logging.INFO)
+    sys.stdout = sl
+
+    stderr_logger = logging.getLogger("stderr")
+    stderr_logger.setLevel(logging.ERROR)
+    sl = StreamToLogger(stderr_logger, logging.ERROR)
+    sys.stderr = sl
+
+    # Get logger
+    logger = logging.getLogger(logger_name)
+    logger.setLevel(logging.INFO)
+
+    # Add a file handler for all loggers
+    if handler is None:
+        os.makedirs(LOGDIR, exist_ok=True)
+        filename = os.path.join(LOGDIR, logger_filename)
+        handler = logging.handlers.TimedRotatingFileHandler(
+            filename, when='D', utc=True)
+        handler.setFormatter(formatter)
+
+        for name, item in logging.root.manager.loggerDict.items():
+            if isinstance(item, logging.Logger):
+                item.addHandler(handler)
+
+    return logger
+
+
+class StreamToLogger(object):
+    """
+    Fake file-like stream object that redirects writes to a logger instance.
+    """
+
+    def __init__(self, logger, log_level=logging.INFO):
+        self.terminal = sys.stdout
+        self.logger = logger
+        self.log_level = log_level
+        self.linebuf = ''
+
+    def __getattr__(self, attr):
+        return getattr(self.terminal, attr)
+
+    def write(self, buf):
+        temp_linebuf = self.linebuf + buf
+        self.linebuf = ''
+        for line in temp_linebuf.splitlines(True):
+            # From the io.TextIOWrapper docs:
+            #   On output, if newline is None, any '\n' characters written
+            #   are translated to the system default line separator.
+            # By default sys.stdout.write() expects '\n' newlines and then
+            # translates them so this is still cross platform.
+            if line[-1] == '\n':
+                self.logger.log(self.log_level, line.rstrip())
+            else:
+                self.linebuf += line
+
+    def flush(self):
+        if self.linebuf != '':
+            self.logger.log(self.log_level, self.linebuf.rstrip())
+        self.linebuf = ''
+
+
+def disable_torch_init():
+    """
+    Disable the redundant torch default initialization to accelerate model creation.
+    """
+    import torch
+    setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
+    setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
+
+
+def violates_moderation(text):
+    """
+    Check whether the text violates OpenAI moderation API.
+    """
+    url = "https://api.openai.com/v1/moderations"
+    headers = {"Content-Type": "application/json",
+               "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]}
+    text = text.replace("\n", "")
+    data = "{" + '"input": ' + f'"{text}"' + "}"
+    data = data.encode("utf-8")
+    try:
+        ret = requests.post(url, headers=headers, data=data, timeout=5)
+        flagged = ret.json()["results"][0]["flagged"]
+    except requests.exceptions.RequestException as e:
+        flagged = False
+    except KeyError as e:
+        flagged = False
+
+    return flagged
+
+
+def pretty_print_semaphore(semaphore):
+    if semaphore is None:
+        return "None"
+    return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})"
diff --git a/omnilmm_en.md b/omnilmm_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..62f799222df555fb8d36d46649813280383a1048
--- /dev/null
+++ b/omnilmm_en.md
@@ -0,0 +1,155 @@
+## OmniLMM-12B
+
+> OmniLMM-12B is released at early time of this project. We recommond you to use our [recently released models](./README_en.md), for better performance and efficiency.
+
+> Archieve at: 2024-05-19
+
+
+**OmniLMM-12B** is the most capable version. The model is built based on EVA02-5B and Zephyr-7B-β, connected with a perceiver resampler layer, and trained on multimodal data in a curriculum fashion. The model has three notable features:
+
+- 🔥 **Strong Performance.** 
+
+  OmniLMM-12B achieves **leading performance** among models with comparable sizes, surpassing established LMMs on multiple benchmarks (including MME, MMBench, SEED-Bench, etc). The model also endows rich multi-modal world knowledge.
+
+- 🏆 **Trustworthy Behavior.** 
+
+  LMMs are known for suffering from hallucination, often generating text that is not factually grounded in images (e.g., faithfully describing non-existing objects in images). OmniLMM-12B is **the first state-of-the-art open-source LMM aligned via multimodal RLHF for trustworthy behavior** (using the recent [RLHF-V](https://rlhf-v.github.io/) technique). It **ranks #1** among open-source models on [MMHal-Bench](https://huggingface.co/datasets/Shengcao1006/MMHal-Bench), and **outperforms GPT-4V** on [Object HalBench](https://arxiv.org/abs/2312.00849).
+
+- 🕹 **Real-time Multimodal Interaction.** 
+
+  We combine the OmniLMM-12B and GPT-3.5 (text-only) into a **real-time multimodal interactive assistant**. The assistant accepts video streams from the camera and speech streams from the microphone and emits speech output. While still primary, we find the model can **replicate some of the fun cases shown in the Gemini Demo video, without any video edition**.
+
+
+### Evaluation <!-- omit in toc -->
+<div align="center">
+    <img src=assets/radar_omnilmm12b.png width=66% />
+</div>
+<details>
+<summary>Click to view results on MME, MMBench, MMMU, MMBench, MMHal-Bench, Object HalBench, SeedBench, LLaVA Bench, MathVista. </summary>
+
+<table>
+<thead>
+  <tr>
+    <th align="left">Model</th>
+    <th>Size</th>
+    <th>MME</th>
+    <th nowrap="nowrap">MMB dev (en)</th>
+    <th nowrap="nowrap" >MMMU val</th>
+    <th nowrap="nowrap" >MMHal-Bench</th>
+    <th nowrap="nowrap" >Object HalBench</th>
+    <th nowrap="nowrap" >SeedBench-I</th>
+    <th>MathVista</th>
+    <th nowrap="nowrap" >LLaVA Bench</th>
+  </tr>
+</thead>
+<tbody align="center">
+  <tr>
+    <td align="left">GPT-4V†</td>
+    <td>-</td>
+    <td>1771.5</td>
+    <td>75.1 </td>
+    <td>56.8</td>
+    <td>3.53 / 70.8</td>
+    <td>86.4 / 92.7</td>
+    <td>71.6 </td>
+    <td>47.8 </td>
+    <td>93.1 </td>
+  </tr>
+  <tr>
+    <td nowrap="nowrap" align="left">Qwen-VL-Plus†</td>
+    <td>-</td>
+    <td>2183.4</td>
+    <td>66.2 </td>
+    <td>45.2</td>
+    <td>- </td>
+    <td>- </td>
+    <td>65.7 </td>
+    <td>36.0 </td>
+    <td>73.7 </td>
+  </tr>
+  <tr>
+    <td align="left">Yi-VL 6B</td>
+    <td align="right">6.7B </td>
+    <td>1915.1 </td>
+    <td>68.6 </td>
+    <td>40.3 </td>
+    <td>- </td>
+    <td>- </td>
+    <td>67.5 </td>
+    <td>28.8 </td>
+    <td>51.9 </td>
+  </tr>
+  <tr>
+    <td nowrap="nowrap" align="left" >Qwen-VL-Chat</td>
+    <td align="right">9.6B</td>
+    <td>1860.0</td>
+    <td>60.6 </td>
+    <td>35.9</td>
+    <td>2.93 / 59.4</td>
+    <td>56.2 / 80.0</td>
+    <td>64.8 </td>
+    <td>33.8 </td>
+    <td>67.7 </td>
+  </tr>
+  <tr>
+    <td align="left" >CogVLM-Chat</td>
+    <td align="right">17.4B</td>
+    <td>1736.6</td>
+    <td>63.7 </td>
+    <td>32.1 </td>
+    <td>2.68 / 52.1 </td>
+    <td>73.6 / 87.4 </td>
+    <td>68.8 </td>
+    <td>34.7 </td>
+    <td>73.9 </td>
+  </tr>
+  <tr>
+    <td align="left" >LLaVA 1.5</td>
+    <td align="right">13.6B </td>
+    <td>1808.4 </td>
+    <td>68.2 </td>
+    <td>36.4 </td>
+    <td>2.71 / 51.0 </td>
+    <td>53.7 / 77.4 </td>
+    <td>68.1 </td>
+    <td>26.4 </td>
+    <td>64.6 </td>
+  </tr>
+  <tr>
+    <td nowrap="nowrap" align="left" ><b>OmniLMM-12B</b></td>
+    <td align="right">11.6B </td>
+    <td>1935.8 </td>
+    <td>71.6 </td>
+    <td>40.7 </td>
+    <td>3.45 / 68.8 </td>
+    <td>90.3 / 95.5 </td>
+    <td>71.1 </td>
+    <td>34.9 </td>
+    <td>72.0 </td>
+  </tr>
+</tbody>
+</table>
+<small>†: Proprietary models</small>
+<br>
+</details>
+
+### Examples <!-- omit in toc -->
+
+<table align="center" >
+  <p align="center" > 
+    <img src="assets/omnilmm-12b-examples_2.png" />
+  </p>
+</table>
+
+
+We combine the OmniLMM-12B and GPT-3.5 (text-only) into a **real-time multimodal interactive assistant**. Video frames are described in text using OmniLMM-12B, and ChatGPT 3.5 (text-only) is employed to generate response according to the descriptions and user prompts. The demo video is a raw recording without edition. 
+
+<div align="center" >
+  <video controls src="https://github.com/OpenBMB/OmniLMM/assets/157115220/485a8f52-fb4d-4eca-8fee-506347efcfc6" type="video/mp4" width=80%/>
+</div>
+
+### Model Zoo
+
+| Model                | Description       | Download Link |
+|:----------------------|:-------------------|:---------------:|
+| OmniLMM-12B | The most capable version with leading performance.   |  [🤗](https://huggingface.co/openbmb/OmniLMM-12B) &nbsp;&nbsp; [<img src="./assets/modelscope_logo.png" width="20px"></img>](https://modelscope.cn/models/OpenBMB/OmniLMM-12B/files) |
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..dce85f998432987caa403e86c2f56245cdbacc47
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,33 @@
+packaging==23.2
+addict==2.4.0
+editdistance==0.6.2
+einops==0.7.0
+fairscale==0.4.0
+jsonlines==4.0.0
+markdown2==2.4.10
+matplotlib==3.7.4
+more_itertools==10.1.0
+nltk==3.8.1
+numpy==1.24.4
+opencv_python_headless==4.5.5.64
+openpyxl==3.1.2
+Pillow==10.1.0
+sacrebleu==2.3.2
+seaborn==0.13.0
+shortuuid==1.0.11
+spacy==3.7.2
+timm==0.9.10
+torch==2.1.2
+torchvision==0.16.2
+tqdm==4.66.1
+protobuf==4.25.0
+transformers==4.40.0
+typing_extensions==4.8.0
+uvicorn==0.24.0.post1
+#xformers==0.0.22.post7
+#flash_attn==2.3.4
+sentencepiece==0.1.99
+accelerate==0.30.1
+socksio==1.0.0
+gradio
+gradio_client
diff --git a/web_demo.py b/web_demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad9c54a5606ec03425fae04c9e13938aa49caf6d
--- /dev/null
+++ b/web_demo.py
@@ -0,0 +1,260 @@
+#!/usr/bin/env python
+# encoding: utf-8
+import gradio as gr
+from PIL import Image
+import traceback
+import re
+import torch
+import argparse
+from transformers import AutoModel, AutoTokenizer
+
+# README, How to run demo on different devices
+# For Nvidia GPUs support BF16 (like A100, H100, RTX3090)
+# python web_demo.py --device cuda --dtype bf16
+
+# For Nvidia GPUs do NOT support BF16 (like V100, T4, RTX2080)
+# python web_demo.py --device cuda --dtype fp16
+
+# For Mac with MPS (Apple silicon or AMD GPUs).
+# PYTORCH_ENABLE_MPS_FALLBACK=1 python web_demo.py --device mps --dtype fp16
+
+# Argparser
+parser = argparse.ArgumentParser(description='demo')
+parser.add_argument('--device', type=str, default='cuda', help='cuda or mps')
+parser.add_argument('--dtype', type=str, default='bf16', help='bf16 or fp16')
+args = parser.parse_args()
+device = args.device
+assert device in ['cuda', 'mps']
+if args.dtype == 'bf16':
+    dtype = torch.bfloat16
+else:
+    dtype = torch.float16
+
+# Load model
+model_path = 'openbmb/MiniCPM-V-2'
+model = AutoModel.from_pretrained(model_path, trust_remote_code=True).to(dtype=torch.bfloat16)
+tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+
+model = model.to(device=device, dtype=dtype)
+model.eval()
+
+
+
+ERROR_MSG = "Error, please retry"
+model_name = 'MiniCPM-V 2.0'
+
+form_radio = {
+    'choices': ['Beam Search', 'Sampling'],
+    #'value': 'Beam Search',
+    'value': 'Sampling',
+    'interactive': True,
+    'label': 'Decode Type'
+}
+# Beam Form
+num_beams_slider = {
+    'minimum': 0,
+    'maximum': 5,
+    'value': 3,
+    'step': 1,
+    'interactive': True,
+    'label': 'Num Beams'
+}
+repetition_penalty_slider = {
+    'minimum': 0,
+    'maximum': 3,
+    'value': 1.2,
+    'step': 0.01,
+    'interactive': True,
+    'label': 'Repetition Penalty'
+}
+repetition_penalty_slider2 = {
+    'minimum': 0,
+    'maximum': 3,
+    'value': 1.05,
+    'step': 0.01,
+    'interactive': True,
+    'label': 'Repetition Penalty'
+}
+max_new_tokens_slider = {
+    'minimum': 1,
+    'maximum': 4096,
+    'value': 1024,
+    'step': 1,
+    'interactive': True,
+    'label': 'Max New Tokens'    
+}
+
+top_p_slider = {
+    'minimum': 0,
+    'maximum': 1,
+    'value': 0.8,
+    'step': 0.05,
+    'interactive': True,
+    'label': 'Top P'    
+}
+top_k_slider = {
+    'minimum': 0,
+    'maximum': 200,
+    'value': 100,
+    'step': 1,
+    'interactive': True,
+    'label': 'Top K'    
+}
+temperature_slider = {
+    'minimum': 0,
+    'maximum': 2,
+    'value': 0.7,
+    'step': 0.05,
+    'interactive': True,
+    'label': 'Temperature'    
+}
+
+
+def create_component(params, comp='Slider'):
+    if comp == 'Slider':
+        return gr.Slider(
+            minimum=params['minimum'],
+            maximum=params['maximum'],
+            value=params['value'],
+            step=params['step'],
+            interactive=params['interactive'],
+            label=params['label']
+        )
+    elif comp == 'Radio':
+        return gr.Radio(
+            choices=params['choices'],
+            value=params['value'],
+            interactive=params['interactive'],
+            label=params['label']
+        )
+    elif comp == 'Button':
+        return gr.Button(
+            value=params['value'],
+            interactive=True
+        )
+
+
+def chat(img, msgs, ctx, params=None, vision_hidden_states=None):
+    default_params = {"num_beams":3, "repetition_penalty": 1.2, "max_new_tokens": 1024}
+    if params is None:
+        params = default_params
+    if img is None:
+        return -1, "Error, invalid image, please upload a new image", None, None
+    try:
+        image = img.convert('RGB')
+        answer, context, _ = model.chat(
+            image=image,
+            msgs=msgs,
+            context=None,
+            tokenizer=tokenizer,
+            **params
+        )
+        res = re.sub(r'(<box>.*</box>)', '', answer)
+        res = res.replace('<ref>', '')
+        res = res.replace('</ref>', '')
+        res = res.replace('<box>', '')
+        answer = res.replace('</box>', '')
+        return -1, answer, None, None
+    except Exception as err:
+        print(err)
+        traceback.print_exc()
+        return -1, ERROR_MSG, None, None
+
+
+def upload_img(image, _chatbot, _app_session):
+    image = Image.fromarray(image)
+
+    _app_session['sts']=None
+    _app_session['ctx']=[]
+    _app_session['img']=image 
+    _chatbot.append(('', 'Image uploaded successfully, you can talk to me now'))
+    return _chatbot, _app_session
+
+
+def respond(_question, _chat_bot, _app_cfg, params_form, num_beams, repetition_penalty, repetition_penalty_2, top_p, top_k, temperature):
+    if _app_cfg.get('ctx', None) is None:
+        _chat_bot.append((_question, 'Please upload an image to start'))
+        return '', _chat_bot, _app_cfg
+
+    _context = _app_cfg['ctx'].copy()
+    if _context:
+        _context.append({"role": "user", "content": _question})
+    else:
+        _context = [{"role": "user", "content": _question}] 
+    print('<User>:', _question)
+
+    if params_form == 'Beam Search':
+        params = {
+            'sampling': False,
+            'num_beams': num_beams,
+            'repetition_penalty': repetition_penalty,
+            "max_new_tokens": 896 
+        }
+    else:
+        params = {
+            'sampling': True,
+            'top_p': top_p,
+            'top_k': top_k,
+            'temperature': temperature,
+            'repetition_penalty': repetition_penalty_2,
+            "max_new_tokens": 896 
+        }
+    code, _answer, _, sts = chat(_app_cfg['img'], _context, None, params)
+    print('<Assistant>:', _answer)
+
+    _context.append({"role": "assistant", "content": _answer}) 
+    _chat_bot.append((_question, _answer))
+    if code == 0:
+        _app_cfg['ctx']=_context
+        _app_cfg['sts']=sts
+    return '', _chat_bot, _app_cfg
+
+
+def regenerate_button_clicked(_question, _chat_bot, _app_cfg, params_form, num_beams, repetition_penalty, repetition_penalty_2, top_p, top_k, temperature):
+    if len(_chat_bot) <= 1:
+        _chat_bot.append(('Regenerate', 'No question for regeneration.'))
+        return '', _chat_bot, _app_cfg
+    elif _chat_bot[-1][0] == 'Regenerate':
+        return '', _chat_bot, _app_cfg
+    else:
+        _question = _chat_bot[-1][0]
+        _chat_bot = _chat_bot[:-1]
+        _app_cfg['ctx'] = _app_cfg['ctx'][:-2]
+    return respond(_question, _chat_bot, _app_cfg, params_form, num_beams, repetition_penalty, repetition_penalty_2, top_p, top_k, temperature)
+
+
+
+with gr.Blocks() as demo:
+    with gr.Row():
+        with gr.Column(scale=1, min_width=300):
+            params_form = create_component(form_radio, comp='Radio')
+            with gr.Accordion("Beam Search") as beams_according:
+                num_beams = create_component(num_beams_slider)
+                repetition_penalty = create_component(repetition_penalty_slider)
+            with gr.Accordion("Sampling") as sampling_according:
+                top_p = create_component(top_p_slider)
+                top_k = create_component(top_k_slider)
+                temperature = create_component(temperature_slider)
+                repetition_penalty_2 = create_component(repetition_penalty_slider2)
+            regenerate = create_component({'value': 'Regenerate'}, comp='Button')
+        with gr.Column(scale=3, min_width=500):
+            app_session = gr.State({'sts':None,'ctx':None,'img':None})
+            bt_pic = gr.Image(label="Upload an image to start")
+            chat_bot = gr.Chatbot(label=f"Chat with {model_name}")
+            txt_message = gr.Textbox(label="Input text")
+            
+            regenerate.click(
+                regenerate_button_clicked,
+                [txt_message, chat_bot, app_session, params_form, num_beams, repetition_penalty, repetition_penalty_2, top_p, top_k, temperature],
+                [txt_message, chat_bot, app_session]
+            )
+            txt_message.submit(
+                respond, 
+                [txt_message, chat_bot, app_session, params_form, num_beams, repetition_penalty, repetition_penalty_2, top_p, top_k, temperature], 
+                [txt_message, chat_bot, app_session]
+            )
+            bt_pic.upload(lambda: None, None, chat_bot, queue=False).then(upload_img, inputs=[bt_pic,chat_bot,app_session], outputs=[chat_bot,app_session])
+
+# launch
+demo.launch(share=False, debug=True, show_api=False, server_port=8080, server_name="0.0.0.0")
+
diff --git a/web_demo_2.5.py b/web_demo_2.5.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc80a8235567e54a6f5e5089fbe8f9edbb14b4ae
--- /dev/null
+++ b/web_demo_2.5.py
@@ -0,0 +1,257 @@
+#!/usr/bin/env python
+# encoding: utf-8
+import gradio as gr
+from PIL import Image
+import traceback
+import re
+import torch
+import argparse
+from transformers import AutoModel, AutoTokenizer
+
+# README, How to run demo on different devices
+
+# For Nvidia GPUs.
+# python web_demo_2.5.py --device cuda
+
+# For Mac with MPS (Apple silicon or AMD GPUs).
+# PYTORCH_ENABLE_MPS_FALLBACK=1 python web_demo_2.5.py --device mps
+
+# Argparser
+parser = argparse.ArgumentParser(description='demo')
+parser.add_argument('--device', type=str, default='cuda', help='cuda or mps')
+args = parser.parse_args()
+device = args.device
+assert device in ['cuda', 'mps']
+
+# Load model
+model_path = 'openbmb/MiniCPM-Llama3-V-2_5'
+if 'int4' in model_path:
+    if device == 'mps':
+        print('Error: running int4 model with bitsandbytes on Mac is not supported right now.')
+        exit()
+    model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
+else:
+    model = AutoModel.from_pretrained(model_path, trust_remote_code=True).to(dtype=torch.float16)
+    model = model.to(device=device)
+tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+model.eval()
+
+
+
+ERROR_MSG = "Error, please retry"
+model_name = 'MiniCPM-V 2.5'
+
+form_radio = {
+    'choices': ['Beam Search', 'Sampling'],
+    #'value': 'Beam Search',
+    'value': 'Sampling',
+    'interactive': True,
+    'label': 'Decode Type'
+}
+# Beam Form
+num_beams_slider = {
+    'minimum': 0,
+    'maximum': 5,
+    'value': 3,
+    'step': 1,
+    'interactive': True,
+    'label': 'Num Beams'
+}
+repetition_penalty_slider = {
+    'minimum': 0,
+    'maximum': 3,
+    'value': 1.2,
+    'step': 0.01,
+    'interactive': True,
+    'label': 'Repetition Penalty'
+}
+repetition_penalty_slider2 = {
+    'minimum': 0,
+    'maximum': 3,
+    'value': 1.05,
+    'step': 0.01,
+    'interactive': True,
+    'label': 'Repetition Penalty'
+}
+max_new_tokens_slider = {
+    'minimum': 1,
+    'maximum': 4096,
+    'value': 1024,
+    'step': 1,
+    'interactive': True,
+    'label': 'Max New Tokens'    
+}
+
+top_p_slider = {
+    'minimum': 0,
+    'maximum': 1,
+    'value': 0.8,
+    'step': 0.05,
+    'interactive': True,
+    'label': 'Top P'    
+}
+top_k_slider = {
+    'minimum': 0,
+    'maximum': 200,
+    'value': 100,
+    'step': 1,
+    'interactive': True,
+    'label': 'Top K'    
+}
+temperature_slider = {
+    'minimum': 0,
+    'maximum': 2,
+    'value': 0.7,
+    'step': 0.05,
+    'interactive': True,
+    'label': 'Temperature'    
+}
+
+
+def create_component(params, comp='Slider'):
+    if comp == 'Slider':
+        return gr.Slider(
+            minimum=params['minimum'],
+            maximum=params['maximum'],
+            value=params['value'],
+            step=params['step'],
+            interactive=params['interactive'],
+            label=params['label']
+        )
+    elif comp == 'Radio':
+        return gr.Radio(
+            choices=params['choices'],
+            value=params['value'],
+            interactive=params['interactive'],
+            label=params['label']
+        )
+    elif comp == 'Button':
+        return gr.Button(
+            value=params['value'],
+            interactive=True
+        )
+
+
+def chat(img, msgs, ctx, params=None, vision_hidden_states=None):
+    default_params = {"num_beams":3, "repetition_penalty": 1.2, "max_new_tokens": 1024}
+    if params is None:
+        params = default_params
+    if img is None:
+        return -1, "Error, invalid image, please upload a new image", None, None
+    try:
+        image = img.convert('RGB')
+        answer = model.chat(
+            image=image,
+            msgs=msgs,
+            tokenizer=tokenizer,
+            **params
+        )
+        res = re.sub(r'(<box>.*</box>)', '', answer)
+        res = res.replace('<ref>', '')
+        res = res.replace('</ref>', '')
+        res = res.replace('<box>', '')
+        answer = res.replace('</box>', '')
+        return -1, answer, None, None
+    except Exception as err:
+        print(err)
+        traceback.print_exc()
+        return -1, ERROR_MSG, None, None
+
+
+def upload_img(image, _chatbot, _app_session):
+    image = Image.fromarray(image)
+
+    _app_session['sts']=None
+    _app_session['ctx']=[]
+    _app_session['img']=image 
+    _chatbot.append(('', 'Image uploaded successfully, you can talk to me now'))
+    return _chatbot, _app_session
+
+
+def respond(_question, _chat_bot, _app_cfg, params_form, num_beams, repetition_penalty, repetition_penalty_2, top_p, top_k, temperature):
+    if _app_cfg.get('ctx', None) is None:
+        _chat_bot.append((_question, 'Please upload an image to start'))
+        return '', _chat_bot, _app_cfg
+
+    _context = _app_cfg['ctx'].copy()
+    if _context:
+        _context.append({"role": "user", "content": _question})
+    else:
+        _context = [{"role": "user", "content": _question}] 
+    print('<User>:', _question)
+
+    if params_form == 'Beam Search':
+        params = {
+            'sampling': False,
+            'num_beams': num_beams,
+            'repetition_penalty': repetition_penalty,
+            "max_new_tokens": 896 
+        }
+    else:
+        params = {
+            'sampling': True,
+            'top_p': top_p,
+            'top_k': top_k,
+            'temperature': temperature,
+            'repetition_penalty': repetition_penalty_2,
+            "max_new_tokens": 896 
+        }
+    code, _answer, _, sts = chat(_app_cfg['img'], _context, None, params)
+    print('<Assistant>:', _answer)
+
+    _context.append({"role": "assistant", "content": _answer}) 
+    _chat_bot.append((_question, _answer))
+    if code == 0:
+        _app_cfg['ctx']=_context
+        _app_cfg['sts']=sts
+    return '', _chat_bot, _app_cfg
+
+
+def regenerate_button_clicked(_question, _chat_bot, _app_cfg, params_form, num_beams, repetition_penalty, repetition_penalty_2, top_p, top_k, temperature):
+    if len(_chat_bot) <= 1:
+        _chat_bot.append(('Regenerate', 'No question for regeneration.'))
+        return '', _chat_bot, _app_cfg
+    elif _chat_bot[-1][0] == 'Regenerate':
+        return '', _chat_bot, _app_cfg
+    else:
+        _question = _chat_bot[-1][0]
+        _chat_bot = _chat_bot[:-1]
+        _app_cfg['ctx'] = _app_cfg['ctx'][:-2]
+    return respond(_question, _chat_bot, _app_cfg, params_form, num_beams, repetition_penalty, repetition_penalty_2, top_p, top_k, temperature)
+
+
+
+with gr.Blocks() as demo:
+    with gr.Row():
+        with gr.Column(scale=1, min_width=300):
+            params_form = create_component(form_radio, comp='Radio')
+            with gr.Accordion("Beam Search") as beams_according:
+                num_beams = create_component(num_beams_slider)
+                repetition_penalty = create_component(repetition_penalty_slider)
+            with gr.Accordion("Sampling") as sampling_according:
+                top_p = create_component(top_p_slider)
+                top_k = create_component(top_k_slider)
+                temperature = create_component(temperature_slider)
+                repetition_penalty_2 = create_component(repetition_penalty_slider2)
+            regenerate = create_component({'value': 'Regenerate'}, comp='Button')
+        with gr.Column(scale=3, min_width=500):
+            app_session = gr.State({'sts':None,'ctx':None,'img':None})
+            bt_pic = gr.Image(label="Upload an image to start")
+            chat_bot = gr.Chatbot(label=f"Chat with {model_name}")
+            txt_message = gr.Textbox(label="Input text")
+            
+            regenerate.click(
+                regenerate_button_clicked,
+                [txt_message, chat_bot, app_session, params_form, num_beams, repetition_penalty, repetition_penalty_2, top_p, top_k, temperature],
+                [txt_message, chat_bot, app_session]
+            )
+            txt_message.submit(
+                respond, 
+                [txt_message, chat_bot, app_session, params_form, num_beams, repetition_penalty, repetition_penalty_2, top_p, top_k, temperature], 
+                [txt_message, chat_bot, app_session]
+            )
+            bt_pic.upload(lambda: None, None, chat_bot, queue=False).then(upload_img, inputs=[bt_pic,chat_bot,app_session], outputs=[chat_bot,app_session])
+
+# launch
+demo.launch(share=False, debug=True, show_api=False, server_port=8080, server_name="0.0.0.0")
+
diff --git a/web_demo_streamlit-2_5.py b/web_demo_streamlit-2_5.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf100ce569e87c8934d95e620bd0d92be024caf9
--- /dev/null
+++ b/web_demo_streamlit-2_5.py
@@ -0,0 +1,108 @@
+import streamlit as st
+from PIL import Image
+import torch
+from transformers import AutoModel, AutoTokenizer
+
+# Model path
+model_path = "openbmb/MiniCPM-Llama3-V-2_5"
+
+# User and assistant names
+U_NAME = "User"
+A_NAME = "Assistant"
+
+# Set page configuration
+st.set_page_config(
+    page_title="MiniCPM-Llama3-V-2_5 Streamlit",
+    page_icon=":robot:",
+    layout="wide"
+)
+
+
+# Load model and tokenizer
+@st.cache_resource
+def load_model_and_tokenizer():
+    print(f"load_model_and_tokenizer from {model_path}")
+    model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.float16).to(device="cuda")
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    return model, tokenizer
+
+
+# Initialize session state
+if 'model' not in st.session_state:
+    st.session_state.model, st.session_state.tokenizer = load_model_and_tokenizer()
+    st.session_state.model.eval()
+    print("model and tokenizer had loaded completed!")
+
+# Initialize session state
+if 'chat_history' not in st.session_state:
+    st.session_state.chat_history = []
+
+# Sidebar settings
+sidebar_name = st.sidebar.title("MiniCPM-Llama3-V-2_5 Streamlit")
+max_length = st.sidebar.slider("max_length", 0, 4096, 2048, step=2)
+repetition_penalty = st.sidebar.slider("repetition_penalty", 0.0, 2.0, 1.05, step=0.01)
+top_p = st.sidebar.slider("top_p", 0.0, 1.0, 0.8, step=0.01)
+top_k = st.sidebar.slider("top_k", 0, 100, 100, step=1)
+temperature = st.sidebar.slider("temperature", 0.0, 1.0, 0.7, step=0.01)
+
+# Clear chat history button
+buttonClean = st.sidebar.button("Clear chat history", key="clean")
+if buttonClean:
+    st.session_state.chat_history = []
+    st.session_state.response = ""
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    st.rerun()
+
+# Display chat history
+for i, message in enumerate(st.session_state.chat_history):
+    if message["role"] == "user":
+        with st.chat_message(name="user", avatar="user"):
+            if message["image"] is not None:
+                st.image(message["image"], caption='User uploaded image', width=448, use_column_width=False)
+                continue
+            elif message["content"] is not None:
+                st.markdown(message["content"])
+    else:
+        with st.chat_message(name="model", avatar="assistant"):
+            st.markdown(message["content"])
+
+# Select mode
+selected_mode = st.sidebar.selectbox("Select mode", ["Text", "Image"])
+if selected_mode == "Image":
+    # Image mode
+    uploaded_image = st.sidebar.file_uploader("Upload image", key=1, type=["jpg", "jpeg", "png"],
+                                              accept_multiple_files=False)
+    if uploaded_image is not None:
+        st.image(uploaded_image, caption='User uploaded image', width=468, use_column_width=False)
+        # Add uploaded image to chat history
+        st.session_state.chat_history.append({"role": "user", "content": None, "image": uploaded_image})
+
+# User input box
+user_text = st.chat_input("Enter your question")
+if user_text:
+    with st.chat_message(U_NAME, avatar="user"):
+        st.session_state.chat_history.append({"role": "user", "content": user_text, "image": None})
+        st.markdown(f"{U_NAME}: {user_text}")
+
+    # Generate reply using the model
+    model = st.session_state.model
+    tokenizer = st.session_state.tokenizer
+
+    with st.chat_message(A_NAME, avatar="assistant"):
+        # If the previous message contains an image, pass the image to the model
+        if len(st.session_state.chat_history) > 1 and st.session_state.chat_history[-2]["image"] is not None:
+            uploaded_image = st.session_state.chat_history[-2]["image"]
+            imagefile = Image.open(uploaded_image).convert('RGB')
+
+        msgs = [{"role": "user", "content": user_text}]
+        res = model.chat(image=imagefile, msgs=msgs, context=None, tokenizer=tokenizer,
+                         sampling=True, top_p=top_p, top_k=top_k, repetition_penalty=repetition_penalty,
+                         temperature=temperature, stream=True)
+
+        # Collect the generated_text str
+        generated_text = st.write_stream(res)
+
+        st.session_state.chat_history.append({"role": "model", "content": generated_text, "image": None})
+
+    st.divider()
diff --git a/web_demo_streamlit.py b/web_demo_streamlit.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b893e6e7579db1cb17caa84a4cc132ff65a3d3a
--- /dev/null
+++ b/web_demo_streamlit.py
@@ -0,0 +1,99 @@
+import streamlit as st
+from PIL import Image
+import torch
+from transformers import AutoModel, AutoTokenizer
+
+# Model path
+model_path = "openbmb/MiniCPM-V-2"
+
+# User and assistant names
+U_NAME = "User"
+A_NAME = "Assistant"
+
+# Set page configuration
+st.set_page_config(
+    page_title="Minicpm-V-2 Streamlit",
+    page_icon=":robot:",
+    layout="wide"
+)
+
+# Load model and tokenizer
+@st.cache_resource
+def load_model_and_tokenizer():
+    print(f"load_model_and_tokenizer from {model_path}")
+    model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16).to(
+        device="cuda:0", dtype=torch.bfloat16)
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    return model, tokenizer
+
+# Initialize session state
+if 'model' not in st.session_state:
+    st.session_state.model, st.session_state.tokenizer = load_model_and_tokenizer()
+    print("model and tokenizer had loaded completed!")
+
+# Initialize session state
+if 'chat_history' not in st.session_state:
+    st.session_state.chat_history = []
+
+# Sidebar settings
+sidebar_name = st.sidebar.title("Minicpm-V-2 Streamlit")
+max_length = st.sidebar.slider("max_length", 0, 4096, 2048, step=2)
+top_p = st.sidebar.slider("top_p", 0.0, 1.0, 0.8, step=0.01)
+temperature = st.sidebar.slider("temperature", 0.0, 1.0, 0.7, step=0.01)
+
+# Clear chat history button
+buttonClean = st.sidebar.button("Clear chat history", key="clean")
+if buttonClean:
+    st.session_state.chat_history = []
+    st.session_state.response = ""
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    st.rerun()
+
+# Display chat history
+for i, message in enumerate(st.session_state.chat_history):
+    if message["role"] == "user":
+        with st.chat_message(name="user", avatar="user"):
+            if message["image"] is not None:
+                st.image(message["image"], caption='User uploaded image', width=468, use_column_width=False)
+                continue
+            elif message["content"] is not None:
+                st.markdown(message["content"])
+    else:
+        with st.chat_message(name="model", avatar="assistant"):
+            st.markdown(message["content"])
+
+# Select mode
+selected_mode = st.sidebar.selectbox("Select mode", ["Text", "Image"])
+if selected_mode == "Image":
+    # Image mode
+    uploaded_image = st.sidebar.file_uploader("Upload image", key=1, type=["jpg", "jpeg", "png"], accept_multiple_files=False)
+    if uploaded_image is not None:
+        st.image(uploaded_image, caption='User uploaded image', width=468, use_column_width=False)
+        # Add uploaded image to chat history
+        st.session_state.chat_history.append({"role": "user", "content": None, "image": uploaded_image})
+
+# User input box
+user_text = st.chat_input("Enter your question")
+if user_text:
+    with st.chat_message(U_NAME, avatar="user"):
+        st.session_state.chat_history.append({"role": "user", "content": user_text, "image": None})
+        st.markdown(f"{U_NAME}: {user_text}")
+
+    # Generate reply using the model
+    model = st.session_state.model
+    tokenizer = st.session_state.tokenizer
+
+    with st.chat_message(A_NAME, avatar="assistant"):
+        # If the previous message contains an image, pass the image to the model
+        if len(st.session_state.chat_history) > 1 and st.session_state.chat_history[-2]["image"] is not None:
+            uploaded_image = st.session_state.chat_history[-2]["image"]
+            imagefile = Image.open(uploaded_image).convert('RGB')
+
+        msgs = [{"role": "user", "content": user_text}]
+        res, context, _ = model.chat(image=imagefile, msgs=msgs, context=None, tokenizer=tokenizer,
+                                     sampling=True,top_p=top_p,temperature=temperature)
+        st.markdown(f"{A_NAME}: {res}")
+        st.session_state.chat_history.append({"role": "model", "content": res, "image": None})
+
+    st.divider()