Upload 17 files

Browse files

Files changed (16) hide show

LICENSE +240 -0
ModelsCommunityLicenseAgreement +167 -0
README.md +345 -0
README_cn.md +346 -0
config.json +31 -0
configuration_orion.py +82 -0
generation_config.json +13 -0
generation_utils.py +52 -0
model.safetensors.index.json +451 -0
modeling_orion.py +1117 -0
output-00001-of-00002.safetensors +3 -0
output-00002-of-00002.safetensors +3 -0
special_tokens_map.json +30 -0
tokenization_orion.py +255 -0
tokenizer.model +3 -0
tokenizer_config.json +46 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,240 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright (C) 2023 ORION STAR Robotics
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+Apache License Version 2.0(简体中文)
+1.　定义
+“许可证”是指根据本文档第1到第9部分关于使用、重生成和分发的术语和条件。
+“许可证颁发者”是指版权所有者或者由版权所有者批准的授权许可证的实体。
+“法律实体”是指实施实体和进行控制的所有其它实体受该实体控制，或者受该实体集中控制。根据此定义，”控制”是指(i)让无论是否签订协议的上述实体，进行指导或管理的直接权利或间接权利，或者(ii)拥有百分之五十(50%)或以上已发行股票的所有者，或者(iii)上述实体的实权所有者。
+“用户”(或“用户的”)是指行使本许可证所授予权限的个人或法律实体。
+“源程序”形式是指对包含但不限制软件源代码、文档源程序和配置文件进行修改的首选形式。
+“目标”形式是指对源程序形式进行机械转换或翻译的任何形式，包括但不限于对编译的目标代码，生成的文件以及转换为其它媒体类型。
+“作品”是指根据本许可证所制作的源程序形式或目标形式的著作，在著作中包含的或附加的版权通知(在下面附录中提供了一个示例)。
+“衍生作品”是指基于作品(或从作品衍生而来)的源程序形式或目标形式的任何作品，以及编辑修订、注释、详细描述或其它修订等构成原创著作作品的整体。根据本许可证，衍生作品不得包括与作品及其衍生作品分离之作品，或仅与作品及其衍生作品的接口相链接(或按名称结合)之作品。
+“贡献”是指任何著作作品，包括作品的原始版本和对该作品或衍生作品所做的任何修订或补充，意在提交给许可证颁发者以让版权所有者或代表版权所有者的授权个人或法律实体包含在其作品中。根据此定义，“提交”一词表示发送给许可证颁发者或其代表人，任何电子的、口头的或书面的交流信息形式，包括但不限于在由许可证颁发者或者代表其管理的电子邮件清单、源代码控制系统、以及发布跟踪系统上为讨论和提高作品的交流，但不包括由版权所有者以书面形式明显标注或指定为”非贡献”的交流活动。
+“贡献者”是指许可证颁发者和代表从许可证颁发者接受之贡献的并随后包含在作品之贡献中的任何个人或法律实体。
+2.　版权许可证的授予
+根据本许可证的条款，每个贡献者授予用户永久性的、全球性的、非专有性的、免费的、无版权费的、不可撤销的版权许可证以源程序形式或目标形式复制、准备衍生作品、公开显示、公开执行、授予分许可证、以及分发作品和这样的衍生作品。
+3.　专利许可证的授予
+根据本许可证的条款，每个贡献者授予用户永久性的、全球性的、非专有性的、免费的、无版权费的、不可撤销的(除在本部分进行说明)专利许可证对作品进行制作、让人制作、使用、提供销售、销售、进口和其它转让，且这样的许可证仅适用于在所递交作品的贡献中因可由单一的或多个这样的贡献者授予而必须侵犯的申请专利。如果用户对任何实体针对作品或作品中所涉及贡献提出因直接性或贡献性专利侵权而提起专利法律诉讼(包括交互诉讼请求或反索赔)，那么根据本许可证，授予用户针对作品的任何专利许可证将在提起上述诉讼之日起终止。
+4.　重新分发
+用户可在任何媒介中复制和分发作品或衍生作品之副本，无论是否修订，还是以源程序形式或目标形式，条件是用户需满���下列条款：
+用户必须为作品或衍生作品的任何其他接收者提供本许可证的副本；
+并且用户必须让任何修改过的文件附带明显的通知，声明用户已更改文件；
+并且用户必须从作品的源程序形式中保留衍生作品源程序形式的用户所分发的所有版权、专利、商标和属性通知，但不包括不属于衍生作品任何部分的类似通知；
+并且如果作品将”通知”文本文件包括为其分发作品的一部分，那么用户分发的任何衍生作品中须至少在下列地方之一包括，在这样的通知文件中所包含的属性通知的可读副本，但不包括那些不属于衍生作品任何部分的通知：在作为衍生作品一部分而分发的通知文本文件中；如果与衍生作品一起提供则在源程序形式或文件中；或者通常作为第三方通知出现的时候和地方，在衍生作品中产生的画面中。通知文件的内容仅供信息提供，并未对许可证进行修改。用户可在其分发的衍生作品中在作品的通知文本后或作为附录添加自己的属性通知，条件是附加的属性通知不得构成修改本许可证。
+用户可以为自身所做出的修订添加自己的版权声明并可对自身所做出修订内容或为这样的衍生作品作为整体的使用、复制或分发提供附加或不同的条款，条件是用户对作品的使用、复制和分发必须符合本许可证中声明的条款。
+5.　贡献的提交
+除非用户明确声明，在作品中由用户向许可证颁发者的提交若要包含在贡献中，必须在无任何附加条款下符合本许可证的条款。尽管上面如此规定，执行许可证颁发者有关贡献的条款时，任何情况下均不得替代或修改任何单独许可证协议的条款。
+6.　商标
+本许可证并未授予用户使用许可证颁发者的商号、商标、服务标记或产品名称，除非将这些名称用于合理性和惯例性描述作品起源和复制通知文件的内容时。
+7.　保证否认条款
+除非因适用法律需要或书面同意，许可证颁发者以”按原样”基础提供作品(并且每个贡献者提供其贡献)，无任何明示的或暗示的保证或条件，包括但不限于关于所有权、不侵权、商品适销性、或适用性的保证或条件。用户仅对使用或重新分发作品的正确性负责，并需承担根据本许可证行使权限时的任何风险。
+8.　责任限制条款
+在任何情况下并根据任何法律，无论是因侵权(包括过失)或根据合同，还是其它原因，除非根据适用法律需要(例如故意行为和重大过失行为)或经书面同意，即使贡献者事先已被告知发生损害的可能性，任何贡献者不就用户因使用本许可证或不能使用或无法使用作品(包括但不限于商誉损失、停工、计算机失效或故障，或任何商业损坏或损失)而造成的损失，包括直接的、非直接的、特殊的、意外的或间接的字符损坏而负责。
+9.　接受保证或附加责任
+重新分发作品或及其衍生作品时，用户可选择提供或为符合本许可证承担之支持、担保、赔偿或其它职责义务和/或权利而收取费用。但是，在承担上述义务时，用户只可代表用户本身和用户本身责任来执行，无需代表任何其它贡献者，并且用户仅可保证、防护并保持每个贡献者不受任何因此而产生的责任或对因用户自身承担这样的保证或附加责任而对这样的贡献者所提出的索赔。
+条款结束

ModelsCommunityLicenseAgreement ADDED Viewed

	@@ -0,0 +1,167 @@

+【Orion-14B Series】 Models Community License Agreement
+                                 Version: 1.0
+                       Date of Release:
+1. Definition
+“Agreement” refers to the terms and conditions defined in this 【Orion-14B Series】 Models Community License Agreement for the use, reproduction, and distribution of Yi
+Series Models.
+“Model” refers to associated components (including checkpoints) developed based on machine learning, including learned weights and parameters (including the
+status of optimizer).
+“【Orion-14B Series】 Models” refers to open-source models with different specifications and capabilities provided by the Licensor, including:
+【Orion-14B-Base】Base model
+【Orion-14B-Chat】Chat model
+【Orion-14B-LongChat】Long context chat model
+【Orion-14B-Chat-RAG】Retrieval augmented generation chat model
+【Orion-14B-Chat-Plugin】Chat model with plugin capability
+【Orion-14B-Base-Int4】4-bit integer quantized base model
+【Orion-14B-Chat-Int4】4-bit integer quantized chat model
+“Derivatives” refers to all modifications to 【Orion-14B Series】 Models, work based on 【Orion-14B Series】 Models, or any other models created or initialized by transferring the weights, parameters, activations, or output patterns of 【Orion-14B Series】 Models to other models to achieve similar performance, including but not limited to methods that require using intermediate data representations or generating synthetic data based on 【Orion-14B Series】 Models to train other models.
+“Licensor” refers to Beijing Orionstar Technology Co., Ltd.
+“you” refers to an individual or legal entity that exercises the license granted by this Agreement and/or uses the 【Orion-14B Series】 Models for any purpose and in any field of use.
+“Third Party” refers to any individuals, legal entities, or non-legal organizations other than you.
+“Distribute” refers to transmitting, copying, publishing, or otherwise sharing the 【Orion-14B Series】 Models with third parties, including providing the 【Orion-14B Series】Models through electronic or other remote means (such as any SaaS software or PaaS software accessed via API or web access).
+“Commercial Purposes” refers to the use of the 【Orion-14B Series】 Models, directly or indirectly, for the operation, promotion, revenue generation, or any other profit-making purposes for entities or individuals.
+“Laws and Regulations” refers to the laws and administrative regulations of the mainland of the People's Republic of China (for the purposes of this Agreement only, excluding Hong Kong, Macau, and Taiwan).
+“Personal Information” refers to various information related to identified or identifiable natural persons recorded electronically or by other means, excluding information that has been anonymized.
+“Logo” refers to any trademark, service mark, trade name, domain name, website name, or other distinctive branding marks.
+2. License and License Restrictions
+The Licensor hereby grants you a non-exclusive, global, non-transferable, on-sub-licensable, revocable, and royalty-free copyright license. You must adhere to the following license restrictions:
+1) Your use of the 【Orion-14B Series】 Models must comply with the Laws and Regulations as well as applicable legal requirements of other countries/regions, and respect social ethics and moral standards, including but not limited to, not using the【Orion-14B Series】 Models for purposes prohibited by Laws and Regulations as well as applicable legal requirements of other countries/regions, such as harming national security, promoting terrorism, extremism, inciting ethnic or racial hatred, discrimination, violence, or pornography, and spreading false harmful information.
+2) You shall not, for military or unlawful purposes or in ways not allowed by Laws and Regulations as well as applicable legal requirements of other countries/regions, a) use, copy, or Distribute the【Orion-14B Series】 Models, or b) create complete or partial Derivatives of the 【Orion-14B Series】 Models.
+3) Your use of the 【Orion-14B Series】 Models (including using the output of the 【Orion-14B Series】 Models) and the creation of Derivatives must not infringe upon the legitimate rights of any Third Party, including but not limited to the rights of personal rights such as the right to likeness, reputation, and privacy, as well as intellectual property rights such as copyrights, patents, trade secrets, and other property rights.
+4) You must clearly attribute the source of the 【Orion-14B Series】 Models to the Licensor and provide a copy of this Agreement to any Third-Party users of the 【Orion-14B Series】 Models and Derivatives.
+5) If you modify the 【Orion-14B Series】 Models to create Derivatives, you must clearly indicate the substantial modifications made, and these modifications shall not violate the license restrictions of this Agreement. You shall not enable, assist, or in any way facilitate Third Parties to violate the license restrictions of this Agreement.
+If you plan to use the 【Orion-14B Series】 Models and Derivatives for Commercial Purposes, please refer to the Registration Form of 【Orion-14B Series】 Models for Commercial Purposes (“Registration Form”), available at 【https://test.orionstar.com/llm-license.html】) and to complete the registration and obtain the license for Commercial Purposes. If you obtained the license for Commercial Purposes and use the 【Orion-14B Series】 Models and Derivatives for Commercial Purposes, you must comply with the afore-mentioned license restrictions.
+3. Intellectual Property
+The ownership of the 【Orion-14B Series】 Models and their related intellectual property rights is solely held by the Licensor.
+In any circumstance, without the prior written consent of the Licensor, you are not allowed to use any Logo associated with the Licensor. If your use of the Licensor's Logo in violation of this Agreement causes any losses to the Licensor or others, you will bear full legal responsibility.
+Within the scope of the granted license, you are authorized to modify the Orion-14B series models to create derivative works. You may assert intellectual property rights over the portions of the derivative works that are the product of your creative labor.
+4. Disclaimer and Limitation of Liability
+The 【Orion-14B Series】 Models are provided "AS IS." The Licensor does not provide any express or implied warranties for the 【Orion-14B Series】 Models, including but not limited to stability, ownership, merchantability, non-infringement, or fitness for a specific purpose of the 【Orion-14B Series】 Models and their output results. You assume all responsibilities for the risks and consequences arising from the use, reproduction, and distribution of the 【Orion-14B Series】 Models, and the creation of Derivatives.
+The Licensor complies with Laws and Regulations at all stages of model training, maintaining the legality, authenticity, accuracy, objectivity, and diversity of data and algorithms. The Licensor is not liable for any direct, indirect, incidental consequences, and other losses or damages related to your use, reproduction, and distribution of the 【Orion-14B Series】 Models, and the creation of Derivatives under this Agreement. This includes but is not limited to:
+1) The Licensor is not responsible for data security risks resulting from your use of the 【Orion-14B Series】 Models.
+2) The 【Orion-14B Series】 Models may contain Personal Information. When you use 【Orion-14B Series】 Models, you acknowledge that you are the data processing entity as defined under the Laws and Regulations responsible for determining the processing methods and purposes of Personal Information. You must comply with legal requirements for processing any Personal Information that may be contained in the 【Orion-14B Series】 Models and assume the associated legal responsibilities, as well as the risks
+and consequences of processing Personal Information.
+3) The Licensor is not liable for reputation risks arising from your use of the 【Orion-14B Series】 Models or the output results of the 【Orion-14B Series】 Models.
+4) The Licensor is not liable for intellectual property risks associated with your use of the 【Orion-14B Series】 Models’ output results.
+If your use, reproduction, distribution of the 【Orion-14B Series】 Models, or the creation of Derivatives result in losses to the Licensor, the Licensor has the right to seek compensation from you. For any claims made by Third Parties against the Licensor related to your use, reproduction, and distribution of the 【Orion-14B Series】 Models, or the creation of Derivatives, the Licensor has the right to demand that you defend, compensate, and indemnify the Licensor and protect the Licensor from harm.
+5. Dispute Resolution
+The stipulation, effectiveness, interpretation, performance, modification, and termination of the Agreement, the use, copy, and Distribute of the 【Orion-14B Series】 Models, and dispute resolution associated with your use, copy, and distribution shall be governed by the laws of the mainland of the People's Republic of China (for the purposes of this agreement only, excluding Hong Kong, Macau, and Taiwan), and the application of conflict of laws is excluded.
+ Any disputes arising from the use, copy, or distribution of the 【Orion-14B Series】 Models should first be resolved through amicable negotiations. If negotiations fail, legal proceedings should be initiated in the People's Court at the location of the Licensor.
+6. Effectiveness and Termination of the Agreement
+Your use of the 【Orion-14B Series】 Models signifies that you have read and agreed to be bound by the terms of the Agreement. The Agreement becomes effective from the date of your use of the 【Orion-14B Series】 Models and will terminate from the date you cease using the 【Orion-14B Series】 Models. If you violate any terms or restrictions in the Agreement, the Licensor reserves the right to terminate the Agreement.
+Upon termination of the Agreement, you must immediately cease using the 【Orion-14B Series】Models. Section 4, "Disclaimer and Limitation of Liability," and Section 5, "Dispute Resolution," of this Agreement remain in effect after the termination of this Agreement.
+7. Updates to the Agreement and Contact Information
+The Licensor reserves the right to update the Agreement from time to time.
+【Orion-14B系列】 模型社区许可协议
+版本：1.0
+发布日期：
+    一、 定义
+“许可”是指本协议中定义的使用、复制和分发的条款和条件。
+“模型”是指任何附带的基于机器学习的组件（包括检查点），包括学习的权重、参数（包括 优化器状态）。
+“【Orion-14B系列】 模型”是指基于【Orion-14B-Base】模型构建的一系列具备领域特色的模型，包含 ：
+【Orion-14B-Base】基座模型
+【Orion-14B-Chat】对话模型
+【Orion-14B-LongChat】长上下文模型
+【Orion-14B-Chat-RAG】检索增强模型
+【Orion-14B-Chat-Plugin】插件模型
+【Orion-14B-Base-Int4】基座Int4量化模型
+【Orion-14B-Chat-Int4】对话Int4量化模型
+“数据”是指从与模型一起使用的数据集中提取的信息和/或内容的集合，包括用于训练、预 训练或以其他方式评估模型的数据。数据集中提取的信息和/或内容的集合，可能包含个人 信息或非个人信息。
+“个人信息”是指以电子或者其他方式记录的与已识别或者可识别的自然人有关的各种信息， 不包括匿名化处理后的信息。个人信息的处理包括个人信息的收集、存储、使用、加工、 传输、提供、公开、删除等。
+“输出”是指运行模型的结果，体现在由此产生的信息内容中。
+“训练”是指为模型提供训练数据，以增强模型的预测能力。
+“模型衍生品”是指对【Orion-14B系列】模型的所有修改、基于【Orion-14B系列】模型的工作，或通过将 【Orion-14B系列】模型的权重、参数、激活或输出模式转移到其他模型而创建或初始化的任何其他 模型，以使其他模型的性能与【Orion-14B系列】模型类似，包括但不限于需要使用中间数据表示的 提取方法或基于【Orion-14B系列】模型生成合成数据来训练其他模型的方法
+。
+ “分发”是指向第三方传输、复制、发布或以其他方式共享模型或模型衍生品，包括将模型作为通过电子或其他远程方式（例如基于 API 或 Web 访问的任何 SaaS 软件或 PaaS 软件） 提供的托管服务。
+ “许可方”是指授予许可的版权所有者或版权所有者实体，包括可能对模型和/或被分发模型拥有权利的个人或实体。本协议下的许可方是：【北京猎户星空科技有限公司】，或其授权可 对任何第三方进行许可的实体或个人。“您”（或“您的”）是指行使本许可授予的权限和/或出于任何目的和在任何使用领域使用模 型的个人或法人实体，属于本协议的被许可人。
+“第三方”是指您之外的任何个人、法人实体或非法人组织。
+“商业用途”是指使用 【Orion-14B系列】模型，直接或间接为实体或个人进行运营、推广或产生收入，或用于任何其他盈利目的。
+    二、 许可及许可限制
+根据本许可协议的条款和条件，许可方特此授予您一个非排他性、全球性、不可转让、不可再许可、可撤销、免版税的版权许可。您可以出于非商业用途使用此许可。许可方对您使用【Orion-14B系列】模型的输出或基于【Orion-14B系列】模型得到的模型衍生品不主张任何权利，但您必须满足如下许可限制条件：
+    1． 您不得出于任何军事或非法目的使用、复制、修改、合并、发布、分发、复制或创建【Orion-14B系列】 模型的全部或部分衍生品。
+    2． 如果您计划将【Orion-14B系列】模型及模型衍生品用作商业用途，应当按照本协议提供的联络方式，事先向许可方登记并获得许可方的书面授权。请点击以下链接进行登记：https://test.orionstar.com/llm-license.html
+    3． 您对【Orion-14B系列】模型的使用和修改（包括使用【Orion-14B系列】 模型的输出或者基于【Orion-14B系列】 模型得到的模型衍生品）不得违反任何国家的法律法规，尤其是中华人民共和国的法律法规，不得侵犯任何第三方的合法权益，包括但不限于肖像权、名誉权、隐私权等 人格权，著作权、专利权、商业秘密等知识产权，或者其他财产权益。
+    4． 您必须向【Orion-14B系列】模型或其模型衍生品的任何第三方使用者提供【Orion-14B系列】模型的来源以及本协议的副本。
+    5． 您修改【Orion-14B系列】 模型得到模型衍生品，必须以显著的方式���明修改的内容，且上述修改不得违反本协议的许可限制条件，也不能允许、协助或以其他方式使得第三方违反本协议中的许可限制条件。
+    三、 知识产权
+    1. 【Orion-14B系列】模型的所有权及其相关知识产权，由许可方单独所有。
+    2. 在任何情况下，未经许可方事先书面同意，您不得使用许可方任何商标、服务标记、 商号、域名、网站名称或其他显著品牌特征（以下统称为"标识"），包括但不限于明示或暗示您自身为“许可方”。未经许可方事先书面同意，您不得将本条款前述标识以单独或结合的任何方式展示、使用或申请注册商标、进行域名注册等，也不得向他人明示或暗示有权展示、使用、或以其他方式处理这些标识的权利。由于您违反本协议使用许可方上述标识 等给许可方或他人造成损失的，由您承担全部法律责任。
+    3. 在许可范围内，您可以对【Orion-14B系列】模型进行修改以得到模型衍生品，对于模型衍生品中您付出创造性劳动的部分，您可以主张该部分的知识产权。
+    四、 免责声明及责任限制
+    1. 在任何情况下，许可方不对您根据本协议使用【Orion-14B系列】模型而产生或与之相关的任何直接、间接、附带的后果、以及其他损失或损害承担责任。若由此导致许可方遭受损失，您应当向许可方承担全部赔偿责任。
+    2. 模型中的模型参数仅仅是一种示例，如果您需要满足其他要求，需自行训练，并遵守相应数据集的许可协议。您将对【Orion-14B系列】模型的输出及模型衍生品所涉及的知识产权风险或与之相关的任何直接、间接、附带的后果、以及其他损失或损害负责。
+    3. 尽管许可方在【Orion-14B系列】模型训练的所有阶段，都坚持努力维护数据的合规性和准确 性，但受限于【Orion-14B系列】模型的规模及其概率固有的随机性因素影响，其输出结果的准确性无法得到保证，模型存在被误导的可能。因此，许可方在此声明，许可方不承担您因使用【Orion-14B系列】模型及其源代码而导致的数据安全问题、声誉风险，或任何涉及【Orion-14B系列】模型被误导、误用、传播或不正当使用而产生的任何风险和责任。
+    4. 本协议所称损失或损害包括但不限于下列任何损失或损害（无论此类损失或损害是不可预见的、可预见的、已知的或其他的）:(i)收入损失;(ii)实际或预期利润损失；(ii)货币使用损失；(iv)预期节约的损失；(v)业务损失；(vi)机会损失；(vii)商誉、声誉损失；(viii)软件的使用损失；或(x)任何间接、附带的特殊或间接损害损失。
+    5. 除非适用的法律另有要求或经过许可方书面同意，否则许可方将按“现状”授予【Orion-14B系列】模型的许可。针对本协议中的【Orion-14B系列】模型，许可方不提供任何明示、暗示的保证，包括但不限于：关于所有权的任何保证或条件、关于适销性的保证或条件、适用于任何特定目的的保证或条件、过去、现在或未来关于【Orion-14B系列】模型不侵权的任何类型的保证、以及因任何交易过程、贸易使用（如建议书、规范或样品）而产生的任何保证。您将对其通过使用、复制或再分发等方式利用【Orion-14B系列】模型所产生的风险与后果，独自承担责任。
+    6. 您充分知悉并理解同意，【Orion-14B系列】模型中可能包含个人信息。您承诺将遵守所有适用的法律法规进行个人信息的处理，特别是遵守《中华人民共和国个人信息保护法》的相关规定。请注意，许可方给予您使用【Orion-14B系列】模型的授权，并不意味着您已经获得处理相关个人信息的合法性基础。您作为独立的个人信息处理者，需要保证在处理【Orion-14B系列】模型中可能包含的个人信息时，完全符合相关法律法规的要求，包括但不限于获得个人信息主体的授权同意等，并愿意独自承担由此可能产生的任何风险和后果。
+    7. 您充分理解并同意，许可方有权依合理判断对违反有关法律法规或本协议规定的行为进行处理，对您的违法违规行为采取适当的法律行动，并依据法律法规保存有关信息向有关部门报告等，您应独自承担由此而产生的一切法律责任。
+    五、 研究、教育和学术目的
+    1. 根据本许可协议的条款和条件，本着对学术界做出贡献的精神，许可方鼓励非营利性学术机构的师生将【Orion-14B系列】模型用于研究、教育和学术目的。
+    2. 进一步的，如您以研究、教育和学术目的使用【Orion-14B系列】模型，您可以在开展相关研 究、教育前，将您的机构名称、使用情况以及联系方式以邮件方式向���们进行提前告知，我们的联系邮箱为【ai@orionstar.com】，我们将可能基于您的联系方式，向您推送【Orion-14B系列】模型的相关更新资讯，以便您更好地开展研究、教育和学术工作。
+    六、 品牌曝光与显著标识
+    1. 您同意并理解，如您将您基于【Orion-14B系列】模型二次开发的模型衍生品在国内外的开源社区提供开源许可的，您需要在该开源社区以显著方式标注该模型衍生品系基于【Orion-14B系列】模型进行的二次开发，标注内容包括但不限于“【Orion-14B  Series】 Inside”以及与【Orion-14B系列】模型相关的品牌的其他元素。
+    2. 您同意并理解，如您将【Orion-14B系列】模型二次开发的模型衍生品参加国内外任何组织和个人举行的排名活动，包括但不限于针对模型性能、准确度、算法、算力等任何维度的排名活动，您均需在模型说明中以显著方式标注该模型衍生品系基于【Orion-14B系列】模型进行的二次开发，标注内容包括但不限于“【Orion-14B  Series】Inside”以及与【Orion-14B系列】模型相关的品牌的其他元素。
+    七、 其他
+    1. 许可方在法律法规许可的范围内对协议条款享有最终解释权。
+    2. 本协议的订立、效力、解释、履行、修改和终止，使用【Orion-14B系列】模型以及争议的解 决均适用中华人民共和国大陆地区（仅为本协议之目的，不包括香港、澳门和台湾）法律，并排除冲突法的适用。
+    3. 因使用【Orion-14B系列】模型而发生的任何争议，各方应首先通过友好协商的方式加以解决。协商不成时，向许可方所在地人民法院提起诉讼。

README.md ADDED Viewed

	@@ -0,0 +1,345 @@

+---
+license: other
+license_name: orion
+license_link: https://huggingface.co/OrionStarAI/Orion-14B-LongChat/blob/main/ModelsCommunityLicenseAgreement
+widget:
+  - text: "Hi!"
+    output:
+      text: "Hello! How can I help you today?"
+pipeline_tag: text-generation
+---
+<!-- markdownlint-disable first-line-h1 -->
+<!-- markdownlint-disable html -->
+![](./assets/imgs/assets_imgs_orion_start.PNG)
+<div align="center">
+<h1>
+  Orion-14B-LongChat
+</h1>
+</div>
+<div align="center">
+<h4 align="center">
+    <p>
+        <b>🌐English</b> |
+        <a href="https://huggingface.co/OrionStarAI/Orion-14B-LongChat/blob/main/README_cn.md">🇨🇳中文</a><br><br>
+        🤗 <a href="https://huggingface.co/OrionStarAI" target="_blank">HuggingFace Mainpage</a> | 🤖 <a href="https://modelscope.cn/organization/OrionStarAI" target="_blank">ModelScope Mainpage</a><br>🎬 <a href="https://huggingface.co/spaces/OrionStarAI/Orion-14B-App-Demo" target="_blank">HuggingFace Demo</a> | 🎫 <a href="https://modelscope.cn/studios/OrionStarAI/Orion-14B-App-Demo/summary" target="_blank">ModelScope Demo</a><br>📖 <a href="https://github.com/OrionStarAI/Orion/blob/master/doc/Orion14B_v3.pdf" target="_blank">Tech Report</a>
+    <p>
+</h4>
+</div>
+# Table of Contents
+- [📖 Model Introduction](#model-introduction)
+- [🔗 Model Download](#model-download)
+- [🔖 Model Benchmark](#model-benchmark)
+- [📊 Model Inference](#model-inference)
+- [🥇 Company Introduction](#company-introduction)
+- [📜 Declarations & License](#declarations-license)
+# 1. Model Introduction
+- Orion-14b-LongChat is based on Orion-14B for optimized training using a longer text corpus. The Orion-14B-LongChat can handle contexts over 200K tokens and perform well.
+- The Orion-14B series models exhibit the following features:
+  - Among models with 20B-parameter scale level, Orion-14B-Base model shows outstanding performance in comprehensive evaluations.
+  - Strong multilingual capabilities, significantly outperforming in Japanese and Korean testsets.
+  - The fine-tuned models demonstrate strong adaptability, excelling in human-annotated blind tests.
+  - The long-chat version supports extremely long texts, performing exceptionally well at a token length of 200k and can support up to a maximum of 320k.
+  - The quantized versions reduce model size by 70%, improve inference speed by 30%, with performance loss less than 1%.
+ <table style="border-collapse: collapse; width: 100%;">
+   <tr>
+     <td style="border: none; padding: 10px; box-sizing: border-box;">
+       <img src="./assets/imgs/opencompass_en.png" alt="opencompass" style="width: 100%; height: auto;">
+     </td>
+     <td style="border: none; padding: 10px; box-sizing: border-box;">
+       <img src="./assets/imgs/model_cap_en.png" alt="modelcap" style="width: 100%; height: auto;">
+     </td>
+   </tr>
+ </table>
+- Orion-14B series models including:
+  - **Orion-14B-Base:**  A multilingual large language foundational model with 14 billion parameters, pretrained on a diverse dataset of 2.5 trillion tokens.
+  - **Orion-14B-Chat:**  A chat-model fine-tuned on a high-quality corpus aims to provide an excellence interactive experience for users in the large model community.
+  - **Orion-14B-LongChat:**  The long-context version excels at handling extremely lengthy texts, performing exceptionally well at a token length of 200k and can support up to a maximum of 320k.
+  - **Orion-14B-Chat-RAG:**  A chat-model fine-tuned on a custom retrieval augmented generation dataset, achieving superior performance in retrieval augmented generation tasks.
+  - **Orion-14B-Chat-Plugin:**  A chat-model specifically tailored for plugin and function calling tasks, ideal for agent-related scenarios where the LLM acts as a plugin and function call system.
+  - **Orion-14B-Base-Int4:**  A quantized base model utilizing 4-bit integer weights. It significantly reduces the model size by 70% and increases the inference speed by 30% while incurring a minimal performance loss of only 1%.
+  - **Orion-14B-Chat-Int4:**  A quantized chat model utilizing 4-bit integer weights.
+# 2. Model Download
+Model release and download links are provided in the table below:
+| Model Name              | HuggingFace Download Links                                                        | ModelScope Download Links                                                                       |
+|-------------------------|-----------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------|
+| ⚾Orion-14B-Base        | [Orion-14B-Base](https://huggingface.co/OrionStarAI/Orion-14B-Base)               | [Orion-14B-Base](https://modelscope.cn/models/OrionStarAI/Orion-14B-Base/summary)               |
+| 😛Orion-14B-Chat        | [Orion-14B-Chat](https://huggingface.co/OrionStarAI/Orion-14B-Chat)               | [Orion-14B-Chat](https://modelscope.cn/models/OrionStarAI/Orion-14B-Chat/summary)               |
+| 📃Orion-14B-LongChat    | [Orion-14B-LongChat](https://huggingface.co/OrionStarAI/Orion-14B-LongChat)       | [Orion-14B-LongChat](https://modelscope.cn/models/OrionStarAI/Orion-14B-LongChat/summary)       |
+| 🔎Orion-14B-Chat-RAG    | [Orion-14B-Chat-RAG](https://huggingface.co/OrionStarAI/Orion-14B-Chat-RAG)       | [Orion-14B-Chat-RAG](https://modelscope.cn/models/OrionStarAI/Orion-14B-Chat-RAG/summary)       |
+| 🔌Orion-14B-Chat-Plugin | [Orion-14B-Chat-Plugin](https://huggingface.co/OrionStarAI/Orion-14B-Chat-Plugin) | [Orion-14B-Chat-Plugin](https://modelscope.cn/models/OrionStarAI/Orion-14B-Chat-Plugin/summary) |
+| 💼Orion-14B-Base-Int4   | [Orion-14B-Base-Int4](https://huggingface.co/OrionStarAI/Orion-14B-Base-Int4)     | [Orion-14B-Base-Int4](https://modelscope.cn/models/OrionStarAI/Orion-14B-Base-Int4/summary)     |
+| 📦Orion-14B-Chat-Int4   | [Orion-14B-Chat-Int4](https://huggingface.co/OrionStarAI/Orion-14B-Chat-Int4)     | [Orion-14B-Chat-Int4](https://modelscope.cn/models/OrionStarAI/Orion-14B-Chat-Int4/summary)     |
+# 3. Model Benchmarks
+## 3.1. Base Model Orion-14B-Base Benchmarks
+### 3.1.1. LLM evaluation results on examination and professional knowledge
+| Model              | C-Eval   | CMMLU    | MMLU     | AGIEval  | Gaokao   | BBH      |
+|--------------------|----------|----------|----------|----------|----------|----------|
+| LLaMA2-13B         |   41.4   |   38.4   |   55.0   |   30.9   |   18.2   |   45.6   |
+| Skywork-13B        |   59.1   |   61.4   |   62.7   |   43.6   |   56.1   |   48.3   |
+| Baichuan2-13B      |   59.0   |   61.3   |   59.5   |   37.4   |   45.6   |   49.0   |
+| QWEN-14B           |   71.7   |   70.2   |   67.9   |   51.9   | **62.5** |   53.7   |
+| InternLM-20B       |   58.8   |   59.0   |   62.1   |   44.6   |   45.5   |   52.5   |
+| **Orion-14B-Base** | **72.9** | **70.6** | **69.9** | **54.7** |   62.1   | **56.5** |
+### 3.1.2. LLM evaluation results on language understanding and common knowledge
+| Model             |RACE-middle|RACE-high |HellaSwag | PIQA     | Lambada  | WSC      |
+|--------------------|----------|----------|----------|----------|----------|----------|
+| LLaMA 2-13B        |   63.0   |   58.9   |   77.5   |   79.8   |   76.5   |   66.3   |
+| Skywork-13B        |   87.6   |   84.1   |   73.7   |   78.3   |   71.8   |   66.3   |
+| Baichuan 2-13B     |   68.9   |   67.2   |   70.8   |   78.1   |   74.1   |   66.3   |
+| QWEN-14B           |   93.0   |   90.3   | **80.2** |   79.8   |   71.4   |   66.3   |
+| InternLM-20B       |   86.4   |   83.3   |   78.1   | **80.3** |   71.8   |   68.3   |
+| **Orion-14B-Base** | **93.2** | **91.3** |   78.5   |   79.5   | **78.8** | **70.2** |
+### 3.1.3. LLM evaluation results of OpenCompass testsets
+| Model | Average  | Examination | Language | Knowledge | Understanding | Reasoning |
+|------------------|----------|----------|----------|----------|----------|----------|
+| LLaMA 2-13B      |   47.3   |   45.2   |   47.0   |   58.3   |   50.9   |   43.6   |
+| Skywork-13B      |   53.6   |   61.1   |   51.3   |   52.7   |   64.5   |   45.2   |
+| Baichuan 2-13B   |   49.4   |   51.8   |   47.5   |   48.9   |   58.1   |   44.2   |
+| QWEN-14B         |   62.4   |   71.3   |   52.67  |   56.1   |   68.8   |   60.1   |
+| InternLM-20B     |   59.4   |   62.5   |   55.0   | **60.1** |   67.3   |   54.9   |
+|**Orion-14B-Base**| **64.3** | **71.4** | **55.0** |   60.0   | **71.9** | **61.6** |
+### 3.1.4. Comparison of LLM performances on Japanese testsets
+| Model             |**Average**|  JCQA    |  JNLI    |  MARC    |  JSQD    |  JQK     |  XLS     |  XWN     |  MGSM    |
+|--------------------|----------|----------|----------|----------|----------|----------|----------|----------|----------|
+| PLaMo-13B          |   52.3   |   56.7   |   42.8   |   95.8   |   70.6   |   71.0   |   8.70   |   70.5   |   2.40   |
+| WebLab-10B         |   50.7   |   66.6   |   53.7   |   82.1   |   62.9   |   56.2   |   10.0   |   72.0   |   2.40   |
+| ELYZA-jp-7B        |   48.8   |   71.7   |   25.3   |   86.6   |   70.8   |   64.1   |   2.50   |   62.1   |   7.20   |
+| StableLM-jp-7B     |   51.1   |   33.4   |   43.3   | **96.7** |   70.6   |   78.1   |   10.7   |   72.8   |   2.80   |
+| LLaMA 2-13B        |   46.3   |   75.0   |   47.6   |   38.8   |   76.1   |   67.7   |   18.1   |   63.2   |   10.4   |
+| Baichuan 2-13B     |   57.1   |   73.7   |   31.3   |   91.6   |   80.5   |   63.3   |   18.6   |   72.2   |   25.2   |
+| QWEN-14B           |   65.8   |   85.9   |   60.7   |   97.0   |   83.3   |   71.8   |   18.8   |   70.6   |   38.0   |
+| Yi-34B             |   67.1   |   83.8   |   61.2   |   95.2   | **86.1** |   78.5   | **27.2** |   69.2   |   35.2   |
+| **Orion-14B-Base** | **69.1** | **88.2** | **75.8** |   94.1   |   75.7   | **85.1** |   17.3   | **78.8** | **38.0** |
+### 3.1.5. Comparison of LLM performances on Korean testsets. n = 0 and n = 5 stand for n-shot prompts used in the evaluation
+|Model      | **Average**<br>n=0&nbsp;&nbsp;n=5 | HellaSwag<br>n=0&nbsp;&nbsp;n=5 | COPA<br> n=0&nbsp;&nbsp;n=5 | BooIQ<br>n=0&nbsp;&nbsp;n=5 | SentiNeg<br>n=0&nbsp;&nbsp;n=5|
+|------------------|------------------------------|------------------------------|------------------------------|------------------------------|------------------------------|
+| KoGPT            |  53.0   &nbsp;&nbsp;   70.1  |  55.9   &nbsp;&nbsp;   58.3  |  73.5   &nbsp;&nbsp;   72.9  |  45.1   &nbsp;&nbsp;   59.8  |  37.5   &nbsp;&nbsp;   89.4  |
+| Polyglot-ko-13B  |  69.6   &nbsp;&nbsp;   73.7  |**59.5** &nbsp;&nbsp; **63.1**|**79.4** &nbsp;&nbsp; **81.1**|  48.2   &nbsp;&nbsp;   60.4  |  91.2   &nbsp;&nbsp;   90.2  |
+| LLaMA 2-13B      |  46.7   &nbsp;&nbsp;   63.7  |  41.3   &nbsp;&nbsp;   44.0  |  59.3   &nbsp;&nbsp;   63.8  |  34.9   &nbsp;&nbsp;   73.8  |  51.5   &nbsp;&nbsp;   73.4  |
+| Baichuan 2-13B   |  52.1   &nbsp;&nbsp;   58.7  |  39.2   &nbsp;&nbsp;   39.6  |  60.6   &nbsp;&nbsp;   60.6  |  58.4   &nbsp;&nbsp;   61.5  |  50.3   &nbsp;&nbsp;   72.9  |
+| QWEN-14B         |  53.8   &nbsp;&nbsp;   73.7  |  45.3   &nbsp;&nbsp;   46.8  |  64.9   &nbsp;&nbsp;   68.9  |  33.4   &nbsp;&nbsp;   83.5  |  71.5   &nbsp;&nbsp;   95.7  |
+| Yi-34B           |  54.2   &nbsp;&nbsp;   72.1  |  44.6   &nbsp;&nbsp;   44.7  |  58.0   &nbsp;&nbsp;   60.6  |  65.9   &nbsp;&nbsp;   90.2  |  48.3   &nbsp;&nbsp;   92.9  |
+|**Orion-14B-Chat**|**74.5** &nbsp;&nbsp; **79.6**|  47.0   &nbsp;&nbsp;   49.6  |  77.7   &nbsp;&nbsp;   79.4  |**81.6** &nbsp;&nbsp; **90.7**|**92.4** &nbsp;&nbsp; **98.7**|
+### 3.1.6. Multilingual evaluation
+| Model              | Train Lang | Japanese | Korean   | Chinese  |  English |
+|--------------------|------------|----------|----------|----------|----------|
+| PLaMo-13B          |  En,Jp     |   52.3   |   *      |   *      |   *      |
+| Weblab-10B         |  En,Jp     |   50.7   |   *      |   *      |   *      |
+| ELYZA-jp-7B        |  En,Jp     |   48.8   |   *      |   *      |   *      |
+| StableLM-jp-7B     |  En,Jp     |   51.1   |   *      |   *      |   *      |
+| KoGPT-6B           |  En,Ko     |   *      |   70.1   |   *      |   *      |
+| Polyglot-ko-13B    |  En,Ko     |   *      |   70.7   |   *      |   *      |
+| Baichuan2-13B      |  Multi     |   57.1   |   58.7   |   50.8   |   57.1   |
+| Qwen-14B           |  Multi     |   65.8   |   73.7   |   64.5   |   65.4   |
+| Llama2-13B         |  Multi     |   46.3   |   63.7   |   41.4   |   55.3   |
+| Yi-34B             |  Multi     |   67.1   |   72.2   |   58.7   | **68.8** |
+| **Orion-14B-Chat** |  Multi     | **69.1** | **79.5** | **67.9** |   67.3   |
+## 3.2. Chat Model Orion-14B-Chat Benchmarks
+### 3.2.1. Chat model subjective evaluation of MTBench
+| Model        | First-Turn | Second-Turn | **Average** |
+|----------------------|----------|----------|----------|
+| Baichuan2-13B-Chat   |   7.05   |   6.47   |   6.76   |
+| Qwen-14B-Chat        |   7.30   |   6.62   |   6.96   |
+| Llama2-13B-Chat      |   7.10   |   6.20   |   6.65   |
+| InternLM-20B-Chat    |   7.03   |   5.93   |   6.48   |
+| **Orion-14B-Chat**   | **7.68** | **7.07** | **7.37** |
+\* use vllm for inference
+### 3.2.2. Chat model subjective evaluation of AlignBench
+| Model              | Math.  |  Logi. | Basic. | Chi.   | Comp.  | Writ.  | Role.  | Prof.  |**Avg.**|
+|--------------------|--------|--------|--------|--------|--------|--------|--------|--------|--------|
+| Baichuan2-13B-Chat |  3.76  |  4.07  |  6.22  |  6.05  |  7.11  |  6.97  |  6.75  |  6.43  |  5.25  |
+| Qwen-14B-Chat      |**4.91**|**4.71**|**6.90**|  6.36  |  6.74  |  6.64  |  6.59  |  6.56  |**5.72**|
+| Llama2-13B-Chat    |  3.05  |  3.79  |  5.43  |  4.40  |  6.76  |  6.63  |  6.99  |  5.65  |  4.70  |
+| InternLM-20B-Chat  |  3.39  |  3.92  |  5.96  |  5.50  |**7.18**|  6.19  |  6.49  |  6.22  |  4.96  |
+| **Orion-14B-Chat** |  4.00  |  4.24  |  6.18  |**6.57**|  7.16  |**7.36**|**7.16**|**6.99**|  5.51  |
+\* use vllm for inference
+## 3.3. LongChat Model Orion-14B-LongChat Benchmarks
+### 3.3.1. LongChat evaluation of LongBench
+| Model           | NarrativeQA|MultiFieldQA-en|MultiFieldQA-zh| DuReader  | QMSum     | VCSUM     | TREC      | TriviaQA  | LSHT      |RepoBench-P|
+|--------------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
+| GPT-3.5-Turbo-16k        | **23.60** | **52.30** | **61.20** |   28.70   |   23.40   | **16.00** |   68.00   | **91.40** |   29.20   |   53.60   |
+| LongChat-v1.5-7B-32k     |   16.90   |   41.40   |   29.10   |   19.50   |   22.70   |    9.90   |   63.50   |   82.30   |   23.20   |   55.30   |
+| Vicuna-v1.5-7B-16k       |   19.40   |   38.50   |   43.00   |   19.30   |   22.80   |   15.10   |   71.50   |   86.20   |   28.80   |   43.50   |
+| Yi-6B-200K               |   14.11   |   36.74   |   22.68   |   14.01   |   20.44   |    8.08   |   72.00   |   86.61   |   38.00   | **63.29** |
+| Orion-14B-LongChat       |   19.47   |   48.11   |   55.84   | **37.02** | **24.87** |   15.44   | **77.00** |   89.12   | **45.50** |   54.31   |
+## 3.4. Chat RAG Model Benchmarks
+### 3.4.1. LLM evaluation results of self-built RAG testsets
+|Model|Effectiveness of Response(Keyword)|*Effectiveness of Response（subjective evaluation）|Quoting Ability|Fallback Ability|*AutoQA|*Data Extraction|
+|---------------------|------|------|------|------|------|------|
+| Baichuan2-13B-Chat  |  85  |  76  |  1   |  0   |  69  |  51  |
+| Qwen-14B-Chat       |  79  |  77  |  75  |  47  |  68  |  72  |
+| Qwen-72B-Chat(Int4) |  87  |  89  |  90  |  32  |  67  |  76  |
+| GPT-4               |  91  |  94  |  96  |  95  |  75  |  86  |
+| Orion-14B-Chat-RAG  |  86  |  87  |  91  |  97  |  73  |  71  |
+ \* means manual assessment
+## 3.5. Chat Plugin Model Orion-14B-Chat-Plugin Benchmarks
+### 3.5.1. LLM evaluation results of self-built plugin testsets
+|Model |Intent Recognition with Full Params |Intent Recognition with Missing Params |Non-Plugin Invocation Recognition |
+|-----------------------|--------|-----------|--------|
+| Baichuan2-13B-Chat    |   25   |   0       |   0    |
+| Qwen-14B-Chat         |   55   |   0       |   50   |
+| GPT-4                 | **95** |   52.38   |   70   |
+| Orion-14B-Chat-Plugin |  92.5  | **60.32** | **90** |
+## 3.6. Quantized Model Orion-14B-Base-Int4 Benchmarks
+### 3.6.1. Comparison of before and after quantization
+|Model |Size(GB)|Inference Speed(tokens/s)|C-Eval|CMMLU|MMLU|RACE|HellaSwag|
+|-------------------------|-------|-----|------|------|------|------|------|
+| OrionStar-14B-Base      |  28.0 | 135 | 72.8 | 70.6 | 70.0 | 93.3 | 78.5 |
+| OrionStar-14B-Base-Int4 |  8.3  | 178 | 71.8 | 69.8 | 69.2 | 93.1 | 78.0 |
+# 4. Model Inference
+Model weights, source code, and configuration needed for inference are published on Hugging Face, and the download link
+is available in the table at the beginning of this document. We demonstrate various inference methods here, and the
+program will automatically download the necessary resources from Hugging Face.
+## 4.1. Python Code
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation.utils import GenerationConfig
+tokenizer = AutoTokenizer.from_pretrained("OrionStarAI/Orion-14B", use_fast=False, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained("OrionStarAI/Orion-14B", device_map="auto",
+                                             torch_dtype=torch.bfloat16, trust_remote_code=True)
+model.generation_config = GenerationConfig.from_pretrained("OrionStarAI/Orion-14B")
+messages = [{"role": "user", "content": "Hello, what is your name? "}]
+response = model.chat(tokenizer, messages, streaming=False)
+print(response)
+```
+In the above Python code, the model is loaded with `device_map='auto'` to utilize all available GPUs. To specify the
+device, you can use something like `export CUDA_VISIBLE_DEVICES=0,1` (using GPUs 0 and 1).
+## 4.2. Command Line Tool
+```shell
+CUDA_VISIBLE_DEVICES=0 python cli_demo.py
+```
+This command-line tool is designed for chat scenarios, and thus, it does not support calling the base model.
+## 4.3. Direct Script Inference
+```shell
+# base model
+CUDA_VISIBLE_DEVICES=0 python demo/text_generation_base.py --model OrionStarAI/Orion-14B --tokenizer OrionStarAI/Orion-14B --prompt hello
+# chat model
+CUDA_VISIBLE_DEVICES=0 python demo/text_generation.py --model OrionStarAI/Orion-14B-Chat --tokenizer OrionStarAI/Orion-14B-Chat --prompt hi
+```
+## 4.4 Example Output
+### 4.4.1. Casual Chat
+`````
+User: Hello
+Orion-14B: Hello! How can I help you today? If you have any questions or need assistance, feel free to ask.
+User: What functions do you have?
+Orion-14B: As an AI language model, I have a wide range of functions and capabilities, including but not limited to:
+  Answering general knowledge questions: I can provide information on various topics such as history, science, geography, and more.
+  Providing definitions and explanations: I can explain complex concepts and provide definitions for words and phrases.
+  Assisting with tasks: I can help with simple tasks like setting reminders, converting units of measurement, and even ordering food from restaurants.
+  Generating text: I can write short stories, essays, articles, and other types of content based on prompts given to me.
+  Translating languages: I can translate text from one language to another.
+  Chatting: I can engage in conversation with users and respond to their queries in a conversational manner.
+  Summarizing long texts: I can summarize longer texts like articles, books, and research papers.
+  Generating random numbers and names: I can generate random numbers within a specified range and also suggest names based on certain criteria.
+  Playing games: I can play simple games like Hangman, Trivia, and more.
+  Providing recommendations: I can recommend movies, music, books, and other forms of entertainment based on user preferences.
+  These are just some of the many functions that I am capable of performing. If you have a specific task in mind, feel free to ask me about it.
+User: Tell me a joke.
+Orion-14B: Sure, here's a classic one-liner: Why don't scientists trust atoms? Because they make up everything.
+`````
+### 4.4.2. Japanese & Korean Chat
+`````
+User：自己を紹介してください
+Orion-14B：こんにちは、私の名前はChatMaxで、OrionStarによって開発されたAIアシスタントです。私はユーザーに正確で、客観的で、有用な回答を提供し、彼らが問題を解決し、情報を取得するのを助けることができます。
+User：자기소개를 해주세요.
+Orion-14B：안녕하세요, 제 이름은 ChatMax입니다. OrionStar에서 개발한 AI 어시스턴트입니다. 저는 사용자들에게 정확하고 객관적이며 유용한 답변을 제공하여 문제를 해결하고 정보를 얻는 데 도움을 줄 수 있습니다.
+`````
+# 5. Declarations, License
+## 5.1. Declarations
+We strongly urge all users not to use the Orion-14B model for any activities that may harm national or social security or violate the law.
+Additionally, we request users not to use the Orion-14B model for internet services without proper security review and filing.
+We hope all users abide by this principle to ensure that technological development takes place in a regulated and legal environment.
+We have done our best to ensure the compliance of the data used in the model training process. However, despite our
+significant efforts, unforeseen issues may still arise due to the complexity of the model and data. Therefore, if any
+problems arise due to the use of the Orion-14B open-source model, including but not limited to data security
+issues, public opinion risks, or any risks and issues arising from the model being misled, abused, disseminated, or
+improperly utilized, we will not assume any responsibility.
+## 5.2. License
+Community use of the Orion-14B series models
+- For code, please comply with  [Apache License Version 2.0](./LICENSE)<br>
+- For model, please comply with [【Orion-14B Series】 Models Community License Agreement](./ModelsCommunityLicenseAgreement)
+# 6. Company Introduction
+OrionStar is a leading global service robot solutions company, founded in September 2016. OrionStar is dedicated to
+using artificial intelligence technology to create the next generation of revolutionary robots, allowing people to break
+free from repetitive physical labor and making human work and life more intelligent and enjoyable. Through technology,
+OrionStar aims to make society and the world a better place.
+OrionStar possesses fully self-developed end-to-end artificial intelligence technologies, such as voice interaction and
+visual navigation. It integrates product development capabilities and technological application capabilities. Based on
+the Orion robotic arm platform, it has launched products such as OrionStar AI Robot Greeting, AI Robot Greeting Mini,
+Lucki, Coffee Master, and established the open platform OrionOS for Orion robots. Following the philosophy of "Born for
+Truly Useful Robots", OrionStar empowers more people through AI technology.
+**The core strengths of OrionStar lies in possessing end-to-end AI application capabilities,** including big data preprocessing, large model pretraining, fine-tuning, prompt engineering, agent, etc.  With comprehensive end-to-end model training capabilities, including systematic data processing workflows and the parallel model training capability of hundreds of GPUs, it has been successfully applied in various industry scenarios such as government affairs, cloud services, international e-commerce, and fast-moving consumer goods.
+Companies with demands for deploying large-scale model applications are welcome to contact us.
+**Enquiry Hotline: 400-898-7779**<br>
+**E-mail: ai@orionstar.com**
+<div align="center">
+  <img src="./assets/imgs/assets_imgs_wechat_group.jpg" alt="wechat" width="40%" />
+</div>

README_cn.md ADDED Viewed

	@@ -0,0 +1,346 @@

+---
+license: other
+license_name: orion
+license_link: https://huggingface.co/OrionStarAI/Orion-14B-LongChat/blob/main/ModelsCommunityLicenseAgreement
+widget:
+  - text: "你好,你叫什么名字?"
+    output:
+      text: "你好!我是一个人工智能助手,没有固定的名字。你可以随意称呼我。有什么我可以帮助你的吗?"
+pipeline_tag: text-generation
+---
+<!-- markdownlint-disable first-line-h1 -->
+<!-- markdownlint-disable html -->
+<div align="center">
+  <img src="./assets/imgs/assets_imgs_orion_start.PNG" alt="logo" width="50%" />
+</div>
+<div align="center">
+<h1>
+  Orion-14B-LongChat
+</h1>
+</div>
+<div align="center">
+<h4 align="center">
+    <p>
+        <b>🇨🇳中文</b> |
+        <a href="https://huggingface.co/OrionStarAI/Orion-14B-LongChat/blob/main/README.md">🌐English</a><br><br>
+        🤗 <a href="https://huggingface.co/OrionStarAI" target="_blank">HuggingFace Mainpage</a> | 🤖 <a href="https://modelscope.cn/organization/OrionStarAI" target="_blank">ModelScope Mainpage</a><br>🎬 <a href="https://huggingface.co/spaces/OrionStarAI/Orion-14B-App-Demo" target="_blank">HuggingFace Demo</a> | 🎫 <a href="https://modelscope.cn/studios/OrionStarAI/Orion-14B-App-Demo/summary" target="_blank">ModelScope Demo</a><br>📖 <a href="https://github.com/OrionStarAI/Orion/blob/master/doc/Orion14B_v3.pdf" target="_blank">Tech Report</a>
+    <p>
+</h4>
+</div>
+# 目录
+- [📖 模型介绍](#模型介绍)
+- [🔗 下载路径](#下载路径)
+- [🔖 评估结果](#评估结果)
+- [📊 模型推理](#模型推理)
+- [🥇 企业介绍](#企业介绍)
+- [📜 声明协议](#声明协议)
+# 1. 模型介绍
+- Orion-14B-LongChat是在Orion-14B的基础上基于使用更长的文本语料进行优化训练。Orion-14B-LongChat可以处理超过200K token的上下文并表现出色。
+- Orion-14B系列大模型有以下几个特点：
+  - 基座20B参数级别大模型综合评测效果表现优异
+  - 多语言能力强，在日语、韩语测试集上显著领先
+  - 微调模型适应性强，在人类标注盲测中，表现突出
+  - 长上下文版本支持超长文本，在200k token长度上效果优异，最长可支持可达320k
+  - 量化版本模型大小缩小70%，推理速度提升30%，性能损失小于1%
+ <table style="border-collapse: collapse; width: 100%;">
+   <tr>
+     <td style="border: none; padding: 10px; box-sizing: border-box;">
+       <img src="./assets/imgs/opencompass_en.png" alt="opencompass" style="width: 100%; height: auto;">
+     </td>
+     <td style="border: none; padding: 10px; box-sizing: border-box;">
+       <img src="./assets/imgs/model_cap_en.png" alt="modelcap" style="width: 100%; height: auto;">
+     </td>
+   </tr>
+ </table>
+- 具体而言，Orion-14B系列大语言模型包含:
+  - **Orion-14B-Base:**  基于2.5万亿令牌多样化数据集训练处的140亿参数量级的多语言基座模型。
+  - **Orion-14B-Chat:**  基于高质量语料库微调的对话类模型，旨在为大模型社区提供更好的用户交互体验。
+  - **Orion-14B-LongChat:**  在200k token长度上效果优异，最长可支持可达320k，在长文本评估集上性能比肩专有模型。
+  - **Orion-14B-Chat-RAG:**  在一个定制的检索增强生成数据集上进行微调的聊天模型，在检索增强生成任务中取得了卓越的性能。
+  - **Orion-14B-Chat-Plugin:**  专门针对插件和函数调用任务定制的聊天模型，非常适用于使用代理的相关场景，其中大语言模型充当插件和函数调用系统。
+  - **Orion-14B-Base-Int4:**  一个使用4位整数进行量化的基座模型。它将模型大小显著减小了70%，同时提高了推理速度30%，仅引入了1%的最小性能损失。
+  - **Orion-14B-Chat-Int4:**  一个使用4位整数进行量化的对话模型。
+# 2. 下载路径
+发布模型和下载链接见下表：
+| 模型名称              | HuggingFace下载链接                                                                | ModelScope下载链接                                                                               |
+|---------------------|-----------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------|
+| ⚾ 基座模型           | [Orion-14B-Base](https://huggingface.co/OrionStarAI/Orion-14B-Base)               | [Orion-14B-Base](https://modelscope.cn/models/OrionStarAI/Orion-14B-Base/summary)              |
+| 😛 对话模型           | [Orion-14B-Chat](https://huggingface.co/OrionStarAI/Orion-14B-Chat)               | [Orion-14B-Chat](https://modelscope.cn/models/OrionStarAI/Orion-14B-Chat/summary)              |
+| 📃 长上下文模型        | [Orion-14B-LongChat](https://huggingface.co/OrionStarAI/Orion-14B-LongChat)       | [Orion-14B-LongChat](https://modelscope.cn/models/OrionStarAI/Orion-14B-LongChat/summary)      |
+| 🔎 检索增强模型        | [Orion-14B-Chat-RAG](https://huggingface.co/OrionStarAI/Orion-14B-Chat-RAG)       | [Orion-14B-Chat-RAG](https://modelscope.cn/models/OrionStarAI/Orion-14B-Chat-RAG/summary)      |
+| 🔌 插件模型           | [Orion-14B-Chat-Plugin](https://huggingface.co/OrionStarAI/Orion-14B-Chat-Plugin) | [Orion-14B-Chat-Plugin](https://modelscope.cn/models/OrionStarAI/Orion-14B-Chat-Plugin/summary)|
+| 💼 基座Int4量化模型    | [Orion-14B-Base-Int4](https://huggingface.co/OrionStarAI/Orion-14B-Base-Int4)     | [Orion-14B-Base-Int4](https://modelscope.cn/models/OrionStarAI/Orion-14B-Base-Int4/summary)    |
+| 📦 对话Int4量化模型    | [Orion-14B-Chat-Int4](https://huggingface.co/OrionStarAI/Orion-14B-Chat-Int4)     | [Orion-14B-Chat-Int4](https://modelscope.cn/models/OrionStarAI/Orion-14B-Chat-Int4/summary)    |
+# 3. 评估结果
+## 3.1. 基座模型Orion-14B-Base评估
+### 3.1.1. 专业知识与试题评估结果
+| 模型名称            | C-Eval   | CMMLU    | MMLU     | AGIEval  | Gaokao   | BBH      |
+|--------------------|----------|----------|----------|----------|----------|----------|
+| LLaMA2-13B         |   41.4   |   38.4   |   55.0   |   30.9   |   18.2   |   45.6   |
+| Skywork-13B        |   59.1   |   61.4   |   62.7   |   43.6   |   56.1   |   48.3   |
+| Baichuan2-13B      |   59.0   |   61.3   |   59.5   |   37.4   |   45.6   |   49.0   |
+| QWEN-14B           |   71.7   |   70.2   |   67.9   |   51.9   | **62.5** |   53.7   |
+| InternLM-20B       |   58.8   |   59.0   |   62.1   |   44.6   |   45.5   |   52.5   |
+| **Orion-14B-Base** | **72.9** | **70.6** | **69.9** | **54.7** |   62.1   | **56.5** |
+### 3.1.2. 理解与通识评估结果
+| 模型名称            |RACE-middle|RACE-high| HellaSwag| PIQA     | Lambada  | WSC      |
+|--------------------|----------|----------|----------|----------|----------|----------|
+| LLaMA 2-13B        |   63.0   |   58.9   |   77.5   |   79.8   |   76.5   |   66.3   |
+| Skywork-13B        |   87.6   |   84.1   |   73.7   |   78.3   |   71.8   |   66.3   |
+| Baichuan 2-13B     |   68.9   |   67.2   |   70.8   |   78.1   |   74.1   |   66.3   |
+| QWEN-14B           |   93.0   |   90.3   | **80.2** |   79.8   |   71.4   |   66.3   |
+| InternLM-20B       |   86.4   |   83.3   |   78.1   | **80.3** |   71.8   |   68.3   |
+| **Orion-14B-Base** | **93.2** | **91.3** |   78.5   |   79.5   | **78.8** | **70.2** |
+### 3.1.3. OpenCompass评测集评估结果
+| 模型名称 | Average | Examination | Language | Knowledge | Understanding | Reasoning |
+|------------------|----------|----------|----------|----------|----------|----------|
+| LLaMA 2-13B      |   47.3   |   45.2   |   47.0   |   58.3   |   50.9   |   43.6   |
+| Skywork-13B      |   53.6   |   61.1   |   51.3   |   52.7   |   64.5   |   45.2   |
+| Baichuan 2-13B   |   49.4   |   51.8   |   47.5   |   48.9   |   58.1   |   44.2   |
+| QWEN-14B         |   62.4   |   71.3   |   52.67  |   56.1   |   68.8   |   60.1   |
+| InternLM-20B     |   59.4   |   62.5   |   55.0   | **60.1** |   67.3   |   54.9   |
+|**Orion-14B-Base**| **64.3** | **71.4** | **55.0** |   60.0   | **71.9** | **61.6** |
+### 3.1.4. 日语测试集评估结果
+|   模型名称         |**Average**|  JCQA    |  JNLI    |  MARC    |  JSQD   |  JQK     |  XLS     |  XWN     |  MGSM    |
+|--------------------|----------|----------|----------|----------|----------|----------|----------|----------|----------|
+| PLaMo-13B          |   52.3   |   56.7   |   42.8   |   95.8   |   70.6   |   71.0   |   8.70   |   70.5   |   2.40   |
+| WebLab-10B         |   50.7   |   66.6   |   53.7   |   82.1   |   62.9   |   56.2   |   10.0   |   72.0   |   2.40   |
+| ELYZA-jp-7B        |   48.8   |   71.7   |   25.3   |   86.6   |   70.8   |   64.1   |   2.50   |   62.1   |   7.20   |
+| StableLM-jp-7B     |   51.1   |   33.4   |   43.3   | **96.7** |   70.6   |   78.1   |   10.7   |   72.8   |   2.80   |
+| LLaMA 2-13B        |   46.3   |   75.0   |   47.6   |   38.8   |   76.1   |   67.7   |   18.1   |   63.2   |   10.4   |
+| Baichuan 2-13B     |   57.1   |   73.7   |   31.3   |   91.6   |   80.5   |   63.3   |   18.6   |   72.2   |   25.2   |
+| QWEN-14B           |   65.8   |   85.9   |   60.7   |   97.0   |   83.3   |   71.8   |   18.8   |   70.6   |   38.0   |
+| Yi-34B             |   67.1   |   83.8   |   61.2   |   95.2   | **86.1** |   78.5   | **27.2** |   69.2   |   35.2   |
+| **Orion-14B-Base** | **69.1** | **88.2** | **75.8** |   94.1   |   75.7   | **85.1** |   17.3   | **78.8** | **38.0** |
+### 3.1.5. 韩语测试集n-shot评估结果
+| 模型名称  | **Average**<br>n=0&nbsp;&nbsp;n=5 | HellaSwag<br>n=0&nbsp;&nbsp;n=5 | COPA<br> n=0&nbsp;&nbsp;n=5 | BooIQ<br>n=0&nbsp;&nbsp;n=5 | SentiNeg<br>n=0&nbsp;&nbsp;n=5|
+|------------------|------------------------------|------------------------------|------------------------------|------------------------------|------------------------------|
+| KoGPT            |  53.0   &nbsp;&nbsp;   70.1  |  55.9   &nbsp;&nbsp;   58.3  |  73.5   &nbsp;&nbsp;   72.9  |  45.1   &nbsp;&nbsp;   59.8  |  37.5   &nbsp;&nbsp;   89.4  |
+| Polyglot-ko-13B  |  69.6   &nbsp;&nbsp;   73.7  |**59.5** &nbsp;&nbsp; **63.1**|**79.4** &nbsp;&nbsp; **81.1**|  48.2   &nbsp;&nbsp;   60.4  |  91.2   &nbsp;&nbsp;   90.2  |
+| LLaMA 2-13B      |  46.7   &nbsp;&nbsp;   63.7  |  41.3   &nbsp;&nbsp;   44.0  |  59.3   &nbsp;&nbsp;   63.8  |  34.9   &nbsp;&nbsp;   73.8  |  51.5   &nbsp;&nbsp;   73.4  |
+| Baichuan 2-13B   |  52.1   &nbsp;&nbsp;   58.7  |  39.2   &nbsp;&nbsp;   39.6  |  60.6   &nbsp;&nbsp;   60.6  |  58.4   &nbsp;&nbsp;   61.5  |  50.3   &nbsp;&nbsp;   72.9  |
+| QWEN-14B         |  53.8   &nbsp;&nbsp;   73.7  |  45.3   &nbsp;&nbsp;   46.8  |  64.9   &nbsp;&nbsp;   68.9  |  33.4   &nbsp;&nbsp;   83.5  |  71.5   &nbsp;&nbsp;   95.7  |
+| Yi-34B           |  54.2   &nbsp;&nbsp;   72.1  |  44.6   &nbsp;&nbsp;   44.7  |  58.0   &nbsp;&nbsp;   60.6  |  65.9   &nbsp;&nbsp;   90.2  |  48.3   &nbsp;&nbsp;   92.9  |
+|**Orion-14B-Base**|**74.5** &nbsp;&nbsp; **79.6**|  47.0   &nbsp;&nbsp;   49.6  |  77.7   &nbsp;&nbsp;   79.4  |**81.6** &nbsp;&nbsp; **90.7**|**92.4** &nbsp;&nbsp; **98.7**|
+### 3.1.6. 多语言评估结果
+| 模型名称            | Train Lang | Japanese | Korean   | Chinese  |  English |
+|--------------------|------------|----------|----------|----------|----------|
+| PLaMo-13B          |  En,Jp     |   52.3   |   *      |   *      |   *      |
+| Weblab-10B         |  En,Jp     |   50.7   |   *      |   *      |   *      |
+| ELYZA-jp-7B        |  En,Jp     |   48.8   |   *      |   *      |   *      |
+| StableLM-jp-7B     |  En,Jp     |   51.1   |   *      |   *      |   *      |
+| KoGPT-6B           |  En,Ko     |   *      |   70.1   |   *      |   *      |
+| Polyglot-ko-13B    |  En,Ko     |   *      |   70.7   |   *      |   *      |
+| Baichuan2-13B      |  Multi     |   57.1   |   58.7   |   50.8   |   57.1   |
+| Qwen-14B           |  Multi     |   65.8   |   73.7   |   64.5   |   65.4   |
+| Llama2-13B         |  Multi     |   46.3   |   63.7   |   41.4   |   55.3   |
+| Yi-34B             |  Multi     |   67.1   |   72.2   |   58.7   | **68.8** |
+| **Orion-14B-Base** |  Multi     | **69.1** | **79.5** | **67.9** |   67.3   |
+## 3.2. 对话模型Orion-14B-Chat评估
+### 3.2.1. 对话模型MTBench主观评估
+| 模型名称              |   第一轮  |  第二轮   |  **平均** |
+|----------------------|----------|----------|----------|
+| Baichuan2-13B-Chat   |   7.05   |   6.47   |   6.76   |
+| Qwen-14B-Chat        |   7.30   |   6.62   |   6.96   |
+| Llama2-13B-Chat      |   7.10   |   6.20   |   6.65   |
+| InternLM-20B-Chat    |   7.03   |   5.93   |   6.48   |
+| **Orion-14B-Chat**   | **7.68** | **7.07** | **7.37** |
+\*这里评测使用vllm进行推理
+### 3.2.2. 对话模型AlignBench主观评估
+| 模型名称             | 数学能力  | 逻辑推理  | 基本能力   | 中文理解  | 综合问答   | 写作能力  | 角色扮演   | 专业知识  | **平均**  |
+|--------------------|----------|----------|----------|----------|----------|----------|----------|----------|----------|
+| Baichuan2-13B-Chat |   3.76   |   4.07   |   6.22   |   6.05   |   7.11   |   6.97   |   6.75   |   6.43   |   5.25   |
+| Qwen-14B-Chat      | **4.91** | **4.71** | **6.90** |   6.36   |   6.74   |   6.64   |   6.59   |   6.56   | **5.72** |
+| Llama2-13B-Chat    |   3.05   |   3.79   |   5.43   |   4.40   |   6.76   |   6.63   |   6.99   |   5.65   |   4.70   |
+| InternLM-20B-Chat  |   3.39   |   3.92   |   5.96   |   5.50   | **7.18** |   6.19   |   6.49   |   6.22   |   4.96   |
+| **Orion-14B-Chat** |   4.00   |   4.24   |   6.18   | **6.57** |   7.16   | **7.36** | **7.16** | **6.99** |   5.51   |
+\*这里评测使用vllm进行推理
+## 3.3. 长上下文模型Orion-14B-LongChat评估
+### 3.3.1. 长上下文模型LongBench评估
+| 模型名称              | NarrativeQA| MultiFieldQA-en| MultiFieldQA-zh | DuReader  | QMSum     | VCSUM  | TREC   | TriviaQA | LSHT   | RepoBench-P |
+|--------------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
+| GPT-3.5-Turbo-16k        | **23.60** | **52.30** | **61.20** |   28.70   |   23.40   | **16.00** |   68.00   | **91.40** |   29.20   |   53.60   |
+| LongChat-v1.5-7B-32k     |   16.90   |   41.40   |   29.10   |   19.50   |   22.70   |    9.90   |   63.50   |   82.30   |   23.20   |   55.30   |
+| Vicuna-v1.5-7B-16k       |   19.40   |   38.50   |   43.00   |   19.30   |   22.80   |   15.10   |   71.50   |   86.20   |   28.80   |   43.50   |
+| Yi-6B-200K               |   14.11   |   36.74   |   22.68   |   14.01   |   20.44   |    8.08   |   72.00   |   86.61   |   38.00   | **63.29** |
+| Orion-14B-LongChat       |   19.47   |   48.11   |   55.84   | **37.02** | **24.87** |   15.44   | **77.00** |   89.12   | **45.50** |   54.31   |
+## 3.4. 检索增强模型Orion-14B-Chat-RAG评估
+### 3.4.1. 自建检索增强测试集评估结果
+|模型名称|回复效果(关键字)|*回复效果(主观打分)|引用能力|兜底能力|*AutoQA|*抽取数据|
+|---------------------|------|------|------|------|------|------|
+| Baichuan2-13B-Chat  |  85  |  76  |  1   |  0   |  69  |  51  |
+| Qwen-14B-Chat       |  79  |  77  |  75  |  47  |  68  |  72  |
+| Qwen-72B-Chat(Int4) |  87  |  89  |  90  |  32  |  67  |  76  |
+| GPT-4               |  91  |  94  |  96  |  95  |  75  |  86  |
+| Orion-14B-Chat-RAG  |  86  |  87  |  91  |  97  |  73  |  71  |
+ \* 表示人工评判结果
+## 3.5. 插件模型Orion-14B-Chat-Plugin评估
+### 3.5.1. 自建插件测试集评估结果
+| 模型名称  | 全参数意图识别 | 缺参数意图识别 | 非插件调用识别 |
+|-----------------------|--------|-----------|--------|
+| Baichuan2-13B-Chat    |   25   |   0       |   0    |
+| Qwen-14B-Chat         |   55   |   0       |   50   |
+| GPT-4                 | **95** |   52.38   |   70   |
+| Orion-14B-Chat-Plugin |   92.5 | **60.32** | **90** |
+## 3.6. 量化模型Orion-14B-Base-Int4评估
+### 3.6.1. 量化前后整体对比
+|模型名称|模型大小(GB)|推理速度(令牌数/秒)|C-Eval |CMMLU |MMLU |RACE | HellaSwag|
+|-------------------------|------|-----|------|------|------|------|------|
+| OrionStar-14B-Base      | 28.0 | 135 | 72.8 | 70.6 | 70.0 | 93.3 | 78.5 |
+| OrionStar-14B-Base-Int4 |  8.3 | 178 | 71.8 | 69.8 | 69.2 | 93.1 | 78.0 |
+# 4. 模型推理
+推理所需的模型权重、源码、配置已发布在 Hugging Face，下载链接见本文档最开始的表格。我们在此示范多种推理方式。程序会自动从
+Hugging Face 下载所需资源。
+## 4.1. Python 代码方式
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation.utils import GenerationConfig
+tokenizer = AutoTokenizer.from_pretrained("OrionStarAI/Orion-14B", use_fast=False, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained("OrionStarAI/Orion-14B", device_map="auto",
+                                             torch_dtype=torch.bfloat16, trust_remote_code=True)
+model.generation_config = GenerationConfig.from_pretrained("OrionStarAI/Orion-14B")
+messages = [{"role": "user", "content": "你好! 你叫什么名字!"}]
+response = model.chat(tokenizer, messages, streaming=Flase)
+print(response)
+```
+在上述两段代码中，模型加载指定 `device_map='auto'`
+，会使用所有可用显卡。如需指定使用的设备，可以使用类似 `export CUDA_VISIBLE_DEVICES=0,1`（使用了0、1号显卡）的方式控制。
+## 4.2. 命令行工具方式
+```shell
+CUDA_VISIBLE_DEVICES=0 python cli_demo.py
+```
+本命令行工具是为 Chat 场景设计，因此我们不支持使用该工具调用 Base 模型。
+## 4.3. 脚本直接推理
+```shell
+# base model
+CUDA_VISIBLE_DEVICES=0 python demo/text_generation_base.py --model OrionStarAI/Orion-14B --tokenizer OrionStarAI/Orion-14B --prompt 你好,你叫什么名字
+# chat model
+CUDA_VISIBLE_DEVICES=0 python demo/text_generation.py --model OrionStarAI/Orion-14B-Chat --tokenizer OrionStarAI/Orion-14B-Chat --prompt 你好,你叫什么名字
+```
+## 4.4. 示例输出
+### 4.4.1. 闲聊
+`````
+用户：你好,你叫什么名字
+Orion-14B：你好!我是一个人工智能助手,没有固定的名字。你可以随意称呼我。有什么我可以帮助你的吗?
+用户：讲个故事
+Orion-14B：从前有一个小男孩叫杰克,他住在一个小村庄里。杰克非常喜欢冒险和探索,总是梦想着能够发现一些神奇的事情。
+  有一天,杰克听说在附近的森林深处有一座传说中的魔法城堡,里面住着一个邪恶的巨人。这个巨人的名字叫格里姆,他被困在城堡中,无法离开。
+  杰克决定要去解救这个巨人,因为他相信格里姆一定有自己的原因被困在那里。于是,他带着勇气和决心,踏上了前往魔法城堡的旅程。
+  经过长时间的跋涉,杰克终于来到了城堡的大门前。他小心翼翼地推开大门,进入了一个充满神秘气息的房间。
+  房间里弥漫着黑暗的气息,但杰克并没有被吓倒。他勇敢地向城堡深处走去,最终找到了格里姆。
+  格里姆告诉杰克,他是被一个邪恶的女巫诅咒的,只有通过解开咒语才能获得自由。杰克决定要帮助格里姆,他开始寻找线索和解开咒语的方法。
+  经过一番努力,杰克终于找到了一条通往女巫的秘密实验室的道路。他在那里发现了女巫留下的魔法书,书中记载了解开咒语的方法。
+  杰克按照书上所写的步骤进行操作,成功地解开了格里姆身上的诅咒。巨人感激不已,他答应帮助杰克实现他的愿望。
+  从此以后,杰克和格里姆成为了好朋友,他们一起经历了许多惊险刺激的冒险,并用他们的勇气和智慧战胜了各种困难。
+  这个故事告诉我们,只要有勇气和决心,我们就能克服一切困难,实现自己的梦想。
+`````
+### 4.4.2. 日韩
+`````
+用户：自己を紹介してください
+Orion-14B：こんにちは、私の���前はChatMaxで、OrionStarによって開発されたAIアシスタントです。私はユーザーに正確で、客観的で、有用な回答を提供し、彼らが問題を解決し、情報を取得するのを助けることができます。
+用户：자기소개를 해주세요.
+Orion-14B：안녕하세요, 제 이름은 ChatMax입니다. OrionStar에서 개발한 AI 어시스턴트입니다. 저는 사용자들에게 정확하고 객관적이며 유용한 답변을 제공하여 문제를 해결하고 정보를 얻는 데 도움을 줄 수 있습니다.
+`````
+# 5. 声明、协议
+## 5.1. 声明
+我们强烈呼吁所有使用者，不要利用 Orion-14B 模型进行任何危害国家社会安全或违法的活动。另外，我们也要求使用者不要将
+Orion-14B 模型用于未经适当安全审查和备案的互联网服务。
+我们希望所有的使用者都能遵守这个原则，确保科技的发展能在规范和合法的环境下进行。
+我们已经尽我们所能，来确保模型训练过程中使用的数据的合规性。然而，尽管我们已经做出了巨大的努力，但由于模型和数据的复杂性，仍有可能存在一些无法预见的问题。因此，如果由于使用
+Orion-14B 开源模型而导致的任何问题，包括但不限于数据安全问题、公共舆论风险，或模型被误导、滥用、传播或不当利用所带来的任何风险和问题，我们将不承担任何责任。
+## 5.2. 协议
+社区使用Orion-14B系列模型
+- 代码请遵循 [Apache License Version 2.0](./LICENSE)<br>
+- 模型请遵循 [Orion-14B系列模型社区许可协议](./ModelsCommunityLicenseAgreement)
+# 6. 企业介绍
+猎户星空（OrionStar）是一家全球领先的服务机器人解决方案公司，成立于2016年9月。猎户星空致力于基于人工智能技术打造下一代革命性机器人，使人们能够摆脱重复的体力劳动，使人类的工作和生活更加智能和有趣，通过技术使社会和世界变得更加美好。
+猎户星空拥有完全自主开发的全链条人工智能技术，如语音交互和视觉导航。它整合了产品开发能力和技术应用能力。基于Orion机械臂平台，它推出了ORION
+STAR AI Robot Greeting、AI Robot Greeting Mini、Lucki、Coffee
+Master等产品，并建立了Orion机器人的开放平台OrionOS。通过为 **真正有用的机器人而生** 的理念实践，它通过AI技术为更多人赋能。
+凭借7年AI经验积累，猎户星空已推出的大模型深度应用“聚言”，并陆续面向行业客户提供定制化AI大模型咨询与服务解决方案，真正帮助客户实现企业经营效率领先同行目标。
+**猎户星空具备全链条大模型应用能力的核心优势**，包括拥有从海量数据处理、大模型预训练、二次预训练、微调(Fine-tune)、Prompt
+Engineering 、Agent开发的全链条能力和经验积累；拥有完整的端到端模型训练能力，包括系统化的数据处理流程和数百张GPU的并行模型训练能力，现已在大政务、云服务、出海电商、快消等多个行业场景落地。
+***欢迎有大模型应用落地需求的企业联系我们进行商务合作***<br>
+**咨询电话:** 400-898-7779<br>
+**电子邮箱:** ai@orionstar.com
+<div align="center">
+  <img src="./assets/imgs/assets_imgs_wechat_group.jpg" alt="wechat" width="40%" />
+</div>

config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "architectures": [
+    "OrionForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_orion.OrionConfig",
+    "AutoModelForCausalLM": "modeling_orion.OrionForCausalLM"
+  },
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "model_type": "orion",
+  "initializer_range": 0.02,
+  "intermediate_size": 15360,
+  "max_position_embeddings": 200000,
+  "max_sequence_length": 200000,
+  "num_attention_heads": 40,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 40,
+  "pad_token_id": 0,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 50000000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.34.0",
+  "use_cache": true,
+  "vocab_size": 84608
+}

configuration_orion.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# Copyright (c) 2024, OrionStar Inc. All rights reserved.
+from transformers import PretrainedConfig
+class OrionConfig(PretrainedConfig):
+    model_type = "orion"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size=84608,
+        hidden_size=4096,
+        intermediate_size=15360,
+        num_hidden_layers=40,
+        num_attention_heads=40,
+        num_key_value_heads=40,
+        hidden_act="silu",
+        max_position_embeddings=4096,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+        self.attention_bias = attention_bias
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
+                f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}")

generation_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "max_new_tokens": 1024,
+  "temperature": 0.3,
+  "top_k": 5,
+  "top_p": 0.90,
+  "repetition_penalty": 1.05,
+  "do_sample": true,
+  "transformers_version": "4.34.0"
+}

generation_utils.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from typing import List
+from queue import Queue
+# build chat input prompt
+def build_chat_input(tokenizer, messages: List[dict]):
+    prompt = "<s>"
+    for msg in messages:
+        role = msg["role"]
+        message = msg["content"]
+        if message is None :
+            continue
+        if role == "user":
+            prompt += "Human: " + message + "\n\nAssistant: "
+        if role == "assistant":
+            prompt += message + "</s>"
+    input_tokens = tokenizer.encode(prompt)
+    return input_tokens
+class TextIterStreamer:
+    def __init__(self, tokenizer, skip_prompt=False, skip_special_tokens=False):
+        self.tokenizer = tokenizer
+        self.skip_prompt = skip_prompt
+        self.skip_special_tokens = skip_special_tokens
+        self.tokens = []
+        self.text_queue = Queue()
+        self.next_tokens_are_prompt = True
+    def put(self, value):
+        if self.skip_prompt and self.next_tokens_are_prompt:
+            self.next_tokens_are_prompt = False
+        else:
+            if len(value.shape) > 1:
+                value = value[0]
+            self.tokens.extend(value.tolist())
+            self.text_queue.put(
+                self.tokenizer.decode(self.tokens, skip_special_tokens=self.skip_special_tokens))
+    def end(self):
+        self.text_queue.put(None)
+    def __iter__(self):
+        return self
+    def __next__(self):
+        value = self.text_queue.get()
+        if value is None:
+            raise StopIteration()
+        else:
+            return value

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,451 @@

+{
+    "metadata": {
+        "total_size": 28837498880
+    },
+    "weight_map": {
+        "lm_head.weight": "model-00003-of-00003.safetensors",
+        "model.embed_tokens.weight": "model-00001-of-00003.safetensors",
+        "model.layers.0.input_layernorm.bias": "model-00001-of-00003.safetensors",
+        "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
+        "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.0.post_attention_layernorm.bias": "model-00001-of-00003.safetensors",
+        "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+        "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.1.input_layernorm.bias": "model-00001-of-00003.safetensors",
+        "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
+        "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.1.post_attention_layernorm.bias": "model-00001-of-00003.safetensors",
+        "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+        "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.10.input_layernorm.bias": "model-00001-of-00003.safetensors",
+        "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors",
+        "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.10.post_attention_layernorm.bias": "model-00001-of-00003.safetensors",
+        "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+        "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.11.input_layernorm.bias": "model-00001-of-00003.safetensors",
+        "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors",
+        "model.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.11.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.11.post_attention_layernorm.bias": "model-00001-of-00003.safetensors",
+        "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+        "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.12.input_layernorm.bias": "model-00001-of-00003.safetensors",
+        "model.layers.12.input_layernorm.weight": "model-00001-of-00003.safetensors",
+        "model.layers.12.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.12.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.12.post_attention_layernorm.bias": "model-00001-of-00003.safetensors",
+        "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+        "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.13.input_layernorm.bias": "model-00002-of-00003.safetensors",
+        "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors",
+        "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.13.post_attention_layernorm.bias": "model-00002-of-00003.safetensors",
+        "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+        "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.14.input_layernorm.bias": "model-00002-of-00003.safetensors",
+        "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors",
+        "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.14.post_attention_layernorm.bias": "model-00002-of-00003.safetensors",
+        "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+        "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.15.input_layernorm.bias": "model-00002-of-00003.safetensors",
+        "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
+        "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.15.post_attention_layernorm.bias": "model-00002-of-00003.safetensors",
+        "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+        "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.16.input_layernorm.bias": "model-00002-of-00003.safetensors",
+        "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
+        "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.16.post_attention_layernorm.bias": "model-00002-of-00003.safetensors",
+        "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+        "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.17.input_layernorm.bias": "model-00002-of-00003.safetensors",
+        "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
+        "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.17.post_attention_layernorm.bias": "model-00002-of-00003.safetensors",
+        "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+        "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.18.input_layernorm.bias": "model-00002-of-00003.safetensors",
+        "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
+        "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.18.post_attention_layernorm.bias": "model-00002-of-00003.safetensors",
+        "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+        "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.19.input_layernorm.bias": "model-00002-of-00003.safetensors",
+        "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
+        "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.19.post_attention_layernorm.bias": "model-00002-of-00003.safetensors",
+        "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+        "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.2.input_layernorm.bias": "model-00001-of-00003.safetensors",
+        "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
+        "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.2.post_attention_layernorm.bias": "model-00001-of-00003.safetensors",
+        "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+        "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.20.input_layernorm.bias": "model-00002-of-00003.safetensors",
+        "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
+        "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.20.post_attention_layernorm.bias": "model-00002-of-00003.safetensors",
+        "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+        "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.21.input_layernorm.bias": "model-00002-of-00003.safetensors",
+        "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
+        "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.21.post_attention_layernorm.bias": "model-00002-of-00003.safetensors",
+        "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+        "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.22.input_layernorm.bias": "model-00002-of-00003.safetensors",
+        "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors",
+        "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.22.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.22.post_attention_layernorm.bias": "model-00002-of-00003.safetensors",
+        "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+        "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.23.input_layernorm.bias": "model-00002-of-00003.safetensors",
+        "model.layers.23.input_layernorm.weight": "model-00002-of-00003.safetensors",
+        "model.layers.23.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.23.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.23.post_attention_layernorm.bias": "model-00002-of-00003.safetensors",
+        "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+        "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.24.input_layernorm.bias": "model-00002-of-00003.safetensors",
+        "model.layers.24.input_layernorm.weight": "model-00002-of-00003.safetensors",
+        "model.layers.24.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.24.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.24.post_attention_layernorm.bias": "model-00002-of-00003.safetensors",
+        "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+        "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.25.input_layernorm.bias": "model-00002-of-00003.safetensors",
+        "model.layers.25.input_layernorm.weight": "model-00002-of-00003.safetensors",
+        "model.layers.25.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.25.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.25.post_attention_layernorm.bias": "model-00002-of-00003.safetensors",
+        "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+        "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.26.input_layernorm.bias": "model-00002-of-00003.safetensors",
+        "model.layers.26.input_layernorm.weight": "model-00002-of-00003.safetensors",
+        "model.layers.26.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.26.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.26.post_attention_layernorm.bias": "model-00002-of-00003.safetensors",
+        "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+        "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.27.input_layernorm.bias": "model-00003-of-00003.safetensors",
+        "model.layers.27.input_layernorm.weight": "model-00003-of-00003.safetensors",
+        "model.layers.27.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.27.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.27.post_attention_layernorm.bias": "model-00003-of-00003.safetensors",
+        "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+        "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+        "model.layers.28.input_layernorm.bias": "model-00003-of-00003.safetensors",
+        "model.layers.28.input_layernorm.weight": "model-00003-of-00003.safetensors",
+        "model.layers.28.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.28.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.28.post_attention_layernorm.bias": "model-00003-of-00003.safetensors",
+        "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+        "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.29.input_layernorm.bias": "model-00003-of-00003.safetensors",
+        "model.layers.29.input_layernorm.weight": "model-00003-of-00003.safetensors",
+        "model.layers.29.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.29.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.29.post_attention_layernorm.bias": "model-00003-of-00003.safetensors",
+        "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+        "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.3.input_layernorm.bias": "model-00001-of-00003.safetensors",
+        "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
+        "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.3.post_attention_layernorm.bias": "model-00001-of-00003.safetensors",
+        "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+        "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.30.input_layernorm.bias": "model-00003-of-00003.safetensors",
+        "model.layers.30.input_layernorm.weight": "model-00003-of-00003.safetensors",
+        "model.layers.30.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.30.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.30.post_attention_layernorm.bias": "model-00003-of-00003.safetensors",
+        "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+        "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.31.input_layernorm.bias": "model-00003-of-00003.safetensors",
+        "model.layers.31.input_layernorm.weight": "model-00003-of-00003.safetensors",
+        "model.layers.31.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.31.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.31.post_attention_layernorm.bias": "model-00003-of-00003.safetensors",
+        "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+        "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.32.input_layernorm.bias": "model-00003-of-00003.safetensors",
+        "model.layers.32.input_layernorm.weight": "model-00003-of-00003.safetensors",
+        "model.layers.32.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.32.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.32.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.32.post_attention_layernorm.bias": "model-00003-of-00003.safetensors",
+        "model.layers.32.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+        "model.layers.32.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.32.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.32.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.32.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.33.input_layernorm.bias": "model-00003-of-00003.safetensors",
+        "model.layers.33.input_layernorm.weight": "model-00003-of-00003.safetensors",
+        "model.layers.33.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.33.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.33.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.33.post_attention_layernorm.bias": "model-00003-of-00003.safetensors",
+        "model.layers.33.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+        "model.layers.33.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.33.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.33.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.33.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.34.input_layernorm.bias": "model-00003-of-00003.safetensors",
+        "model.layers.34.input_layernorm.weight": "model-00003-of-00003.safetensors",
+        "model.layers.34.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.34.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.34.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.34.post_attention_layernorm.bias": "model-00003-of-00003.safetensors",
+        "model.layers.34.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+        "model.layers.34.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.34.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.34.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.34.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.35.input_layernorm.bias": "model-00003-of-00003.safetensors",
+        "model.layers.35.input_layernorm.weight": "model-00003-of-00003.safetensors",
+        "model.layers.35.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.35.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.35.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.35.post_attention_layernorm.bias": "model-00003-of-00003.safetensors",
+        "model.layers.35.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+        "model.layers.35.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.35.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.35.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.35.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.36.input_layernorm.bias": "model-00003-of-00003.safetensors",
+        "model.layers.36.input_layernorm.weight": "model-00003-of-00003.safetensors",
+        "model.layers.36.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.36.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.36.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.36.post_attention_layernorm.bias": "model-00003-of-00003.safetensors",
+        "model.layers.36.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+        "model.layers.36.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.36.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.36.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.36.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.37.input_layernorm.bias": "model-00003-of-00003.safetensors",
+        "model.layers.37.input_layernorm.weight": "model-00003-of-00003.safetensors",
+        "model.layers.37.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.37.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.37.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.37.post_attention_layernorm.bias": "model-00003-of-00003.safetensors",
+        "model.layers.37.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+        "model.layers.37.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.37.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.37.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.37.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.38.input_layernorm.bias": "model-00003-of-00003.safetensors",
+        "model.layers.38.input_layernorm.weight": "model-00003-of-00003.safetensors",
+        "model.layers.38.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.38.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.38.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.38.post_attention_layernorm.bias": "model-00003-of-00003.safetensors",
+        "model.layers.38.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+        "model.layers.38.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.38.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.38.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.38.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.39.input_layernorm.bias": "model-00003-of-00003.safetensors",
+        "model.layers.39.input_layernorm.weight": "model-00003-of-00003.safetensors",
+        "model.layers.39.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.39.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.39.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.39.post_attention_layernorm.bias": "model-00003-of-00003.safetensors",
+        "model.layers.39.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+        "model.layers.39.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.39.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.39.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.39.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+        "model.layers.4.input_layernorm.bias": "model-00001-of-00003.safetensors",
+        "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
+        "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.4.post_attention_layernorm.bias": "model-00001-of-00003.safetensors",
+        "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+        "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.5.input_layernorm.bias": "model-00001-of-00003.safetensors",
+        "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
+        "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.5.post_attention_layernorm.bias": "model-00001-of-00003.safetensors",
+        "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+        "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.6.input_layernorm.bias": "model-00001-of-00003.safetensors",
+        "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
+        "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.6.post_attention_layernorm.bias": "model-00001-of-00003.safetensors",
+        "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+        "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.7.input_layernorm.bias": "model-00001-of-00003.safetensors",
+        "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
+        "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.7.post_attention_layernorm.bias": "model-00001-of-00003.safetensors",
+        "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+        "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.8.input_layernorm.bias": "model-00001-of-00003.safetensors",
+        "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors",
+        "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.8.post_attention_layernorm.bias": "model-00001-of-00003.safetensors",
+        "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+        "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.9.input_layernorm.bias": "model-00001-of-00003.safetensors",
+        "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors",
+        "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.9.post_attention_layernorm.bias": "model-00001-of-00003.safetensors",
+        "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+        "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+        "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+        "model.norm.bias": "model-00003-of-00003.safetensors",
+        "model.norm.weight": "model-00003-of-00003.safetensors"
+    }
+}

modeling_orion.py ADDED Viewed

	@@ -0,0 +1,1117 @@

+# Copyright 2024 OrionStar Inc. team. All rights reserved.
+# Copied and adapted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
+from transformers import AutoConfig, AutoModel
+from .configuration_orion import OrionConfig
+import numbers
+import importlib
+import math
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+from torch.nn.parameter import Parameter
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from torch.nn import init
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_available,
+    logging,
+    replace_return_docstrings,
+)
+from .generation_utils import build_chat_input, TextIterStreamer
+from transformers.generation.utils import GenerationConfig
+from threading import Thread
+if is_flash_attn_available():
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "OrionConfig"
+def _get_unpad_data(padding_mask):
+    seqlens_in_batch = padding_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(padding_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+    inverted_mask = 1.0 - expanded_mask
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+class OrionRotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+class OrionLinearScalingRotaryEmbedding(OrionRotaryEmbedding):
+    """OrionRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        t = t / self.scaling_factor
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+class OrionDynamicNTKScalingRotaryEmbedding(OrionRotaryEmbedding):
+    """OrionRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        if seq_len > self.max_position_embeddings:
+            base = self.base * (
+                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+            ) ** (self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+            self.register_buffer("inv_freq", inv_freq, persistent=False)
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+# Copied from transformers.models.gpt_neox.modeling_gpt_neox.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+    cos = cos[position_ids].unsqueeze(1)  # [seq_len, dim] -> [batch_size, 1, seq_len, head_dim]
+    sin = sin[position_ids].unsqueeze(1)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class OrionMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        if self.config.pretraining_tp > 1:
+            slice = self.intermediate_size // self.config.pretraining_tp
+            gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
+            up_proj_slices = self.up_proj.weight.split(slice, dim=0)
+            down_proj_slices = self.down_proj.weight.split(slice, dim=1)
+            gate_proj = torch.cat(
+                [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1
+            )
+            up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
+            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
+            down_proj = [
+                F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
+            ]
+            down_proj = sum(down_proj)
+        else:
+            down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+class OrionAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: OrionConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
+        self._init_rope()
+    def _init_rope(self):
+        if self.config.rope_scaling is None:
+            self.rotary_emb = OrionRotaryEmbedding(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                base=self.rope_theta,
+            )
+        else:
+            scaling_type = self.config.rope_scaling["type"]
+            scaling_factor = self.config.rope_scaling["factor"]
+            if scaling_type == "linear":
+                self.rotary_emb = OrionLinearScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            elif scaling_type == "dynamic":
+                self.rotary_emb = OrionDynamicNTKScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            else:
+                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        padding_mask: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+        if self.config.pretraining_tp > 1:
+            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
+            query_slices = self.q_proj.weight.split(
+                (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
+            )
+            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
+            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
+            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
+            query_states = torch.cat(query_states, dim=-1)
+            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
+            key_states = torch.cat(key_states, dim=-1)
+            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
+            value_states = torch.cat(value_states, dim=-1)
+        else:
+            query_states = self.q_proj(hidden_states)
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        past_key_value = (key_states, value_states) if use_cache else None
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_output = torch.matmul(attn_weights, value_states)
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        if self.config.pretraining_tp > 1:
+            attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
+            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
+            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
+        else:
+            attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class OrionFlashAttention2(OrionAttention):
+    """
+    Orion flash attention module. This module inherits from `OrionAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        padding_mask: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # OrionFlashAttention2 attention does not support output_attentions
+        output_attentions = False
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dime x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        past_key_value = (key_states, value_states) if use_cache else None
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        # TODO: llama does not have dropout in the config??
+        # It is recommended to use dropout with FA according to the docs
+        # when training.
+        dropout_rate = 0.0  # if not self.training else self.attn_dropout
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (LlamaRMSNorm handles it correctly)
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            logger.warning_once(
+                "The input hidden states seems to be silently casted in float32, this might be related to"
+                " the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                " float16."
+            )
+            query_states = query_states.to(torch.float16)
+            key_states = key_states.to(torch.float16)
+            value_states = value_states.to(torch.float16)
+        attn_output = self._flash_attention_forward(
+            query_states, key_states, value_states, padding_mask, q_len, dropout=dropout_rate
+        )
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+    def _flash_attention_forward(
+        self, query_states, key_states, value_states, padding_mask, query_length, dropout=0.0, softmax_scale=None
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            padding_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`int`, *optional*):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+        """
+        # Contains at least one padding token in the sequence
+        if padding_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, padding_mask, query_length
+            )
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+            attn_output_unpad = flash_attn_varlen_func(
+                query_states,
+                key_states,
+                value_states,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_in_batch_q,
+                max_seqlen_k=max_seqlen_in_batch_k,
+                dropout_p=dropout,
+                softmax_scale=softmax_scale,
+                causal=True,
+            )
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            attn_output = flash_attn_func(
+                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=True
+            )
+        return attn_output
+    def _upad_input(self, query_layer, key_layer, value_layer, padding_mask, query_length):
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(padding_mask)
+        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+        key_layer = index_first_axis(
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        value_layer = index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            padding_mask = padding_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, padding_mask)
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+class OrionDecoderLayer(nn.Module):
+    def __init__(self, config: OrionConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = (
+            OrionAttention(config=config)
+            if not getattr(config, "_flash_attn_2_enabled", False)
+            else OrionFlashAttention2(config=config)
+        )
+        self.mlp = OrionMLP(config)
+        self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        padding_mask: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            padding_mask=padding_mask,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+class OrionPreTrainedModel(PreTrainedModel):
+    config_class = OrionConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["OrionDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, OrionModel):
+            module.gradient_checkpointing = value
+class OrionModel(OrionPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`OrionDecoderLayer`]
+    Args:
+        config: OrionConfig
+    """
+    def __init__(self, config: OrionConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList([OrionDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.norm = nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                inputs_embeds.device
+            )
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+        return combined_attention_mask
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0)
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        # embed positions
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
+            )
+            padding_mask = None
+        else:
+            if 0 in attention_mask:
+                padding_mask = attention_mask
+            else:
+                padding_mask = None
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+        )
+        hidden_states = inputs_embeds
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+            if self.gradient_checkpointing and self.training:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, past_key_value, output_attentions, padding_mask=padding_mask)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer), hidden_states, attention_mask, position_ids
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    padding_mask=padding_mask,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class OrionForCausalLM(OrionPreTrainedModel):
+    model_type = "orion"
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = OrionModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, OrionForCausalLM
+        >>> model = OrionForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        if self.config.pretraining_tp > 1:
+            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
+            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
+            logits = torch.cat(logits, dim=-1)
+        else:
+            logits = self.lm_head(hidden_states)
+        logits = logits.float()
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def chat(self, tokenizer, messages: List[dict], streaming=False,generation_config: Optional[GenerationConfig]=None):
+        generation_config = generation_config or self.generation_config
+        input_tokens = build_chat_input(tokenizer,messages)
+        input_ids = torch.LongTensor([input_tokens]).to(self.device)
+        if streaming:
+            streamer = TextIterStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+            Thread(target=self.generate, kwargs=dict(
+                inputs=input_ids, streamer=streamer,
+                generation_config=generation_config,
+            )).start()
+            return streamer
+        else:
+            outputs = self.generate(input_ids, generation_config=generation_config)
+            response = tokenizer.decode(outputs[0][len(input_ids[0]):], skip_special_tokens=True)
+            return response
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+class OrionForSequenceClassification(OrionPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = OrionModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
+                    logits.device
+                )
+            else:
+                sequence_lengths = -1
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )

output-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5aa36a9c49adb769bc76383121365745d9512256dac72cec8011aa9563ecf5f5
+size 8589913896

output-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0155bf1a71ce68c4369a963991457a4e050289bdbc84543d8a2d4852a572b4ef
+size 2851954216

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenization_orion.py ADDED Viewed

	@@ -0,0 +1,255 @@

+# Copyright (c) 2024, OrionStar Inc. All rights reserved.
+import os
+from shutil import copyfile
+from typing import Any, Dict, List, Optional, Tuple
+import sentencepiece as spm
+from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
+VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {},
+    "tokenizer_file": {},
+}
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
+class OrionTokenizer(PreTrainedTokenizer):
+    """
+    Construct a Orion tokenizer. Based on byte-level Byte-Pair-Encoding.
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask"]
+    def __init__(
+        self,
+        vocab_file,
+        unk_token="<unk>",
+        bos_token="<s>",
+        eos_token="</s>",
+        pad_token=None,
+        sp_model_kwargs: Optional[Dict[str, Any]] = None,
+        add_bos_token=True,
+        add_eos_token=False,
+        clean_up_tokenization_spaces=False,
+        **kwargs,
+    ):
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        bos_token = (
+            AddedToken(bos_token, lstrip=False, rstrip=False)
+            if isinstance(bos_token, str)
+            else bos_token
+        )
+        eos_token = (
+            AddedToken(eos_token, lstrip=False, rstrip=False)
+            if isinstance(eos_token, str)
+            else eos_token
+        )
+        unk_token = (
+            AddedToken(unk_token, lstrip=False, rstrip=False)
+            if isinstance(unk_token, str)
+            else unk_token
+        )
+        pad_token = (
+            AddedToken(pad_token, lstrip=False, rstrip=False)
+            if isinstance(pad_token, str)
+            else pad_token
+        )
+        self.vocab_file = vocab_file
+        self.add_bos_token = add_bos_token
+        self.add_eos_token = add_eos_token
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(vocab_file)
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            add_bos_token=add_bos_token,
+            add_eos_token=add_eos_token,
+            sp_model_kwargs=self.sp_model_kwargs,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+    def __setstate__(self, d):
+        self.__dict__ = d
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(self.vocab_file)
+    @property
+    def vocab_size(self):
+        """Returns vocab size"""
+        return self.sp_model.get_piece_size()
+    def get_vocab(self):
+        """Returns vocab as a dict"""
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+    def _tokenize(self, text):
+        """Returns a tokenized string."""
+        return self.sp_model.encode(text, out_type=str)
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.sp_model.piece_to_id(token)
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        token = self.sp_model.IdToPiece(index)
+        return token
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        current_sub_tokens = []
+        out_string = ""
+        prev_is_special = False
+        for i, token in enumerate(tokens):
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self.all_special_tokens:
+                if not prev_is_special and i != 0:
+                    out_string += " "
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                prev_is_special = True
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+                prev_is_special = False
+        out_string += self.sp_model.decode(current_sub_tokens)
+        return out_string
+    def save_vocabulary(
+        self, save_directory, filename_prefix: Optional[str] = None
+    ) -> Tuple[str]:
+        """
+        Save the vocabulary and special tokens file to a directory.
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "")
+            + VOCAB_FILES_NAMES["vocab_file"],
+        )
+        if os.path.abspath(self.vocab_file) != os.path.abspath(
+            out_vocab_file
+        ) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+        return (out_vocab_file,)
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+        output = bos_token_id + token_ids_0 + eos_token_id
+        if token_ids_1 is not None:
+            output = output + bos_token_id + token_ids_1 + eos_token_id
+        return output
+    def get_special_tokens_mask(
+        self,
+        token_ids_0: List[int],
+        token_ids_1: Optional[List[int]] = None,
+        already_has_special_tokens: bool = False,
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0,
+                token_ids_1=token_ids_1,
+                already_has_special_tokens=True,
+            )
+        bos_token_id = [1] if self.add_bos_token else []
+        eos_token_id = [1] if self.add_eos_token else []
+        if token_ids_1 is None:
+            return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
+        return (
+            bos_token_id
+            + ([0] * len(token_ids_0))
+            + eos_token_id
+            + bos_token_id
+            + ([0] * len(token_ids_1))
+            + eos_token_id
+        )
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
+        sequence pair mask has the following format:
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
+        if token_ids_1 is None, only returns the first portion of the mask (0s).
+        Args:
+            token_ids_0 (`List[int]`):
+                List of ids.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+        """
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+        output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
+        if token_ids_1 is not None:
+            output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
+        return output

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ded43118b7418f56db97a4eed08a5c265c03120158229ddd4fbcc9658241d5f0
+size 1520600

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_orion.OrionTokenizer",
+      null
+    ]
+  },
+  "bos_token": {
+    "__type": "AddedToken",
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": true
+  },
+  "clean_up_tokenization_spaces": false,
+  "eos_token": {
+    "__type": "AddedToken",
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": true
+  },
+  "model_max_length": 4096,
+  "pad_token": {
+    "__type": "AddedToken",
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": true
+  },
+  "sp_model_kwargs": {},
+  "tokenizer_class": "OrionTokenizer",
+  "unk_token": {
+    "__type": "AddedToken",
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": true
+  }
+}