yibolu commited on Jun 21, 2023

Commit

693dde8

0 Parent(s):

Feat: Add support for cuda 11.x and faster model load speed

Files changed (18) hide show

.gitattributes +37 -0
CHANGES.rst +10 -0
LISENCE +420 -0
README.md +157 -0
demo.py +22 -0
lyraChatGLM/__init__.py +1 -0
lyraChatGLM/config.py +31 -0
lyraChatGLM/ftlib/libth_transformer_sm70_cu11.so +3 -0
lyraChatGLM/ftlib/libth_transformer_sm70_cu12.so +3 -0
lyraChatGLM/ftlib/libth_transformer_sm80_cu11.so +3 -0
lyraChatGLM/ftlib/libth_transformer_sm80_cu12.so +3 -0
lyraChatGLM/lyra_glm.py +174 -0
lyraChatGLM/model.py +194 -0
models/config.ini +13 -0
models/ice_text.model +3 -0
models/tokenization_chatglm.py +443 -0
models/tokenizer_config.json +20 -0
requirements.txt +9 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,37 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+models/glm6b-kv-cache-dy-bs8.ftm filter=lfs diff=lfs merge=lfs -text
+models/glm6b-bs8.ftm filter=lfs diff=lfs merge=lfs -text
+*.so filter=lfs diff=lfs merge=lfs -text

CHANGES.rst ADDED Viewed

	@@ -0,0 +1,10 @@

+Changelog (lyraChatGLM)
+## 2.0
+- rebuild whole system using modified Fastertransformer
+- add dynamic library & models for Volta architecture.
+- further acceleration, remove token generation limits.
+## 1.0
+- add lyraChatGLM model, from original weights

LISENCE ADDED Viewed

	@@ -0,0 +1,420 @@

+MIT License
+Copyright (c) 2023 Tencent Music Entertainment
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+Other dependencies and licenses:
+Open Source Software Licensed under The ChatGLM-6B License and the Apache License Version 2.0 :
+--------------------------------------------------------------------
+1. chatglm-6b
+File：https://github.com/THUDM/ChatGLM-6B
+License：The ChatGLM-6B License and Apache Licnese Version 2.0
+For details：https://github.com/THUDM/ChatGLM-6B/blob/main/MODEL_LICENSE
+             https://github.com/THUDM/ChatGLM-6B/blob/main/LICENSE
+APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+Copyright Zhengxiao Du
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+A copy of the Apache License Version 2.0 is included in this file.
+Terms of The ChatGLM-6B License:
+--------------------------------------------------------------------
+一、定义
+“许可方”是指分发其软件的 ChatGLM-6B 模型团队。
+“软件”是指根据本许可提供的 ChatGLM-6B 模型参数。
+2. 许可授予
+根据本许可的条款和条件，许可方特此授予您非排他性、全球性、不可转让、不可再许可、可撤销、免版税的版权许可，仅用于您的非商业研究目的。
+上述版权声明和本许可声明应包含在本软件的所有副本或重要部分中。
+3.限制
+您不得出于任何商业、军事或非法目的使用、复制、修改、合并、发布、分发、复制或创建本软件的全部或部分衍生作品。
+您不得利用本软件从事任何危害国家安全和国家统一、危害社会公共利益、侵犯人身权益的行为。
+4.免责声明
+本软件“按原样”提供，不提供任何明示或暗示的保证，包括但不限于对适销性、特定用途的适用性和非侵权性的保证。 在任何情况下，作者或版权持有人均不对任何索赔、损害或其他责任负责，无论是在合同诉讼、侵权行为还是其他方面，由软件或软件的使用或其他交易引起、由软件引起或与之相关 软件。
+5. 责任限制
+除适用法律禁止的范围外，在任何情况下且根据任何法律理论，无论是基于侵权行为、疏忽、合同、责任或其他原因，任何许可方均不对您承担任何直接、间接、特殊、偶然、示范性、 或间接损害，或任何其他商业损失，即使许可人已被告知此类损害的可能性。
+6.争议解决
+本许可受中华人民共和国法律管辖并按其解释。 因本许可引起的或与本许可有关的任何争议应提交北京市海淀区人民法院。
+请注意，许可证可能会更新到更全面的版本。 有关许可和版权的任何问题，请通过 glm-130b@googlegroups.com 与我们联系。
+1. Definitions
+“Licensor” means the ChatGLM-6B Model Team that distributes its Software.
+“Software” means the ChatGLM-6B model parameters made available under this license.
+2. License Grant
+Subject to the terms and conditions of this License, the Licensor hereby grants to you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty-free copyright license to use the Software solely for your non-commercial research purposes.
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+3. Restriction
+You will not use, copy, modify, merge, publish, distribute, reproduce, or create derivative works of the Software, in whole or in part, for any commercial, military, or illegal purposes.
+You will not use the Software for any act that may undermine China's national security and national unity, harm the public interest of society, or infringe upon the rights and interests of human beings.
+4. Disclaimer
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+5. Limitation of Liability
+EXCEPT TO THE EXTENT PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER BASED IN TORT, NEGLIGENCE, CONTRACT, LIABILITY, OR OTHERWISE WILL ANY LICENSOR BE LIABLE TO YOU FOR ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES, OR ANY OTHER COMMERCIAL LOSSES, EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+6. Dispute Resolution
+This license shall be governed and construed in accordance with the laws of People’s Republic of China. Any dispute arising from or in connection with this License shall be submitted to Haidian District People's Court in Beijing.
+Note that the license is subject to update to a more comprehensive version.  For any questions related to the license and copyright, please contact us at glm-130b@googlegroups.com.
+Open Source Software Licensed under the Apache License Version 2.0:
+--------------------------------------------------------------------
+1. huggingface/transformers
+Copyright 2018- The Hugging Face team. All rights reserved.
+Terms of the Apache License Version 2.0:
+--------------------------------------------------------------------
+Apache License
+Version 2.0, January 2004
+http://www.apache.org/licenses/
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+1. Definitions.
+"License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.
+"Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.
+"Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
+"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.
+"Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.
+"Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.
+"Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).
+"Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.
+"Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
+"Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.
+2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
+3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.
+4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
+You must give any other recipients of the Work or Derivative Works a copy of this License; and
+You must cause any modified files to carry prominent notices stating that You changed the files; and
+You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and
+If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License.
+You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.
+5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.
+6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.
+7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.
+8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
+9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
+END OF TERMS AND CONDITIONS
+Open Source Software Licensed under the Modified BSD License:
+--------------------------------------------------------------------
+1. pytorch
+From PyTorch:
+Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
+Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
+Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
+Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+Copyright (c) 2011-2013 NYU                      (Clement Farabet)
+Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
+Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
+Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
+From Caffe2:
+Copyright (c) 2016-present, Facebook Inc. All rights reserved.
+All contributions by Facebook:
+Copyright (c) 2016 Facebook Inc.
+All contributions by Google:
+Copyright (c) 2015 Google Inc.
+All rights reserved.
+All contributions by Yangqing Jia:
+Copyright (c) 2015 Yangqing Jia
+All rights reserved.
+All contributions by Kakao Brain:
+Copyright 2019-2020 Kakao Brain
+All contributions by Cruise LLC:
+Copyright (c) 2022 Cruise LLC.
+All rights reserved.
+All contributions from Caffe:
+Copyright(c) 2013, 2014, 2015, the respective contributors
+All rights reserved.
+All other contributions:
+Copyright(c) 2015, 2016 the respective contributors
+All rights reserved.
+Caffe2 uses a copyright model similar to Caffe: each contributor holds
+copyright over their contributions to Caffe2. The project versioning records
+all such contribution and copyright details. If a contributor wants to further
+mark their specific copyright on a particular contribution, they should
+indicate their copyright solely in the commit message of the change when it is
+committed.
+All rights reserved.
+Terms of the Modified BSD License:
+-------------------------------------------------------------------
+This project is licensed under the terms of the Modified BSD License, as follows:
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
+   and IDIAP Research Institute nor the names of its contributors may be
+   used to endorse or promote products derived from this software without
+   specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+Open Source Software Licensed under the Python Software Foundation License Version 2:
+--------------------------------------------------------------------------
+1. Python/cpython
+Copyright © 2001-2023 Python Software Foundation. All rights reserved
+A. HISTORY OF THE SOFTWARE
+==========================
+Python was created in the early 1990s by Guido van Rossum at Stichting
+Mathematisch Centrum (CWI, see https://www.cwi.nl) in the Netherlands
+as a successor of a language called ABC.  Guido remains Python's
+principal author, although it includes many contributions from others.
+In 1995, Guido continued his work on Python at the Corporation for
+National Research Initiatives (CNRI, see https://www.cnri.reston.va.us)
+in Reston, Virginia where he released several versions of the
+software.
+In May 2000, Guido and the Python core development team moved to
+BeOpen.com to form the BeOpen PythonLabs team.  In October of the same
+year, the PythonLabs team moved to Digital Creations, which became
+Zope Corporation.  In 2001, the Python Software Foundation (PSF, see
+https://www.python.org/psf/) was formed, a non-profit organization
+created specifically to own Python-related Intellectual Property.
+Zope Corporation was a sponsoring member of the PSF.
+All Python releases are Open Source (see https://opensource.org for
+the Open Source Definition).  Historically, most, but not all, Python
+releases have also been GPL-compatible; the table below summarizes
+the various releases.
+    Release         Derived     Year        Owner       GPL-
+                    from                                compatible? (1)
+    0.9.0 thru 1.2              1991-1995   CWI         yes
+    1.3 thru 1.5.2  1.2         1995-1999   CNRI        yes
+    1.6             1.5.2       2000        CNRI        no
+    2.0             1.6         2000        BeOpen.com  no
+    1.6.1           1.6         2001        CNRI        yes (2)
+    2.1             2.0+1.6.1   2001        PSF         no
+    2.0.1           2.0+1.6.1   2001        PSF         yes
+    2.1.1           2.1+2.0.1   2001        PSF         yes
+    2.1.2           2.1.1       2002        PSF         yes
+    2.1.3           2.1.2       2002        PSF         yes
+    2.2 and above   2.1.1       2001-now    PSF         yes
+Footnotes:
+(1) GPL-compatible doesn't mean that we're distributing Python under
+    the GPL.  All Python licenses, unlike the GPL, let you distribute
+    a modified version without making your changes open source.  The
+    GPL-compatible licenses make it possible to combine Python with
+    other software that is released under the GPL; the others don't.
+(2) According to Richard Stallman, 1.6.1 is not GPL-compatible,
+    because its license has a choice of law clause.  According to
+    CNRI, however, Stallman's lawyer has told CNRI's lawyer that 1.6.1
+    is "not incompatible" with the GPL.
+Thanks to the many outside volunteers who have worked under Guido's
+direction to make these releases possible.
+B. TERMS AND CONDITIONS FOR ACCESSING OR OTHERWISE USING PYTHON
+===============================================================
+Python software and documentation are licensed under the
+Python Software Foundation License Version 2.
+Starting with Python 3.8.6, examples, recipes, and other code in
+the documentation are dual licensed under the PSF License Version 2
+and the Zero-Clause BSD license.
+Some software incorporated into Python is under different licenses.
+The licenses are listed with code falling under that license.
+PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2
+--------------------------------------------
+1. This LICENSE AGREEMENT is between the Python Software Foundation
+("PSF"), and the Individual or Organization ("Licensee") accessing and
+otherwise using this software ("Python") in source or binary form and
+its associated documentation.
+2. Subject to the terms and conditions of this License Agreement, PSF hereby
+grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce,
+analyze, test, perform and/or display publicly, prepare derivative works,
+distribute, and otherwise use Python alone or in any derivative version,
+provided, however, that PSF's License Agreement and PSF's notice of copyright,
+i.e., "Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
+2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023 Python Software Foundation;
+All Rights Reserved" are retained in Python alone or in any derivative version
+prepared by Licensee.
+3. In the event Licensee prepares a derivative work that is based on
+or incorporates Python or any part thereof, and wants to make
+the derivative work available to others as provided herein, then
+Licensee hereby agrees to include in any such work a brief summary of
+the changes made to Python.
+4. PSF is making Python available to Licensee on an "AS IS"
+basis.  PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
+IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND
+DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
+FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT
+INFRINGE ANY THIRD PARTY RIGHTS.
+5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON
+FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS
+A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON,
+OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
+6. This License Agreement will automatically terminate upon a material
+breach of its terms and conditions.
+7. Nothing in this License Agreement shall be deemed to create any
+relationship of agency, partnership, or joint venture between PSF and
+Licensee.  This License Agreement does not grant permission to use PSF
+trademarks or trade name in a trademark sense to endorse or promote
+products or services of Licensee, or any third party.
+8. By copying, installing or otherwise using Python, Licensee
+agrees to be bound by the terms and conditions of this License
+Agreement.
+Open Source Software：
+--------------------------------------------------------------------
+1. icetk
+File：https://github.com/THUDM/icetk

README.md ADDED Viewed

	@@ -0,0 +1,157 @@

+---
+license: mit
+language: en
+tags:
+- LLM
+- ChatGLM6B
+---
+## New Features (2023-06-20)
+- We now support cuda version of both 11.X and 12.X
+- lyraChatGLM has been further optimized, with faster model load speed from few minutes to less than 10s for non-int8 mode, and around 1 min for int8 mode!
+## Breakings!
+**We know what you want, and here you go!**
+- Newly released lyraChatGLM model, suitable for Ampere (A100/A10) as well as Volta (V100)
+- lyraChatGLM has been further optimized, reaching **9000 tokens/s** on A100 and **3900 tokens/s** on V100, about **5.5x** faster than the up-to-date official version (2023/6/1).
+- The memory usage was optimized too, now we can set batch_size up to **256** on A100!
+- INT8 weight only PTQ is supported
+**Note that the code was fully updated too, you need to use the new API, see `Uses` below**
+If you like our work and consider to join us, feel free to drop a line to benbinwu@tencent.com.
+P.S. Recently we have received a lot of inquiries on accelerating customized models. Actually, we **do not have plan** to release the convertion tool at this moment, nor do we think it would be possible to apply your customized models based on our current release.
+****
+## Model Card for lyraChatGLM
+lyraChatGLM is currently the **fastest ChatGLM-6B** available. To the best of our knowledge, it is the **first accelerated version of ChatGLM-6B**.
+The inference speed of lyraChatGLM has achieved **300x** acceleration upon the early original version. We are still working hard to further improve the performance.
+Among its main features are:
+- weights: original ChatGLM-6B weights released by THUDM.
+- device: Nvidia GPU with Amperer architecture or Volta architecture (A100, A10, V100...).
+- batch_size: compiled with dynamic batch size, maximum depends on device.
+## Speed
+- orginal version(fixed batch infer): commit id 1d240ba
+### test on A100 40G
+1. The maximum batch size and maximum speed table for each version of the model.
+|version|max_batch_size|max_speed|
+|:-:|:-:|:-:|
+|original|1|30 tokens/s|
+|original(fxied batch infer)|192|1638.52 tokens/s|
+|lyraChatGLM(current)|256|9082.60 tokens/s|
+2. The speed table for the same batch size.
+|version|1 batch_size|8 batch_size| 64 batch_size | 128 batch_size |
+|:-:|:-:|:-:|:-:|:-:|
+|original|30 tokens/s| - | - | - |
+|original(fxied batch infer)|34.48 tokens/s|356.29 tokens/s|1638.52 tokens/s|1338.45 tokens/s|
+|lyraChatGLM(current)|110.05 tokens/s|843.60 tokens/s|4926.92 tokens/s|7235.04 tokens/s|
+### test on V100
+1. The maximum batch size and maximum speed table for each version of the model.
+|version|max_batch_size|max_speed|
+|:-:|:-:|:-:|
+|original|1|17.83 tokens/s|
+|original(fxied batch infer)|128|992.20 tokens/s|
+|lyraChatGLM(current)|192|3958.39 tokens/s|
+2. The speed table for the same batch size.
+|version|1 batch_size|8 batch_size| 64 batch_size | 128 batch_size |
+|:-:|:-:|:-:|:-:|:-:|
+|original|17.83 tokens/s| - | - | - |
+|original(fxied batch infer)|17.83 tokens/s|228.95 tokens/s|889.7 tokens/s|922.20 tokens/s|
+|lyraChatGLM(current)|59.33 tokens/s|514.15 tokens/s|2849.88 tokens/s|3958.39 tokens/s|
+## Model Sources
+- **Repository:** https://huggingface.co/THUDM/chatglm-6b
+## Docker Environment Recommendation
+- For Cuda 11.X: we recommend ```nvcr.io/nvidia/pytorch:22.12-py3```
+- For Cuda 12.0: we recommend ```nvcr.io/nvidia/pytorch:23.02-py3```
+```bash
+docker pull nvcr.io/nvidia/pytorch:23.02-py3
+docker run --rm -it --gpus all -v ./:/lyraChatGLM nvcr.io/nvidia/pytorch:23.02-py3
+pip install -r requirements.txt
+python demo.py
+```
+## Uses
+```python
+from lyraChatGLM import LyraChatGLM6B
+model_path = "./models/1-gpu-fp16.h5"
+tokenizer_path = "./models"
+data_type = "fp16"
+int8_mode = 0   # 1 for INT8 WEIGHT ONLY PTQ
+max_output_length = 150
+arch = "Ampere" # Ampere or Volta
+cuda_version = 12
+model = LyraChatGLM6B(model_path, tokenizer_path, data_type, int8_mode, arch, cuda_version)
+prompt = "列出3个不同的机器学习算法，并说明它们的适用范围."
+test_batch_size = 256
+prompts = [prompt, ]
+# If you want to get different output in same batch, you can set do_sample to True
+output_texts = model.generate(prompts, output_length=max_output_length,top_k=30, top_p=0.85, temperature=0.35, repetition_penalty=1.2, do_sample=False)
+print(output_texts)
+```
+## Demo output
+### input
+列出3个不同的机器学习算法，并说明它们的适用范围.
+### output
+以下是三个常见的机器学习算法及其适用范围:
+1. 决策树(Decision Tree):决策树是一种基于分类和回归问题的朴素贝叶斯模型。它通过构建一系列逐步分裂的分支来预测结果。适用于那些具有简单特征、大量数据且数据集大小在可接受范围内的情况。
+2. 随机森林(Random Forest):随���森林是一种集成学习算法,由多个决策树组成。它的优点是能够处理大规模数据和高维度的特征。适用于需要对多个变量进行建模的场景,例如医疗诊断、金融风险评估等。
+3. 支持向量机(Support Vector Machine):支持向量机是一种监督学习方法,通常用于分类问题。它可以处理高维数据,并且具有较高的准确性。适用于需要对高维数据进行分类或回归的问题,例如图像识别、自然语言处理等。
+## INT8
+**Int8 usage**:
+Our current version supports INT8 weight only PTQ. To enable this mode, simply modify the `int8_mode` to `1` in the demo.py file.
+**In this mode, gpu memory can be further reduced by about half and the speed can be doubled.**
+This solves the issue mentioned in https://github.com/THUDM/ChatGLM-6B/issues/1042.
+However, the speed gain is best achieved with a batch size of no more than 128. If you don't use A100 GPU, you can adjust the
+batch size to reduce it and get the benefits. We recommend a batch size of 64.This mode is very suitable for GPUs with
+limited VRAM or scenarios where it is difficult to use larger batch sizes in real-time services.
+It should be noted that although we have aligned the accuracy in our test cases, there may be slight differences
+in accuracy in some untested scenarios with int8. Please be aware of this.
+## Citation
+``` bibtex
+@Misc{lyraChatGLM2023,
+  author =       {Kangjian Wu, Zhengtao Wang, Yibo Lu, Bin Wu},
+  title =        {lyraChatGLM: Accelerating ChatGLM to 9000+ tokens/s},
+  howpublished = {\url{https://huggingface.co/TMElyralab/lyraChatGLM}},
+  year =         {2023}
+}
+```
+## Report bug
+- start a discussion to report any bugs!--> https://huggingface.co/TMElyralab/lyraChatGLM/discussions
+- report bug with a `[bug]` mark in the title.

demo.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from lyraChatGLM import LyraChatGLM6B
+import numpy as np
+model_path = "./models/1-gpu-fp16.bin"
+tokenizer_path = "./models"
+data_type = "fp16"
+int8_mode = 0
+max_output_length = 150
+arch = "Ampere" # Ampere or Volta
+cuda_version = 12 # cuda version, we currently support 11 and 12
+model = LyraChatGLM6B(model_path, tokenizer_path, data_type, int8_mode, arch, cuda_version)
+prompt = "今天天气大概 25度，有点小雨，吹着风，我想去户外散步，应该穿什么样的衣服裤子鞋子搭配。"
+# test_batch_size = 256
+prompts = [prompt, ]
+# # If you want to get different output in same batch, you can set do_sample to True
+output_texts = model.generate(prompts, output_length=max_output_length,top_k=30, top_p=0.85, temperature=0.35, repetition_penalty=1.2, do_sample=False)
+print(output_texts)

lyraChatGLM/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .lyra_glm import LyraChatGLM6B

lyraChatGLM/config.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import dataclasses
+from typing import Optional
+@dataclasses.dataclass
+class ChatGLM6BParam:
+    num_heads: int = 32
+    size_per_head: int = 128
+    inter_size: int = 16384
+    num_layers: int = 28
+    vocab_size: int = 130528
+    start_id: Optional[int] = 130004
+    end_id: Optional[int] = 130005
+    tensor_para_size: int = 1
+    pipeline_para_size: int = 1
+    remove_padding: bool = True
+    shared_contexts_ratio: float = 0.0
+    layernorm_eps: float = 1e-5
+    weights_data_type: str = "fp16"
+    def __post_init__(self):
+        if not 0.0 <= self.shared_contexts_ratio <= 1.0:
+            raise ValueError(
+                f'Got an invalid value of shared_context_ratio '
+                f'{self.shared_contexts_ratio} - range: [0.0, 1.0]')
+    def asdict(self):
+        return dataclasses.asdict(self)
+CHATGLM_6B_PARAM = ChatGLM6BParam()

lyraChatGLM/ftlib/libth_transformer_sm70_cu11.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a4a778897f6c5f77b0ea1cb14bb63732da9c3cc4e16ff16d9f911dcc8b6f6be5
+size 114267536

lyraChatGLM/ftlib/libth_transformer_sm70_cu12.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:99ac80b2f4c161bbacbf64a7607f323c612c7c5f26b83eaec7f559425f3a818b
+size 114186112

lyraChatGLM/ftlib/libth_transformer_sm80_cu11.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a1d6cd03321b671275fcabb4136562845233875564047ccde20401fca4df45c2
+size 200834616

lyraChatGLM/ftlib/libth_transformer_sm80_cu12.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2da10aad8e92bcdf45b15884cee63e845f582cd28bcc0f7f1c2a4f6a101e9646
+size 200916960

lyraChatGLM/lyra_glm.py ADDED Viewed

	@@ -0,0 +1,174 @@

+from __future__ import annotations
+import configparser
+import pathlib
+import typing
+import torch
+import transformers
+from .config import CHATGLM_6B_PARAM
+from .model import ChatGLM6BModel
+class LyraChatGLM6B:
+    def __init__(self, model_path, tokenizer_path=None, dtype='fp16', int8_mode=0, arch="Ampere", cuda_version="11") -> None:
+        self.model_path = model_path
+        self.tokenizer_path = tokenizer_path
+        self.dtype = dtype
+        self.arch=arch
+        # if dtype != 'int8':
+        #     int8_mode = 0
+        self.cuda_version = cuda_version
+        self.int8_mode = int8_mode
+        self.model, self.tokenizer = self.load_model_and_tokenizer()
+        if not (arch in ["Ampere", "Volta"]):
+            raise ValueError("Only support GPU device Ampere(A100,A10) or Volta(V100)")
+        print("Got model and tokenizer")
+    def load_model_and_tokenizer(self):
+        if self.tokenizer_path is None:
+            tokenizer_path = self.model_path
+        else:
+            tokenizer_path = self.tokenizer_path
+        print(f'Loading tokenizer from {pathlib.Path(tokenizer_path).parent}')
+        tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True)
+        checkpoint_path = pathlib.Path(self.model_path)
+        config_path = checkpoint_path.parent / 'config.ini'
+        if config_path.exists():
+            # Read model params from config.
+            cfg = configparser.ConfigParser()
+            cfg.read(config_path)
+            model_name = 'glm6b'
+            inference_data_type = self.dtype
+            if inference_data_type == None:
+                inference_data_type = cfg.get(model_name, "weight_data_type")
+            model_args = dict(
+                head_num=cfg.getint(model_name, 'head_num'),
+                size_per_head=cfg.getint(model_name, "size_per_head"),
+                layer_num=cfg.getint(model_name, "num_layer"),
+                tensor_para_size=cfg.getint(model_name, "tensor_para_size"),
+                vocab_size=cfg.getint(model_name, "vocab_size"),
+                start_id=cfg.getint(model_name, "start_id"),
+                end_id=cfg.getint(model_name, "end_id"),
+                weights_data_type=cfg.get(model_name, "weight_data_type"),
+                layernorm_eps=cfg.getfloat(model_name, 'layernorm_eps'),
+                inference_data_type=inference_data_type)
+        else:
+            inference_data_type = self.dtype
+            if inference_data_type == None:
+                inference_data_type = CHATGLM_6B_PARAM.weights_data_type
+            model_args = dict(head_num=CHATGLM_6B_PARAM.num_heads,
+                              size_per_head=CHATGLM_6B_PARAM.size_per_head,
+                              vocab_size=CHATGLM_6B_PARAM.vocab_size,
+                              start_id=CHATGLM_6B_PARAM.start_id or tokenizer.bos_token_id,
+                              end_id=CHATGLM_6B_PARAM.end_id or tokenizer.eos_token_id,
+                              layer_num=CHATGLM_6B_PARAM.num_layers,
+                              tensor_para_size=CHATGLM_6B_PARAM.tensor_para_size,
+                              weights_data_type=CHATGLM_6B_PARAM.weights_data_type,
+                              layernorm_eps=CHATGLM_6B_PARAM.layernorm_eps,
+                              inference_data_type=inference_data_type,
+                              )
+        # update common parameters
+        model_args.update(dict(
+            rotary_embedding_dim=64,
+            max_seq_len=0,  # for position seq embedding
+            pipeline_para_size=CHATGLM_6B_PARAM.pipeline_para_size,
+            shared_contexts_ratio=CHATGLM_6B_PARAM.shared_contexts_ratio,
+            int8_mode=self.int8_mode,
+            model_path=self.model_path,
+            cuda_version=self.cuda_version,
+        ))
+        print('[INFO] Load Our Highly Optimized LyraChatGLM6B model')
+        for k, v in model_args.items():
+            print(f' - {k.ljust(25, ".")}: {v}')
+        # Check sanity and consistency between the model and tokenizer.
+        checklist = ['head_num', 'size_per_head', 'vocab_size', 'layer_num',
+                     'tensor_para_size', 'tensor_para_size', 'weights_data_type']
+        if None in [model_args[k] for k in checklist]:
+            none_params = [p for p in checklist if model_args[p] is None]
+            print(f'[WARNING] Found None parameters {none_params}. They must '
+                  f'be provided either by config file or CLI arguments.')
+        if model_args['start_id'] != tokenizer.bos_token_id:
+            print('[WARNING] Given start_id is not matched with the bos token '
+                  'id of the pretrained tokenizer.')
+        if model_args['end_id'] not in (tokenizer.pad_token_id, tokenizer.eos_token_id):
+            print('[WARNING] Given end_id is not matched with neither pad '
+                  'token id nor eos token id of the pretrained tokenizer.')
+        print(f'Loading tokenizer from {self.model_path}')
+        model = ChatGLM6BModel(arch=self.arch,**model_args)
+        return model, tokenizer
+    def generate(self, prompts: typing.List[str] | str,
+                 output_length: int = 512,
+                 beam_width: int = 1,
+                 top_k: typing.Optional[torch.IntTensor] = 1,
+                 top_p: typing.Optional[torch.FloatTensor] = 1.0,
+                 beam_search_diversity_rate: typing.Optional[torch.FloatTensor] = 0.0,
+                 temperature: typing.Optional[torch.FloatTensor] = 1.0,
+                 len_penalty: typing.Optional[torch.FloatTensor] = 0.0,
+                 repetition_penalty: typing.Optional[torch.FloatTensor] = 1.0,
+                 presence_penalty: typing.Optional[torch.FloatTensor] = None,
+                 min_length: typing.Optional[torch.IntTensor] = None,
+                 bad_words_list: typing.Optional[torch.IntTensor] = None,
+                 do_sample: bool = False,
+                 return_output_length: bool = False,
+                 return_cum_log_probs: int = 0):
+        #
+        if isinstance(prompts, str):
+            prompts = [prompts, ]
+        inputs = prompts
+        batch_size = len(inputs)
+        ones_int = torch.ones(size=[batch_size], dtype=torch.int32)
+        ones_float = torch.ones(size=[batch_size], dtype=torch.float32)
+        input_token_ids = self.tokenizer(prompts, return_tensors="pt", padding=True).input_ids.int()
+        input_lengths = torch.IntTensor([len(ids) for ids in input_token_ids])
+        mask_positions = torch.IntTensor([seq.index(130001) for seq in input_token_ids.tolist()])
+        random_seed = None
+        if do_sample:
+            random_seed = torch.randint(0, 262144, (batch_size,), dtype=torch.long)
+        outputs = self.model(start_ids=input_token_ids,
+                             start_lengths=input_lengths,
+                             mask_positions=mask_positions,
+                             output_len=output_length,
+                             beam_width=beam_width,
+                             top_k=top_k*ones_int,
+                             top_p=top_p*ones_float,
+                             beam_search_diversity_rate=beam_search_diversity_rate*ones_float,
+                             temperature=temperature*ones_float,
+                             len_penalty=len_penalty*ones_float,
+                             repetition_penalty=repetition_penalty*ones_float,
+                             presence_penalty=presence_penalty,
+                             min_length=min_length,
+                             random_seed=random_seed,
+                             bad_words_list=bad_words_list,
+                             return_output_length=return_output_length,
+                             return_cum_log_probs=return_cum_log_probs)
+        if return_cum_log_probs > 0:
+            outputs = outputs[0]  # output_token_ids.
+        # Slice the generated token ids of the 1st beam result.
+        # output = input tokens + generated tokens.
+        output_token_ids = [out[0, length:].cpu()
+                            for out, length in zip(outputs, input_lengths)]
+        output_texts = self.tokenizer.batch_decode(
+            output_token_ids, skip_special_tokens=False)
+        return output_texts

lyraChatGLM/model.py ADDED Viewed

	@@ -0,0 +1,194 @@

+import os
+import h5py
+import pathlib
+import typing
+import numpy as np
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+class ChatGLM6BModel(nn.Module):
+    def __init__(self,
+                 head_num, size_per_head,
+                 vocab_size,
+                 rotary_embedding_dim,
+                 start_id, end_id, layer_num,
+                 arch,
+                 max_seq_len: int,
+                 tensor_para_size: int,
+                 pipeline_para_size: int,
+                 inference_data_type: str,
+                 model_path,
+                 cuda_version,
+                 inter_size: int = 0,
+                 # glm_variant_params
+                 layernorm_eps: float = 1e-5,
+                 layernorm_type: typing.Literal['pre_layernorm', 'post_layernorm'] = "pre_layernorm",
+                 activation_type: str = "Gelu",
+                 gpt_with_moe: bool = False,
+                 expert_num: int = 0,
+                 moe_k: int = 0,
+                 moe_layer_index: typing.List = [],
+                 has_positional_encoding: bool = False,
+                 has_pre_decoder_layernorm: bool = False,
+                 has_post_decoder_layernorm: bool = True,
+                 has_adapters: bool = False,
+                 adapter_inter_size: int = 0,
+                 use_attention_linear_bias: bool = False,
+                 int8_mode: int = 0,
+                 weights_data_type: typing.Union[str, np.dtype] = np.float32,
+                 shared_contexts_ratio: float = 1.0):
+        super().__init__()
+        self.head_num = head_num
+        self.size_per_head = size_per_head
+        self.vocab_size = vocab_size
+        self.rotary_embedding_dim = rotary_embedding_dim
+        self.start_id = start_id
+        self.end_id = end_id
+        self.layer_num = layer_num
+        self.inter_size = inter_size if inter_size != 0 else 4 * self.head_num * self.size_per_head
+        self.arch = arch
+        self.model_path = model_path
+        # gpt_variant_params
+        self.layernorm_eps = layernorm_eps
+        self.layernorm_type = layernorm_type
+        self.activation_type = activation_type
+        self.gpt_with_moe = gpt_with_moe
+        self.expert_num = expert_num
+        self.moe_k = moe_k
+        self.moe_layer_index = moe_layer_index
+        self.has_positional_encoding = has_positional_encoding
+        self.has_pre_decoder_layernorm = has_pre_decoder_layernorm
+        self.has_post_decoder_layernorm = has_post_decoder_layernorm
+        self.has_adapters = has_adapters
+        self.adapter_inter_size = adapter_inter_size
+        self.use_attention_linear_bias = use_attention_linear_bias
+        # multi-gpu params
+        self.tensor_para_size = tensor_para_size
+        self.pipeline_para_size = pipeline_para_size
+        self.use_sparse_gemm = False
+        self.build_model = False
+        self.int8_mode = int8_mode
+        self.weights_data_type = weights_data_type
+        self.shared_contexts_ratio = shared_contexts_ratio
+        assert torch.cuda.is_available(), "CUDA is required for this model."
+        assert head_num % tensor_para_size == 0, "head_num must be a multiple of tensor_para_size."
+        assert layer_num % pipeline_para_size == 0, "layer_num must be a multiple of pipeline_para_size."
+        self.device = 0
+        # Load the C++ model into Pytorch model.
+        sm = "sm80"
+        if arch == "Ampere":
+            sm = "sm80"
+        elif arch == "Volta":
+            sm = "sm70"
+        else:
+            raise Exception(f"unsupported arch: {arch}")
+        cu = 'cu11'
+        if cuda_version == 11:
+            cu = 'cu11'
+        elif cuda_version == 12:
+            cu = 'cu12'
+        else:
+            raise Exception(f"unsupported cuda version: {cuda_version}")
+        lib_path = pathlib.Path(__file__).parent / "ftlib" / f"libth_transformer_{sm}_{cu}.so"
+        torch.classes.load_library(os.path.abspath(lib_path))
+        self.model = torch.classes.FasterTransformer.GlmOp(
+            self.head_num, self.size_per_head, self.inter_size,
+            self.layer_num,
+            self.expert_num,
+            self.moe_k,
+            self.moe_layer_index,
+            self.vocab_size,
+            self.rotary_embedding_dim,
+            self.start_id, self.end_id,
+            self.tensor_para_size, self.pipeline_para_size, self.int8_mode,
+            # GLM variant parameters
+            self.layernorm_eps,
+            self.layernorm_type,
+            self.activation_type,
+            self.has_positional_encoding,
+            self.has_pre_decoder_layernorm,
+            self.has_post_decoder_layernorm,
+            self.has_adapters,
+            self.adapter_inter_size,
+            self.use_attention_linear_bias,
+            self.model_path,
+            inference_data_type,
+            self.shared_contexts_ratio)
+        self.build_model = True
+    def forward(self,
+                start_ids: torch.IntTensor,
+                start_lengths: torch.IntTensor,
+                mask_positions: torch.IntTensor,
+                output_len: int,
+                beam_width: int = 1,
+                top_k: typing.Optional[torch.IntTensor] = None,
+                top_p: typing.Optional[torch.FloatTensor] = None,
+                beam_search_diversity_rate: typing.Optional[torch.FloatTensor] = None,
+                temperature: typing.Optional[torch.FloatTensor] = None,
+                len_penalty: typing.Optional[torch.FloatTensor] = None,
+                repetition_penalty: typing.Optional[torch.FloatTensor] = None,
+                presence_penalty: typing.Optional[torch.FloatTensor] = None,
+                min_length: typing.Optional[torch.IntTensor] = None,
+                random_seed: typing.Optional[torch.LongTensor] = None,
+                bad_words_list: typing.Optional[torch.IntTensor] = None,
+                return_output_length: bool = False,
+                return_cum_log_probs: int = 0):
+        input_len = start_ids.size(1)
+        assert input_len > 0, "input len must be larger than zero. For an unconditional case, use start_id as the first token."
+        # Inputs to device
+        start_ids = start_ids.cuda(self.device)
+        start_lengths = start_lengths.cuda(self.device)
+        mask_positions = mask_positions.cuda(self.device)
+        # outputs: output_ids, output_lengths, output_cum_log_probs (optional)
+        outputs = self.model.forward(start_ids,
+                                     start_lengths,
+                                     mask_positions,
+                                     output_len,
+                                     beam_width,  # optional, can be None
+                                     top_k,  # optional, can be None
+                                     top_p,  # optional, can be None
+                                     beam_search_diversity_rate,  # optional, can be None
+                                     temperature,  # optional, can be None
+                                     len_penalty,  # optional, can be None
+                                     repetition_penalty,  # optional, can be None
+                                     presence_penalty,  # optional, can be None
+                                     min_length,  # optional, can be None
+                                     random_seed,  # optional, can be None
+                                     bad_words_list,  # optional, can be None
+                                     return_cum_log_probs)  # optional, can be None
+        if return_cum_log_probs == 0:
+            output_ids, output_lengths = outputs
+        else:
+            output_ids, output_lengths, output_cum_log_probs = outputs
+        if return_output_length:
+            if return_cum_log_probs > 0:
+                return output_ids, output_lengths, output_cum_log_probs
+            else:
+                return output_ids, output_lengths
+        else:
+            return output_ids
+    def set_input_tensor(self, input_tensor):
+        """Set input tensor to be used instead of forward()'s input.
+        When doing pipeline parallelism the input from the previous
+        stage comes from communication, not from the input, so the
+        model's forward_step_func won't have it. This function is thus
+        used by internal code to bypass the input provided by the
+        forward_step_func"""
+        self.input_tensor = input_tensor

models/config.ini ADDED Viewed

	@@ -0,0 +1,13 @@

+[glm6b]
+model_name = chatglm-6b
+head_num = 32
+size_per_head = 128
+inter_size = 16384
+max_pos_seq_len = 2048
+num_layer = 28
+vocab_size = 130528
+start_id = 130004
+end_id = 130005
+weight_data_type = fp16
+tensor_para_size = 1
+layernorm_eps = 1e-5

models/ice_text.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5e974d9a69c242ce014c88c2b26089270f6198f3c0b700a887666cd3e816f17e
+size 2706249

models/tokenization_chatglm.py ADDED Viewed

	@@ -0,0 +1,443 @@

+"""Tokenization classes for ChatGLM."""
+from typing import List, Optional, Union
+import os
+from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.utils import logging, PaddingStrategy
+from transformers.tokenization_utils_base import EncodedInput, BatchEncoding
+from typing import Dict
+import sentencepiece as spm
+import numpy as np
+logger = logging.get_logger(__name__)
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "THUDM/chatglm-6b": 2048,
+}
+class TextTokenizer:
+    def __init__(self, model_path):
+        self.sp = spm.SentencePieceProcessor()
+        self.sp.Load(model_path)
+        self.num_tokens = self.sp.vocab_size()
+    def encode(self, text):
+        return self.sp.EncodeAsIds(text)
+    def decode(self, ids: List[int]):
+        return self.sp.DecodeIds(ids)
+    def tokenize(self, text):
+        return self.sp.EncodeAsPieces(text)
+    def convert_tokens_to_string(self, tokens):
+        return self.sp.DecodePieces(tokens)
+    def convert_tokens_to_ids(self, tokens):
+        return [self.sp.PieceToId(token) for token in tokens]
+    def convert_token_to_id(self, token):
+        return self.sp.PieceToId(token)
+    def convert_id_to_token(self, idx):
+        return self.sp.IdToPiece(idx)
+    def __len__(self):
+        return self.num_tokens
+class SPTokenizer:
+    def __init__(
+            self,
+            vocab_file,
+            num_image_tokens=20000,
+            max_blank_length=80,
+            byte_fallback=True,
+    ):
+        assert vocab_file is not None
+        self.vocab_file = vocab_file
+        self.num_image_tokens = num_image_tokens
+        self.special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "<unused_0>", "<sop>", "<eop>", "<ENC>", "<dBLOCK>"]
+        self.max_blank_length = max_blank_length
+        self.byte_fallback = byte_fallback
+        self.text_tokenizer = TextTokenizer(vocab_file)
+    def _get_text_tokenizer(self):
+        return self.text_tokenizer
+    @staticmethod
+    def get_blank_token(length: int):
+        assert length >= 2
+        return f"<|blank_{length}|>"
+    @staticmethod
+    def get_tab_token():
+        return f"<|tab|>"
+    @property
+    def num_text_tokens(self):
+        return self.text_tokenizer.num_tokens
+    @property
+    def num_tokens(self):
+        return self.num_image_tokens + self.num_text_tokens
+    @staticmethod
+    def _encode_whitespaces(text: str, max_len: int = 80):
+        text = text.replace("\t", SPTokenizer.get_tab_token())
+        for i in range(max_len, 1, -1):
+            text = text.replace(" " * i, SPTokenizer.get_blank_token(i))
+        return text
+    def _preprocess(self, text: str, linebreak=True, whitespaces=True):
+        if linebreak:
+            text = text.replace("\n", "<n>")
+        if whitespaces:
+            text = self._encode_whitespaces(text, max_len=self.max_blank_length)
+        return text
+    def encode(
+            self, text: str, linebreak=True, whitespaces=True, add_dummy_prefix=True
+    ) -> List[int]:
+        """
+        @param text: Text to encode.
+        @param linebreak: Whether to encode newline (\n) in text.
+        @param whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding.
+        @param special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text.
+        @param add_dummy_prefix: Whether to add dummy blank space in the beginning.
+        """
+        text = self._preprocess(text, linebreak, whitespaces)
+        if not add_dummy_prefix:
+            text = "<n>" + text
+        tmp = self._get_text_tokenizer().encode(text)
+        tokens = [x + self.num_image_tokens for x in tmp]
+        return tokens if add_dummy_prefix else tokens[2:]
+    def postprocess(self, text):
+        text = text.replace("<n>", "\n")
+        text = text.replace(SPTokenizer.get_tab_token(), "\t")
+        for i in range(2, self.max_blank_length + 1):
+            text = text.replace(self.get_blank_token(i), " " * i)
+        return text
+    def decode(self, text_ids: List[int]) -> str:
+        ids = [int(_id) - self.num_image_tokens for _id in text_ids]
+        ids = [_id for _id in ids if _id >= 0]
+        text = self._get_text_tokenizer().decode(ids)
+        text = self.postprocess(text)
+        return text
+    def decode_tokens(self, tokens: List[str]) -> str:
+        text = self._get_text_tokenizer().convert_tokens_to_string(tokens)
+        text = self.postprocess(text)
+        return text
+    def tokenize(
+            self, text: str, linebreak=True, whitespaces=True, add_dummy_prefix=True
+    ) -> List[str]:
+        """
+        @param text: Text to encode.
+        @param linebreak: Whether to encode newline (\n) in text.
+        @param whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding.
+        @param special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text.
+        @param add_dummy_prefix: Whether to add dummy blank space in the beginning.
+        """
+        text = self._preprocess(text, linebreak, whitespaces)
+        if not add_dummy_prefix:
+            text = "<n>" + text
+        tokens = self._get_text_tokenizer().tokenize(text)
+        return tokens if add_dummy_prefix else tokens[2:]
+    def __getitem__(self, x: Union[int, str]):
+        if isinstance(x, int):
+            if x < self.num_image_tokens:
+                return "<image_{}>".format(x)
+            else:
+                return self.text_tokenizer.convert_id_to_token(x - self.num_image_tokens)
+        elif isinstance(x, str):
+            if x.startswith("<image_") and x.endswith(">") and x[7:-1].isdigit():
+                return int(x[7:-1])
+            else:
+                return self.text_tokenizer.convert_token_to_id(x) + self.num_image_tokens
+        else:
+            raise ValueError("The key should be str or int.")
+class ChatGLMTokenizer(PreTrainedTokenizer):
+    """
+    Construct a ChatGLM tokenizer. Based on byte-level Byte-Pair-Encoding.
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+    """
+    vocab_files_names = {"vocab_file": "ice_text.model"}
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask", "position_ids"]
+    def __init__(
+            self,
+            vocab_file,
+            do_lower_case=False,
+            remove_space=False,
+            bos_token='<sop>',
+            eos_token='<eop>',
+            end_token='</s>',
+            mask_token='[MASK]',
+            gmask_token='[gMASK]',
+            padding_side="left",
+            pad_token="<pad>",
+            unk_token="<unk>",
+            num_image_tokens=20000,
+            **kwargs
+    ) -> None:
+        super().__init__(
+            do_lower_case=do_lower_case,
+            remove_space=remove_space,
+            padding_side=padding_side,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            end_token=end_token,
+            mask_token=mask_token,
+            gmask_token=gmask_token,
+            pad_token=pad_token,
+            unk_token=unk_token,
+            num_image_tokens=num_image_tokens,
+            **kwargs
+        )
+        self.do_lower_case = do_lower_case
+        self.remove_space = remove_space
+        self.vocab_file = vocab_file
+        self.bos_token = bos_token
+        self.eos_token = eos_token
+        self.end_token = end_token
+        self.mask_token = mask_token
+        self.gmask_token = gmask_token
+        self.sp_tokenizer = SPTokenizer(vocab_file, num_image_tokens=num_image_tokens)
+        """ Initialisation """
+    @property
+    def gmask_token_id(self) -> Optional[int]:
+        if self.gmask_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.gmask_token)
+    @property
+    def end_token_id(self) -> Optional[int]:
+        """
+        `Optional[int]`: Id of the end of context token in the vocabulary. Returns `None` if the token has not been
+        set.
+        """
+        if self.end_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.end_token)
+    @property
+    def vocab_size(self):
+        """ Returns vocab size """
+        return self.sp_tokenizer.num_tokens
+    def get_vocab(self):
+        """ Returns vocab as a dict """
+        vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+    def preprocess_text(self, inputs):
+        if self.remove_space:
+            outputs = " ".join(inputs.strip().split())
+        else:
+            outputs = inputs
+        if self.do_lower_case:
+            outputs = outputs.lower()
+        return outputs
+    def _tokenize(self, text, **kwargs):
+        """ Returns a tokenized string. """
+        text = self.preprocess_text(text)
+        seq = self.sp_tokenizer.tokenize(text)
+        return seq
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        return self.sp_tokenizer.decode_tokens(tokens)
+    def _decode(
+            self,
+            token_ids: Union[int, List[int]],
+            **kwargs
+    ) -> str:
+        if isinstance(token_ids, int):
+            token_ids = [token_ids]
+        if len(token_ids) == 0:
+            return ""
+        if self.pad_token_id in token_ids:  # remove pad
+            token_ids = list(filter((self.pad_token_id).__ne__, token_ids))
+        return super()._decode(token_ids, **kwargs)
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        return self.sp_tokenizer[token]
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.sp_tokenizer[index]
+    def save_vocabulary(self, save_directory, filename_prefix=None):
+        """
+        Save the vocabulary and special tokens file to a directory.
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+            filename_prefix (`str`, *optional*):
+                An optional prefix to add to the named of the saved files.
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory, self.vocab_files_names["vocab_file"]
+            )
+        else:
+            vocab_file = save_directory
+        with open(self.vocab_file, 'rb') as fin:
+            proto_str = fin.read()
+        with open(vocab_file, "wb") as writer:
+            writer.write(proto_str)
+        return (vocab_file,)
+    def build_inputs_with_special_tokens(
+            self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A BERT sequence has the following format:
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        gmask_id = self.sp_tokenizer[self.gmask_token]
+        eos_id = self.sp_tokenizer[self.eos_token]
+        token_ids_0 = token_ids_0 + [gmask_id, self.sp_tokenizer[self.bos_token]]
+        if token_ids_1 is not None:
+            token_ids_0 = token_ids_0 + token_ids_1 + [eos_id]
+        return token_ids_0
+    def _pad(
+            self,
+            encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
+            max_length: Optional[int] = None,
+            padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+            pad_to_multiple_of: Optional[int] = None,
+            return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        """
+        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
+        Args:
+            encoded_inputs:
+                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
+            max_length: maximum length of the returned list and optionally padding length (see below).
+                Will truncate by taking into account the special tokens.
+            padding_strategy: PaddingStrategy to use for padding.
+                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+                The tokenizer padding sides are defined in self.padding_side:
+                    - 'left': pads on the left of the sequences
+                    - 'right': pads on the right of the sequences
+            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta).
+            return_attention_mask:
+                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
+        """
+        # Load from model defaults
+        bos_token_id = self.sp_tokenizer[self.bos_token]
+        mask_token_id = self.sp_tokenizer[self.mask_token]
+        gmask_token_id = self.sp_tokenizer[self.gmask_token]
+        assert self.padding_side == "left"
+        required_input = encoded_inputs[self.model_input_names[0]]
+        seq_length = len(required_input)
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = len(required_input)
+        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
+        # Initialize attention mask if not present.
+        if max_length is not None:
+            if "attention_mask" not in encoded_inputs:
+                if bos_token_id in required_input:
+                    context_length = required_input.index(bos_token_id)
+                else:
+                    context_length = seq_length
+                attention_mask = np.ones((1, seq_length, seq_length))
+                attention_mask = np.tril(attention_mask)
+                attention_mask[:, :, :context_length] = 1
+                attention_mask = np.bool_(attention_mask < 0.5)
+                encoded_inputs["attention_mask"] = attention_mask
+            if "position_ids" not in encoded_inputs:
+                if bos_token_id in required_input:
+                    context_length = required_input.index(bos_token_id)
+                else:
+                    context_length = seq_length
+                position_ids = np.arange(seq_length, dtype=np.int64)
+                mask_token = mask_token_id if mask_token_id in required_input else gmask_token_id
+                if mask_token in required_input:
+                    mask_position = required_input.index(mask_token)
+                    position_ids[context_length:] = mask_position
+                block_position_ids = np.concatenate(
+                    [np.zeros(context_length, dtype=np.int64),
+                     np.arange(1, seq_length - context_length + 1, dtype=np.int64)])
+                encoded_inputs["position_ids"] = np.stack([position_ids, block_position_ids], axis=0)
+        if needs_to_be_padded:
+            difference = max_length - len(required_input)
+            if "attention_mask" in encoded_inputs:
+                encoded_inputs["attention_mask"] = np.pad(encoded_inputs["attention_mask"],
+                                                          pad_width=[(0, 0), (difference, 0), (difference, 0)],
+                                                          mode='constant', constant_values=True)
+            if "token_type_ids" in encoded_inputs:
+                encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
+                    "token_type_ids"
+                ]
+            if "special_tokens_mask" in encoded_inputs:
+                encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
+            if "position_ids" in encoded_inputs:
+                encoded_inputs["position_ids"] = np.pad(encoded_inputs["position_ids"],
+                                                        pad_width=[(0, 0), (difference, 0)])
+            encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
+        return encoded_inputs

models/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "name_or_path": "THUDM/chatglm-6b",
+  "bos_token": "<sop>",
+  "eos_token": "<eop>",
+  "end_token": "</s>",
+  "gmask_token": "[gMASK]",
+  "mask_token": "[MASK]",
+  "pad_token": "<pad>",
+  "unk_token": "<unk>",
+  "remove_space": false,
+  "do_lower_case": false,
+  "tokenizer_class": "ChatGLMTokenizer",
+  "num_image_tokens": 0,
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_chatglm.ChatGLMTokenizer",
+      null
+      ]
+  }
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+icetk
+cpm_kernels
+transformers
+huggingface_hub
+numpy
+setuptools
+torch
+h5py
+protobuf==3.20.3