Bo1015 commited on Jun 11

Commit

0dce0bd

•

1 Parent(s): a5de5b9

Upload 27 files

Browse files

Files changed (28) hide show

.gitattributes +1 -0
LICENSE +201 -0
README.md +109 -3
README_zh.md +0 -0
checkpoints/MSAGPT-DPO/1/mp_rank_00_model_states.pt +3 -0
checkpoints/MSAGPT-DPO/latest +1 -0
checkpoints/MSAGPT-DPO/model_config.json +16 -0
checkpoints/MSAGPT-SFT/1/mp_rank_00_model_states.pt +3 -0
checkpoints/MSAGPT-SFT/latest +1 -0
checkpoints/MSAGPT-SFT/model_config.json +16 -0
checkpoints/MSAGPT/1/mp_rank_00_model_states.pt +3 -0
checkpoints/MSAGPT/latest +1 -0
checkpoints/MSAGPT/model_config.json +16 -0
cli_sat.py +136 -0
model_utils/__init__.py +2 -0
model_utils/model_msagpt.py +30 -0
model_utils/model_proteinglm_clm.py +428 -0
msa_input +4 -0
requirements.txt +3 -0
resources/app_case.png +0 -0
resources/demo.gif +3 -0
resources/overall_frame.png +0 -0
scripts/cli_sat.sh +62 -0
utils/__init__.py +4 -0
utils/chat.py +371 -0
utils/strategies.py +229 -0
utils/tokenization.py +213 -0
utils/utils.py +7 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+resources/demo.gif filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,3 +1,109 @@
----
-license: apache-2.0
----

+# MSAGPT
+<table>
+  <tr>
+    <td>
+      <h2>MSAGPT</h2>
+      <p>📖 Paper: <a href="xxx">MSAGPT: Neural Prompting Protein Structure Prediction via MSA Generative Pre-Training</a></p>
+      <p><b>MSAGPT</b> is a powerful protein language model (PLM). MSAGPT has 3 billion parameters with three versions of the model, MSAGPT, MSAGPT-Sft, and MSAGPT-Dpo, <b>supporting zero-shot and few-shot MSA generation</b>.</p>
+      <p><b>MSAGPT achieves state-of-the-art structural prediction performance on natural MSA-scarce scenarios</b>.</p>
+    </td>
+  </tr>
+</table>
+## Overall Framework
+<p align="center">
+<img src="resources/overall_frame.png" alt="描述文字" style="display: block; margin: auto; width: 90%;">
+</p>
+## Visualized Cases
+Visualization of improved structure prediction compared with nature MSA.
+<font color=orange>Yellow</font>: Ground truth;
+<font color=purple>Purple</font>: Predictions based on MSA generated by MSAGPT;
+<font color=cyan>Cyan</font>: Predictions from MSA generated by natural MSA.
+<p align="center">
+<img src="resources/app_case.png" alt="描述文字" style="display: block; margin: auto; width: 90%;">
+</p>
+## Get Started:
+### Option 1：Deploy MSAGPT by yourself
+We support GUI for model inference.
+First, we need to install the dependencies.
+```bash
+# CUDA >= 11.8
+pip install -r requirements.txt
+```
+#### Model List
+You can choose to manually download the necessary weights. Then UNZIP it and put it into the **checkpoints** folder.
+| Model            | Type | Seq Length | Download                                                                                                                                |
+|------------------|------|------------|-----------------------------------------------------------------------------------------------------------------------------------------|
+| MSAGPT         | Base | 16K         | [🤗 Huggingface](https://cloud.tsinghua.edu.cn/f/ebfc954a4cd24cef9243/?dl=1)  [🔨 SwissArmyTransformer](https://cloud.tsinghua.edu.cn/f/ebfc954a4cd24cef9243/?dl=1)  |
+| MSAGPT-SFT   | Sft | 16K        | [🤗 Huggingface](https://cloud.tsinghua.edu.cn/f/ebfc954a4cd24cef9243/?dl=1)  [🔨 SwissArmyTransformer](https://cloud.tsinghua.edu.cn/f/32da3eadf6e042aab2fa/?dl=1)   |
+| MSAGPT-DPO | Rlhf | 16K         | [🤗 Huggingface](https://cloud.tsinghua.edu.cn/f/ebfc954a4cd24cef9243/?dl=1)  [🔨 SwissArmyTransformer](https://cloud.tsinghua.edu.cn/f/ebfc954a4cd24cef9243/?dl=1) |                                                                                                                                                                                      |                                                                                                                                                                                  |
+#### Situation 1.1 CLI (SAT version)
+Run CLI demo via:
+```bash
+# Online Chat
+bash scripts/cli_sat.sh --from_pretrained ./checkpoints/MSAGPT-DPO --input-source chat --stream_chat --max-gen-length 1024
+```
+The program will automatically interact in the command line. You can generate replies entering the protein sequence you need to generate virtual MSAs (or add a few MSAs as a prompt, connected by "\<M\>"), for example: "PEGKQGDPGIPGEPGPPGPPGPQGARGPPG\<M\>VTVEFVNSCLIGDMGVDGPPGQQGQPGPPG", where "PEGKQGDPGIPGEPGPPGPPGPQGARGPPG" is the main sequence, and "VTVEFVNSCLIGDMGVDGPPGQQGQPGPPG" are MSA prompts, and pressing enter. Enter `stop` to stop the program. The chat CLI looks like:
+<p align="center">
+<img src="resources/demo.gif" alt="描述文字" style="display: block; margin: auto; width: 90%;">
+</p>
+You can also enable the offline generation by set the **--input-source \<your input file\>** and **--output-path \<your output path\>**.
+We set an input file example: *msa_input*.
+```bash
+# Offline Generation
+bash scripts/cli_sat.sh --from_pretrained ./checkpoints/MSAGPT-DPO --input-source <your input file> --output-path <your output path> --max-gen-length 1024
+```
+#### Situation 1.2 CLI (Huggingface version)
+(TODO)
+#### Situation 1.3 Web Demo
+(TODO)
+### Option 2：Finetuning MSAGPT
+(TODO)
+### Hardware requirement
+* Model Inference:
+  For BF16: 1 * A100(80G)
+* Finetuning:
+  For BF16: 4 * A100(80G) *[Recommend]*.
+## License
+The code in this repository is open source under the [Apache-2.0 license](./LICENSE).
+If you find our work helpful, please consider citing the our paper
+```
+@article{chen2024msagpt,
+  title={MSAGPT: Neural Prompting Protein Structure Prediction via MSA Generative Pre-Training},
+  author={Chen, Bo and Bei, Zhilei and Cheng, Xingyi and Li, Pan and Tang, Jie and Song, Le},
+  journal={arXiv preprint arXiv:2406.05347},
+  year={2024}
+}
+```

README_zh.md ADDED Viewed

File without changes

checkpoints/MSAGPT-DPO/1/mp_rank_00_model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f3507871a00564c0be3a697678f521eccee2efb2d77577b0bc009d766b8f02a4
+size 5721204666

checkpoints/MSAGPT-DPO/latest ADDED Viewed

	@@ -0,0 +1 @@


1	+ 1

checkpoints/MSAGPT-DPO/model_config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+    "model_class": "MSAGPT",
+    "tokenizer_type": "ProteinTokenizer",
+    "num_layers": 36,
+    "hidden_size": 2560,
+    "inner_hidden_size": 6832,
+    "num_attention_heads": 40,
+    "vocab_size": 128,
+    "layernorm_order": "post",
+    "model_parallel_size": 1,
+    "max_sequence_length": 2048,
+    "untie_head": true,
+    "head_num": 2,
+    "moe": false,
+    "expert": 1
+}

checkpoints/MSAGPT-SFT/1/mp_rank_00_model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:19b7a79194615affec18617b2854602f2b77f053b80b44b31f6fd79bfb38ae68
+size 5721204666

checkpoints/MSAGPT-SFT/latest ADDED Viewed

	@@ -0,0 +1 @@


1	+ 1

checkpoints/MSAGPT-SFT/model_config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+    "model_class": "MSAGPT",
+    "tokenizer_type": "ProteinTokenizer",
+    "num_layers": 36,
+    "hidden_size": 2560,
+    "inner_hidden_size": 6832,
+    "num_attention_heads": 40,
+    "vocab_size": 128,
+    "layernorm_order": "post",
+    "model_parallel_size": 1,
+    "max_sequence_length": 2048,
+    "untie_head": true,
+    "head_num": 2,
+    "moe": false,
+    "expert": 1
+}

checkpoints/MSAGPT/1/mp_rank_00_model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:daaec07dca52dda4eaee8442d02c9c0f821a5e8ad81cbd280490f50f8f16e205
+size 5721204666

checkpoints/MSAGPT/latest ADDED Viewed

	@@ -0,0 +1 @@


1	+ 1

checkpoints/MSAGPT/model_config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+    "model_class": "MSAGPT",
+    "tokenizer_type": "ProteinTokenizer",
+    "num_layers": 36,
+    "hidden_size": 2560,
+    "inner_hidden_size": 6832,
+    "num_attention_heads": 40,
+    "vocab_size": 128,
+    "layernorm_order": "post",
+    "model_parallel_size": 1,
+    "max_sequence_length": 2048,
+    "untie_head": true,
+    "head_num": 2,
+    "moe": false,
+    "expert": 1
+}

cli_sat.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import os
+import torch
+import stat
+import re
+import time
+import argparse
+import numpy as np
+from functools import partial
+from typing import List, Tuple
+import torch.distributed as dist
+from sat.helpers import print_rank0
+from sat import mpu, get_args, get_tokenizer
+from utils import AdvancedBaseStrategy, BeamSearchStrategy
+from model_utils import MSAGPT, FineTuneMSAGPT
+from utils import chat_api
+if __name__ == "__main__":
+    py_parser = argparse.ArgumentParser(add_help=False)
+    py_parser.add_argument("--sampling-strategy", type=str, default="BaseStrategy", help="Type of sampling strategy.")
+    py_parser.add_argument("--min-gen-length", type=int, default=0, help="The minimum length each blank should generate.")
+    py_parser.add_argument("--max-gen-length", type=int, default=512, help="The minimum length each blank should generate.")
+    py_parser.add_argument("--is-valid", action="store_true", help="Print all output generated by beam search strategy.")
+    py_parser.add_argument("--print-all-beams", action="store_true", help="Print all output generated by beam search strategy.")
+    py_parser.add_argument("--multiline_stream", action="store_true", help="streaming multiline output.")
+    py_parser.add_argument("--no-gap", action="store_true", help="do not generate gaps.")
+    py_parser.add_argument("--from_pretrained", type=str, default="./checkpoints/MSAGPT", help='pretrained ckpt')
+    py_parser.add_argument("--chinese", action='store_true', help='Chinese interface')
+    py_parser.add_argument("--stream_chat", action='store_true', help='streaming output')
+    py_parser = MSAGPT.add_model_specific_args(py_parser)
+    known, args_list = py_parser.parse_known_args()
+    args = get_args(args_list)
+    args = argparse.Namespace(**vars(args), **vars(known))
+    model, args = MSAGPT.from_pretrained(args.from_pretrained, args, overwrite_args={'model_parallel_size': args.model_parallel_size} if args.model_parallel_size != 1 else {})
+    model.eval()
+    rank = int(os.environ.get('RANK', 0))
+    world_size = int(os.environ.get('WORLD_SIZE', 1))
+    if torch.cuda.is_available():
+        model = model.to('cuda')
+    from utils import proteinglm_tokenizer
+    tokenizer = proteinglm_tokenizer()
+    end_tokens = [tokenizer.get_command("eop"), tokenizer.get_command("eos")]
+    # Get rid of all invalid tokens
+    invalid_slices = [0,26,28,29,30,31,32]
+    if args.no_gap:
+        invalid_slices.append(tokenizer.TokenToId('-'))
+    if args.sampling_strategy == "BaseStrategy":
+        assert not args.print_all_beams, "BaseStrategy don't support print all beams."
+        strategy = AdvancedBaseStrategy(
+            batch_size=1, invalid_slices = invalid_slices, temperature=args.temperature, top_k=args.top_k, top_p=args.top_p, min_gen_length=args.min_gen_length, no_repeat_ngram_size=args.no_repeat_ngram_size, end_tokens=end_tokens
+        )
+    elif args.sampling_strategy == "BeamSearchStrategy":
+        strategy = BeamSearchStrategy(
+            1,
+            args.num_beams,
+            length_penalty=args.length_penalty,
+            consider_end=True,
+            end_tokens=end_tokens,
+            invalid_slices=invalid_slices,
+            no_repeat_ngram_size=args.no_repeat_ngram_size,
+            min_gen_length=args.min_gen_length,
+            deterministic=True
+        )
+    else:
+        raise ValueError(f"unknown strategy {args.sampling_strategy}")
+    if args.input_source == 'chat':
+        if args.chinese:
+            if rank == 0:
+                print('欢迎使用 MSAGPT-CLI ，输入需要生成虚拟MSA的蛋白序列（或加上少量MSA作为prompt，以"<M>"相连），例如："PEGKQGDPGIPGEPGPPGPPGPQGARGPPG<M>VTVEFVNSCLIGDMGVDGPPGQQGQPGPPG"，其中"PEGKQGDPGIPGEPGPPGPPGPQGARGPPG"为主序列，"VTVEFVNSCLIGDMGVDGPPGQQGQPGPPG"为MSA prompt。 stop 终止程序'.center(20, "*"))
+        else:
+            if rank == 0:
+                print('Welcome to MSAGPT-CLI. Enter the protein sequence you need to generate virtual MSAs (or add a few MSAs as a prompt, connected by "<M>"), for example: "PEGKQGDPGIPGEPGPPGPPGPQGARGPPG<M>VTVEFVNSCLIGDMGVDGPPGQQGQPGPPG", where "PEGKQGDPGIPGEPGPPGPPGPQGARGPPG" is the main sequence, and "VTVEFVNSCLIGDMGVDGPPGQQGQPGPPG" are MSA prompts.  Type "stop" to end the program.'.center(20,"*"))
+        with torch.no_grad():
+            while True:
+                if args.chinese:
+                    if rank == 0:
+                        protein_input = input("请输入需要生成虚拟MSA的蛋白序列（或加上少量MSA作为prompt，以'<M>'相连）：")
+                    else:
+                        protein_input = None
+                else:
+                    if rank == 0:
+                        protein_input = input("Enter the protein sequence you need to generate virtual MSAs (or add a few MSAs as a prompt, connected by '<M>': ")
+                    else:
+                        protein_input = None
+                if world_size > 1:
+                    torch.distributed.broadcast_object(protein_input, 0)
+                protein_input = protein_input.strip()
+                assert protein_input is not None
+                if protein_input == 'stop':
+                    break
+                try:
+                    response = chat_api(
+                        args=args,
+                        query=protein_input,
+                        model=model,
+                        tokenizer=tokenizer,
+                        strategy=strategy
+                        )
+                except Exception as e:
+                    print(e)
+                    break
+                if rank == 0 and not args.stream_chat:
+                    if args.chinese:
+                        print(f"{'生成的MSA'.center(20, '*')}")
+                    else:
+                        print(f"{'Virtual MSA'.center(20, '*')}")
+                    if args.print_all_beams:
+                        for idx, gen in enumerate(response):
+                            out_str = f"Beam: {idx}".center(11,'@')
+                            print(out_str)
+                            for _ in gen:
+                                print(_)
+                            print()
+                    else:
+                        response = response[0]
+                        for _ in response:
+                            print(_)
+                    print()
+    else:
+        chat_api(
+                args=args,
+                model=model,
+                tokenizer=tokenizer,
+                strategy=strategy
+                )

model_utils/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .model_proteinglm_clm import ProteinGLMForGeneration
2	+ from .model_msagpt import MSAGPT, FineTuneMSAGPT

model_utils/model_msagpt.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import math
+import copy
+import torch
+from torch.nn import functional as F
+import torch.nn as nn
+from .model_proteinglm_clm import ProteinGLMForGeneration
+class MSAGPT(ProteinGLMForGeneration):
+    def __init__(self, args, transformer=None, **kwargs):
+        super().__init__(
+            args,
+            transformer=transformer,
+            **kwargs
+        )
+    @classmethod
+    def add_model_specific_args(cls, parser):
+        group = parser.add_argument_group('MSAGPT-inference', 'MSAGPT inference Configurations')
+        return super().add_model_specific_args(parser)
+class FineTuneMSAGPT(MSAGPT):
+    def __init__(self, args, transformer=None, **kwargs):
+        super().__init__(
+            args,
+            transformer=transformer,
+            **kwargs
+        )
+        pass

model_utils/model_proteinglm_clm.py ADDED Viewed

	@@ -0,0 +1,428 @@

+import math
+import copy
+import torch
+from torch.nn import functional as F
+import torch.nn as nn
+import contextlib
+from sat import mpu
+from sat.transformer_defaults import standard_attention, attention_fn_default
+from sat.mpu.utils import split_tensor_along_last_dim, divide
+from sat.mpu.layers import ColumnParallelLinear
+from sat.model.base_model import BaseModel, BaseMixin
+from sat.model.position_embedding import RotaryEmbedding
+from sat.model.position_embedding import apply_rotary_pos_emb_index
+from sat.ops import LayerNorm
+class RotaryEmbeddingMixin(BaseMixin):
+    def __init__(
+        self,
+        fp16,
+        hidden_size,
+        num_attention_heads,
+        model_parallel_size,
+        rotary_embedding_2d=True,
+    ):
+        super().__init__()
+        hidden_size_per_attention_head = divide(hidden_size, num_attention_heads)
+        self.hidden_size_per_attention_head = hidden_size_per_attention_head
+        self.rotary_embedding_2d = rotary_embedding_2d
+        self.num_attention_heads_per_partition = divide(num_attention_heads, model_parallel_size)
+        self.rotary_emb = RotaryEmbedding(
+            # hidden_size_per_attention_head,
+            hidden_size_per_attention_head // 2
+            if rotary_embedding_2d
+            else hidden_size_per_attention_head,
+            base=10000,
+            precision=torch.half if fp16 else torch.bfloat16,
+            learnable=False,
+            device=torch.cuda.current_device(),
+        )
+    def attention_forward(self, hidden_states, mask, **kw_args):
+        attn = self.transformer.layers[kw_args["layer_id"]].attention
+        attention_fn = attention_fn_default
+        if "attention_fn" in attn.hooks:
+            attention_fn = attn.hooks["attention_fn"]
+        # [seq, b, 3 * hn * np]
+        mixed_raw_layer = attn.query_key_value(hidden_states)
+        # [seq, b, (np * 3 * hn)] --> [seq, b, np, 3 * hn]
+        new_tensor_shape = mixed_raw_layer.size()[:-1] + (
+            self.num_attention_heads_per_partition,
+            3 * self.hidden_size_per_attention_head,
+        )
+        mixed_raw_layer = mixed_raw_layer.view(*new_tensor_shape)
+        # [sq, b, np, hn]
+        (query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_raw_layer, 3)
+        # print(key_layer.shape)
+        dropout_fn = attn.attention_dropout if attn.training else None
+        if self.rotary_embedding_2d:
+            q1, q2 = query_layer.chunk(2, dim=(query_layer.ndim - 1))
+            k1, k2 = key_layer.chunk(2, dim=(key_layer.ndim - 1))
+            cos, sin = self.rotary_emb(q1, seq_len=kw_args["position_ids"].max() + 1)
+            position_ids, block_position_ids = \
+                kw_args["position_ids"][:, 0, :].transpose(0, 1).contiguous(), \
+                kw_args["position_ids"][:, 1, :].transpose(0, 1).contiguous()
+            q1, k1 = apply_rotary_pos_emb_index(q1, k1, cos, sin, position_ids)
+            q2, k2 = apply_rotary_pos_emb_index(q2, k2, cos, sin, block_position_ids)
+            query_layer = torch.concat([q1, q2], dim=(q1.ndim - 1))
+            key_layer = torch.concat([k1, k2], dim=(k1.ndim - 1))
+        else:
+            kw_args["position_ids"] = kw_args["position_ids"].transpose(0, 1)
+            cos, sin = self.rotary_emb(value_layer, seq_len=kw_args["position_ids"].max() + 1)
+            query_layer, key_layer = apply_rotary_pos_emb_index(query_layer, key_layer, cos, sin, kw_args["position_ids"])
+        context_layer = attention_fn(query_layer, key_layer, value_layer, mask, dropout_fn, **kw_args)
+        output = attn.dense(context_layer)
+        if attn.training:
+            output = attn.output_dropout(output)
+        return output
+class GEGLU(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.activation_fn = F.gelu
+    def forward(self, x):
+        # dim=-1 breaks in jit for pt<1.10
+        x1, x2 = x.chunk(2, dim=(x.ndim - 1))
+        return x1 * self.activation_fn(x2)
+class DeepNormWithGLUMixin(BaseMixin):
+    def __init__(self, num_layers, hidden_size, inner_hidden_size=None):
+        super().__init__()
+        self.num_layers = num_layers
+        self.hidden_size = hidden_size
+        if inner_hidden_size is None:
+            inner_hidden_size = 4 * hidden_size * 2 // 3
+        self.inner_hidden_size = inner_hidden_size
+    def reinit(self):
+        for layer in self.transformer.layers:
+            del layer.mlp.dense_h_to_4h
+            layer.mlp.dense_h_to_4h = ColumnParallelLinear(
+                self.hidden_size,
+                2 * self.inner_hidden_size,
+                gather_output=False,
+                bias=True,
+                params_dtype=torch.half,
+                module=self,
+                name="dense_h_to_4h",
+                skip_init=True,
+            )
+            del layer.mlp.activation_func
+            layer.mlp.activation_func = GEGLU()
+    def layer_forward(self, hidden_states, mask, *args, **kw_args):
+        """
+        hidden_states: [seq_len, batch, hidden_size]
+        mask: [(1, 1), seq_len, seq_len]
+        """
+        layer = self.transformer.layers[kw_args["layer_id"]]
+        # Layer norm at the begining of the transformer layer.
+        attention_input = layer.input_layernorm(hidden_states)
+        # Self attention.
+        attention_output = layer.attention(attention_input, mask, **kw_args)
+        # Residual connection.
+        alpha = (2 * self.num_layers) ** 0.5
+        hidden_states = attention_input * alpha + attention_output
+        mlp_input = layer.post_attention_layernorm(hidden_states)
+        # MLP.
+        mlp_output = layer.mlp(mlp_input, **kw_args)
+        # Second residual connection.
+        output = mlp_input * alpha + mlp_output
+        return output
+class SelfAttentionWithFP32SoftmaxMixin(BaseMixin):
+    def __init__(self, fp16, hidden_size, num_attention_heads, model_parallel_size):
+        super().__init__()
+        self.hidden_size_per_attention_head = divide(hidden_size, num_attention_heads)
+        self.hidden_size_per_partition = divide(hidden_size, model_parallel_size)
+        self.scale_mask_softmax = None
+        self.fp16 = fp16
+    @staticmethod
+    def attention_mask_func(attention_scores, attention_mask):
+        attention_scores.masked_fill_(attention_mask, -10000.0)
+        return attention_scores
+    def attention_fn(
+        self,
+        query_layer,
+        key_layer,
+        value_layer,
+        attention_mask,
+        attention_dropout=None,
+        log_attention_weights=None,
+        scaling_attention_score=True,
+        mems=None,
+        **kwargs
+    ):
+        mem = mems[kwargs["layer_id"]] if mems is not None else None
+        # seqlen, batch, head, hidden_size
+        seq_len, b, nh, hidden_size = key_layer.shape
+        # stack, seqlen, b, head, hidden
+        # b, seqlen, stack, head, hidden
+        cache_kv = (
+            torch.stack((key_layer, value_layer))
+            .permute(2, 1, 0, 3, 4)
+            .detach()
+            .contiguous()
+            .view(b, seq_len, nh * hidden_size * 2)
+        )
+        kwargs["output_this_layer"]["mem_kv"] = cache_kv
+        if mem is not None:  # the first time, mem is None
+            # might change batch_size
+            # b, seqlen, stack, head, hidden -> stack, seqlen, b, head, hidden
+            mem = mem.expand(b, -1, -1).reshape(b, mem.shape[1], 2, nh, hidden_size).permute(2, 1, 0, 3, 4)
+            memk, memv = mem[0], mem[1]
+            key_layer = torch.cat((memk, key_layer), dim=0)
+            value_layer = torch.cat((memv, value_layer), dim=0)
+        # check if use flash attention
+        is_low_triangle = (attention_mask == ~torch.ones_like(attention_mask, dtype=torch.bool).tril()).all()
+        is_full = (attention_mask is None) or (attention_mask == 0).all()
+        if int(torch.__version__.split('.')[0]) >= 2 and (is_full or is_low_triangle):
+            # Pytorch 2.0 attention uses very much memory if attention_mask is float, and has NaN bug if attention_mask is None.
+            dropout_p = 0. if attention_dropout is None or not attention_dropout.training else attention_dropout.p
+            #[b, np, sq, hn]
+            query_layer, key_layer, value_layer = query_layer.permute(1,2,0,3).contiguous(), key_layer.permute(1,2,0,3).contiguous(), value_layer.permute(1,2,0,3).contiguous()
+            batch_size, num_query_heads = query_layer.shape[:2] # [b, np, s, hn]
+            num_kv_heads = key_layer.shape[1] # [b, np, s, hn]
+            key_layer = key_layer.unsqueeze(2).expand(-1, -1, num_query_heads//num_kv_heads, -1, -1).contiguous().view(batch_size, num_query_heads, *key_layer.shape[2:])
+            value_layer = value_layer.unsqueeze(2).expand(-1, -1, num_query_heads//num_kv_heads, -1, -1).contiguous().view(batch_size, num_query_heads, *value_layer.shape[2:])
+            if dropout_p > 0 and mpu.get_cuda_rng_tracker is not None:
+                context = mpu.get_cuda_rng_tracker().fork()
+            else:
+                context = contextlib.nullcontext()
+            with context:
+                context_layer = torch.nn.functional.scaled_dot_product_attention(
+                    query_layer, key_layer, value_layer,
+                    attn_mask=None,
+                    dropout_p=dropout_p,
+                    is_causal=not is_full
+                )
+            #[sq, b, np, hn]
+            context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
+            # [sq, b, np, hn] --> [sq, b, hp]
+            new_context_layer_shape = context_layer.size()[:-2] + (-1,)
+            context_layer = context_layer.view(*new_context_layer_shape)
+            return context_layer
+        else:
+            # standard attention
+            # [b, np, sq, sk]
+            output_size = (query_layer.size(1), query_layer.size(2), query_layer.size(0), key_layer.size(0))
+            query_key_layer_scaling_coeff = float(kwargs["layer_id"] + 1)
+            if scaling_attention_score:
+                query_layer = query_layer / (math.sqrt(self.hidden_size_per_attention_head) * query_key_layer_scaling_coeff)
+            # ===================================
+            # Raw attention scores. [b, np, s, s]
+            # ===================================
+            # [sq, b, np, hn] -> [sq, b * np, hn]
+            query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1)
+            # [sk, b, np, hn] -> [sk, b * np, hn]
+            key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1)
+            matmul_result = torch.empty(
+                output_size[0] * output_size[1],
+                output_size[2],
+                output_size[3],
+                dtype=query_layer.dtype,
+                device=torch.cuda.current_device(),
+            )
+            matmul_result = torch.baddbmm(
+                matmul_result,
+                query_layer.transpose(0, 1),  # [b * np, sq, hn]
+                key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
+                beta=0.0,
+                alpha=1.0,
+            )
+            # change view to [b, np, sq, sk]
+            attention_scores = matmul_result.view(*output_size)
+            if not (attention_mask.shape[-2] == 1 and (attention_mask > 0).all()):
+                # if auto-regressive, skip
+                attention_scores.masked_fill_(attention_mask.bool(), -float("inf"))
+            attention_scores = attention_scores.float()
+            attention_scores = attention_scores * query_key_layer_scaling_coeff
+            attention_probs = F.softmax(attention_scores, dim=-1)
+            if self.fp16:
+                attention_probs = attention_probs.half()
+            else:
+                attention_probs = attention_probs.bfloat16()
+            if attention_dropout is not None:
+                if mpu.get_cuda_rng_tracker() is not None:
+                    with mpu.get_cuda_rng_tracker().fork():
+                        attention_probs = attention_dropout(attention_probs)
+                else:
+                    attention_probs = attention_dropout(attention_probs)
+            # =========================
+            # Context layer. [sq, b, hp]
+            # =========================
+            # value_layer -> context layer.
+            # [sk, b, np, hn] --> [b, np, sq, hn]
+            # context layer shape: [b, np, sq, hn]
+            output_size = (value_layer.size(1), value_layer.size(2), query_layer.size(0), value_layer.size(3))
+            # change view [sk, b * np, hn]
+            value_layer = value_layer.view(value_layer.size(0), output_size[0] * output_size[1], -1)
+            # change view [b * np, sq, sk]
+            attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
+            # matmul: [b * np, sq, hn]
+            context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
+            # change view [b, np, sq, hn]
+            context_layer = context_layer.view(*output_size)
+            # [b, np, sq, hn] --> [sq, b, np, hn]
+            context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
+            # [sq, b, np, hn] --> [sq, b, hp]
+            new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
+            context_layer = context_layer.view(*new_context_layer_shape)
+            return context_layer
+class FinalForwardMixin(BaseMixin):
+    def __init__(self):
+        super().__init__()
+    def final_forward(self, logits, **kw_args):
+        return F.linear(logits, self.transformer.word_embeddings.weight).transpose(0, 1).contiguous()
+class UntieFinalForwardMixin(BaseMixin):
+    def __init__(self, hidden_size, vocab_size, untie_head_num, layernorm_epsilon=1.0e-5):
+        super().__init__()
+        self.lm_head = nn.ModuleList()
+        for i in range(untie_head_num):
+            self.lm_head.append(
+                ColumnParallelLinear(
+                hidden_size,
+                2 * hidden_size,
+                gather_output=True,
+                bias=False,
+                module=self,
+                name=f"lm_head.{i}",
+                )
+             ) # Setting bias to False always to keep it consistent with embedding tying that also does not have a bias.
+        self.head_layernorm = nn.ModuleList()
+        for i in range(untie_head_num):
+            self.head_layernorm.append(
+                LayerNorm(
+                hidden_size,
+                eps=layernorm_epsilon
+                )
+            )
+        self.activation_func=GEGLU()
+    def final_forward(self, logits, **kwargs):
+        logits = self.lm_head[1](logits)
+        logits = self.activation_func(logits)
+        logits = self.head_layernorm[1](logits)
+        return F.linear(logits, self.transformer.word_embeddings.weight).transpose(0, 1).contiguous()
+class NonePositionEmbedding(BaseMixin):
+    def __init__(self):
+        super().__init__()
+    def position_embedding_forward(self, position_ids, output_cross_layer, **kw_args):
+        return None
+class WordEmbedding(BaseMixin):
+    def __init__(self):
+        super().__init__()
+    def word_embedding_forward(self, input_ids, output_cross_layer, **kw_args):
+        return self.transformer.word_embeddings(input_ids).transpose(0, 1)
+class ProteinGLMForGeneration(BaseModel):
+    def __init__(self, args, transformer=None, **kwargs):
+        super().__init__(
+            args,
+            transformer=transformer,
+            **kwargs
+        )
+        self.add_mixin("glu-deepnorm", DeepNormWithGLUMixin(args.num_layers, args.hidden_size, args.inner_hidden_size))
+        self.add_mixin(
+            "fp32-softmax",
+            SelfAttentionWithFP32SoftmaxMixin(args.fp16, args.hidden_size, args.num_attention_heads, args.model_parallel_size),
+        )
+        if args.untie_head:
+            self.add_mixin("final-forward", UntieFinalForwardMixin(args.hidden_size, args.vocab_size, args.head_num))
+        else:
+            self.add_mixin("final-forward", FinalForwardMixin())
+        self.add_mixin("non-position-embedding", NonePositionEmbedding())
+        del self.transformer.position_embeddings
+        self.add_mixin("word-embedding", WordEmbedding())
+        self.add_mixin(
+            "rotary-embedding",
+            RotaryEmbeddingMixin(
+                args.fp16,
+                args.hidden_size,
+                args.num_attention_heads,
+                args.model_parallel_size,
+                args.rotary_embedding_2d
+            ),
+        )
+        self.get_mixin("glu-deepnorm").reinit()
+    @classmethod
+    def add_model_specific_args(cls, parser):
+        group = parser.add_argument_group('ProteinGLMForGeneration', 'ProteinGLMForGeneration Configurations')
+        group.add_argument('--untie-head', action='store_true', help='untie-heads')
+        group.add_argument('--head-num', default=1, type=int, help='head>1')
+        group.add_argument('--infer-type', default=1, type=int, help='1 for Generation')
+        group.add_argument('--rotary-embedding-2d', action='store_true',
+                help='If set, use 2D rotary embedding for ProtenGLM.')
+        return super().add_model_specific_args(parser)

msa_input ADDED Viewed

	@@ -0,0 +1,4 @@

+PPGPPGPPGKPGANGLSGERGPPGPPGPPG
+SYEDQNSLLKMICQQVEAIKKEMQELKLNS<M>-AEDHKTILQMICQQVEALKNEMQEMKLNS<M>-AEDQKSLLQMICQQVEALKNEMHEMKLNS
+MGSSHHHHHHSSGLVPRGSHMGAATPAERDAILLDLVRGQVAAVLGHASGEDIEPGRAFKNLGFDSLTAVELRDRLGAATGHKLPATIVFDYPNPTALAQHLRAAVL
+MGSSHHHHHHSSGLVPRGSHMGAATPAERDAILLDLVRGQVAAVLGHASGEDIEPGRAFKNLGFDSLTAVELRDRLGAATGHKLPATIVFDYPNPTALAQHLRAAVL<M>-------------ITPSVESLRDLPRSERREALETLVVTEFKTALLMTEQDDLPLDESYFDLGLTSLTVNDLKQRLESLLSREIDGTLLFNSPTVQRLLDHLEEDV-

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+numpy==1.24.1
+SwissArmyTransformer==0.4.11
+torch==2.1.0.dev20230822+cu118

resources/app_case.png ADDED Viewed

resources/demo.gif ADDED Viewed

Git LFS Details

SHA256: 499be1fc8c44d53b5f717176630c60525202ac7367c3f5e3e94a3ab61b0d7da0
Pointer size: 132 Bytes
Size of remote file: 1.58 MB

resources/overall_frame.png ADDED Viewed

scripts/cli_sat.sh ADDED Viewed

	@@ -0,0 +1,62 @@

+#!/bin/bash
+script_path=$(realpath $0)
+script_dir=$(dirname $script_path)
+main_dir=$(dirname $script_dir)
+MP_SIZE=1
+# MODEL_NAME="MSAGPT-"
+# MODEL_NAME="MSAGPT-dpo"
+SEED=12345
+MAX_GEN_LENGTH=128
+MIN_GEN_LENGTH=0
+# BeamSearchStrategy args
+NUM_BEAMS=4
+LENGTH_PENALTY=1.0
+NO_REPEAT_NGRAM=0
+# BaseStrategy args
+TEMP=0.8
+TOPK=0
+TOPP=0.9
+PORT=19865
+MODEL_ARGS="--bf16 \
+            --skip-init \
+            --mode finetune \
+            --rotary-embedding-2d"
+       #      --mode inference \ TODO: sat ds_config bug?
+GENERATION_ARGS="--seed $SEED \
+              --sampling-strategy BaseStrategy \
+              --max-gen-length $MAX_GEN_LENGTH \
+              --min-gen-length $MIN_GEN_LENGTH \
+              --num-beams $NUM_BEAMS \
+              --length-penalty $LENGTH_PENALTY \
+              --no-repeat-ngram-size $NO_REPEAT_NGRAM \
+              --multiline_stream \
+              --temperature $TEMP \
+              --top_k $TOPK \
+              --top_p $TOPP
+"
+# --sampling-strategy BeamSearchStrategy \
+# --no-gap
+OPTIONS_NCCL="NCCL_DEBUG=VERSION NCCL_IB_DISABLE=0 NCCL_NET_GDR_LEVEL=2 CUDA_LAUNCH_BLOCKING=0"
+ARGS="${main_dir}/cli_sat.py \
+       $MODEL_ARGS \
+       $GENERATION_ARGS \
+       $*"
+run_cmd="${OPTIONS_NCCL} torchrun --nproc_per_node $MP_SIZE --master_port=$PORT ${ARGS}"
+echo  ${run_cmd}
+eval ${run_cmd}
+set +x

utils/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .strategies import AdvancedBaseStrategy, BeamSearchStrategy
+from .tokenization import proteinglm_tokenizer
+from .chat import chat_api
+from .utils import move_cursor_up

utils/chat.py ADDED Viewed

	@@ -0,0 +1,371 @@

+import os
+import torch
+import stat
+import re
+import time
+import argparse
+import numpy as np
+from functools import partial
+from typing import List, Tuple
+import torch.distributed as dist
+from sat.helpers import print_rank0
+from sat import mpu, get_args, get_tokenizer
+from sat.generation.utils import timed_name, generate_continually
+from sat.generation.autoregressive_sampling import update_mems, get_masks_and_position_ids_default
+from .utils import move_cursor_up, move_cursor_down
+def get_masks_and_position_ids(seq, msa_len, max_gen_length, gmask=False):
+    context_length = seq.shape[1]
+    query_len = msa_len
+    max_msa_num = (max_gen_length - 2) // query_len
+    max_gen_length = max_msa_num * query_len + 2
+    tokens = torch.nn.functional.pad(seq, (0, max_gen_length - context_length), mode="constant", value=-1)
+    attention_mask = torch.ones((1, tokens.shape[-1], tokens.shape[-1]), device=tokens.device)
+    attention_mask.tril_()
+    attention_mask.unsqueeze_(1)
+    attention_mask = (attention_mask < 0.5).bool()
+    # <gMASK> + <SOP>
+    position_ids = np.zeros(max_gen_length, dtype=int)
+    block_position_ids = np.zeros(max_gen_length, dtype=int)
+    pre = 0
+    for msa_idx in range(max_msa_num):
+        position_ids[(1 + pre): (1 + pre + query_len)] =  np.arange(query_len, dtype = int)
+        block_position_ids[(1 + pre): (1 + pre + query_len)] = msa_idx
+        pre += query_len
+    position_ids = np.stack((position_ids, block_position_ids), axis=0)
+    position_ids = torch.from_numpy(position_ids).to(tokens.device)
+    position_ids = position_ids.unsqueeze(0)
+    return tokens, attention_mask, position_ids
+def generation_sequence(
+        model,
+        seqs,
+        strategy,
+        max_memory_length=100000,
+        get_masks_and_position_ids=get_masks_and_position_ids,
+        stream=False,
+        mems=None,
+        **kw_args
+        ):
+    '''
+        seq: [2, 3, 5, ..., -1(to be generated), -1, ...]
+        mems: [num_layers, batch_size, len_mems(index), mem_hidden_size]
+            cache, should be first mems.shape[1] parts of context_tokens.
+            mems are the first-level citizens here, but we don't assume what is memorized.
+            input mems are used when multi-phase generation.
+    '''
+    assert len(seqs.shape) == 2
+    # building the initial tokens, attention_mask, and position_ids
+    batch_size, context_length = seqs.shape
+    seqs, attention_mask, position_ids = get_masks_and_position_ids(seqs)
+    tokens = seqs[..., :context_length]
+    # initialize generation
+    counter = context_length # Last fixed index is ``counter''
+    index = 0 if mems is None else mems.shape[2] # Next forward starting index, also the length of cache.
+    num_beams = 1
+    # step-by-step generation
+    while counter < seqs.shape[1] - 1:
+        # Now, we want to generate seq[counter + 1],
+        # token[:, index: counter+1] needs forwarding.
+        # forward
+        tokens = tokens.reshape(batch_size * num_beams, -1)
+        mems = mems.reshape(mems.shape[0], batch_size * num_beams, mems.shape[-2], mems.shape[-1]) if mems is not None else None
+        model.eval()
+        with torch.no_grad():
+            logits, *output_per_layers = model(
+                tokens[:, index:],
+                position_ids[..., index: counter],
+                attention_mask[..., index: counter, :counter], # TODO memlen
+                mems=mems,
+                **kw_args
+            )
+        mem_kv = [o['mem_kv'] for o in output_per_layers]
+        mems = update_mems(mem_kv, mems, max_memory_length=max_memory_length)
+        logits = logits[:, -1]
+        index = counter
+        counter += 1
+        logits = logits.reshape(batch_size, num_beams, -1)
+        tokens = tokens.reshape(batch_size, num_beams, -1)
+        mems = mems.reshape(mems.shape[0], batch_size, num_beams, mems.shape[-2], mems.shape[-1])
+        tokens, mems = strategy.forward(logits, tokens, mems)
+        if len(tokens.shape) == 3 and num_beams == 1:
+            num_beams = tokens.shape[1]
+            position_ids = position_ids.unsqueeze(1).expand(batch_size, num_beams, 2, -1).reshape(batch_size * num_beams, 2, -1)
+            attention_mask_shape = attention_mask.shape[-3:]
+            attention_mask = attention_mask.unsqueeze(1).expand(batch_size, num_beams, -1, -1, -1).reshape(
+                batch_size * num_beams, *attention_mask_shape)
+        if strategy.is_done:
+            break
+    return strategy.finalize(tokens, mems)
+def stream_generation_sequence(
+        model,
+        seqs,
+        strategy,
+        max_memory_length=100000,
+        get_masks_and_position_ids=get_masks_and_position_ids,
+        stream=False,
+        mems=None,
+        **kw_args
+        ):
+    '''
+        seq: [2, 3, 5, ..., -1(to be generated), -1, ...]
+        mems: [num_layers, batch_size, len_mems(index), mem_hidden_size]
+            cache, should be first mems.shape[1] parts of context_tokens.
+            mems are the first-level citizens here, but we don't assume what is memorized.
+            input mems are used when multi-phase generation.
+    '''
+    assert len(seqs.shape) == 2
+    # building the initial tokens, attention_mask, and position_ids
+    batch_size, context_length = seqs.shape
+    seqs, attention_mask, position_ids = get_masks_and_position_ids(seqs)
+    tokens = seqs[..., :context_length]
+    # initialize generation
+    counter = context_length # Last fixed index is ``counter''
+    index = 0 if mems is None else mems.shape[2] # Next forward starting index, also the length of cache.
+    num_beams = 1
+    # step-by-step generation
+    while counter < seqs.shape[1] - 1:
+        # Now, we want to generate seq[counter + 1],
+        # token[:, index: counter+1] needs forwarding.
+        # forward
+        tokens = tokens.reshape(batch_size * num_beams, -1)
+        mems = mems.reshape(mems.shape[0], batch_size * num_beams, mems.shape[-2], mems.shape[-1]) if mems is not None else None
+        model.eval()
+        with torch.no_grad():
+            logits, *output_per_layers = model(
+                tokens[:, index:],
+                position_ids[..., index: counter],
+                attention_mask[..., index: counter, :counter], # TODO memlen
+                mems=mems,
+                **kw_args
+            )
+        mem_kv = [o['mem_kv'] for o in output_per_layers]
+        mems = update_mems(mem_kv, mems, max_memory_length=max_memory_length)
+        logits = logits[:, -1]
+        index = counter
+        counter += 1
+        logits = logits.reshape(batch_size, num_beams, -1)
+        tokens = tokens.reshape(batch_size, num_beams, -1)
+        mems = mems.reshape(mems.shape[0], batch_size, num_beams, mems.shape[-2], mems.shape[-1])
+        tokens, mems = strategy.forward(logits, tokens, mems, is_first=False)
+        if len(tokens.shape) == 3 and num_beams == 1:
+            num_beams = tokens.shape[1]
+            position_ids = position_ids.unsqueeze(1).expand(batch_size, num_beams, 2, -1).reshape(batch_size * num_beams, 2, -1)
+            attention_mask_shape = attention_mask.shape[-3:]
+            attention_mask = attention_mask.unsqueeze(1).expand(batch_size, num_beams, -1, -1, -1).reshape(
+                batch_size * num_beams, *attention_mask_shape)
+        yield tokens, mems
+        if strategy.is_done:
+            break
+def autoregressive_sampling(args, raw_text: str, model, tokenizer, strategy, stream=False) -> Tuple[List[str], List[str], List[List[str]]]:
+    # add MASK
+    generation_mask = "[gMASK]"
+    seq = []
+    msa_len = len(raw_text[0]) + 1
+    seq += [tokenizer.get_command(generation_mask)] + [tokenizer.get_command("sop")]
+    for each in raw_text:
+        seq += tokenizer.tokenize(each) + [tokenizer.get_command('<M>')]
+    output_list = [seq]
+    num_output = args.num_beams if args.sampling_strategy == "BeamSearchStrategy" else 1
+    seq = output_list[0]
+    # detect mask position
+    mask_token = tokenizer.get_command(generation_mask)
+    mask_position = seq.index(mask_token)
+    last_pos, answers, blanks, output_list = (
+        [0] * num_output,
+        ["" for _ in range(num_output)],
+        [[] for _ in range(num_output)],
+        []
+    )
+    icl_msas = len(raw_text)
+    input_seq = torch.tensor(
+        [seq],
+        dtype = torch.long,
+        device=args.device,
+    )
+    if args.stream_chat:
+        if args.chinese:
+            print(f"{'生成的MSA'.center(20, '*')}", flush=True)
+        else:
+            print(f"{'Virtual MSA'.center(20, '*')}", flush=True)
+        output_stream = stream_generation_sequence(
+            model = model,
+            seqs = input_seq,
+            strategy=strategy,
+            get_masks_and_position_ids=partial(
+                get_masks_and_position_ids,
+                msa_len = msa_len,
+                max_gen_length=args.max_gen_length,
+                gmask=True
+            )
+        )
+        offset = -1
+        for tmp_res, mems in output_stream:
+            if isinstance(tmp_res, torch.Tensor):
+                output = tmp_res.tolist()
+            output_list = output[0]
+            for i in range(len(output_list)):
+                output = output_list[i].tolist() if isinstance(output_list[i], torch.Tensor) else output_list[i]
+                bog = output.index(tokenizer.get_command("sop"))
+                try:
+                    unfinished = output.index(-1)
+                except ValueError:
+                    unfinished = len(output)
+                output_list[i] = output[:mask_position] + output[bog + 1 : unfinished]
+            for i, output in enumerate(output_list):
+                if output[-1] == tokenizer.get_command("eos"):
+                    output = output[:-1]
+                answers[i] = tokenizer.detokenize(output)
+            tmp_ret = answers[0] # only support streaming output first line.
+            if mpu.get_model_parallel_rank() == 0:
+                if not args.multiline_stream:
+                    vit_msa = tmp_ret[offset if offset>0 else -1:]
+                    print(vit_msa, end='', flush=True)
+                    offset = len(tmp_ret)
+                else:
+                    print_len = 0
+                    vit_msa = tmp_ret.split('[<M>]')[icl_msas:]
+                    vit_msa = [_ for _ in vit_msa if len(_) > 0]
+                    for _ in vit_msa:
+                        print(_)
+                        print_len += 1
+                    move_cursor_up(print_len)
+        move_cursor_down(print_len)
+        print('\n')
+        output = strategy.finalize(tmp_res, mems)[0]
+    else:
+        output, _ = generation_sequence(
+            model = model,
+            seqs = input_seq,
+            strategy=strategy,
+            get_masks_and_position_ids=partial(
+                get_masks_and_position_ids,
+                msa_len = msa_len,
+                max_gen_length=args.max_gen_length,
+                gmask=True
+            )
+        )
+    last_pos, answers, blanks, output_list = (
+        [0] * num_output,
+        ["" for _ in range(num_output)],
+        [[] for _ in range(num_output)],
+        []
+    )
+    if isinstance(output, torch.Tensor):  # different strategies
+        output = output.tolist()
+    output = output[0]  # batch_size = 1
+    output_list.extend(output)
+    # clip -1s and fill back generated things into seq
+    for i in range(len(output_list)):
+        output = output_list[i].tolist() if isinstance(output_list[i], torch.Tensor) else output_list[i]
+        try:
+            unfinished = output.index(-1)
+        except ValueError:
+            unfinished = len(output)
+        # if output[unfinished - 1] in strategy.end_tokens:
+        #     unfinished -= 1
+        bog = output.index(tokenizer.get_command("sop"))
+        prefix = tokenizer.detokenize(output[last_pos[i] : mask_position])
+        blank = tokenizer.detokenize(output[bog + 1 : unfinished])
+        blanks[i].append(blank)
+        last_pos[i] = mask_position + unfinished - (bog + 1)
+        output_list[i] = output[:mask_position] + output[bog + 1 : unfinished]
+    for i, output in enumerate(output_list):
+        if output[-1] == tokenizer.get_command("eos"):
+            output = output[:-1]
+        answers[i] = tokenizer.detokenize(output)
+    return answers
+def offline_generation(args, temp, top_p, top_k, func):
+    os.makedirs(args.output_path, exist_ok=True)
+    with open(args.input_source, 'r', encoding="utf-8") as fin:
+        inputs = fin.readlines()
+    output_path = os.path.join(args.output_path, f"tmp_{temp}_p_{top_p}_k_{top_k}")
+    fin = open(output_path, 'w')
+    start_time = time.time()
+    for line_no, raw_text in enumerate(inputs):
+        if line_no % mpu.get_data_parallel_world_size() != mpu.get_data_parallel_rank():
+            continue
+        rk = dist.get_rank()
+        raw_text = raw_text.strip()
+        raw_text = raw_text.split('<M>')
+        main_seq = raw_text[0]
+        msa_len = len(main_seq) + 1
+        icl_msas = len(raw_text)
+        require_min_gen_length = msa_len * (icl_msas + 1) + 2
+        if args.max_gen_length < require_min_gen_length:
+            args.max_gen_length = require_min_gen_length # at least generate 1 msa.
+        if mpu.get_model_parallel_rank() == 0:
+            print(f'Processing No. {line_no} on model group {rk} input main seq: "{main_seq}" few-shot prompt: "{"<M>".join(raw_text[1:])}"')
+        if len(raw_text) == 0:
+            continue
+        ret = func(raw_text)
+        if mpu.get_model_parallel_rank() == 0:
+            if args.print_all_beams:
+                for idx, vit_msa in enumerate(ret):
+                    vit_msa = vit_msa.split('[<M>]')[icl_msas:]
+                    vit_msa = [_ for _ in vit_msa if len(_) > 0]
+                    vit_msa_len = len(vit_msa)
+                    vit_msa_str = '<M>'.join(vit_msa)
+                    print('Beam: {} #Vitural Length:{} | MSA: "{}" | (Temp, P, K)=({}, {}, {}) | Taken time {:.2f}'.format(idx, vit_msa_len, vit_msa_str, temp, top_p, top_k, time.time() - start_time), flush=True)
+            else:
+                vit_msa = ret[0]
+                vit_msa = vit_msa.split('[<M>]')[icl_msas:]
+                vit_msa = [_ for _ in vit_msa if len(_) > 0]
+                vit_msa_len = len(vit_msa)
+                vit_msa_str = '<M>'.join(vit_msa)
+                fin.write(f"{vit_msa_str}"+'\n')
+                print('#Vitural Length:{} | MSA: "{}" | (Temp, P, K)=({}, {}, {}) | Taken time {:.2f}'.format(vit_msa_len, vit_msa_str, temp, top_p, top_k, time.time() - start_time), flush=True)
+        print()
+        fin.flush()
+    dist.barrier()
+    fin.close()
+def online_generation(args, query, temp, top_p, top_k, func):
+    raw_text = query.strip()
+    raw_text = raw_text.split('<M>')
+    main_seq = raw_text[0]
+    msa_len = len(main_seq) + 1
+    icl_msas = len(raw_text)
+    require_min_gen_length = msa_len * (icl_msas + 1) + 2
+    if args.max_gen_length < require_min_gen_length:
+        args.max_gen_length = require_min_gen_length # at least generate 1 msa.
+    ret = func(raw_text)
+    response = []
+    if mpu.get_model_parallel_rank() == 0:
+        for idx, vit_msa in enumerate(ret):
+            vit_msa = vit_msa.split('[<M>]')[icl_msas:]
+            vit_msa = [_ for _ in vit_msa if len(_) > 0]
+            response.append(vit_msa)
+        return response
+def chat_api(args, model, tokenizer, strategy, query=None): # TODO: Steam chat
+    if args.input_source == 'chat':
+        assert query is not None
+        ret = online_generation(args, query, temp=args.temperature, top_p = args.top_p, top_k = args.top_k, func = partial(autoregressive_sampling, args, model = model, tokenizer = tokenizer, strategy = strategy))
+        return ret
+    else:
+        assert not args.stream_chat, "Offline Generation don't support streaming output."
+        offline_generation(args, temp=args.temperature, top_p = args.top_p, top_k = args.top_k, func = partial(autoregressive_sampling, args, model = model, tokenizer = tokenizer, strategy = strategy))

utils/strategies.py ADDED Viewed

	@@ -0,0 +1,229 @@

+import numpy as np
+import torch
+import torch.nn.functional as F
+from sat.generation.sampling_strategies.base_strategy import top_k_logits
+from sat.mpu.initialize import get_model_parallel_world_size, get_model_parallel_src_rank, get_model_parallel_group
+class AdvancedBaseStrategy:
+    def __init__(self, batch_size, invalid_slices=[], temperature=1., no_repeat_ngram_size = 0, top_k=200, eps=1e-4, top_p=0.0, min_gen_length=1, end_tokens=None):
+        self.batch_size = batch_size
+        self.invalid_slices = invalid_slices
+        self.temperature = temperature
+        self.topk = top_k
+        self.top_p = top_p
+        self.eps = eps
+        self.min_gen_length = min_gen_length
+        self.ngram=no_repeat_ngram_size
+        if end_tokens is None:
+            end_tokens = []
+        self.end_tokens = end_tokens
+        self.length_generated = 0
+        self.cached_beam_ngram_bans = [{} for _ in range(self.batch_size)]
+        self._is_done = np.zeros(self.batch_size, dtype=np.bool_)
+        self._init_cache()
+    @property
+    def is_done(self) -> bool:
+        return self._is_done.all()
+    def _init_cache(self):
+        self.length_generated = 0
+        self.cached_beam_ngram_bans = [[{}] for _ in range(self.batch_size)]
+        self._is_done = np.zeros(self.batch_size, dtype=bool)
+    def forward(self, logits, tokens, mems, is_first = False, temperature=None):
+        # print(is_first)
+        batch_size, num_beam, seq_len = tokens.shape
+        seq_len = tokens.shape[-1]
+        if temperature is None:
+            temperature = self.temperature
+        logits = logits / temperature
+        if self.min_gen_length > self.length_generated:
+            for end_token in self.end_tokens:
+                logits[..., end_token] = -65504
+        for invalid_slice in self.invalid_slices:
+            logits[..., invalid_slice] = -65504
+        if self.ngram > 0 and seq_len > self.ngram:
+            for batch_idx in range(batch_size):
+                for i in range(num_beam):
+                    ngram_prefix = tokens[batch_idx, i, -(self.ngram - 1) :].tolist()  # TODO ngram=1
+                    for banned_index in self.cached_beam_ngram_bans[batch_idx][i].get(tuple(ngram_prefix), []):
+                        logits[batch_idx, i, banned_index] = -65504
+        logits = logits.view(-1, logits.size(-1))
+        logits = top_k_logits(logits, self.topk, self.top_p)
+        probs = F.softmax(logits.float(), dim=-1)  # float is essetial, due to a bug in Pytorch
+        pred = torch.multinomial(probs, num_samples=1)
+        for i in range(self.batch_size):
+            if i >= batch_size:
+                self._is_done[i] = True
+            elif self._is_done[i]:
+                pred[i] = -1
+            elif pred[i].item() in self.end_tokens:
+                self._is_done[i] = True
+        if self.ngram > 0:
+            for batch_idx in range(batch_size):
+                bans_continue = []
+                for i in range(num_beam):
+                    bans = self.cached_beam_ngram_bans[batch_idx][i].copy()
+                    ngram_prefix = tuple(tokens[batch_idx, i, -(self.ngram - 1):].tolist())
+                    bans[ngram_prefix] = bans.get(ngram_prefix, tuple()) + (pred[batch_idx],)
+                    bans_continue.append(bans)
+                self.cached_beam_ngram_bans[batch_idx] = bans_continue
+        tokens = torch.cat((tokens, pred.view(tokens.shape[:-1] + (1,))), dim=-1)
+        self.length_generated += 1
+        return tokens, mems
+    def finalize(self, tokens, mems):
+        self._is_done = np.zeros(self.batch_size, dtype=np.bool_)
+        self._init_cache()
+        return tokens, mems
+class BeamSearchStrategy:
+    def __init__(
+        self,
+        batch_size,
+        num_beams,
+        length_penalty=1.0,
+        consider_end=False,
+        end_tokens=[],
+        invalid_slices=[],
+        no_repeat_ngram_size=0,
+        min_gen_length=0,
+        deterministic=False,
+    ):
+        self.batch_size = batch_size
+        self.num_beams = num_beams
+        self.length_penalty = length_penalty
+        self.end_tokens = end_tokens
+        self.ngram = no_repeat_ngram_size
+        self.min_gen_length = min_gen_length
+        self.invalid_slices = invalid_slices
+        self.consider_end = consider_end
+        self.deterministic = deterministic
+        self._init_cache()
+    def _init_cache(self):
+        self.end_beams = [[] for _ in range(self.batch_size)]  # list of LongTensors
+        self.end_beams_penalized_scores = [[] for _ in range(self.batch_size)]  # list of LongTensors
+        self.cached_beam_scores = 0  # [batch_size]
+        self.cached_beam_ngram_bans = [[{} for _ in range(self.num_beams)] for _ in range(self.batch_size)]
+        self.length_generated = 0
+        self._is_done = np.zeros(self.batch_size, dtype=np.bool_)
+    def _add_end_beams(self, score, beam, batch_idx):
+        score = score / ((5.0 + len(beam)) / 6) ** self.length_penalty  # Magic number for OpenNMT
+        for i in range(len(self.end_beams[batch_idx]), -1, -1):
+            if i == 0 or score < self.end_beams_penalized_scores[batch_idx][i - 1]:
+                break
+        self.end_beams[batch_idx].insert(i, beam)
+        self.end_beams_penalized_scores[batch_idx].insert(i, score)
+        self.end_beams[batch_idx] = self.end_beams[batch_idx][: self.num_beams]
+        self.end_beams_penalized_scores[batch_idx] = self.end_beams_penalized_scores[batch_idx][: self.num_beams]
+    @property
+    def is_done(self) -> bool:
+        return self._is_done.all()
+    def forward(self, logits, tokens, mems):
+        batch_size, num_beams, vocab_size = logits.shape
+        seq_len = tokens.shape[-1]
+        logits = logits.float()
+        for invalid_slice in self.invalid_slices:
+            logits[..., invalid_slice] = -65504
+        if self.min_gen_length > self.length_generated:
+            for end_token in self.end_tokens:
+                logits[..., end_token] = -65504
+        if self.ngram > 0 and seq_len > self.ngram:
+            for batch_idx in range(batch_size):
+                for i in range(num_beams):
+                    ngram_prefix = tokens[batch_idx, i, -(self.ngram - 1) :].tolist()  # TODO ngram=1
+                    for banned_index in self.cached_beam_ngram_bans[batch_idx][i].get(tuple(ngram_prefix), []):
+                        logits[batch_idx, i, banned_index] = -65504
+        next_token_scores = F.log_softmax(logits, dim=-1)  # [batch_size, vocab_size]
+        prev_scores = self.cached_beam_scores
+        if isinstance(prev_scores, torch.Tensor):
+            prev_scores = prev_scores[..., None].expand_as(next_token_scores)
+        next_token_scores = next_token_scores + prev_scores
+        next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size)
+        probs = F.softmax(next_token_scores, dim=-1)
+        if num_beams < self.num_beams:  # First token
+            probs = probs[..., :vocab_size]
+        if self.deterministic:
+            next_tokens = torch.topk(probs, k=(max(1, len(self.end_tokens)) + 1) * self.num_beams).indices  # [2*nb]
+        else:
+            next_tokens = torch.multinomial(
+                probs, num_samples=(max(1, len(self.end_tokens)) + 1) * self.num_beams
+            )  # [2*nb]
+        next_token_scores = next_token_scores[torch.arange(batch_size).unsqueeze(1), next_tokens]
+        next_token_scores, _indices = torch.sort(next_token_scores, descending=True, dim=1)
+        next_tokens = next_tokens[torch.arange(batch_size).unsqueeze(1), _indices]
+        next_indices = torch.div(next_tokens, vocab_size, rounding_mode="trunc")
+        next_tokens = next_tokens % vocab_size
+        # select out end beams or continue beams
+        beam_continue_batch, score_continue_batch, mems_continue_batch = [], [], []
+        for batch_idx in range(batch_size):
+            beam_continue = []
+            scores_continue = []
+            bans_continue = []
+            mems_contiue = []
+            for i in range(len(next_tokens[batch_idx])):
+                beam = torch.cat((tokens[batch_idx, next_indices[batch_idx, i]], next_tokens[batch_idx, i : i + 1]))
+                if not self._is_done[batch_idx] and int(next_tokens[batch_idx, i]) in self.end_tokens:
+                    self._add_end_beams(next_token_scores[batch_idx, i], beam, batch_idx)
+                elif len(beam_continue) < self.num_beams:
+                    beam_continue.append(beam)
+                    mems_contiue.append(mems[:, batch_idx, next_indices[batch_idx, i]])
+                    # update caches
+                    scores_continue.append(next_token_scores[batch_idx, i])
+                    if self.ngram > 0:
+                        bans = self.cached_beam_ngram_bans[batch_idx][next_indices[batch_idx, i]].copy()
+                        # TODO ngram=1
+                        ngram_prefix = tuple(tokens[batch_idx, next_indices[batch_idx, i], -(self.ngram - 1):].tolist())
+                        bans[ngram_prefix] = bans.get(ngram_prefix, tuple()) + (next_tokens[batch_idx, i],)
+                        bans_continue.append(bans)
+                else:
+                    break
+            beam_continue_batch.append(torch.stack(beam_continue))
+            mems_continue_batch.append(torch.stack(mems_contiue, dim=1))
+            score_continue_batch.append(scores_continue)
+            self.cached_beam_ngram_bans[batch_idx] = bans_continue
+        tokens = torch.stack(beam_continue_batch)
+        mems = torch.stack(mems_continue_batch, dim=1)
+        self.cached_beam_scores = torch.tensor(score_continue_batch, device=logits.device)
+        self.length_generated += 1
+        for batch_idx in range(self.batch_size):
+            if batch_idx >= batch_size:
+                self._is_done[batch_idx] = True
+            elif (
+                len(self.end_beams[batch_idx]) == self.num_beams
+                and self.end_beams_penalized_scores[batch_idx][-1]
+                >= self.cached_beam_scores[batch_idx].max() / ((5.0 + (seq_len + 1)) / 6) ** self.length_penalty
+            ):  # We're done if none of current tokens will better than the worst in end_beams
+                self._is_done[batch_idx] = True
+        return tokens, mems
+    def finalize(self, tokens, mems):
+        if self.consider_end:
+            batch_size, num_beams = tokens.shape[:2]
+            for batch_idx in range(batch_size):
+                if not self._is_done[batch_idx]:
+                    for i in range(num_beams):
+                        self._add_end_beams(self.cached_beam_scores[batch_idx, i], tokens[batch_idx, i], batch_idx)
+            mems = None
+            ret = self.end_beams[:batch_size]
+        else:
+            ret = tokens
+        self._init_cache()
+        return ret, mems

utils/tokenization.py ADDED Viewed

	@@ -0,0 +1,213 @@

+from typing import Sequence, Tuple, List, Union
+import itertools
+class ResidueLevelTokenizer:
+    """
+    Tokenizer for Protein Residue Level Tokenization.
+    """
+    def __init__(self, **kwargs):
+        super(ResidueLevelTokenizer, self).__init__()
+        self.pad_tok = ['[pad]']
+        self.all_toks = self.pad_tok
+        self._tokens = ['L', 'A', 'G', 'V', 'S', 'E', 'R', 'T', 'I', 'D', 'P', 'K', 'Q', 'N', 'F', 'Y', 'M', 'H', 'W', 'C', 'X', 'B', 'U', 'Z', 'O', '.', '-']
+        self.all_toks.extend(self._tokens)
+        self._special_tokens = ['MASK', 'gMASK', 'sMASK', 'eod', 'sop', 'eop', '</s>', '<M>']
+        self.set_special_tokens(self._special_tokens)
+        self.special_tokens['eos']=self.special_tokens['</s>']
+        self.special_tokens['tMASK']=self.special_tokens['MASK']
+        self.all_toks.extend(self._special_tokens)
+        self._vocab = {t: i for i, t in enumerate(self.all_toks)}
+        self.command_token = {'[tMASK]': 'tMASK', '[MASK]':'MASK', '[gMASK]': 'gMASK', '[sMASK]':'sMASK'}
+        # print('Building vocab.: {}'.format(self._vocab))
+        # print('Special_tokens: {}'.format(self.special_tokens))
+        # print('All tokens: {}'.format(self.all_toks))
+    def pad_id(self):
+        return self._vocab['[pad]']
+    def set_special_tokens(self, special_tokens):
+        """Add a list of additional tokens to the encoder.
+        The additional tokens are indexed starting from the last index of the
+        current vocabulary in the order of the `special_tokens` list.
+        """
+        if not special_tokens:
+            self.special_tokens = {}
+            self.special_tokens_decoder = {}
+            return
+        self.special_tokens = dict((tok, len(self.all_toks) + i) for i, tok in enumerate(special_tokens))
+        self.special_tokens_decoder = {v: k for k, v in self.special_tokens.items()}
+    def __len__(self):
+        return len(self._vocab)
+    def EncodeAsIds(self, text, process_fn=None):
+        """convert sequence to idx"""
+        processed_text = text
+        if process_fn is not None:
+            processed_text = process_fn(processed_text)
+            processed_text = str(processed_text)
+        tokens = [self.TokenToId(c) for c in processed_text]
+        return tokens
+    def IdToToken(self, idx):
+        if idx == 0:
+            return '[pad]'
+        elif idx in self.special_tokens_decoder:
+            return f"[{self.special_tokens_decoder[idx]}]"
+        else:
+            try:
+                tok = self.all_toks[idx]
+            except:
+                tok = '*'
+            return tok
+    def TokenToId(self, token):
+        if token == '[pad]':
+            return 0
+        elif token in self.special_tokens:
+            return self.special_tokens[token]
+        else:
+            return self._vocab[token]
+    def DecodeIds(self, Ids):
+        return ''.join([self.IdToToken(tok) for tok in Ids])
+    def _tokenize(self, text) -> str:
+        return text.split()
+    def tokenize(self, text, **kwargs) -> List[str]:
+        """
+        Inspired by https://github.com/huggingface/transformers/blob/master/src/transformers/tokenization_utils.py
+        Converts a string in a sequence of tokens, using the tokenizer.
+        Args:
+            text (:obj:`str`):
+                The sequence to be encoded.
+        Returns:
+            :obj:`List[str]`: The list of tokens.
+        """
+        def split_on_token(tok, text):
+            result = []
+            split_text = text.split(tok)
+            for i, sub_text in enumerate(split_text):
+                # AddedToken can control whitespace stripping around them.
+                # We use them for GPT2 and Roberta to have different behavior depending on the special token
+                # Cf. https://github.com/huggingface/transformers/pull/2778
+                # and https://github.com/huggingface/transformers/issues/3788
+                # We strip left and right by default
+                if i < len(split_text) - 1:
+                    sub_text = sub_text.rstrip()
+                if i > 0:
+                    sub_text = sub_text.lstrip()
+                if i == 0 and not sub_text:
+                    result.append(tok)
+                elif i == len(split_text) - 1:
+                    if sub_text:
+                        result.append(sub_text)
+                    else:
+                        pass
+                else:
+                    if sub_text:
+                        result.append(sub_text)
+                    result.append(tok)
+            return result
+        def split_on_tokens(tok_list, text):
+            if not text.strip():
+                return []
+            tokenized_text = []
+            text_list = [text]
+            for tok in tok_list:
+                tokenized_text = []
+                for sub_text in text_list:
+                    if sub_text not in self._tokens:
+                        tokenized_text.extend(split_on_token(tok, sub_text))
+                    else:
+                        tokenized_text.append(sub_text)
+                text_list = tokenized_text
+            return list(
+                itertools.chain.from_iterable(
+                    (
+                        self._tokenize(token)
+                        if token not in self.all_toks
+                        else [token]
+                        for token in tokenized_text
+                    )
+                )
+            )
+        no_split_token = self.all_toks
+        tokenized_text = split_on_tokens(no_split_token, text)
+        return self.convert_tokens_to_ids(tokenized_text)
+    def convert_tokens_to_ids(self, tokens):
+        """Converts a sequence of tokens into ids using the vocab."""
+        ids = []
+        # print_rank_0(tokens)
+        # print_rank_0(self.vocab)
+        for token in tokens:
+            ids.append(self.TokenToId(token))
+        return ids
+class proteinglm_tokenizer:
+    """
+    Protein Tokenizer based on Residue level tokenizer
+    """
+    def __init__(self):
+        name = 'ProteinTokenizer'
+        self.tokenizer = ResidueLevelTokenizer()
+        self.special_tokens = self.tokenizer.special_tokens
+    def IdToToken(self, idx):
+        return self.tokenizer.IdToToken(idx)
+    def TokenToId(self, token):
+        return self.tokenizer.TokenToId(token)
+    @property
+    def vocab_size(self):
+        return len(self.tokenizer)
+    def decode(self, token_ids):
+        return self.tokenizer.DecodeIds([token_ids])
+    @property
+    def eod(self):
+        return self.tokenizer.get_special_token('eos')
+    def detokenize(self, Ids, type_token=False):
+        new_tokens = self.tokenizer.DecodeIds(Ids)
+        return new_tokens
+    def tokenize(self, text):
+        ids = self.tokenizer.tokenize(text)
+        return ids
+    @property
+    def vocab(self):
+        return self.tokenizer._vocab
+    @property
+    def inv_vocab(self):
+        return {v:k for k, v in self.tokenizer._vocab.items()}
+    @property
+    def get_pad_id(self):
+        return self.tokenizer.pad_id
+    def get_command(self, token):
+        tok = token
+        if token in self.tokenizer.command_token:
+            tok = self.tokenizer.command_token[token]
+        return self.tokenizer.special_tokens[tok]

utils/utils.py ADDED Viewed

	@@ -0,0 +1,7 @@

+def move_cursor_up(n):
+    # ANSI escape code to move cursor up by n lines
+    print(f"\033[{n}A", end='')
+def move_cursor_down(n):
+    # ANSI escape code to move cursor down by n lines
+    print(f"\033[{n}B", end='')