Spaces:

yrshi
/

ReactXT

Runtime error

App Files Files Community

SyrWin commited on Jun 14, 2024

Commit

95f97c5

1 Parent(s): a281fc1

init

Browse files

Files changed (45) hide show

.gitattributes +1 -0
.gitignore +18 -0
README.md +49 -13
all_checkpoints/.gitignore +2 -0
app.py +309 -0
average_ckpt.py +33 -0
convert.py +14 -0
data_provider/__init__.py +0 -0
data_provider/caption_dataset.py +93 -0
data_provider/chebi_dataset.py +42 -0
data_provider/context_gen.py +207 -0
data_provider/data_utils.py +144 -0
data_provider/molecule_abstract_dataset.py +222 -0
data_provider/pretrain_dm.py +309 -0
data_provider/r_smiles.py +449 -0
data_provider/reaction_action_dataset.py +100 -0
data_provider/synthesis_dataset.py +160 -0
data_provider/tune_dm.py +312 -0
demo.json +7 -0
demo.py +224 -0
environment.yml +489 -0
figures/frameworks.jpg +3 -0
gin_pretrained/graphcl_80.pth +3 -0
graph_gen.ipynb +190 -0
lora_config.json +14 -0
main.py +157 -0
model/allowed_words.json +118 -0
model/blip2.py +126 -0
model/blip2_llama.py +266 -0
model/blip2_model.py +381 -0
model/blip2_opt.py +417 -0
model/blip2_t5.py +305 -0
model/blip2qformer.py +603 -0
model/dist_funs.py +83 -0
model/gin_model.py +397 -0
model/help_funcs.py +86 -0
model/modeling_llama.py +888 -0
model/modeling_opt.py +1223 -0
model/opt_flash_attention.py +331 -0
read_results/baselines.py +141 -0
read_results/read_results.py +28 -0
read_results/score.py +358 -0
read_results/t_test.py +48 -0
read_results/utils.py +256 -0
visualize_context_gen.py +164 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+figures/frameworks.jpg filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,18 @@

+kvplm_pretrained
+__pycache__
+test*
+log*
+*.sh.e*
+*.sh.o*
+.d*
+llms
+Text2graph
+fig/
+results/
+*.out
+*.err
+debug*
+data
+scripts
+conda_env
+tmp*

README.md CHANGED Viewed

@@ -1,13 +1,49 @@
----
-title: ReactXT
-emoji: 🏆
-colorFrom: green
-colorTo: indigo
-sdk: gradio
-sdk_version: 4.36.0
-app_file: app.py
-pinned: false
-license: cc-by-sa-4.0
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# ReactXT: Understanding Molecular “Reaction-ship” via Reaction-Contextualized Molecule-Text Pretraining
+## Comparison to previous molecule-text generative modeling methods
+![fig1](./figures/comparison.pdf)
+## Framework of ReactXT
+![fig1](./figures/frameworks.pdf)
+## Requirements
+Our environment is detailed in `environment.yml`. To create a new environment `reactxt`, run the following command:
+```bash
+conda env create -f environment.yml
+```
+## Reproduce the results
+### Reaction-Contextualized Molecule-Text Pretraining
+```bash
+bash scripts/run_pretrain.sh
+```
+### Finetuning on downstream tasks
+1. Experimental Procedure Prediction on OpenExp
+```bash
+bash scripts/run_action.sh
+```
+2. Molecule Captioning on PubChem324k and CheBI-20
+```bash
+bash scripts/run_caption.sh
+bash scripts/run_chebi.sh
+```
+3. Retro-synthesis Prediction on USPTO-50k
+```bash
+bash scripts/run_retro.sh
+```

all_checkpoints/.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *
2	+ !.gitignore

app.py ADDED Viewed

	@@ -0,0 +1,309 @@

+import os
+import torch
+import argparse
+import warnings
+from rdkit import Chem
+from rdkit.Chem import CanonSmiles
+from rdkit.Chem import MolFromSmiles, MolToSmiles
+from data_provider.pretrain_dm import PretrainDM
+from data_provider.tune_dm import *
+from model.opt_flash_attention import replace_opt_attn_with_flash_attn
+from model.blip2_model import Blip2Model
+from data_provider.data_utils import json_read, json_write
+from data_provider.data_utils import smiles2data, reformat_smiles
+import gradio as gr
+from datetime import datetime
+## for pyg bug
+warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
+## for A5000 gpus
+torch.set_float32_matmul_precision('medium') # can be medium (bfloat16), high (tensorfloat32), highest (float32)
+def smiles_split(string, separator='.'):
+    string = str(string)
+    mols = []
+    for smi in string.split(separator):
+        mol = MolFromSmiles(smi)
+        if mol is None:
+            continue  # Skip invalid SMILES strings
+        mols.append(mol)
+    parts = []
+    current_part = []
+    charge_count = 0
+    for mol in mols:
+        charge = Chem.GetFormalCharge(mol)
+        if charge==0:
+            if current_part:
+                smiles = '.'.join([MolToSmiles(m) for m in current_part])
+                smiles = CanonSmiles(smiles)
+                parts.append(smiles)
+                current_part = []
+                charge_count = 0
+            parts.append(MolToSmiles(mol))
+        else:
+            charge_count += charge
+            current_part.append(mol)
+            if charge_count == 0:
+                smiles = '.'.join([MolToSmiles(m) for m in current_part])
+                smiles = CanonSmiles(smiles)
+                parts.append(smiles)
+                current_part = []
+                charge_count = 0
+    if current_part:
+        smiles = '.'.join([MolToSmiles(m) for m in current_part])
+        smiles = CanonSmiles(smiles)
+        parts.append(smiles)
+    return parts
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--filename', type=str, default="main")
+    parser.add_argument('--seed', type=int, default=42, help='random seed')
+    # MM settings
+    parser.add_argument('--mode', type=str, default='pretrain', choices=['pretrain', 'ft', 'eval', 'pretrain_eval'])
+    parser.add_argument('--strategy_name', type=str, default='mydeepspeed')
+    parser.add_argument('--iupac_prediction', action='store_true', default=False)
+    parser.add_argument('--ckpt_path', type=str, default=None)
+    # parser = Trainer.add_argparse_args(parser)
+    parser = Blip2Model.add_model_specific_args(parser)  # add model args
+    parser = PretrainDM.add_model_specific_args(parser)
+    parser.add_argument('--accelerator', type=str, default='gpu')
+    parser.add_argument('--devices', type=str, default='0,1,2,3')
+    parser.add_argument('--precision', type=str, default='bf16-mixed')
+    parser.add_argument('--downstream_task', type=str, default='action', choices=['action', 'synthesis', 'caption', 'chebi'])
+    parser.add_argument('--max_epochs', type=int, default=10)
+    parser.add_argument('--enable_flash', action='store_true', default=False)
+    parser.add_argument('--disable_graph_cache', action='store_true', default=False)
+    parser.add_argument('--generate_restrict_tokens', action='store_true', default=False)
+    parser.add_argument('--train_restrict_tokens', action='store_true', default=False)
+    parser.add_argument('--smiles_type', type=str, default='default', choices=['default', 'canonical', 'restricted', 'unrestricted', 'r_smiles'])
+    parser.add_argument('--accumulate_grad_batches', type=int, default=1)
+    parser.add_argument('--tqdm_interval', type=int, default=50)
+    parser.add_argument('--check_val_every_n_epoch', type=int, default=1)
+    args = parser.parse_args()
+    if args.enable_flash:
+        replace_opt_attn_with_flash_attn()
+    return args
+app_config = {
+    "init_checkpoint": "all_checkpoints/ckpt_tune_hybridFeb11_May31/last_converted.ckpt",
+    "filename": "app",
+    "opt_model": "facebook/galactica-1.3b",
+    "num_workers": 4,
+    "rxn_max_len": 512,
+    "text_max_len": 512,
+    "precision": "bf16-mixed",
+    "max_inference_len": 512,
+}
+class InferenceRunner:
+    def __init__(self, model, tokenizer, rxn_max_len, smi_max_len,
+                 smiles_type='default', device='cuda', args=None):
+        self.model = model
+        self.rxn_max_len = rxn_max_len
+        self.smi_max_len = smi_max_len
+        self.tokenizer = tokenizer
+        self.collater = Collater([], [])
+        self.mol_ph = '<mol>' * args.num_query_token
+        self.mol_token_id = tokenizer.mol_token_id
+        self.is_gal = args.opt_model.find('galactica') >= 0
+        self.collater = Collater([], [])
+        self.device = device
+        self.smiles_type = smiles_type
+        self.args = args
+        time_stamp = datetime.now().strftime("%Y.%m.%d-%H:%M")
+        self.cache_dir = f'results/{self.args.filename}/{time_stamp}'
+        os.makedirs(self.cache_dir, exist_ok=True)
+    def make_query_dict(self, rxn_string):
+        try:
+            reactant, solvent, product = rxn_string.split('>')
+            reactant = smiles_split(reactant)
+            product = smiles_split(product)
+            solvent = smiles_split(solvent) if solvent else []
+            assert reactant and product
+        except:
+            raise KeyError('Please input a valid reaction string')
+        extracted_molecules = {product[0]: "$-1$"}
+        for mol in reactant+solvent:
+            extracted_molecules[mol] = f"${len(extracted_molecules)}$"
+        result_dict = {}
+        result_dict['time_stamp'] = datetime.now().strftime("%Y.%m.%d %H:%M:%S.%f")[:-3]
+        result_dict['reaction_string'] = rxn_string
+        result_dict['REACTANT'] = reactant
+        result_dict['SOLVENT'] = solvent
+        result_dict['CATALYST'] = []
+        result_dict['PRODUCT'] = product
+        result_dict['extracted_molecules'] = extracted_molecules
+        return result_dict
+    def save_prediction(self, result_dict):
+        os.makedirs(self.cache_dir, exist_ok=True)
+        result_id = result_dict['time_stamp']
+        result_path = os.path.join(self.cache_dir, f'{result_id}.json')
+        json_write(result_path, result_dict)
+    def make_prompt(self, param_dict, smi_max_len=128):
+        smiles_list = []
+        prompt = ''
+        prompt += 'Reactants: '
+        smiles_wrapper = lambda x: reformat_smiles(x, smiles_type=self.smiles_type)[:smi_max_len]
+        for smi in param_dict['REACTANT']:
+            prompt += f'{param_dict["extracted_molecules"][smi]}: [START_SMILES]{smiles_wrapper(smi)}[END_SMILES] '
+            smiles_list.append(smi)
+        prompt += 'Product: '
+        for smi in param_dict['PRODUCT']:
+            prompt += f'{param_dict["extracted_molecules"][smi]}: [START_SMILES]{smiles_wrapper(smi)}[END_SMILES] '
+            smiles_list.append(smi)
+        if param_dict['CATALYST']:
+            prompt += 'Catalysts: '
+            for smi in param_dict['CATALYST']:
+                if smi in param_dict["extracted_molecules"]:
+                    prompt += f'{param_dict["extracted_molecules"][smi]}: [START_SMILES]{smiles_wrapper(smi)}[END_SMILES] '
+                else:
+                    prompt += f'[START_SMILES]{smiles_wrapper(smi)}[END_SMILES] '
+                smiles_list.append(smi)
+        if param_dict['SOLVENT']:
+            prompt += 'Solvents: '
+            for smi in param_dict['SOLVENT']:
+                if smi in param_dict["extracted_molecules"]:
+                    prompt += f'{param_dict["extracted_molecules"][smi]}: [START_SMILES]{smiles_wrapper(smi)}[END_SMILES] '
+                else:
+                    prompt += f'[START_SMILES]{smiles_wrapper(smi)}[END_SMILES] '
+                smiles_list.append(smi)
+        prompt += 'Action Squence: '
+        return prompt, smiles_list
+    def get_action_elements(self, rxn_dict):
+        input_text, smiles_list = self.make_prompt(rxn_dict, self.smi_max_len)
+        graph_list = []
+        for smiles in smiles_list:
+            graph_item = smiles2data(smiles)
+            graph_list.append(graph_item)
+        return graph_list, input_text
+    @torch.no_grad()
+    def predict(self, rxn_dict, temperature=1):
+        graphs, prompt_tokens = self.tokenize(rxn_dict)
+        result_dict = rxn_dict
+        samples = {'graphs': graphs, 'prompt_tokens': prompt_tokens}
+        prediction = self.model.blip2opt.generate(
+            samples,
+            do_sample=self.args.do_sample,
+            num_beams=self.args.num_beams,
+            max_length=self.args.max_inference_len,
+            min_length=self.args.min_inference_len,
+            num_captions=self.args.num_generate_captions,
+            temperature=temperature,
+            use_graph=True
+        )[0]
+        for k, v in result_dict['extracted_molecules'].items():
+            prediction = prediction.replace(v, k)
+        result_dict['prediction'] = prediction
+        return result_dict
+    def tokenize(self, rxn_dict):
+        graph_list, input_text = self.get_action_elements(rxn_dict)
+        if graph_list:
+            graphs = self.collater(graph_list).to(self.device)
+        input_prompt = smiles_handler(input_text, self.mol_ph, self.is_gal)[0]
+        ## deal with prompt
+        self.tokenizer.padding_side = 'left'
+        input_prompt_tokens = self.tokenizer(input_prompt,
+                                              truncation=True,
+                                              padding='max_length',
+                                              add_special_tokens=True,
+                                              max_length=self.rxn_max_len,
+                                              return_tensors='pt',
+                                              return_attention_mask=True).to(self.device)
+        is_mol_token = input_prompt_tokens.input_ids == self.mol_token_id
+        input_prompt_tokens['is_mol_token'] = is_mol_token
+        return graphs, input_prompt_tokens
+def main(args):
+    device = torch.device('cuda')
+    # model
+    if args.init_checkpoint:
+        model = Blip2Model(args).to(device)
+        ckpt = torch.load(args.init_checkpoint, map_location='cpu')
+        model.load_state_dict(ckpt['state_dict'], strict=False)
+        print(f"loaded model from {args.init_checkpoint}")
+    else:
+        model = Blip2Model(args).to(device)
+    model.eval()
+    print('total params:', sum(p.numel() for p in model.parameters()))
+    if args.opt_model.find('galactica') >= 0 or args.opt_model.find('t5') >= 0:
+        tokenizer = model.blip2opt.opt_tokenizer
+    elif args.opt_model.find('llama') >= 0 or args.opt_model.find('vicuna') >= 0:
+        tokenizer = model.blip2opt.llm_tokenizer
+    else:
+        raise NotImplementedError
+    infer_runner = InferenceRunner(
+        model=model,
+        tokenizer=tokenizer,
+        rxn_max_len=args.rxn_max_len,
+        smi_max_len=args.smi_max_len,
+        device=device,
+        args=args
+    )
+    example_inputs = json_read('demo.json')
+    example_inputs = [[e] for e in example_inputs]
+    def online_chat(reaction_string, temperature=1):
+        data_item = infer_runner.make_query_dict(reaction_string)
+        result = infer_runner.predict(data_item, temperature=temperature)
+        infer_runner.save_prediction(result)
+        prediction = result['prediction'].replace(' ; ', ' ;\n')
+        return prediction
+    with gr.Blocks(css="""
+            .center { display: flex; justify-content: center; }
+        """) as demo:
+        gr.HTML(
+        """
+        <center><h1><b>ReactXT</b></h1></center>
+        <p style="font-size:20px; font-weight:bold;">This is the demo page of our ACL 2024 paper
+        <i>ReactXT: Understanding Molecular “Reaction-ship” via Reaction-Contextualized Molecule-Text Pretraining.</i></p>
+        """)
+        with gr.Row(elem_classes="center"):
+            gr.Image(value="./figures/frameworks.jpg", elem_classes="center", width=800, label="Framework of ReactXT")
+        gr.HTML(
+        """
+        <p style="font-size:16px;"> Please input one chemical reaction below, and we will generate the predicted experimental procedure.</p>
+        <p style="font-size:16px;"> The reaction should be in form of <b>Reactants>Reagents>Product</b>.</p>
+        """)
+        reaction_string = gr.Textbox(placeholder="Input one reaction", label='Input Reaction')
+        gr.Examples(example_inputs, [reaction_string,], fn=online_chat, label='Example Reactions')
+        with gr.Row():
+            btn = gr.Button("Submit")
+            clear_btn = gr.Button("Clear")
+        temperature = gr.Slider(0.1, 1, value=1, label='Temperature')
+        with gr.Row():
+            out = gr.Textbox(label="ReactXT's Output", placeholder="Predicted experimental procedure")
+        btn.click(fn=online_chat, inputs=[reaction_string, temperature], outputs=[out])
+        clear_btn.click(fn=lambda:("", ""), inputs=[], outputs=[reaction_string, out])
+    demo.launch(share=True)
+if __name__ == '__main__':
+    args = get_args()
+    vars(args).update(app_config)
+    main(args)

average_ckpt.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import argparse
+import torch
+def average_checkpoints(checkpoint_paths):
+    averaged_ckpt = torch.load(checkpoint_paths[-1], map_location=torch.device('cpu'))
+    param_sum_dict = {}
+    for key, value in averaged_ckpt['state_dict'].items():
+        param_sum_dict[key] = value.clone()
+    num_checkpoints = len(checkpoint_paths)
+    for ckpt_path in checkpoint_paths[:-1]:
+        checkpoint = torch.load(ckpt_path, map_location=torch.device('cpu'))
+        for key, value in checkpoint['state_dict'].items():
+            param_sum_dict[key] += value
+    for key in param_sum_dict.keys():
+        param_sum_dict[key] = param_sum_dict[key] / num_checkpoints
+    averaged_ckpt['state_dict'] = param_sum_dict
+    return averaged_ckpt
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="Averages the weights of multiple transformer model checkpoints.")
+    parser.add_argument('--checkpoint_paths', nargs='+', required=True,
+                        help='List of paths to the checkpoints to be averaged. Example: --checkpoint_paths path1 path2 path3')
+    parser.add_argument('--output_path', type=str, required=True,)
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = parse_arguments()
+    averaged_state_dict = average_checkpoints(args.checkpoint_paths)
+    torch.save(averaged_state_dict, args.output_path)
+    print(f"Averaged checkpoint saved to {args.output_path}")

convert.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import argparse
+from pathlib import Path
+from pytorch_lightning.utilities.deepspeed import convert_zero_checkpoint_to_fp32_state_dict
+if __name__ == '__main__':
+    ## read a path using argparse and pass it to convert_zero_checkpoint_to_fp32_state_dict
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input', type=str, default=None, help='path to the desired checkpoint folder')
+    parser.add_argument('--output', type=str, default=None, help='path to the pytorch fp32 state_dict output file')
+    # parser.add_argument('--tag', type=str, help='checkpoint tag used as a unique identifier for checkpoint')
+    args = parser.parse_args()
+    if args.output is None:
+        args.output = Path(args.input) / 'converted.ckpt'
+    convert_zero_checkpoint_to_fp32_state_dict(args.input, args.output)

data_provider/__init__.py ADDED Viewed

File without changes

data_provider/caption_dataset.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import torch
+from torch_geometric.data import Dataset
+import os
+from torch_geometric.data import InMemoryDataset
+from .data_utils import reformat_smiles
+import random
+import json
+class PubChemDataset(InMemoryDataset):
+    def __init__(self, path):
+        super(PubChemDataset, self).__init__()
+        self.data, self.slices = torch.load(path)
+    def __getitem__(self, idx):
+        return self.get(idx)
+class CaptionDataset(Dataset):
+    def __init__(self, root, mode, smi_max_len=128, use_graph=True, disable_graph_cache=False, smiles_type='default'):
+        super(CaptionDataset, self).__init__(root)
+        self.root = root
+        self.file_path = os.path.join(root, f'{mode}.pt')
+        self.smi_max_len = smi_max_len
+        self.tokenizer = None
+        self.use_graph = use_graph
+        self.smiles_type = smiles_type
+        self.data = PubChemDataset(self.file_path)
+    def get(self, index):
+        return self.__getitem__(index)
+    def len(self):
+        return len(self)
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, index):
+        data = self.data[index]
+        smiles = reformat_smiles(data.smiles, smiles_type=self.smiles_type)
+        smiles_prompt = f'[START_I_SMILES]{smiles[:self.smi_max_len]}[END_I_SMILES]. '
+        text_list = []
+        count = 0
+        for line in data.text.split('\n'):
+            count += 1
+            text_list.append(line.strip())
+            if count > 100:
+                break
+        text = ' '.join(text_list) + '\n'
+        graph_list = [data] if self.use_graph else []
+        return index, graph_list, text, smiles_prompt
+class PretrainCaptionDataset(Dataset):
+    def __init__(self, root, smi_max_len=128, use_graph=True, disable_graph_cache=False):
+        super(PretrainCaptionDataset, self).__init__(root)
+        self.pre_train_data = CaptionDataset(
+            root,
+            'pretrain',
+            smi_max_len=smi_max_len,
+            use_graph=use_graph,
+        )
+        self.train_data = CaptionDataset(
+            root,
+            'train',
+            smi_max_len=smi_max_len,
+            use_graph=use_graph,
+        )
+    def get(self, index):
+        return self.__getitem__(index)
+    def len(self):
+        return len(self)
+    def __len__(self):
+        return len(self.pre_train_data) + len(self.train_data)
+    def __getitem__(self, index):
+        if index < len(self.pre_train_data):
+            index, graph_list, text, smiles_prompt =  self.pre_train_data[index]
+        else:
+            index, graph_list, text, smiles_prompt = self.train_data[index - len(self.pre_train_data)]
+        graph_item = graph_list[0]
+        if hasattr(graph_item, 'iupac'):
+            del graph_item.iupac
+        if hasattr(graph_item, 'cid'):
+            del graph_item.cid
+        del graph_item.text
+        del graph_item.smiles
+        return graph_item, text, smiles_prompt

data_provider/chebi_dataset.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import torch
+from torch_geometric.data import Dataset
+import os
+from torch_geometric.data import InMemoryDataset
+import random
+import json
+from .data_utils import reformat_smiles
+class ChEBI_dataset(Dataset):
+    def __init__(self, root, mode, smi_max_len=128, use_graph=True, disable_graph_cache=False, smiles_type='default'):
+        super(ChEBI_dataset, self).__init__(root)
+        self.root = root
+        self.file_path = os.path.join(root, f'{mode}.txt')
+        self.smi_max_len = smi_max_len
+        self.tokenizer = None
+        self.use_graph = use_graph
+        self.smiles_type = smiles_type
+        if self.use_graph:
+            self.idx_graph_map = torch.load(os.path.join(root, 'cid_graph_map.pt'))
+        with open(self.file_path) as f:
+            lines = f.readlines()
+            self.data = [line.split('\t', maxsplit=2) for line in lines[1:]]
+    def get(self, index):
+        return self.__getitem__(index)
+    def len(self):
+        return len(self)
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, index):
+        cid, smiles, text = self.data[index]
+        smiles = reformat_smiles(smiles, smiles_type=self.smiles_type)
+        smiles_prompt = f'[START_I_SMILES]{smiles[:self.smi_max_len]}[END_I_SMILES]. '
+        text = text.strip() + '\n'
+        if self.use_graph:
+            graph_list = [self.idx_graph_map[cid]]
+        return index, graph_list, text, smiles_prompt

data_provider/context_gen.py ADDED Viewed

	@@ -0,0 +1,207 @@

+import random
+import os
+import numpy as np
+import argparse
+import json
+from collections import defaultdict
+from matplotlib import pyplot as plt
+from collections import Counter
+from .data_utils import json_read
+def set_random_seed(seed):
+    random.seed(seed)
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    np.random.seed(seed)
+class Reaction_Cluster:
+    def __init__(self, root, reaction_filename, reverse_ratio=0.5):
+        self.root = root
+        self.reaction_data = json_read(os.path.join(self.root, reaction_filename))
+        self.property_data = json_read(os.path.join(self.root, 'Abstract_property.json'))
+        self.mol_property_map = {d['canon_smiles']: d for d in self.property_data}
+        self.reverse_ratio = reverse_ratio
+        self.rxn_mols_attr = defaultdict(lambda:{
+            'freq': 0,
+            'occurrence': 0,
+            'in_caption': False,
+        })
+        self._read_reaction_mols() # add `valid_mols` in each rxn_dict
+        self.mol_counter = Counter(mol for rxn_dict in self.reaction_data for mol in rxn_dict['valid_mols'])
+        self._calculate_Pr() # calculate P(r), add `weight` in each rxn_dict
+        self._calculate_Pir() # calculate P(i|r), add `mol_weight` in each rxn_dict
+    def _read_reaction_mols(self):
+        self.valid_rxn_indices = []
+        for rxn_id, rxn_dict in enumerate(self.reaction_data):
+            mol_role_map = {}
+            for key in ['REACTANT', 'CATALYST', 'SOLVENT', 'PRODUCT']:
+                for m in rxn_dict[key]:
+                    if m in mol_role_map:
+                        continue
+                    if m in self.mol_property_map:
+                        mol_role_map[m] = key
+            valid_mols = []
+            for mol in mol_role_map:
+                assert mol in self.mol_property_map # this is garanteed by the above if statement
+                if 'abstract' not in self.mol_property_map[mol]:
+                    continue
+                valid_mols.append(mol) # here the molecules should be in the R, C, S, P order.
+            if len(valid_mols) > 0:
+                self.valid_rxn_indices.append(rxn_id)
+            rxn_dict['valid_mols'] = valid_mols
+            rxn_dict['mol_role_map'] = mol_role_map
+    def _calculate_Pr(self):
+        total_weights = 0
+        for rxn_dict in self.reaction_data:
+            rxn_weight = sum([1/self.mol_counter[mol] for mol in rxn_dict['valid_mols']])
+            rxn_dict['weight'] = rxn_weight
+            total_weights += rxn_weight
+        for rxn_dict in self.reaction_data:
+            rxn_dict['weight'] = rxn_dict['weight'] / total_weights
+    def _calculate_Pir(self):
+        for rxn_dict in self.reaction_data:
+            mol_weight = {}
+            for mol in rxn_dict['valid_mols']:
+                mol_weight[mol] = 1/self.mol_counter[mol]
+            total_weight = sum(mol_weight.values())
+            rxn_dict['mol_weight'] = {m:w/total_weight for m, w in mol_weight.items()}
+    def choose_mol(self, valid_mols, k=4, weights=None):
+        if k>=len(valid_mols):
+            sampled_indices = list(range(len(valid_mols)))
+        else:
+            sampled_indices = np.random.choice(len(valid_mols), k, replace=False, p=weights)
+            sampled_indices = list(sampled_indices)
+        sampled_indices = sorted(sampled_indices)
+        if random.random() < self.reverse_ratio: # reverse the indices with reverse_ratio chance.
+            sampled_indices.reverse()
+        sampled_mols = [valid_mols[i] for i in sampled_indices]
+        return sampled_mols
+    def sample_mol_batch(self, index=None, k=4):
+        if index is None:
+            index = self.sample_rxn_index(1)[0]
+        assert index < len(self.reaction_data)
+        rxn = self.reaction_data[index]
+        valid_mols, weights = zip(*rxn['mol_weight'].items())
+        sampled_mols = self.choose_mol(valid_mols, k=k, weights=weights)
+        mol_property_batch = []
+        for mol in sampled_mols:
+            mol_property = self.mol_property_map[mol]
+            mol_role = rxn['mol_role_map'][mol]
+            mol_property['role'] = mol_role
+            mol_property_batch.append(mol_property)
+        if 'rsmiles_map' in rxn:
+            rsmiles_map = random.choice(rxn['rsmiles_map'])
+            for mol_property in mol_property_batch:
+                canon_smiles = mol_property['canon_smiles']
+                if canon_smiles in rsmiles_map:
+                    mol_property['r_smiles'] = rsmiles_map[canon_smiles]
+        return mol_property_batch
+    def sample_rxn_index(self, num_samples):
+        indices = range(len(self.reaction_data))
+        weights = [d['weight'] for d in self.reaction_data]
+        return np.random.choice(indices, num_samples, replace=False, p=weights)
+    def __call__(self, rxn_num=1000, k=4):
+        sampled_indices = self.sample_rxn_index(rxn_num)
+        sampled_batch = [self.sample_mol_batch(idx, k=k) for idx in sampled_indices]
+        return sampled_batch
+    def generate_batch_uniform_rxn(self, rxn_num=1000, k=4):
+        assert rxn_num <= len(self.valid_rxn_indices)
+        sampled_rxn_indices = random.sample(self.valid_rxn_indices, rxn_num)
+        sampled_batch = []
+        for rxn_id in sampled_rxn_indices:
+            rxn = self.reaction_data[rxn_id]
+            sampled_mols = self.choose_mol(rxn['valid_mols'], k=k, weights=None)
+            mol_property_batch = []
+            for mol in sampled_mols:
+                mol_property = self.mol_property_map[mol]
+                mol_role = rxn['mol_role_map'][mol]
+                mol_property['role'] = mol_role
+                mol_property_batch.append(mol_property)
+            sampled_batch.append(mol_property_batch)
+        return sampled_batch
+    def generate_batch_uniform_mol(self, rxn_num=1000, k=4):
+        valid_mols = list(self.mol_counter.elements())
+        assert rxn_num*k <= len(valid_mols)
+        sampled_batch = []
+        sampled_mol_ids = random.sample(range(len(valid_mols)), rxn_num*k)
+        for i in range(rxn_num):
+            sampled_batch.append([self.mol_property_map[valid_mols[mol_id]] for mol_id in sampled_mol_ids[i*k:(i+1)*k]])
+        return sampled_batch
+    def generate_batch_single(self, rxn_num=1000):
+        valid_mols = list(self.mol_counter.elements())
+        sampled_mols = random.sample(valid_mols, rxn_num)
+        total_valid_mols = [[self.mol_property_map[mol]] for mol in sampled_mols]
+        return total_valid_mols
+    # visaulize probability for molecules in caption dataset.
+    def visualize_mol_distribution(self):
+        prob_dict = {mol:0.0 for mol in self.mol_property_map.keys()}
+        N = len(prob_dict)
+        M = len(self.reaction_data)
+        assert N == len(self.mol_property_map)
+        print(f'Number of molecules in Caption Dataset: {N}')
+        print(f'Number of Reactions in Reaction Dataset: {M}')
+        # prob distribution for molecules
+        for rxn_dict in self.reaction_data:
+            for mol, weight in rxn_dict['mol_weight'].items():
+                prob_dict[mol] += weight * rxn_dict['weight']
+        # sum of prob_dict.values() should already be 1.
+        prob_values = np.array(list(prob_dict.values()))
+        prob_values *= N
+        # prob distribution for reactions
+        rxn_weights = np.array([d['weight'] for d in self.reaction_data])
+        # sum of rxn_weights should already be 1.
+        rxn_weights *= M
+        return prob_values, rxn_weights
+    # visaulize the frequency for molecules in caption dataset.
+    def visualize_mol_frequency(self, rxn_num=1000, k=4, epochs=100):
+        sampled_mols_counter = Counter()
+        sampled_rxns_counter = Counter()
+        for _ in range(epochs):
+            rxn_indices = self.sample_rxn_index(rxn_num)
+            sampled_rxns_counter.update(rxn_indices)
+            for index in rxn_indices:
+                rxn = self.reaction_data[index]
+                if len(rxn['valid_mols']) ==0:
+                    continue
+                valid_mols, weights = zip(*rxn['mol_weight'].items())
+                mol_batch = self.choose_mol(valid_mols, k=k, weights=weights)
+                sampled_mols_counter.update(mol_batch)
+        sampled_mols_count = np.array([c for _, c in sorted(sampled_mols_counter.items())])
+        sampled_rxns_count = np.array([c for _, c in sorted(sampled_rxns_counter.items())])
+        return sampled_mols_count, sampled_rxns_count
+    def _randomly(self, func, *args, **kwargs):
+        # make fake weights and backup the weights
+        for rxn_dict in self.reaction_data:
+            rxn_dict['weight_bak'] = rxn_dict['weight']
+            rxn_dict['weight'] = 1/len(self.reaction_data)
+            rxn_dict['mol_weight_bak'] = rxn_dict['mol_weight']
+            rxn_dict['mol_weight'] = {m:1/len(rxn_dict['mol_weight']) for m in rxn_dict['mol_weight']}
+        # run the function
+        result = func(*args, **kwargs)
+        # weights recovery
+        for rxn_dict in self.reaction_data:
+            rxn_dict['weight'] = rxn_dict['weight_bak']
+            del rxn_dict['weight_bak']
+            rxn_dict['mol_weight'] = rxn_dict['mol_weight_bak']
+            del rxn_dict['mol_weight_bak']
+        return result

data_provider/data_utils.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import torch
+from torch_geometric.data import Data
+from ogb.utils import smiles2graph
+from rdkit import Chem
+import random
+import os
+import json
+from rdkit import RDLogger
+RDLogger.DisableLog('rdApp.*')
+from .r_smiles import multi_process
+import multiprocessing
+def reformat_smiles(smiles, smiles_type='default'):
+    if not smiles:
+        return None
+    if smiles_type == 'default':
+        return smiles
+    elif smiles_type=='canonical':
+        mol = Chem.MolFromSmiles(smiles)
+        return Chem.MolToSmiles(mol, canonical=True, isomericSmiles=False)
+    elif smiles_type=='restricted':
+        mol = Chem.MolFromSmiles(smiles)
+        new_atom_order = list(range(mol.GetNumAtoms()))
+        random.shuffle(new_atom_order)
+        random_mol = Chem.RenumberAtoms(mol, newOrder=new_atom_order)
+        return Chem.MolToSmiles(random_mol, canonical=False, isomericSmiles=False)
+    elif smiles_type=='unrestricted':
+        mol = Chem.MolFromSmiles(smiles)
+        return Chem.MolToSmiles(mol, canonical=False, doRandom=True, isomericSmiles=False)
+    elif smiles_type=='r_smiles':
+        # the implementation of root-aligned smiles is in r_smiles.py
+        return smiles
+    else:
+        raise NotImplementedError(f"smiles_type {smiles_type} not implemented")
+def json_read(path):
+    with open(path, 'r') as f:
+        data = json.load(f)
+    return data
+def json_write(path, data):
+    with open(path, 'w') as f:
+        json.dump(data, f, indent=4, ensure_ascii=False)
+def format_float_from_string(s):
+    try:
+        float_value = float(s)
+        return f'{float_value:.2f}'
+    except ValueError:
+        return s
+def make_abstract(mol_dict, abstract_max_len=256, property_max_len=256):
+    prompt = ''
+    if 'abstract' in mol_dict:
+        abstract_string = mol_dict['abstract'][:abstract_max_len]
+        prompt += f'[Abstract] {abstract_string} '
+    property_string = ''
+    property_dict = mol_dict['property'] if 'property' in mol_dict else {}
+    for property_key in ['Experimental Properties', 'Computed Properties']:
+        if not property_key in property_dict:
+            continue
+        for key, value in property_dict[property_key].items():
+            if isinstance(value, float):
+                key_value_string = f'{key}: {value:.2f}; '
+            elif isinstance(value, str):
+                float_value = format_float_from_string(value)
+                key_value_string = f'{key}: {float_value}; '
+            else:
+                key_value_string = f'{key}: {value}; '
+            if len(property_string+key_value_string) > property_max_len:
+                break
+            property_string += key_value_string
+    if property_string:
+        property_string = property_string[:property_max_len]
+        prompt += f'[Properties] {property_string}. '
+    return prompt
+def smiles2data(smiles):
+    graph = smiles2graph(smiles)
+    x = torch.from_numpy(graph['node_feat'])
+    edge_index = torch.from_numpy(graph['edge_index'], )
+    edge_attr = torch.from_numpy(graph['edge_feat'])
+    data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)
+    return data
+import re
+SPLIT_MARKER = f"SPL{1}T-TH{1}S-Pl3A5E"
+CUSTOM_SEQ_RE = re.compile(r"(\[START_(DNA|SMILES|I_SMILES|AMINO)])(.*?)(\[END_\2])")
+def _insert_split_marker(m: re.Match):
+    """
+    Applies split marker based on a regex match of special tokens such as
+    [START_DNA].
+    Parameters
+    ----------
+    n : str
+        Input text to split
+    Returns
+    ----------
+    str - the text with the split token added
+    """
+    start_token, _, sequence, end_token = m.groups()
+    sequence = re.sub(r"(.)", fr"{SPLIT_MARKER}\1", sequence, flags=re.DOTALL)
+    return f"{start_token}{sequence}{SPLIT_MARKER}{end_token}"
+def escape_custom_split_sequence(text):
+    """
+    Applies custom splitting to the text for GALILEO's tokenization
+    Parameters
+    ----------
+    text : str
+        Input text to split
+    Returns
+    ----------
+    str - the text with the split token added
+    """
+    return CUSTOM_SEQ_RE.sub(_insert_split_marker, text)
+def generate_rsmiles(reactants, products, augmentation=20):
+    """
+        reactants: list of N, reactant smiles
+        products: list of N, product smiles
+        augmentation: int, number of augmentations
+        return: list of N x augmentation
+    """
+    data = [{
+        'reactant': r.strip().replace(' ', ''),
+        'product': p.strip().replace(' ', ''),
+        'augmentation': augmentation,
+        'root_aligned': True,
+    } for r, p in zip(reactants, products)]
+    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
+    results = pool.map(func=multi_process,iterable=data)
+    product_smiles = [smi for r in results for smi in r['src_data']]
+    reactant_smiles = [smi for r in results for smi in r['tgt_data']]
+    return reactant_smiles, product_smiles

data_provider/molecule_abstract_dataset.py ADDED Viewed

	@@ -0,0 +1,222 @@

+import torch
+from torch_geometric.data import Dataset
+import os
+from .context_gen import Reaction_Cluster
+import json
+from .data_utils import smiles2data, reformat_smiles
+from collections import defaultdict
+import random
+from data_provider.caption_dataset import PretrainCaptionDataset
+from data_provider.synthesis_dataset import SynthesisDataset
+def format_float_from_string(s):
+    try:
+        float_value = float(s)
+        return f'{float_value:.2f}'
+    except ValueError:
+        return s
+class MoleculeAbstract(Dataset):
+    def __init__(self,
+            root,
+            rxn_num=1000,
+            rxn_batch_size=4,
+            smi_max_len=128,
+            prompt=None,
+            disable_graph_cache=False,
+            disable_graphs=False,
+            context_style='weighted_rxn',
+            use_caption_dataset=False,
+            caption_batch_num=10000,
+            synthesis_datasetpath=None,
+            synthesis_batch_num=10000,
+            reverse_ratio=0.5,
+            enable_abstract=True,
+            enable_property=True,
+            smiles_type='default',
+            mode='train'
+        ):
+        super(MoleculeAbstract, self).__init__(root)
+        self.root = root
+        self.rxn_num = rxn_num
+        self.rxn_batch_size = rxn_batch_size
+        self.smi_max_len = smi_max_len
+        self.context_style = context_style
+        self.tokenizer = None
+        self.disable_graph_cache = disable_graph_cache
+        self.disable_graphs = disable_graphs
+        self.use_caption_dataset = use_caption_dataset
+        self.smiles_type = smiles_type
+        if use_caption_dataset:
+            self.caption_dataset = PretrainCaptionDataset(
+                os.path.join(root, '../caption_data'),
+                smi_max_len=smi_max_len,
+                use_graph=not self.disable_graphs,
+                disable_graph_cache=disable_graph_cache,
+                smiles_type=smiles_type,
+            )
+            self.caption_batch_num = caption_batch_num
+        self.use_synthesis_dataset = bool(synthesis_datasetpath)
+        if self.use_synthesis_dataset:
+            self.synthesis_dataset = SynthesisDataset(
+                synthesis_datasetpath,
+                'train',
+                smi_max_len,
+                roundrobin_train=True,
+                use_graph=not disable_graphs,
+                disable_graph_cache=disable_graph_cache,
+                smiles_type='default',
+            )
+            self.synthesis_batch_num = synthesis_batch_num
+        if not self.disable_graphs:
+            self.mol_graph_map = torch.load(os.path.join(self.root, 'mol_graph_map.pt'))
+        reaction_filename = 'reactions/reactions_test.json' if (mode=='test') else 'reactions/reactions.json'
+        if smiles_type=='r_smiles':
+            reaction_filename = 'reactions/reactions_wRSMILES.json'
+        self.cluster = Reaction_Cluster(self.root, reaction_filename=reaction_filename, reverse_ratio=reverse_ratio)
+        self.reload_data_list()
+        self.abstract_max_len = 10240
+        self.property_max_len = 10240
+        self.enable_abstract = enable_abstract
+        self.enable_property = enable_property
+    def get(self, index):
+        return self.__getitem__(index)
+    def len(self):
+        return len(self)
+    def __len__(self):
+        data_len = len(self.data_list)
+        if self.use_caption_dataset:
+            data_len += len(self.caption_index_list)
+        if self.use_synthesis_dataset:
+            data_len += len(self.synthesis_index_list)
+        return data_len
+    def reload_data_list(self):
+        k = self.rxn_batch_size
+        if self.context_style == 'weighted_rxn':
+            self.data_list = self.cluster(self.rxn_num, k=k)
+        elif self.context_style == 'uniform_rxn':
+            self.data_list = self.cluster.generate_batch_uniform_rxn(self.rxn_num, k=k)
+        elif self.context_style == 'uniform_mol':
+            self.data_list = self.cluster.generate_batch_uniform_mol(self.rxn_num, k=k)
+        elif self.context_style == 'single_mol':
+            self.data_list = self.cluster.generate_batch_single(self.rxn_num)
+        elif self.context_style == 'hybrid':
+            self.data_list = self.cluster(self.rxn_num//2, k=k)
+            self.data_list += self.cluster.generate_batch_uniform_mol(self.rxn_num//2, k=k)
+        else:
+            raise NotImplementedError
+        if self.use_caption_dataset:
+            assert self.caption_batch_num*k <= len(self.caption_dataset)
+            caption_index_list = random.sample(range(len(self.caption_dataset)), self.caption_batch_num*k)
+            self.caption_index_list = [caption_index_list[i*k:(i+1)*k] for i in range(self.caption_batch_num)]
+        else:
+            self.caption_index_list = []
+        if self.use_synthesis_dataset:
+            if self.synthesis_dataset.roundrobin_train:
+                self.synthesis_dataset.reload_data()
+            assert self.synthesis_batch_num <= len(self.synthesis_dataset)
+            self.synthesis_index_list = random.sample(range(len(self.synthesis_dataset)), self.synthesis_batch_num)
+        else:
+            self.synthesis_index_list = []
+    def make_prompt(self, mol_batch, smi_max_len=128):
+        mol_prompt_list, text_prompt_list = [], []
+        last_role = None
+        for mol_dict in mol_batch:
+            smiles = mol_dict['canon_smiles']
+            if self.smiles_type=='r_smiles':
+                if 'r_smiles' in mol_dict:
+                    smiles = mol_dict['r_smiles']
+                # else:
+                #     smiles = reformat_smiles(smiles, smiles_type='restricted')
+            else:
+                smiles = reformat_smiles(smiles, smiles_type=self.smiles_type)
+            mol_prompt = f'[START_SMILES]{smiles[:smi_max_len]}[END_SMILES]. '
+            if 'role' in mol_dict:
+                role = {
+                    'REACTANT': 'Reactant',
+                    'CATALYST': 'Catalyst',
+                    'SOLVENT': 'Solvent',
+                    'PRODUCT': 'Product',
+                }[mol_dict['role']]
+                if last_role != role:
+                    mol_prompt = f'{role}: {mol_prompt}'
+                    last_role = role
+            text_prompt = self.make_abstract(mol_dict)
+            mol_prompt_list.append(mol_prompt)
+            text_prompt_list.append(text_prompt)
+        return mol_prompt_list, text_prompt_list
+    def make_abstract(self, mol_dict):
+        prompt = ''
+        if self.enable_abstract and 'abstract' in mol_dict:
+            abstract_string = mol_dict['abstract'][:self.abstract_max_len]
+            prompt += f'[Abstract] {abstract_string} '
+        if self.enable_property:
+            property_string = ''
+            property_dict = mol_dict['property'] if 'property' in mol_dict else {}
+            for property_key in ['Experimental Properties', 'Computed Properties']:
+                if not property_key in property_dict:
+                    continue
+                for key, value in property_dict[property_key].items():
+                    if isinstance(value, float):
+                        key_value_string = f'{key}: {value:.2f}; '
+                    elif isinstance(value, str):
+                        float_value = format_float_from_string(value)
+                        key_value_string = f'{key}: {float_value}; '
+                    else:
+                        key_value_string = f'{key}: {value}; '
+                    if len(property_string+key_value_string) > self.property_max_len:
+                        break
+                    property_string += key_value_string
+            if property_string:
+                property_string = property_string[:self.property_max_len]
+                prompt += f'[Properties] {property_string}. '
+        return prompt
+    def get_caption_data(self, index):
+        caption_index = self.caption_index_list[index]
+        graph_list, mol_prompt_list, text_prompt_list = [], [], []
+        for idx in caption_index:
+            graph_item, text, smiles_prompt = self.caption_dataset[idx]
+            graph_list.append(graph_item)
+            mol_prompt_list.append(smiles_prompt)
+            text_prompt_list.append(text)
+        return graph_list, mol_prompt_list, text_prompt_list
+    def get_synthesis_data(self, index):
+        synthesis_index = self.synthesis_index_list[index]
+        _, graph_list, output_text, input_text = self.synthesis_dataset[synthesis_index]
+        return graph_list, [input_text], [output_text]
+    def __getitem__(self, index):
+        if index < len(self.data_list):
+            mol_batch = self.data_list[index]
+        elif index < len(self.data_list)+len(self.caption_index_list):
+            assert self.use_caption_dataset
+            return self.get_caption_data(index-len(self.data_list))
+        else:
+            assert self.use_synthesis_dataset
+            return self.get_synthesis_data(index-(len(self.data_list)+len(self.caption_index_list)))
+        graph_list = []
+        for mol_dict in mol_batch:
+            smiles = mol_dict['canon_smiles']
+            if self.disable_graphs:
+                graph_item = None
+            else:
+                if self.disable_graph_cache:
+                    graph_item = smiles2data(smiles)
+                else:
+                    assert smiles in self.mol_graph_map
+                    graph_item = self.mol_graph_map[smiles]
+            graph_list.append(graph_item)
+        mol_prompt_list, text_prompt_list = self.make_prompt(mol_batch, smi_max_len=self.smi_max_len)
+        return graph_list, mol_prompt_list, text_prompt_list

data_provider/pretrain_dm.py ADDED Viewed

	@@ -0,0 +1,309 @@

+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+import torch
+from pytorch_lightning import LightningDataModule
+import torch_geometric
+# from torch_geometric.loader import DataLoader
+from torch.utils.data import DataLoader
+from torch_geometric.loader.dataloader import Collater
+from data_provider.molecule_abstract_dataset import MoleculeAbstract
+import re
+from transformers import BatchEncoding
+# we split individual characters inside special tokens like [START_DNA]
+CUSTOM_SEQ_RE = re.compile(r"(\[START_(DNA|SMILES|I_SMILES|AMINO)])(.*?)(\[END_\2])")
+# token added to implement a custom sequence tokenization. This token is added at
+# corpus cleaning step and removed in pretokenization. The digits are added to increase the chance
+# that they do not occur in the corpus. The digits are escaped so that the token does not appear
+# literally in the source code in case we ever include it in the training data.
+SPLIT_MARKER = f"SPL{1}T-TH{1}S-Pl3A5E"
+def _insert_split_marker(m: re.Match):
+    """
+    Applies split marker based on a regex match of special tokens such as
+    [START_DNA].
+    Parameters
+    ----------
+    n : str
+        Input text to split
+    Returns
+    ----------
+    str - the text with the split token added
+    """
+    start_token, _, sequence, end_token = m.groups()
+    sequence = re.sub(r"(.)", fr"{SPLIT_MARKER}\1", sequence, flags=re.DOTALL)
+    return f"{start_token}{sequence}{SPLIT_MARKER}{end_token}"
+def smiles_handler(text, mol_ph, is_gal=True):
+    smiles_list = []
+    for match in CUSTOM_SEQ_RE.finditer(text):
+        smiles = match.group(3)
+        smiles_list.append(smiles)
+    if is_gal:
+        text = CUSTOM_SEQ_RE.sub(r'\1\3\4%s' % (mol_ph), text)
+        text = escape_custom_split_sequence(text)
+        return text, smiles_list
+    else:
+        text = CUSTOM_SEQ_RE.sub(r'\3%s' % (mol_ph), text)
+        return text, smiles_list
+def escape_custom_split_sequence(text):
+    """
+    Applies custom splitting to the text for GALILEO's tokenization
+    Parameters
+    ----------
+    text : str
+        Input text to split
+    Returns
+    ----------
+    str - the text with the split token added
+    """
+    return CUSTOM_SEQ_RE.sub(_insert_split_marker, text)
+def tokenize_and_merge_batched_qa_pairs(tokenizer, qa_pairs_list, max_length):
+    tokenized_batches = {
+        'input_ids': [],
+        'attention_mask': []
+    }
+    for qa_pairs in qa_pairs_list:
+        max_length_per_qa = max_length // len(qa_pairs)
+        batch_input_ids = []
+        batch_attention_mask = []
+        for qa in qa_pairs:
+            # here qa should be string
+            tokens = tokenizer(qa,
+                            truncation=True,
+                            padding=False,
+                            add_special_tokens=False,
+                            max_length=max_length_per_qa,
+                            return_tensors='pt',
+                            return_attention_mask=True)
+            batch_input_ids.extend(tokens['input_ids'].squeeze().tolist())
+            batch_attention_mask.extend(tokens['attention_mask'].squeeze().tolist())
+        # Pad the batch to max_length
+        padding_length = max_length - len(batch_input_ids)
+        batch_input_ids.extend([tokenizer.pad_token_id] * padding_length)
+        batch_attention_mask.extend([0] * padding_length)
+        tokenized_batches['input_ids'].append(torch.tensor(batch_input_ids).unsqueeze(0))
+        tokenized_batches['attention_mask'].append(torch.tensor(batch_attention_mask).unsqueeze(0))
+    tokenized_batches['input_ids'] = torch.cat(tokenized_batches['input_ids'], dim=0)
+    tokenized_batches['attention_mask'] = torch.cat(tokenized_batches['attention_mask'], dim=0)
+    tokenized_batch = BatchEncoding(data=tokenized_batches, tensor_type='pt')
+    return tokenized_batch
+class TrainCollater:
+    def __init__(self, tokenizer, text_max_len, mol_ph, mol_token_id, is_gal=True, disable_graphs=False):
+        self.text_max_len = text_max_len
+        self.tokenizer = tokenizer
+        self.collater = Collater([], [])
+        self.mol_ph = mol_ph
+        self.mol_token_id = mol_token_id
+        self.is_gal = is_gal
+        self.disable_graphs = disable_graphs
+    def __call__(self, batch):
+        graphs, mol_prompt, text_prompt = zip(*batch)
+        if not self.disable_graphs:
+            graphs = [graph for graph_batch in graphs for graph in graph_batch]
+            graphs = self.collater(graphs)
+        qa_pairs = []
+        for mol_batch, text_batch in zip(mol_prompt, text_prompt):
+            qa_list = []
+            for mol_prompt, text_prompt in zip(mol_batch, text_batch):
+                smiles_prompt = smiles_handler(mol_prompt, self.mol_ph, self.is_gal)[0]
+                qa_list.append(f'{smiles_prompt} {text_prompt}')
+            qa_pairs.append(qa_list)
+        self.tokenizer.padding_side = 'right'
+        qa_batch = tokenize_and_merge_batched_qa_pairs(self.tokenizer, qa_pairs, self.text_max_len)
+        is_mol_token = qa_batch.input_ids == self.mol_token_id
+        qa_batch['is_mol_token'] = is_mol_token
+        return graphs, qa_batch
+class InferenceCollater:
+    def __init__(self, tokenizer, text_max_len, mol_ph, mol_token_id, is_gal=True, disable_graphs=False, last_only=False):
+        self.text_max_len = text_max_len
+        self.tokenizer = tokenizer
+        self.collater = Collater([], [])
+        self.mol_ph = mol_ph
+        self.mol_token_id = mol_token_id
+        self.is_gal = is_gal
+        self.disable_graphs = disable_graphs
+        self.last_only = last_only
+    def __call__(self, batch):
+        graphs, mol_prompt, text_prompt = zip(*batch)
+        rxn_ids = [0 for i in range(len(mol_prompt))]
+        if self.last_only:
+            mol_prompt = [[mol_batch[-1]] for mol_batch in mol_prompt]
+            text_prompt = [[text_batch[-1]] for text_batch in text_prompt]
+            graphs = [[graph_batch[-1]] for graph_batch in graphs]
+        if not self.disable_graphs:
+            graphs = [graph for graph_batch in graphs for graph in graph_batch]
+            graphs = self.collater(graphs)
+        input_text, output_text = [], []
+        for mol_batch, text_batch in zip(mol_prompt, text_prompt):
+            qa_list = []
+            for mol_prompt, text_prompt in list(zip(mol_batch, text_batch))[:-1]:
+                smiles_prompt = smiles_handler(mol_prompt, self.mol_ph, self.is_gal)[0]
+                qa_list.append(f'{smiles_prompt} {text_prompt}')
+            qa_list.append(f'{smiles_handler(mol_batch[-1], self.mol_ph, self.is_gal)[0]} ')
+            output_text.append(text_batch[-1])
+            input_text.append(qa_list)
+        self.tokenizer.padding_side = 'right'
+        input_batch = tokenize_and_merge_batched_qa_pairs(self.tokenizer, input_text, self.text_max_len)
+        is_mol_token = input_batch.input_ids == self.mol_token_id
+        input_batch['is_mol_token'] = is_mol_token
+        return rxn_ids, graphs, input_batch, output_text, input_text
+class PretrainDM(LightningDataModule):
+    def __init__(
+        self,
+        num_workers: int = 0,
+        batch_size: int = 256,
+        root: str = 'data/',
+        text_max_len: int = 128,
+        rxn_max_len: int = 128,
+        smi_max_len: int = 128,
+        tokenizer=None,
+        args=None,
+    ):
+        super().__init__()
+        self.args = args
+        self.batch_size = batch_size
+        self.inference_batch_size = args.inference_batch_size
+        self.num_workers = num_workers
+        self.text_max_len = text_max_len
+        self.rxn_max_len = rxn_max_len
+        self.pretrain_dataset = MoleculeAbstract(
+            root,
+            rxn_num=args.pretrain_rxn_num,
+            rxn_batch_size=args.rxn_batch_size,
+            smi_max_len=smi_max_len,
+            disable_graph_cache=args.disable_graph_cache,
+            context_style=args.context_style,
+            disable_graphs=args.disable_graphs,
+            use_caption_dataset=args.pretrain_use_caption,
+            caption_batch_num=args.caption_batch_num,
+            synthesis_datasetpath=args.pretrain_synthesis_path,
+            synthesis_batch_num=args.synthesis_batch_num,
+            reverse_ratio=args.reverse_ratio,
+            enable_abstract=not args.disable_abstract,
+            enable_property=not args.disable_property,
+            smiles_type=args.smiles_type,
+        )
+        self.test_dataset = MoleculeAbstract(
+            root,
+            rxn_num=args.pretrain_rxn_num,
+            rxn_batch_size=args.rxn_batch_size,
+            smi_max_len=smi_max_len,
+            disable_graph_cache=args.disable_graph_cache,
+            context_style=args.context_style,
+            disable_graphs=args.disable_graphs,
+            use_caption_dataset=args.pretrain_use_caption,
+            caption_batch_num=args.caption_batch_num,
+            reverse_ratio=args.reverse_ratio,
+            enable_abstract=not args.disable_abstract,
+            enable_property=not args.disable_property,
+            smiles_type=args.smiles_type,
+            mode='test',
+        )
+        self.init_tokenizer(tokenizer)
+        self.mol_ph_token = '<mol>' * self.args.num_query_token
+        self.is_gal = args.opt_model.find('galactica') >= 0
+        self.disable_graphs = args.disable_graphs
+        self.last_only = args.pretrain_eval_last_only
+    def init_tokenizer(self, tokenizer):
+        self.tokenizer = tokenizer
+        self.pretrain_dataset.tokenizer = tokenizer
+        self.test_dataset.tokenizer = tokenizer
+        self.mol_token_id = self.tokenizer.mol_token_id
+        # self.tokenizer.mol_token_id = tokenizer("<mol>", add_special_tokens=False).input_ids[0]
+    def train_dataloader(self):
+        self.pretrain_dataset.reload_data_list()
+        loader = DataLoader(
+            self.pretrain_dataset,
+            batch_size=self.batch_size,
+            shuffle=True,
+            num_workers=self.num_workers,
+            pin_memory=False,
+            drop_last=True,
+            persistent_workers=True,
+            collate_fn=TrainCollater(
+                tokenizer=self.tokenizer,
+                text_max_len=self.text_max_len,
+                mol_ph=self.mol_ph_token,
+                mol_token_id=self.mol_token_id,
+                is_gal=self.is_gal,
+                disable_graphs=self.disable_graphs,
+            ),
+        )
+        return loader
+    def val_dataloader(self):
+        test_loader = DataLoader(
+            self.test_dataset,
+            batch_size=self.inference_batch_size,
+            shuffle=False,
+            num_workers=self.num_workers,
+            pin_memory=False,
+            drop_last=False,
+            persistent_workers=True,
+            collate_fn=InferenceCollater(
+                tokenizer=self.tokenizer,
+                text_max_len=self.text_max_len,
+                mol_ph=self.mol_ph_token,
+                mol_token_id=self.mol_token_id,
+                is_gal=self.is_gal,
+                disable_graphs=self.disable_graphs,
+                last_only=self.last_only,
+            ),
+        )
+        return [test_loader]
+    def add_model_specific_args(parent_parser):
+        parser = parent_parser.add_argument_group("Data module")
+        parser.add_argument('--num_workers', type=int, default=2)
+        parser.add_argument('--batch_size', type=int, default=4)
+        parser.add_argument('--inference_batch_size', type=int, default=4)
+        parser.add_argument('--use_smiles', action='store_true', default=False)
+        parser.add_argument('--root', type=str, default='data/action_data')
+        parser.add_argument('--context_style', type=str, default='weighted_rxn', choices=['weighted_rxn', 'uniform_rxn', 'uniform_mol', 'single_mol', 'hybrid'])
+        parser.add_argument('--rxn_max_len', type=int, default=512)
+        parser.add_argument('--text_max_len', type=int, default=512)
+        parser.add_argument('--smi_max_len', type=int, default=128)
+        parser.add_argument('--pretrain_rxn_num', type=int, default=50000)
+        parser.add_argument('--reverse_ratio', type=float, default=0.5, help='ratio of reversed reactions (retro reactions)')
+        parser.add_argument('--disable_abstract', action='store_true', default=False)
+        parser.add_argument('--disable_property', action='store_true', default=False)
+        parser.add_argument('--pretrain_use_caption', action='store_true', default=False)
+        parser.add_argument('--caption_batch_num', type=int, default=5000)
+        parser.add_argument('--pretrain_synthesis_path', type=str, default=None)
+        parser.add_argument('--synthesis_batch_num', type=int, default=5000)
+        parser.add_argument('--rxn_batch_size', type=int, default=4)
+        parser.add_argument('--roundrobin_train', action='store_true', default=False)
+        parser.add_argument('--test_subset', type=int, default=-1)
+        parser.add_argument('--pretrain_eval_last_only', default=False, action='store_true')
+        parser.add_argument('--prompt', type=str, default=None)
+        return parent_parser

data_provider/r_smiles.py ADDED Viewed

	@@ -0,0 +1,449 @@

+import numpy as np
+import argparse
+import re
+import random
+import textdistance
+from rdkit import Chem
+from rdkit import RDLogger
+RDLogger.DisableLog('rdApp.*')
+def smi_tokenizer(smi):
+    pattern = "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
+    regex = re.compile(pattern)
+    tokens = [token for token in regex.findall(smi)]
+    assert smi == ''.join(tokens)
+    return ' '.join(tokens)
+def clear_map_canonical_smiles(smi, canonical=True, root=-1):
+    mol = Chem.MolFromSmiles(smi)
+    if mol is not None:
+        for atom in mol.GetAtoms():
+            if atom.HasProp('molAtomMapNumber'):
+                atom.ClearProp('molAtomMapNumber')
+        return Chem.MolToSmiles(mol, isomericSmiles=True, rootedAtAtom=root, canonical=canonical)
+    else:
+        return smi
+def get_cano_map_number(smi,root=-1):
+    atommap_mol = Chem.MolFromSmiles(smi)
+    canonical_mol = Chem.MolFromSmiles(clear_map_canonical_smiles(smi,root=root))
+    cano2atommapIdx = atommap_mol.GetSubstructMatch(canonical_mol)
+    correct_mapped = [canonical_mol.GetAtomWithIdx(i).GetSymbol() == atommap_mol.GetAtomWithIdx(index).GetSymbol() for i,index in enumerate(cano2atommapIdx)]
+    atom_number = len(canonical_mol.GetAtoms())
+    if np.sum(correct_mapped) < atom_number or len(cano2atommapIdx) < atom_number:
+        cano2atommapIdx = [0] * atom_number
+        atommap2canoIdx = canonical_mol.GetSubstructMatch(atommap_mol)
+        if len(atommap2canoIdx) != atom_number:
+            return None
+        for i, index in enumerate(atommap2canoIdx):
+            cano2atommapIdx[index] = i
+    id2atommap = [atom.GetAtomMapNum() for atom in atommap_mol.GetAtoms()]
+    return [id2atommap[cano2atommapIdx[i]] for i in range(atom_number)]
+def get_root_id(mol,root_map_number):
+    root = -1
+    for i, atom in enumerate(mol.GetAtoms()):
+        if atom.GetAtomMapNum() == root_map_number:
+            root = i
+            break
+    return root
+    # root = -1
+    # for i, atom in enumerate(mol.GetAtoms()):
+    #     if atom.GetAtomMapNum() == root_map_number:
+    #         return i
+def get_forward_rsmiles(data):
+    pt = re.compile(r':(\d+)]')
+    product = data['product']
+    reactant = data['reactant']
+    augmentation = data['augmentation']
+    separated = data['separated']
+    pro_mol = Chem.MolFromSmiles(product)
+    rea_mol = Chem.MolFromSmiles(reactant)
+    """checking data quality"""
+    rids = sorted(re.findall(pt, reactant))
+    pids = sorted(re.findall(pt, product))
+    return_status = {
+        "status":0,
+        "src_data":[],
+        "tgt_data":[],
+        "edit_distance":0,
+    }
+    reactant = reactant.split(".")
+    product = product.split(".")
+    rea_atom_map_numbers = [list(map(int, re.findall(r"(?<=:)\d+", rea))) for rea in reactant]
+    max_times = np.prod([len(map_numbers) for map_numbers in rea_atom_map_numbers])
+    times = min(augmentation, max_times)
+    reactant_roots = [[-1 for _ in reactant]]
+    j = 0
+    while j < times:
+        reactant_roots.append([random.sample(rea_atom_map_numbers[k], 1)[0] for k in range(len(reactant))])
+        if reactant_roots[-1] in reactant_roots[:-1]:
+            reactant_roots.pop()
+        else:
+            j += 1
+    if j < augmentation:
+        reactant_roots.extend(random.choices(reactant_roots, k=augmentation - times))
+        times = augmentation
+    reversable = False  # no reverse
+    assert times == augmentation
+    if reversable:
+        times = int(times / 2)
+    pro_atom_map_numbers = [list(map(int, re.findall(r"(?<=:)\d+", pro))) for pro in product]
+    full_pro_atom_map_numbers = set(map(int, re.findall(r"(?<=:)\d+", ".".join(product))))
+    for k in range(times):
+        tmp = list(zip(reactant, reactant_roots[k],rea_atom_map_numbers))
+        random.shuffle(tmp)
+        reactant_k, reactant_roots_k,rea_atom_map_numbers_k = [i[0] for i in tmp], [i[1] for i in tmp], [i[2] for i in tmp]
+        aligned_reactants = []
+        aligned_products = []
+        aligned_products_order = []
+        all_atom_map = []
+        for i, rea in enumerate(reactant_k):
+            rea_root_atom_map = reactant_roots_k[i]
+            rea_root = get_root_id(Chem.MolFromSmiles(rea), root_map_number=rea_root_atom_map)
+            cano_atom_map = get_cano_map_number(rea, rea_root)
+            if cano_atom_map is None:
+                print(f"Reactant Failed to find Canonical Mol with Atom MapNumber")
+                continue
+            rea_smi = clear_map_canonical_smiles(rea, canonical=True, root=rea_root)
+            aligned_reactants.append(rea_smi)
+            all_atom_map.extend(cano_atom_map)
+        for i, pro_map_number in enumerate(pro_atom_map_numbers):
+            reactant_candidates = []
+            selected_reactant = []
+            for j, map_number in enumerate(all_atom_map):
+                if map_number in pro_map_number:
+                    for rea_index, rea_atom_map_number in enumerate(rea_atom_map_numbers_k):
+                        if map_number in rea_atom_map_number and rea_index not in selected_reactant:
+                            selected_reactant.append(rea_index)
+                            reactant_candidates.append((map_number, j, len(rea_atom_map_number)))
+            # select maximal reactant
+            reactant_candidates.sort(key=lambda x: x[2], reverse=True)
+            map_number = reactant_candidates[0][0]
+            j = reactant_candidates[0][1]
+            pro_root = get_root_id(Chem.MolFromSmiles(product[i]), root_map_number=map_number)
+            pro_smi = clear_map_canonical_smiles(product[i], canonical=True, root=pro_root)
+            aligned_products.append(pro_smi)
+            aligned_products_order.append(j)
+        sorted_products = sorted(list(zip(aligned_products, aligned_products_order)), key=lambda x: x[1])
+        aligned_products = [item[0] for item in sorted_products]
+        pro_smi = ".".join(aligned_products)
+        if separated:
+            reactants = []
+            reagents = []
+            for i,cano_atom_map in enumerate(rea_atom_map_numbers_k):
+                if len(set(cano_atom_map) & full_pro_atom_map_numbers) > 0:
+                    reactants.append(aligned_reactants[i])
+                else:
+                    reagents.append(aligned_reactants[i])
+            rea_smi = ".".join(reactants)
+            reactant_tokens = smi_tokenizer(rea_smi)
+            if len(reagents) > 0 :
+                reactant_tokens += " <separated> " + smi_tokenizer(".".join(reagents))
+        else:
+            rea_smi = ".".join(aligned_reactants)
+            reactant_tokens = smi_tokenizer(rea_smi)
+        product_tokens = smi_tokenizer(pro_smi)
+        return_status['src_data'].append(reactant_tokens)
+        return_status['tgt_data'].append(product_tokens)
+        if reversable:
+            aligned_reactants.reverse()
+            aligned_products.reverse()
+            pro_smi = ".".join(aligned_products)
+            rea_smi = ".".join(aligned_reactants)
+            product_tokens = smi_tokenizer(pro_smi)
+            reactant_tokens = smi_tokenizer(rea_smi)
+            return_status['src_data'].append(reactant_tokens)
+            return_status['tgt_data'].append(product_tokens)
+    edit_distances = []
+    for src,tgt in zip(return_status['src_data'],return_status['tgt_data']):
+        edit_distances.append(textdistance.levenshtein.distance(src.split(),tgt.split()))
+    return_status['edit_distance'] = np.mean(edit_distances)
+    return return_status
+def get_retro_rsmiles(data):
+    pt = re.compile(r':(\d+)]')
+    product = data['product']
+    reactant = data['reactant']
+    augmentation = data['augmentation']
+    pro_mol = Chem.MolFromSmiles(product)
+    rea_mol = Chem.MolFromSmiles(reactant)
+    """checking data quality"""
+    rids = sorted(re.findall(pt, reactant))
+    pids = sorted(re.findall(pt, product))
+    return_status = {
+        "status":0,
+        "src_data":[],
+        "tgt_data":[],
+        "edit_distance":0,
+    }
+    pro_atom_map_numbers = list(map(int, re.findall(r"(?<=:)\d+", product)))
+    reactant = reactant.split(".")
+    reversable = False  # no shuffle
+    # augmentation = 100
+    if augmentation == 999:
+        product_roots = pro_atom_map_numbers
+        times = len(product_roots)
+    else:
+        product_roots = [-1]
+        # reversable = len(reactant) > 1
+        max_times = len(pro_atom_map_numbers)
+        times = min(augmentation, max_times)
+        if times < augmentation:  # times = max_times
+            product_roots.extend(pro_atom_map_numbers)
+            product_roots.extend(random.choices(product_roots, k=augmentation - len(product_roots)))
+        else:  # times = augmentation
+            while len(product_roots) < times:
+                product_roots.append(random.sample(pro_atom_map_numbers, 1)[0])
+                # pro_atom_map_numbers.remove(product_roots[-1])
+                if product_roots[-1] in product_roots[:-1]:
+                    product_roots.pop()
+        times = len(product_roots)
+        assert times == augmentation
+        if reversable:
+            times = int(times / 2)
+    # candidates = []
+    for k in range(times):
+        pro_root_atom_map = product_roots[k]
+        pro_root = get_root_id(pro_mol, root_map_number=pro_root_atom_map)
+        cano_atom_map = get_cano_map_number(product, root=pro_root)
+        if cano_atom_map is None:
+            return_status["status"] = "error_mapping"
+            return return_status
+        pro_smi = clear_map_canonical_smiles(product, canonical=True, root=pro_root)
+        aligned_reactants = []
+        aligned_reactants_order = []
+        rea_atom_map_numbers = [list(map(int, re.findall(r"(?<=:)\d+", rea))) for rea in reactant]
+        used_indices = []
+        for i, rea_map_number in enumerate(rea_atom_map_numbers):
+            for j, map_number in enumerate(cano_atom_map):
+                # select mapping reactans
+                if map_number in rea_map_number:
+                    rea_root = get_root_id(Chem.MolFromSmiles(reactant[i]), root_map_number=map_number)
+                    rea_smi = clear_map_canonical_smiles(reactant[i], canonical=True, root=rea_root)
+                    aligned_reactants.append(rea_smi)
+                    aligned_reactants_order.append(j)
+                    used_indices.append(i)
+                    break
+        sorted_reactants = sorted(list(zip(aligned_reactants, aligned_reactants_order)), key=lambda x: x[1])
+        aligned_reactants = [item[0] for item in sorted_reactants]
+        reactant_smi = ".".join(aligned_reactants)
+        product_tokens = smi_tokenizer(pro_smi)
+        reactant_tokens = smi_tokenizer(reactant_smi)
+        return_status['src_data'].append(product_tokens)
+        return_status['tgt_data'].append(reactant_tokens)
+        if reversable:
+            aligned_reactants.reverse()
+            reactant_smi = ".".join(aligned_reactants)
+            product_tokens = smi_tokenizer(pro_smi)
+            reactant_tokens = smi_tokenizer(reactant_smi)
+            return_status['src_data'].append(product_tokens)
+            return_status['tgt_data'].append(reactant_tokens)
+    assert len(return_status['src_data']) == data['augmentation']
+    edit_distances = []
+    for src,tgt in zip(return_status['src_data'],return_status['tgt_data']):
+        edit_distances.append(textdistance.levenshtein.distance(src.split(),tgt.split()))
+    return_status['edit_distance'] = np.mean(edit_distances)
+    return return_status
+def multi_process(data):
+    pt = re.compile(r':(\d+)]')
+    product = data['product']
+    reactant = data['reactant']
+    augmentation = data['augmentation']
+    pro_mol = Chem.MolFromSmiles(product)
+    rea_mol = Chem.MolFromSmiles(reactant)
+    """checking data quality"""
+    rids = sorted(re.findall(pt, reactant))
+    pids = sorted(re.findall(pt, product))
+    return_status = {
+        "status":0,
+        "src_data":[],
+        "tgt_data":[],
+        "edit_distance":0,
+    }
+    # if ",".join(rids) != ",".join(pids):  # mapping is not 1:1
+    #     return_status["status"] = "error_mapping"
+    # if len(set(rids)) != len(rids):  # mapping is not 1:1
+    #     return_status["status"] = "error_mapping"
+    # if len(set(pids)) != len(pids):  # mapping is not 1:1
+    #     return_status["status"] = "error_mapping"
+    if "" == product:
+        return_status["status"] = "empty_p"
+    if "" == reactant:
+        return_status["status"] = "empty_r"
+    if rea_mol is None:
+        return_status["status"] = "invalid_r"
+    if len(rea_mol.GetAtoms()) < 5:
+        return_status["status"] = "small_r"
+    if pro_mol is None:
+        return_status["status"] = "invalid_p"
+    if len(pro_mol.GetAtoms()) == 1:
+        return_status["status"] = "small_p"
+    if not all([a.HasProp('molAtomMapNumber') for a in pro_mol.GetAtoms()]):
+        return_status["status"] = "error_mapping_p"
+    """finishing checking data quality"""
+    if return_status['status'] == 0:
+        pro_atom_map_numbers = list(map(int, re.findall(r"(?<=:)\d+", product)))
+        reactant = reactant.split(".")
+        if data['root_aligned']:
+            reversable = False  # no shuffle
+            # augmentation = 100
+            if augmentation == 999:
+                product_roots = pro_atom_map_numbers
+                times = len(product_roots)
+            else:
+                product_roots = [-1]
+                # reversable = len(reactant) > 1
+                max_times = len(pro_atom_map_numbers)
+                times = min(augmentation, max_times)
+                if times < augmentation:  # times = max_times
+                    product_roots.extend(pro_atom_map_numbers)
+                    product_roots.extend(random.choices(product_roots, k=augmentation - len(product_roots)))
+                else:  # times = augmentation
+                    while len(product_roots) < times:
+                        product_roots.append(random.sample(pro_atom_map_numbers, 1)[0])
+                        # pro_atom_map_numbers.remove(product_roots[-1])
+                        if product_roots[-1] in product_roots[:-1]:
+                            product_roots.pop()
+                times = len(product_roots)
+                assert times == augmentation
+                if reversable:
+                    times = int(times / 2)
+            # candidates = []
+            for k in range(times):
+                pro_root_atom_map = product_roots[k]
+                pro_root = get_root_id(pro_mol, root_map_number=pro_root_atom_map)
+                cano_atom_map = get_cano_map_number(product, root=pro_root)
+                if cano_atom_map is None:
+                    return_status["status"] = "error_mapping"
+                    return return_status
+                pro_smi = clear_map_canonical_smiles(product, canonical=True, root=pro_root)
+                aligned_reactants = []
+                aligned_reactants_order = []
+                rea_atom_map_numbers = [list(map(int, re.findall(r"(?<=:)\d+", rea))) for rea in reactant]
+                used_indices = []
+                for i, rea_map_number in enumerate(rea_atom_map_numbers):
+                    for j, map_number in enumerate(cano_atom_map):
+                        # select mapping reactans
+                        if map_number in rea_map_number:
+                            rea_root = get_root_id(Chem.MolFromSmiles(reactant[i]), root_map_number=map_number)
+                            rea_smi = clear_map_canonical_smiles(reactant[i], canonical=True, root=rea_root)
+                            aligned_reactants.append(rea_smi)
+                            aligned_reactants_order.append(j)
+                            used_indices.append(i)
+                            break
+                sorted_reactants = sorted(list(zip(aligned_reactants, aligned_reactants_order)), key=lambda x: x[1])
+                aligned_reactants = [item[0] for item in sorted_reactants]
+                reactant_smi = ".".join(aligned_reactants)
+                product_tokens = smi_tokenizer(pro_smi)
+                reactant_tokens = smi_tokenizer(reactant_smi)
+                return_status['src_data'].append(product_tokens)
+                return_status['tgt_data'].append(reactant_tokens)
+                if reversable:
+                    aligned_reactants.reverse()
+                    reactant_smi = ".".join(aligned_reactants)
+                    product_tokens = smi_tokenizer(pro_smi)
+                    reactant_tokens = smi_tokenizer(reactant_smi)
+                    return_status['src_data'].append(product_tokens)
+                    return_status['tgt_data'].append(reactant_tokens)
+            assert len(return_status['src_data']) == data['augmentation']
+        else:
+            cano_product = clear_map_canonical_smiles(product)
+            cano_reactanct = ".".join([clear_map_canonical_smiles(rea) for rea in reactant if len(set(map(int, re.findall(r"(?<=:)\d+", rea))) & set(pro_atom_map_numbers)) > 0 ])
+            return_status['src_data'].append(smi_tokenizer(cano_product))
+            return_status['tgt_data'].append(smi_tokenizer(cano_reactanct))
+            pro_mol = Chem.MolFromSmiles(cano_product)
+            rea_mols = [Chem.MolFromSmiles(rea) for rea in cano_reactanct.split(".")]
+            for i in range(int(augmentation-1)):
+                pro_smi = Chem.MolToSmiles(pro_mol,doRandom=True)
+                rea_smi = [Chem.MolToSmiles(rea_mol,doRandom=True) for rea_mol in rea_mols]
+                rea_smi = ".".join(rea_smi)
+                return_status['src_data'].append(smi_tokenizer(pro_smi))
+                return_status['tgt_data'].append(smi_tokenizer(rea_smi))
+        edit_distances = []
+        for src,tgt in zip(return_status['src_data'],return_status['tgt_data']):
+            edit_distances.append(textdistance.levenshtein.distance(src.split(),tgt.split()))
+        return_status['edit_distance'] = np.mean(edit_distances)
+    return return_status
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-rxn',type=str,required=True)
+    parser.add_argument('-mode',type=str,default="retro",)
+    parser.add_argument('-forward_mode',type=str,default="separated",)
+    parser.add_argument("-augmentation",type=int,default=1)
+    parser.add_argument("-seed",type=int,default=33)
+    args = parser.parse_args()
+    print(args)
+    reactant,reagent,product = args.rxn.split(">")
+    pt = re.compile(r':(\d+)]')
+    rids = sorted(re.findall(pt, reactant))
+    pids = sorted(re.findall(pt, product))
+    if len(rids) == 0 or len(pids) == 0:
+        print("No atom mapping found!")
+        exit(1)
+    if args.mode == "retro":
+        args.input = product
+        args.output = reactant
+    else:
+        args.input = reactant
+        args.output = product
+    print("Original input:", args.input)
+    print("Original output:",args.output)
+    src_smi = clear_map_canonical_smiles(args.input)
+    tgt_smi = clear_map_canonical_smiles(args.output)
+    if src_smi == "" or tgt_smi == "":
+        print("Invalid SMILES!")
+        exit(1)
+    print("Canonical input:", src_smi)
+    print("Canonical output:",tgt_smi)
+    mapping_check = True
+    if ",".join(rids) != ",".join(pids):  # mapping is not 1:1
+        mapping_check = False
+    if len(set(rids)) != len(rids):  # mapping is not 1:1
+        mapping_check = False
+    if len(set(pids)) != len(pids):  # mapping is not 1:1
+        mapping_check = False
+    if not mapping_check:
+        print("The quality of the atom mapping may not be good enough, which can affect the effect of root alignment.")
+    data = {
+        'product':product,
+        'reactant':reactant,
+        'augmentation':args.augmentation,
+        'separated':args.forward_mode == "separated"
+    }
+    if args.mode == "retro":
+        res = get_retro_rsmiles(data)
+    else:
+        res = get_forward_rsmiles(data)
+    for index,(src,tgt) in enumerate(zip(res['src_data'], res['tgt_data'])):
+        print(f"ID:{index}")
+        print(f"R-SMILES input:{''.join(src.split())}")
+        print(f"R-SMILES output:{''.join(tgt.split())}")
+    print("Avg. edit distance:", res['edit_distance'])

data_provider/reaction_action_dataset.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import torch
+from torch_geometric.data import Dataset
+import os
+import random
+import json
+from .data_utils import smiles2data, reformat_smiles
+class ActionDataset(Dataset):
+    def __init__(self, root, mode, smi_max_len, use_graph=True, disable_graph_cache=False, predict_rxn_condition=False, smiles_type='default'):
+        super(ActionDataset, self).__init__(root)
+        self.root = root
+        self.smi_max_len = smi_max_len
+        self.tokenizer = None
+        self.use_graph = use_graph
+        self.disable_graph_cache = disable_graph_cache
+        self.predict_rxn_condition = predict_rxn_condition
+        self.smiles_type = smiles_type
+        with open(os.path.join(self.root, f'{mode}.json'), encoding='utf-8') as f:
+            self.data_list = json.load(f)
+        if self.use_graph:
+            self.mol_graph_map = torch.load(os.path.join(self.root, 'mol_graph_map.pt'))
+        # self.data_list = self.data_list[:100]
+    def get(self, index):
+        return self.__getitem__(index)
+    def len(self):
+        return len(self)
+    def __len__(self):
+        return len(self.data_list)
+    def make_prompt(self, param_dict, smi_max_len=128, predict_rxn_condition=False):
+        action_sequence = param_dict['actions']
+        smiles_list = []
+        prompt = ''
+        prompt += 'Reactants: '
+        smiles_wrapper = lambda x: reformat_smiles(x, smiles_type=self.smiles_type)[:smi_max_len]
+        for smi in param_dict['REACTANT']:
+            prompt += f'{param_dict["extracted_molecules"][smi]}: [START_SMILES]{smiles_wrapper(smi)}[END_SMILES] '
+            smiles_list.append(smi)
+        prompt += 'Product: '
+        for smi in param_dict['PRODUCT']:
+            prompt += f'{param_dict["extracted_molecules"][smi]}: [START_SMILES]{smiles_wrapper(smi)}[END_SMILES] '
+            smiles_list.append(smi)
+        if param_dict['CATALYST']:
+            prompt += 'Catalysts: '
+            for smi in param_dict['CATALYST']:
+                if smi in param_dict["extracted_molecules"]:
+                    prompt += f'{param_dict["extracted_molecules"][smi]}: [START_SMILES]{smiles_wrapper(smi)}[END_SMILES] '
+                else:
+                    prompt += f'[START_SMILES]{smiles_wrapper(smi)}[END_SMILES] '
+                smiles_list.append(smi)
+        if param_dict['SOLVENT']:
+            prompt += 'Solvents: '
+            for smi in param_dict['SOLVENT']:
+                if smi in param_dict["extracted_molecules"]:
+                    prompt += f'{param_dict["extracted_molecules"][smi]}: [START_SMILES]{smiles_wrapper(smi)}[END_SMILES] '
+                else:
+                    prompt += f'[START_SMILES]{smiles_wrapper(smi)}[END_SMILES] '
+                smiles_list.append(smi)
+        if predict_rxn_condition:
+            for value, token in param_dict['extracted_duration'].items():
+                action_sequence = action_sequence.replace(token, value)
+            for value, token in param_dict['extracted_temperature'].items():
+                action_sequence = action_sequence.replace(token, value)
+        else:
+            prompt += 'Temperatures: '
+            for value, token in param_dict['extracted_temperature'].items():
+                prompt += f'{token}: {value} '
+            prompt += 'Durations: '
+            for value, token in param_dict['extracted_duration'].items():
+                prompt += f'{token}: {value} '
+        prompt += 'Action Squence: '
+        return prompt, smiles_list, action_sequence
+    def __getitem__(self, index):
+        rxn_dict = self.data_list[index]
+        rxn_id = rxn_dict['index']
+        input_text, smiles_list, output_text = self.make_prompt(rxn_dict, self.smi_max_len, self.predict_rxn_condition)
+        output_text = output_text.strip() + '\n'
+        graph_list = []
+        if self.use_graph:
+            for smiles in smiles_list:
+                if self.disable_graph_cache:
+                    graph_item = smiles2data(smiles)
+                else:
+                    assert smiles in self.mol_graph_map
+                    graph_item = self.mol_graph_map[smiles]
+                graph_list.append(graph_item)
+        return rxn_id, graph_list, output_text, input_text

data_provider/synthesis_dataset.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import torch
+from torch_geometric.data import Dataset
+import os
+import random
+import json
+from .data_utils import smiles2data, escape_custom_split_sequence, reformat_smiles, generate_rsmiles
+class SynthesisDataset(Dataset):
+    def __init__(self,
+            root,
+            mode,
+            smi_max_len=128,
+            use_graph=True,
+            disable_graph_cache=False,
+            smiles_type='default',
+            roundrobin_train=False,
+            test_subset=-1
+        ):
+        super(SynthesisDataset, self).__init__(root)
+        self.root = root
+        if 'PtoR' in root:
+            self.task = 'retro'
+        elif 'pretrain' in root:
+            self.task = 'pretrain'
+        elif 'RtoP' in root:
+            self.task = 'forward'
+        else:
+            raise NotImplementedError(f'Invalid task: {root}')
+        if mode=='valid':
+            mode='val'
+        self.mode = mode
+        self.smi_max_len = smi_max_len
+        self.tokenizer = None
+        self.use_graph = use_graph
+        self.disable_graph_cache = disable_graph_cache
+        self.smiles_type = smiles_type
+        self.roundrobin_train = roundrobin_train
+        with open(os.path.join(root, 'mol_graphid_map.json')) as f:
+            self.mol_idx_map = json.load(f)
+        if self.use_graph:
+            self.idx_graph_map = torch.load(os.path.join(root, 'idx_graph_map.pt'))
+        if self.roundrobin_train and mode=='train':
+            self.reload_counter=-2
+            self.reload_data()
+        else:
+            with open(os.path.join(root, mode, f'src-{mode}.txt')) as f:
+                self.input_list = f.readlines()
+            with open(os.path.join(root, mode, f'tgt-{mode}.txt')) as f:
+                self.output_list = f.readlines()
+            assert len(self.input_list) == len(self.output_list)
+            self.renew_r_smiles()
+            self.input_list = [smi.strip().replace(' ','') for smi in self.input_list]
+            self.output_list = [smi.strip().replace(' ','') for smi in self.output_list]
+        if test_subset>0 and mode=='test':
+            assert test_subset<=len(self.input_list)
+            self.input_list = self.input_list[:test_subset]
+            self.input_list = self.input_list[:test_subset]
+    def reload_data(self):
+        if not self.roundrobin_train:
+            return
+        self.reload_counter = (self.reload_counter+1)%10
+        if hasattr(self, 'input_list'):
+            del self.input_list
+        if hasattr(self, 'output_list'):
+            del self.output_list
+        with open(os.path.join(self.root, f'train/src-train_{self.reload_counter}.txt')) as f:
+            self.input_list = f.readlines()
+        with open(os.path.join(self.root, f'train/tgt-train_{self.reload_counter}.txt')) as f:
+            self.output_list = f.readlines()
+        assert len(self.input_list) == len(self.output_list)
+        self.renew_r_smiles()
+        self.input_list = [smi.strip().replace(' ','') for smi in self.input_list]
+        self.output_list = [smi.strip().replace(' ','') for smi in self.output_list]
+        input_list, output_list = [], []
+        for input_smiles, output_smiles in zip(self.input_list, self.output_list):
+            if input_smiles.count('.') != output_smiles.count('.'):
+                continue
+            input_list.append(input_smiles)
+            output_list.append(output_smiles)
+        print(f'Reloaded data from {self.root}/train/src-train_{self.reload_counter}.txt, filtered len={len(self.input_list)}', flush=True)
+        self.input_list = input_list
+        self.output_list = output_list
+    def renew_r_smiles(self):
+        if self.smiles_type == 'r_smiles' and self.mode == 'train':
+            # only renew r_smiles for training set
+            if not hasattr(self, 'input_list_mapped'):
+                # here we back up the original input_list and output_list
+                self.input_list_mapped = self.input_list
+                self.output_list_mapped = self.output_list
+            self.output_list, self.input_list = generate_rsmiles(self.output_list_mapped, self.input_list_mapped)
+            self.input_list = [smi.strip().replace(' ','') for smi in self.input_list]
+            self.output_list = [smi.strip().replace(' ','') for smi in self.output_list]
+    def get(self, index):
+        return self.__getitem__(index)
+    def len(self):
+        return len(self)
+    def __len__(self):
+        return len(self.input_list)
+    def make_prompt(self, input_smiles, output_smiles, smi_max_len=512):
+        FORWARD_PROMPT = 'Question: Given the following reactant molecules: {}, what are the expected products? Answer: The product molecules are '
+        FORWARD_CATALYST_PROMPT = '{}, and the following catalyst molecules: {}'
+        RETRO_PROMPT = 'Question: Given the following product molecules: {}, what are the reactants that produce them? Answer: The reactant molecules are '
+        # RETRO_PROMPT = 'Predict the reaction that produces the following product: {} '
+        PRETRAIN_PROMPT = 'Reconstruct the masked molecule: {}. Answer: '
+        smiles_wrapper = lambda x: reformat_smiles(x, smiles_type=self.smiles_type)[:smi_max_len]
+        if self.task=='retro':
+            assert '<separated>' not in input_smiles
+            smiles_list = input_smiles.split('.')
+            in_prompt = '; '.join([f'[START_SMILES]{smiles_wrapper(smi)}[END_SMILES]' for smi in smiles_list])
+            input_prompt = RETRO_PROMPT.format(in_prompt)
+        elif self.task=='forward':
+            if '<separated>' in input_smiles:
+                reactant_smiles, reagent_smiles = input_smiles.split('<separated>')
+                reactant_smiles = reactant_smiles.split('.')
+                reagent_smiles = reagent_smiles.split('.')
+                reactant_prompt = '; '.join([f'[START_SMILES]{smiles_wrapper(smi)}[END_SMILES]' for smi in reactant_smiles])
+                reagent_prompt = '; '.join([f'[START_SMILES]{smiles_wrapper(smi)}[END_SMILES]' for smi in reagent_smiles])
+                smiles_list = reactant_smiles+reagent_smiles
+                input_prompt = FORWARD_CATALYST_PROMPT.format(reactant_prompt, reagent_prompt)
+            else:
+                smiles_list = input_smiles.split('.')
+                reactant_prompt = '; '.join([f'[START_SMILES]{smiles_wrapper(smi)}[END_SMILES]' for smi in smiles_list])
+                input_prompt = reactant_prompt
+            input_prompt = FORWARD_PROMPT.format(input_prompt)
+        elif self.task=='pretrain':
+            in_prompt = '; '.join([f'[START_SMILES]{smiles_wrapper(smi)}[END_SMILES]' for smi in input_smiles.split('.')])
+            input_prompt = PRETRAIN_PROMPT.format(in_prompt)
+            smiles_list = output_smiles.split('.')
+        # output_smiles = ' '.join([f'[START_SMILES]{smi[:smi_max_len]}[END_SMILES]' for smi in output_smiles.split('.')])
+        output_smiles = f'[START_SMILES]{output_smiles}[END_SMILES]'
+        output_smiles = escape_custom_split_sequence(output_smiles)
+        return input_prompt, smiles_list, output_smiles
+    def __getitem__(self, index):
+        input_smiles = self.input_list[index]
+        output_smiles = self.output_list[index]
+        input_text, smiles_list, output_text = self.make_prompt(input_smiles, output_smiles, smi_max_len=self.smi_max_len)
+        output_text = output_text.strip()+'\n'
+        graph_list = []
+        if self.use_graph:
+            for smiles in smiles_list:
+                if self.disable_graph_cache:
+                    graph_item = smiles2data(smiles)
+                else:
+                    assert smiles in self.mol_idx_map
+                    idx = self.mol_idx_map[smiles]
+                    assert idx in self.idx_graph_map
+                    graph_item = self.idx_graph_map[idx]
+                graph_list.append(graph_item)
+        return index, graph_list, output_text, input_text

data_provider/tune_dm.py ADDED Viewed

	@@ -0,0 +1,312 @@

+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+import torch
+from pytorch_lightning import LightningDataModule
+import torch_geometric
+# from torch_geometric.loader import DataLoader
+from torch.utils.data import DataLoader
+from torch_geometric.loader.dataloader import Collater
+from data_provider.reaction_action_dataset import ActionDataset
+from data_provider.synthesis_dataset import SynthesisDataset
+from data_provider.caption_dataset import CaptionDataset
+from data_provider.chebi_dataset import ChEBI_dataset
+import re
+# we split individual characters inside special tokens like [START_DNA]
+CUSTOM_SEQ_RE = re.compile(r"(\[START_(DNA|SMILES|I_SMILES|AMINO)])(.*?)(\[END_\2])")
+# token added to implement a custom sequence tokenization. This token is added at
+# corpus cleaning step and removed in pretokenization. The digits are added to increase the chance
+# that they do not occur in the corpus. The digits are escaped so that the token does not appear
+# literally in the source code in case we ever include it in the training data.
+SPLIT_MARKER = f"SPL{1}T-TH{1}S-Pl3A5E"
+def _insert_split_marker(m: re.Match):
+    """
+    Applies split marker based on a regex match of special tokens such as
+    [START_DNA].
+    Parameters
+    ----------
+    n : str
+        Input text to split
+    Returns
+    ----------
+    str - the text with the split token added
+    """
+    start_token, _, sequence, end_token = m.groups()
+    sequence = re.sub(r"(.)", fr"{SPLIT_MARKER}\1", sequence, flags=re.DOTALL)
+    return f"{start_token}{sequence}{SPLIT_MARKER}{end_token}"
+def smiles_handler(text, mol_ph, is_gal=True):
+    smiles_list = []
+    for match in CUSTOM_SEQ_RE.finditer(text):
+        smiles = match.group(3)
+        smiles_list.append(smiles)
+    if is_gal:
+        text = CUSTOM_SEQ_RE.sub(r'\1\3\4%s' % (mol_ph), text)
+        text = escape_custom_split_sequence(text)
+        return text, smiles_list
+    else:
+        text = CUSTOM_SEQ_RE.sub(r'\3%s' % (mol_ph), text)
+        return text, smiles_list
+def escape_custom_split_sequence(text):
+    """
+    Applies custom splitting to the text for GALILEO's tokenization
+    Parameters
+    ----------
+    text : str
+        Input text to split
+    Returns
+    ----------
+    str - the text with the split token added
+    """
+    return CUSTOM_SEQ_RE.sub(_insert_split_marker, text)
+class TrainCollater:
+    def __init__(self, tokenizer, text_max_len, rxn_max_len, mol_ph, mol_token_id, is_gal=True, use_graph=True, use_qa_pair=True):
+        self.rxn_max_len = rxn_max_len
+        self.text_max_len = text_max_len
+        self.tokenizer = tokenizer
+        self.collater = Collater([], [])
+        self.mol_ph = mol_ph
+        self.mol_token_id = mol_token_id
+        self.is_gal = is_gal
+        self.use_graph = use_graph
+        self.use_qa_pair = use_qa_pair
+    def __call__(self, batch):
+        return self.collate_qa(batch) if self.use_qa_pair else self.collate(batch)
+    def collate(self, batch):
+        rxn_ids, graphs, texts, smiles_prompt = zip(*batch)
+        if graphs:
+            graphs = self.collater(graphs)
+        ## deal with prompt
+        if self.use_graph:
+            smiles_prompt = [smiles_handler(p, self.mol_ph, self.is_gal)[0] for p in smiles_prompt]
+        else:
+            smiles_prompt = [escape_custom_split_sequence(p) for p in smiles_prompt]
+        self.tokenizer.padding_side = 'left'
+        smiles_prompt_tokens = self.tokenizer(text=smiles_prompt,
+                                              truncation=False,
+                                              padding='longest',
+                                              add_special_tokens=True,
+                                              return_tensors='pt',
+                                              return_attention_mask=True)
+        is_mol_token = smiles_prompt_tokens.input_ids == self.mol_token_id
+        smiles_prompt_tokens['is_mol_token'] = is_mol_token
+        self.tokenizer.padding_side = 'right'
+        text_tokens = self.tokenizer(text=texts,
+                                     truncation=True,
+                                     padding='longest',
+                                     add_special_tokens=True,
+                                     max_length=self.text_max_len,
+                                     return_tensors='pt',
+                                     return_attention_mask=True)
+        return rxn_ids, graphs, smiles_prompt_tokens, text_tokens
+    def collate_qa(self, batch):
+        rxn_ids, graphs, texts, input_prompt = zip(*batch)
+        graphs = [graph for graph_batch in graphs for graph in graph_batch]
+        if graphs:
+            graphs = self.collater(graphs)
+        ## deal with prompt
+        if self.use_graph:
+            input_prompt = [smiles_handler(p, self.mol_ph, self.is_gal)[0] for p in input_prompt]
+        else:
+            input_prompt = [escape_custom_split_sequence(p) for p in input_prompt]
+        self.tokenizer.padding_side = 'right'
+        qa_pair = [[q, a] for q, a in zip(input_prompt, texts)]
+        qa_batch = self.tokenizer(qa_pair,
+                                    truncation=True,
+                                    padding='longest',
+                                    add_special_tokens=True,
+                                    max_length=self.rxn_max_len + self.text_max_len,
+                                    return_tensors='pt',
+                                    return_attention_mask=True,
+                                    return_token_type_ids=True)
+        is_mol_token = qa_batch.input_ids == self.mol_token_id
+        qa_batch['is_mol_token'] = is_mol_token
+        return rxn_ids, graphs, qa_batch
+class InferenceCollater:
+    def __init__(self, tokenizer, text_max_len, rxn_max_len, mol_ph, mol_token_id, is_gal=True):
+        self.text_max_len = text_max_len
+        self.rxn_max_len = rxn_max_len
+        self.tokenizer = tokenizer
+        self.collater = Collater([], [])
+        self.mol_ph = mol_ph
+        self.mol_token_id = mol_token_id
+        self.is_gal = is_gal
+    def __call__(self, batch):
+        rxn_ids, graphs, texts, input_prompt = zip(*batch)
+        inputs = input_prompt
+        graphs = [graph for graph_batch in graphs for graph in graph_batch]
+        if graphs:
+            graphs = self.collater(graphs)
+        input_prompt = [smiles_handler(p, self.mol_ph, self.is_gal)[0] for p in input_prompt]
+        ## deal with prompt
+        self.tokenizer.padding_side = 'left'
+        input_prompt_tokens = self.tokenizer(input_prompt,
+                                              truncation=True,
+                                              padding='longest',
+                                              add_special_tokens=True,
+                                              max_length=self.rxn_max_len,
+                                              return_tensors='pt',
+                                              return_attention_mask=True)
+        is_mol_token = input_prompt_tokens.input_ids == self.mol_token_id
+        input_prompt_tokens['is_mol_token'] = is_mol_token
+        return rxn_ids, graphs, input_prompt_tokens, texts, inputs
+class TuneDM(LightningDataModule):
+    def __init__(
+        self,
+        num_workers: int = 0,
+        batch_size: int = 256,
+        root: str = 'data/',
+        text_max_len: int = 128,
+        smi_max_len: int = 128,
+        rxn_max_len: int = 128,
+        tokenizer=None,
+        downstream_task='action',
+        args=None,
+    ):
+        super().__init__()
+        self.args = args
+        self.batch_size = batch_size
+        self.inference_batch_size = args.inference_batch_size
+        self.num_workers = num_workers
+        self.rxn_max_len = rxn_max_len
+        self.text_max_len = text_max_len
+        self.prompt = args.prompt
+        DownstreamDataset = {
+            'action': ActionDataset,
+            'synthesis': SynthesisDataset,
+            'caption': CaptionDataset,
+            'chebi': ChEBI_dataset,
+        }[downstream_task]
+        ds_args = {
+            'use_graph': not args.disable_graphs,
+            'disable_graph_cache': args.disable_graph_cache,
+            'smiles_type': args.smiles_type,
+        }
+        if downstream_task == 'action':
+            ds_args['predict_rxn_condition'] = args.predict_rxn_condition
+        if downstream_task == 'synthesis':
+            ds_args['roundrobin_train'] = args.roundrobin_train
+            ds_args['test_subset'] = args.test_subset
+        self.train_dataset = DownstreamDataset(root, 'train', smi_max_len, **ds_args)
+        self.val_dataset = DownstreamDataset(root, 'valid', smi_max_len, **ds_args)
+        self.test_dataset = DownstreamDataset(root, 'test', smi_max_len, **ds_args)
+        self.init_tokenizer(tokenizer)
+        self.mol_ph_token = '<mol>' * self.args.num_query_token
+        self.is_gal = args.opt_model.find('galactica') >= 0
+        self.use_graph = not args.disable_graphs
+        self.is_t5 = args.opt_model.find('t5') >= 0
+    def init_tokenizer(self, tokenizer):
+        self.tokenizer = tokenizer
+        self.train_dataset.tokenizer = tokenizer
+        self.val_dataset.tokenizer = tokenizer
+        self.test_dataset.tokenizer = tokenizer
+        self.mol_token_id = self.tokenizer.mol_token_id
+        # self.tokenizer.mol_token_id = tokenizer("<mol>", add_special_tokens=False).input_ids[0]
+    def train_dataloader(self):
+        if self.args.roundrobin_train:
+            self.train_dataset.reload_data()
+        if hasattr(self.train_dataset, 'renew_r_smiles'):
+            self.train_dataset.renew_r_smiles()
+        loader = DataLoader(
+            self.train_dataset,
+            batch_size=self.batch_size,
+            shuffle=True,
+            num_workers=self.num_workers,
+            pin_memory=False,
+            drop_last=True,
+            persistent_workers=True,
+            collate_fn=TrainCollater(
+                tokenizer=self.tokenizer,
+                text_max_len=self.text_max_len,
+                rxn_max_len=self.rxn_max_len,
+                mol_ph=self.mol_ph_token,
+                mol_token_id=self.mol_token_id,
+                is_gal=self.is_gal,
+                use_graph=self.use_graph,
+                use_qa_pair=not self.is_t5,
+            ),
+        )
+        return loader
+    def val_dataloader(self):
+        test_loader = DataLoader(
+            self.test_dataset,
+            batch_size=self.inference_batch_size,
+            shuffle=False,
+            num_workers=self.num_workers,
+            pin_memory=False,
+            drop_last=False,
+            persistent_workers=True,
+            collate_fn=InferenceCollater(
+                tokenizer=self.tokenizer,
+                text_max_len=self.text_max_len,
+                rxn_max_len=self.rxn_max_len,
+                mol_ph=self.mol_ph_token,
+                mol_token_id=self.mol_token_id,
+                is_gal=self.is_gal
+            ),
+        )
+        return [test_loader]
+        val_loader = DataLoader(
+            self.val_dataset,
+            batch_size=self.batch_size,
+            shuffle=False,
+            num_workers=self.num_workers,
+            pin_memory=False,
+            drop_last=False,
+            persistent_workers=True,
+            collate_fn=InferenceCollater(
+                tokenizer=self.tokenizer,
+                text_max_len=self.text_max_len,
+                rxn_max_len=self.rxn_max_len,
+                mol_ph=self.mol_ph_token,
+                mol_token_id=self.mol_token_id,
+                is_gal=self.is_gal
+            ),
+        )
+        return [val_loader, test_loader]
+    def test_dataloader(self):
+        loader = DataLoader(
+            self.test_dataset,
+            batch_size=self.inference_batch_size,
+            shuffle=False,
+            num_workers=self.num_workers,
+            pin_memory=False,
+            drop_last=False,
+            persistent_workers=True,
+            collate_fn=InferenceCollater(
+                tokenizer=self.tokenizer,
+                text_max_len=self.text_max_len,
+                rxn_max_len=self.rxn_max_len,
+                mol_ph=self.mol_ph_token,
+                mol_token_id=self.mol_token_id,
+                is_gal=self.is_gal
+            ),
+        )
+        return loader

demo.json ADDED Viewed

	@@ -0,0 +1,7 @@

+[
+  "[BH4-].[Na+].COC(=O)c1nc2ccccn2c1C>C1CCOC1.CO>Cc1c(CO)nc2ccccn12",
+  "NCCN.O=C(O)c1cc2nc(-c3ccc(-c4ccccc4)cc3)c(Cl)cc2n1CO>CN(C)C=O>O=C(O)c1cc2nc(-c3ccc(-c4ccccc4)cc3)c(Cl)cc2[nH]1",
+  "CC[O-].[Na+].CC(C)[N+](=O)[O-].Cc1cc(C)nc(NC(=O)NS(=O)(=O)c2ccccc2CCl)n1>CCO>Cc1cc(C)nc(NC(=O)NS(=O)(=O)c2ccccc2C=O)n1",
+  "COC(=O)c1ccc2c(c1)nc(C(C)(C)C)n2CC1CCC(F)(F)CC1.O=S(=O)([O-])O.[K+]>[Li+].[OH-].C1COCCO1>CC(C)(C)c1nc2cc(C(=O)O)ccc2n1CC1CCC(F)(F)CC1",
+  "FC(F)(F)c1cccc2c(Br)c(Cc3ccccc3)cnc12.CC1(C)OB(c2cncc(C=O)c2)OC1(C)C>O=C([O-])[O-].[Na+].[Na+].Cc1ccccc1.CCO>O=Cc1cncc(-c2c(Cc3ccccc3)cnc3c(C(F)(F)F)cccc23)c1"
+]

demo.py ADDED Viewed

	@@ -0,0 +1,224 @@

+import os
+import torch
+import argparse
+import warnings
+import pytorch_lightning as pl
+from pytorch_lightning import Trainer, strategies
+import pytorch_lightning.callbacks as plc
+from pytorch_lightning.loggers import CSVLogger
+from pytorch_lightning.callbacks import TQDMProgressBar
+from data_provider.pretrain_dm import PretrainDM
+from data_provider.tune_dm import *
+from model.opt_flash_attention import replace_opt_attn_with_flash_attn
+from model.blip2_model import Blip2Model
+from model.dist_funs import MyDeepSpeedStrategy
+from data_provider.reaction_action_dataset import ActionDataset
+from data_provider.data_utils import json_read, json_write
+from data_provider.data_utils import smiles2data, reformat_smiles
+## for pyg bug
+warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
+## for A5000 gpus
+torch.set_float32_matmul_precision('medium') # can be medium (bfloat16), high (tensorfloat32), highest (float32)
+class InferenceRunner:
+    def __init__(self, model, tokenizer, rxn_max_len, smi_max_len,
+                 smiles_type='default', device='cuda', predict_rxn_condition=True, args=None):
+        self.model = model
+        self.rxn_max_len = rxn_max_len
+        self.smi_max_len = smi_max_len
+        self.tokenizer = tokenizer
+        self.collater = Collater([], [])
+        self.mol_ph = '<mol>' * args.num_query_token
+        self.mol_token_id = tokenizer.mol_token_id
+        self.is_gal = args.opt_model.find('galactica') >= 0
+        self.collater = Collater([], [])
+        self.device = device
+        self.smiles_type = smiles_type
+        self.predict_rxn_condition = predict_rxn_condition
+        self.args = args
+    def make_prompt(self, param_dict, smi_max_len=128, predict_rxn_condition=False):
+        action_sequence = param_dict['actions']
+        smiles_list = []
+        prompt = ''
+        prompt += 'Reactants: '
+        smiles_wrapper = lambda x: reformat_smiles(x, smiles_type=self.smiles_type)[:smi_max_len]
+        for smi in param_dict['REACTANT']:
+            prompt += f'{param_dict["extracted_molecules"][smi]}: [START_SMILES]{smiles_wrapper(smi)}[END_SMILES] '
+            smiles_list.append(smi)
+        prompt += 'Product: '
+        for smi in param_dict['PRODUCT']:
+            prompt += f'{param_dict["extracted_molecules"][smi]}: [START_SMILES]{smiles_wrapper(smi)}[END_SMILES] '
+            smiles_list.append(smi)
+        if param_dict['CATALYST']:
+            prompt += 'Catalysts: '
+            for smi in param_dict['CATALYST']:
+                if smi in param_dict["extracted_molecules"]:
+                    prompt += f'{param_dict["extracted_molecules"][smi]}: [START_SMILES]{smiles_wrapper(smi)}[END_SMILES] '
+                else:
+                    prompt += f'[START_SMILES]{smiles_wrapper(smi)}[END_SMILES] '
+                smiles_list.append(smi)
+        if param_dict['SOLVENT']:
+            prompt += 'Solvents: '
+            for smi in param_dict['SOLVENT']:
+                if smi in param_dict["extracted_molecules"]:
+                    prompt += f'{param_dict["extracted_molecules"][smi]}: [START_SMILES]{smiles_wrapper(smi)}[END_SMILES] '
+                else:
+                    prompt += f'[START_SMILES]{smiles_wrapper(smi)}[END_SMILES] '
+                smiles_list.append(smi)
+        if predict_rxn_condition:
+            for value, token in param_dict['extracted_duration'].items():
+                action_sequence = action_sequence.replace(token, value)
+            for value, token in param_dict['extracted_temperature'].items():
+                action_sequence = action_sequence.replace(token, value)
+        else:
+            prompt += 'Temperatures: '
+            for value, token in param_dict['extracted_temperature'].items():
+                prompt += f'{token}: {value} '
+            prompt += 'Durations: '
+            for value, token in param_dict['extracted_duration'].items():
+                prompt += f'{token}: {value} '
+        prompt += 'Action Squence: '
+        return prompt, smiles_list, action_sequence
+    def get_action_elements(self, rxn_dict):
+        rxn_id = rxn_dict['index']
+        input_text, smiles_list, output_text = self.make_prompt(rxn_dict, self.smi_max_len, self.predict_rxn_condition)
+        output_text = output_text.strip() + '\n'
+        graph_list = []
+        for smiles in smiles_list:
+            graph_item = smiles2data(smiles)
+            graph_list.append(graph_item)
+        return rxn_id, graph_list, output_text, input_text
+    @torch.no_grad()
+    def predict(self, rxn_dict):
+        rxn_id, graphs, prompt_tokens, output_text, input_text = self.tokenize(rxn_dict)
+        result_dict = {
+            'raw': rxn_dict,
+            'index': rxn_id,
+            'input': input_text,
+            'target': output_text
+        }
+        samples = {'graphs': graphs, 'prompt_tokens': prompt_tokens}
+        with torch.no_grad():
+            result_dict['prediction'] = self.model.blip2opt.generate(
+                samples,
+                do_sample=self.args.do_sample,
+                num_beams=self.args.num_beams,
+                max_length=self.args.max_inference_len,
+                min_length=self.args.min_inference_len,
+                num_captions=self.args.num_generate_captions,
+                use_graph=True
+            )
+        return result_dict
+    def tokenize(self, rxn_dict):
+        rxn_id, graph_list, output_text, input_text = self.get_action_elements(rxn_dict)
+        if graph_list:
+            graphs = self.collater(graph_list).to(self.device)
+        input_prompt = smiles_handler(input_text, self.mol_ph, self.is_gal)[0]
+        ## deal with prompt
+        self.tokenizer.padding_side = 'left'
+        input_prompt_tokens = self.tokenizer(input_prompt,
+                                              truncation=True,
+                                              padding='max_length',
+                                              add_special_tokens=True,
+                                              max_length=self.rxn_max_len,
+                                              return_tensors='pt',
+                                              return_attention_mask=True).to(self.device)
+        is_mol_token = input_prompt_tokens.input_ids == self.mol_token_id
+        input_prompt_tokens['is_mol_token'] = is_mol_token
+        return rxn_id, graphs, input_prompt_tokens, output_text, input_text
+def main(args):
+    device = torch.device('cuda')
+    data_list = json_read('demo.json')
+    pl.seed_everything(args.seed)
+    # model
+    if args.init_checkpoint:
+        model = Blip2Model(args).to(device)
+        ckpt = torch.load(args.init_checkpoint, map_location='cpu')
+        model.load_state_dict(ckpt['state_dict'], strict=False)
+        print(f"loaded model from {args.init_checkpoint}")
+    else:
+        model = Blip2Model(args).to(device)
+    model.eval()
+    print('total params:', sum(p.numel() for p in model.parameters()))
+    if args.opt_model.find('galactica') >= 0 or args.opt_model.find('t5') >= 0:
+        tokenizer = model.blip2opt.opt_tokenizer
+    elif args.opt_model.find('llama') >= 0 or args.opt_model.find('vicuna') >= 0:
+        tokenizer = model.blip2opt.llm_tokenizer
+    else:
+        raise NotImplementedError
+    infer_runner = InferenceRunner(
+        model=model,
+        tokenizer=tokenizer,
+        rxn_max_len=args.rxn_max_len,
+        smi_max_len=args.smi_max_len,
+        device=device,
+        predict_rxn_condition=args.predict_rxn_condition,
+        args=args
+    )
+    import time
+    for data_item in data_list:
+        t1 = time.time()
+        result = infer_runner.predict(data_item)
+        print(result)
+        print(f"Time: {time.time() - t1:.2f}s")
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--filename', type=str, default="main")
+    parser.add_argument('--seed', type=int, default=42, help='random seed')
+    # MM settings
+    parser.add_argument('--mode', type=str, default='pretrain', choices=['pretrain', 'ft', 'eval', 'pretrain_eval'])
+    parser.add_argument('--strategy_name', type=str, default='mydeepspeed')
+    parser.add_argument('--iupac_prediction', action='store_true', default=False)
+    parser.add_argument('--ckpt_path', type=str, default=None)
+    # parser = Trainer.add_argparse_args(parser)
+    parser = Blip2Model.add_model_specific_args(parser)  # add model args
+    parser = PretrainDM.add_model_specific_args(parser)
+    parser.add_argument('--accelerator', type=str, default='gpu')
+    parser.add_argument('--devices', type=str, default='0,1,2,3')
+    parser.add_argument('--precision', type=str, default='bf16-mixed')
+    parser.add_argument('--downstream_task', type=str, default='action', choices=['action', 'synthesis', 'caption', 'chebi'])
+    parser.add_argument('--max_epochs', type=int, default=10)
+    parser.add_argument('--enable_flash', action='store_true', default=False)
+    parser.add_argument('--disable_graph_cache', action='store_true', default=False)
+    parser.add_argument('--predict_rxn_condition', action='store_true', default=False)
+    parser.add_argument('--generate_restrict_tokens', action='store_true', default=False)
+    parser.add_argument('--train_restrict_tokens', action='store_true', default=False)
+    parser.add_argument('--smiles_type', type=str, default='default', choices=['default', 'canonical', 'restricted', 'unrestricted', 'r_smiles'])
+    parser.add_argument('--accumulate_grad_batches', type=int, default=1)
+    parser.add_argument('--tqdm_interval', type=int, default=50)
+    parser.add_argument('--check_val_every_n_epoch', type=int, default=1)
+    args = parser.parse_args()
+    if args.enable_flash:
+        replace_opt_attn_with_flash_attn()
+    print("=========================================")
+    for k, v in sorted(vars(args).items()):
+        print(k, '=', v)
+    print("=========================================")
+    return args
+if __name__ == '__main__':
+    main(get_args())

environment.yml ADDED Viewed

	@@ -0,0 +1,489 @@

+name: reactxt
+channels:
+  - pyg
+  - tmap
+  - pytorch
+  - nvidia
+  - nvidia/label/cuda-11.7.0
+  - conda-forge
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=5.1=1_gnu
+  - appdirs=1.4.4=pyhd3eb1b0_0
+  - asttokens=2.2.1=pyhd8ed1ab_0
+  - backcall=0.2.0=pyh9f0ad1d_0
+  - backports=1.0=pyhd8ed1ab_3
+  - backports.functools_lru_cache=1.6.5=pyhd8ed1ab_0
+  - blas=1.0=mkl
+  - brotlipy=0.7.0=py38h27cfd23_1003
+  - bzip2=1.0.8=h7b6447c_0
+  - ca-certificates=2023.08.22=h06a4308_0
+  - certifi=2023.11.17=py38h06a4308_0
+  - cffi=1.15.1=py38h5eee18b_3
+  - charset-normalizer=2.0.4=pyhd3eb1b0_0
+  - cryptography=39.0.1=py38h9ce1e76_2
+  - cuda-cccl=11.7.58=hc415cf5_0
+  - cuda-cudart=11.7.99=0
+  - cuda-cudart-dev=11.7.60=h6a7c232_0
+  - cuda-cupti=11.7.101=0
+  - cuda-driver-dev=11.7.60=0
+  - cuda-libraries=11.7.1=0
+  - cuda-libraries-dev=11.7.0=0
+  - cuda-nvcc=11.7.64=0
+  - cuda-nvrtc=11.7.99=0
+  - cuda-nvrtc-dev=11.7.50=heada363_0
+  - cuda-nvtx=11.7.91=0
+  - cuda-runtime=11.7.0=0
+  - cycler=0.11.0=pyhd3eb1b0_0
+  - debugpy=1.5.1=py38h295c915_0
+  - decorator=5.1.1=pyhd8ed1ab_0
+  - entrypoints=0.4=pyhd8ed1ab_0
+  - executing=1.2.0=pyhd8ed1ab_0
+  - ffmpeg=4.3=hf484d3e_0
+  - freetype=2.12.1=h4a9f257_0
+  - giflib=5.2.1=h5eee18b_3
+  - gmp=6.2.1=h295c915_3
+  - gmpy2=2.1.2=py38heeb90bb_0
+  - gnutls=3.6.15=he1e5248_0
+  - idna=3.4=py38h06a4308_0
+  - intel-openmp=2023.1.0=hdb19cb5_46305
+  - ipykernel=6.15.0=pyh210e3f2_0
+  - jedi=0.18.2=pyhd8ed1ab_0
+  - jinja2=3.1.2=py38h06a4308_0
+  - joblib=1.2.0=py38h06a4308_0
+  - jpeg=9e=h5eee18b_1
+  - jupyter_client=7.0.6=pyhd8ed1ab_0
+  - jupyter_core=4.12.0=py38h578d9bd_0
+  - lame=3.100=h7b6447c_0
+  - lcms2=2.12=h3be6417_0
+  - ld_impl_linux-64=2.38=h1181459_1
+  - lerc=3.0=h295c915_0
+  - libcublas=11.10.3.66=0
+  - libcublas-dev=11.10.1.25=h0c8ac2b_0
+  - libcufft=10.7.2.124=h4fbf590_0
+  - libcufft-dev=10.7.2.50=h59a5ac8_0
+  - libcufile=1.7.0.149=0
+  - libcufile-dev=1.3.0.44=0
+  - libcurand=10.3.3.53=0
+  - libcurand-dev=10.2.10.50=hd49a9cd_0
+  - libcusolver=11.4.0.1=0
+  - libcusolver-dev=11.3.5.50=hc6eba6f_0
+  - libcusparse=11.7.4.91=0
+  - libcusparse-dev=11.7.3.50=hc644b96_0
+  - libdeflate=1.17=h5eee18b_0
+  - libffi=3.4.4=h6a678d5_0
+  - libgcc-ng=11.2.0=h1234567_1
+  - libgfortran-ng=11.2.0=h00389a5_1
+  - libgfortran5=11.2.0=h1234567_1
+  - libgomp=11.2.0=h1234567_1
+  - libiconv=1.16=h7f8727e_2
+  - libidn2=2.3.4=h5eee18b_0
+  - libnpp=11.7.4.75=0
+  - libnpp-dev=11.7.3.21=hb6476a9_0
+  - libnvjpeg=11.8.0.2=0
+  - libnvjpeg-dev=11.7.2.34=h2e48410_0
+  - libpng=1.6.39=h5eee18b_0
+  - libsodium=1.0.18=h36c2ea0_1
+  - libstdcxx-ng=11.2.0=h1234567_1
+  - libtasn1=4.19.0=h5eee18b_0
+  - libtiff=4.5.0=h6a678d5_2
+  - libunistring=0.9.10=h27cfd23_0
+  - libwebp=1.2.4=h11a3e52_1
+  - libwebp-base=1.2.4=h5eee18b_1
+  - lz4-c=1.9.4=h6a678d5_0
+  - markupsafe=2.1.1=py38h7f8727e_0
+  - matplotlib-inline=0.1.6=pyhd8ed1ab_0
+  - mkl=2023.1.0=h6d00ec8_46342
+  - mkl-service=2.4.0=py38h5eee18b_1
+  - mkl_fft=1.3.6=py38h417a72b_1
+  - mkl_random=1.2.2=py38h417a72b_1
+  - mpc=1.1.0=h10f8cd9_1
+  - mpfr=4.0.2=hb69a4c5_1
+  - mpmath=1.2.1=py38h06a4308_0
+  - ncurses=6.4=h6a678d5_0
+  - nest-asyncio=1.5.6=pyhd8ed1ab_0
+  - nettle=3.7.3=hbbd107a_1
+  - networkx=2.8.4=py38h06a4308_1
+  - numpy-base=1.24.3=py38h060ed82_1
+  - ogdf=1.2.0=h2bc3f7f_0
+  - openh264=2.1.1=h4ff587b_0
+  - openssl=3.0.12=h7f8727e_0
+  - parso=0.8.3=pyhd8ed1ab_0
+  - pexpect=4.8.0=pyh1a96a4e_2
+  - pickleshare=0.7.5=py_1003
+  - pillow=9.4.0=py38h6a678d5_0
+  - pooch=1.4.0=pyhd3eb1b0_0
+  - prompt-toolkit=3.0.39=pyha770c72_0
+  - prompt_toolkit=3.0.39=hd8ed1ab_0
+  - ptyprocess=0.7.0=pyhd3deb0d_0
+  - pure_eval=0.2.2=pyhd8ed1ab_0
+  - pycparser=2.21=pyhd3eb1b0_0
+  - pyg=2.3.0=py38_torch_2.0.0_cu117
+  - pygments=2.15.1=pyhd8ed1ab_0
+  - pyopenssl=23.0.0=py38h06a4308_0
+  - pyparsing=3.0.9=py38h06a4308_0
+  - pysocks=1.7.1=py38h06a4308_0
+  - python=3.8.17=h955ad1f_0
+  - python-dateutil=2.8.2=pyhd8ed1ab_0
+  - python_abi=3.8=2_cp38
+  - pytorch=2.0.1=py3.8_cuda11.7_cudnn8.5.0_0
+  - pytorch-cuda=11.7=h778d358_5
+  - pytorch-mutex=1.0=cuda
+  - readline=8.2=h5eee18b_0
+  - setuptools=67.8.0=py38h06a4308_0
+  - six=1.16.0=pyh6c4a22f_0
+  - sqlite=3.41.2=h5eee18b_0
+  - stack_data=0.6.2=pyhd8ed1ab_0
+  - sympy=1.11.1=py38h06a4308_0
+  - tbb=2021.8.0=hdb19cb5_0
+  - threadpoolctl=2.2.0=pyh0d69192_0
+  - tk=8.6.12=h1ccaba5_0
+  - tmap=1.0.6=py38h2bc3f7f_0
+  - torchaudio=2.0.2=py38_cu117
+  - torchtriton=2.0.0=py38
+  - torchvision=0.15.2=py38_cu117
+  - tqdm=4.65.0=py38hb070fc8_0
+  - traitlets=5.9.0=pyhd8ed1ab_0
+  - typing_extensions=4.6.3=py38h06a4308_0
+  - urllib3=1.26.16=py38h06a4308_0
+  - wcwidth=0.2.6=pyhd8ed1ab_0
+  - wheel=0.38.4=py38h06a4308_0
+  - xz=5.4.2=h5eee18b_0
+  - zeromq=4.3.4=h9c3ff4c_1
+  - zlib=1.2.13=h5eee18b_0
+  - zstd=1.5.5=hc292b87_0
+  - pip:
+      - absl-py==1.4.0
+      - accelerate==0.20.3
+      - aiofiles==23.2.1
+      - aiohttp==3.8.4
+      - aiosignal==1.3.1
+      - aliyun-python-sdk-core==2.13.36
+      - aliyun-python-sdk-kms==2.16.1
+      - altair==4.2.2
+      - annotated-types==0.6.0
+      - antlr4-python3-runtime==4.9.3
+      - anyio==3.7.1
+      - argon2-cffi==23.1.0
+      - argon2-cffi-bindings==21.2.0
+      - arrow==1.2.3
+      - async-lru==2.0.4
+      - async-timeout==4.0.2
+      - attrs==23.1.0
+      - autocommand==2.2.2
+      - babel==2.13.0
+      - backoff==2.2.1
+      - backports-zoneinfo==0.2.1
+      - beautifulsoup4==4.12.2
+      - bigmodelvis==0.0.1
+      - binaryornot==0.4.4
+      - bleach==6.0.0
+      - blessed==1.20.0
+      - blinker==1.6.2
+      - blis==0.7.9
+      - braceexpand==0.1.7
+      - cachetools==5.3.1
+      - catalogue==2.0.8
+      - cfgv==3.3.1
+      - chardet==5.2.0
+      - cheroot==10.0.0
+      - cherrypy==18.8.0
+      - click==8.1.4
+      - cloudpathlib==0.16.0
+      - cmake==3.27.7
+      - colorama==0.4.6
+      - colour==0.1.5
+      - comm==0.1.4
+      - confection==0.1.0
+      - configargparse==1.7
+      - contexttimer==0.3.3
+      - contourpy==1.1.0
+      - cookiecutter==2.4.0
+      - crcmod==1.7
+      - croniter==1.4.1
+      - ctranslate2==3.20.0
+      - cymem==2.0.7
+      - datasets==2.13.1
+      - dateutils==0.6.12
+      - decord==0.6.0
+      - deepdiff==6.3.1
+      - deepspeed==0.10.1+ff7d5275
+      - defusedxml==0.7.1
+      - delta-center-client==0.0.4
+      - dill==0.3.6
+      - diskcache==5.6.3
+      - distlib==0.3.6
+      - distro==1.8.0
+      - dnspython==2.4.2
+      - docker-pycreds==0.4.0
+      - einops==0.6.1
+      - evaluate==0.4.1
+      - exceptiongroup==1.1.2
+      - faerun==0.3.20
+      - fairscale==0.4.4
+      - fastapi==0.100.0
+      - fastjsonschema==2.18.1
+      - fasttext-wheel==0.9.2
+      - ffmpy==0.3.1
+      - filelock==3.12.2
+      - flash-attn==2.3.3
+      - flask==3.0.0
+      - fonttools==4.40.0
+      - fqdn==1.5.1
+      - frozenlist==1.3.3
+      - fsspec==2023.6.0
+      - ftfy==6.1.1
+      - future==0.18.3
+      - gdown==4.7.1
+      - gitdb==4.0.10
+      - gitpython==3.1.37
+      - google-auth==2.23.2
+      - google-auth-oauthlib==1.0.0
+      - gpustat==1.1.1
+      - gradio-client==0.7.0
+      - grpcio==1.59.0
+      - h11==0.14.0
+      - hjson==3.1.0
+      - httpcore==1.0.2
+      - httpx==0.25.1
+      - huggingface-hub==0.16.4
+      - identify==2.5.24
+      - imageio==2.31.1
+      - importlib-metadata==6.8.0
+      - importlib-resources==6.0.0
+      - inflect==7.0.0
+      - inquirer==3.1.3
+      - iopath==0.1.10
+      - ipython==8.12.2
+      - ipython-genutils==0.2.0
+      - ipywidgets==8.1.1
+      - isoduration==20.11.0
+      - itsdangerous==2.1.2
+      - jaraco-collections==4.3.0
+      - jaraco-context==4.3.0
+      - jaraco-functools==3.8.0
+      - jaraco-text==3.12.0
+      - jmespath==0.10.0
+      - json5==0.9.14
+      - jsonpointer==2.4
+      - jsonschema==4.18.0
+      - jsonschema-specifications==2023.6.1
+      - jupyter==1.0.0
+      - jupyter-client==8.4.0
+      - jupyter-console==6.6.3
+      - jupyter-events==0.8.0
+      - jupyter-lsp==2.2.0
+      - jupyter-server==2.8.0
+      - jupyter-server-terminals==0.4.4
+      - jupyterlab==4.0.7
+      - jupyterlab-pygments==0.2.2
+      - jupyterlab-server==2.25.0
+      - jupyterlab-widgets==3.0.9
+      - kaggle==1.5.15
+      - kiwisolver==1.4.4
+      - langcodes==3.3.0
+      - lazy-loader==0.3
+      - levenshtein==0.23.0
+      - lightning==2.1.2
+      - lightning-cloud==0.5.37
+      - lightning-utilities==0.9.0
+      - lit==17.0.6
+      - littleutils==0.2.2
+      - lmppl==0.3.1
+      - lxml==4.9.3
+      - markdown==3.5
+      - markdown-it-py==3.0.0
+      - matplotlib==3.2.2
+      - mdurl==0.1.2
+      - mistune==3.0.2
+      - more-itertools==9.1.0
+      - multidict==6.0.4
+      - multiprocess==0.70.14
+      - murmurhash==1.0.9
+      - nbclient==0.8.0
+      - nbconvert==7.9.2
+      - nbformat==5.9.2
+      - ninja==1.11.1
+      - nltk==3.8.1
+      - nodeenv==1.8.0
+      - notebook==7.0.6
+      - notebook-shim==0.2.3
+      - numpy==1.24.4
+      - nvidia-cublas-cu11==11.10.3.66
+      - nvidia-cublas-cu12==12.1.3.1
+      - nvidia-cuda-cupti-cu11==11.7.101
+      - nvidia-cuda-cupti-cu12==12.1.105
+      - nvidia-cuda-nvrtc-cu11==11.7.99
+      - nvidia-cuda-nvrtc-cu12==12.1.105
+      - nvidia-cuda-runtime-cu11==11.7.99
+      - nvidia-cuda-runtime-cu12==12.1.105
+      - nvidia-cudnn-cu11==8.5.0.96
+      - nvidia-cudnn-cu12==8.9.2.26
+      - nvidia-cufft-cu11==10.9.0.58
+      - nvidia-cufft-cu12==11.0.2.54
+      - nvidia-curand-cu11==10.2.10.91
+      - nvidia-curand-cu12==10.3.2.106
+      - nvidia-cusolver-cu11==11.4.0.1
+      - nvidia-cusolver-cu12==11.4.5.107
+      - nvidia-cusparse-cu11==11.7.4.91
+      - nvidia-cusparse-cu12==12.1.0.106
+      - nvidia-ml-py==12.535.77
+      - nvidia-nccl-cu11==2.14.3
+      - nvidia-nccl-cu12==2.18.1
+      - nvidia-nvjitlink-cu12==12.3.101
+      - nvidia-nvtx-cu11==11.7.91
+      - nvidia-nvtx-cu12==12.1.105
+      - oauthlib==3.2.2
+      - ogb==1.3.6
+      - omegaconf==2.3.0
+      - openai==1.2.4
+      - opencv-python-headless==4.5.5.64
+      - opendatasets==0.1.22
+      - opendelta==0.3.2
+      - opennmt-py==3.4.1
+      - ordered-set==4.1.0
+      - orjson==3.9.10
+      - oss2==2.15.0
+      - outdated==0.2.2
+      - overrides==7.4.0
+      - packaging==23.1
+      - pandas==2.0.3
+      - pandocfilters==1.5.0
+      - paragraph2actions==1.5.0
+      - pathtools==0.1.2
+      - pathy==0.10.2
+      - peft==0.3.0
+      - pip==23.3.1
+      - pkgutil-resolve-name==1.3.10
+      - platformdirs==3.8.1
+      - plotly==5.15.0
+      - portalocker==2.7.0
+      - portend==3.2.0
+      - pre-commit==3.3.3
+      - preshed==3.0.8
+      - prometheus-client==0.17.1
+      - promise==2.3
+      - protobuf==3.19.6
+      - psutil==5.9.5
+      - pubchempy==1.0.4
+      - py-cpuinfo==9.0.0
+      - pyahocorasick==2.0.0
+      - pyarrow==12.0.1
+      - pyasn1==0.5.0
+      - pyasn1-modules==0.3.0
+      - pybind11==2.11.1
+      - pycocoevalcap==1.2
+      - pycocotools==2.0.6
+      - pycryptodome==3.18.0
+      - pydantic==1.10.11
+      - pydantic-core==2.14.3
+      - pydeck==0.8.1b0
+      - pydub==0.25.1
+      - pyjwt==2.7.0
+      - pymongo==4.6.0
+      - pympler==1.0.1
+      - pyonmttok==1.37.1
+      - python-editor==1.0.4
+      - python-json-logger==2.0.7
+      - python-levenshtein==0.23.0
+      - python-magic==0.4.27
+      - python-multipart==0.0.6
+      - python-slugify==8.0.1
+      - pytorch-lightning==2.0.0
+      - pytz==2023.3
+      - pytz-deprecation-shim==0.1.0.post0
+      - pywavelets==1.4.1
+      - pyyaml==6.0.1
+      - pyzmq==25.1.1
+      - qtconsole==5.4.4
+      - qtpy==2.4.0
+      - rapidfuzz==3.4.0
+      - rdkit==2023.3.3
+      - readchar==4.0.5
+      - referencing==0.29.1
+      - regex==2023.6.3
+      - requests==2.31.0
+      - requests-oauthlib==1.3.1
+      - responses==0.18.0
+      - rfc3339-validator==0.1.4
+      - rfc3986-validator==0.1.1
+      - rich==13.4.2
+      - rouge-score==0.1.2
+      - rpds-py==0.8.10
+      - rsa==4.9
+      - rxn-onmt-utils==1.1.0
+      - rxn-opennmt-py==1.1.5
+      - rxn-utils==1.6.0
+      - sacrebleu==2.3.1
+      - safetensors==0.3.1
+      - salesforce-lavis==1.0.0
+      - scikit-image==0.20.0
+      - scikit-learn==0.23.1
+      - scipy==1.4.1
+      - semantic-version==2.10.0
+      - send2trash==1.8.2
+      - sentencepiece==0.1.99
+      - sentry-sdk==1.31.0
+      - setproctitle==1.3.3
+      - shellingham==1.5.4
+      - smart-open==6.3.0
+      - smmap==5.0.1
+      - sniffio==1.3.0
+      - soupsieve==2.4.1
+      - spacy==3.7.2
+      - spacy-legacy==3.0.12
+      - spacy-loggers==1.0.4
+      - srsly==2.4.6
+      - starlette==0.27.0
+      - starsessions==1.3.0
+      - streamlit==1.22.0
+      - tabulate==0.9.0
+      - tempora==5.5.0
+      - tenacity==8.2.2
+      - tensorboard==2.14.0
+      - tensorboard-data-server==0.7.1
+      - terminado==0.17.1
+      - text-unidecode==1.3
+      - textdistance==4.6.0
+      - thinc==8.1.10
+      - tifffile==2023.7.10
+      - timm==0.4.12
+      - tinycss2==1.2.1
+      - tokenizers==0.13.3
+      - toml==0.10.2
+      - tomli==2.0.1
+      - tomlkit==0.12.0
+      - toolz==0.12.0
+      - torch==2.0.1
+      - torchmetrics==1.0.0
+      - torchtext==0.4.0
+      - tornado==6.3.3
+      - transformers==4.33.3
+      - triton==2.0.0
+      - typer==0.9.0
+      - tzdata==2023.3
+      - tzlocal==4.3.1
+      - ujson==5.8.0
+      - uri-template==1.3.0
+      - uvicorn==0.22.0
+      - validators==0.20.0
+      - virtualenv==20.23.1
+      - waitress==2.1.2
+      - wandb==0.15.5
+      - wasabi==1.1.2
+      - watchdog==3.0.0
+      - weasel==0.3.4
+      - web-py==0.62
+      - webcolors==1.13
+      - webdataset==0.2.48
+      - webencodings==0.5.1
+      - websocket-client==1.6.1
+      - websockets==11.0.3
+      - werkzeug==3.0.0
+      - widgetsnbextension==4.0.9
+      - xxhash==3.2.0
+      - yacs==0.1.8
+      - yarl==1.9.2
+      - zc-lockfile==3.0.post1
+      - zipp==3.16.0

figures/frameworks.jpg ADDED Viewed

Git LFS Details

SHA256: f278b9619d545ab56442b98e4c43272ed086c595aba314c30f02a76150b6aa1c
Pointer size: 131 Bytes
Size of remote file: 257 kB

gin_pretrained/graphcl_80.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8f685ace701ad71e3d82330fa78add25c573287ebc2908d9f7fddf13bc745f
+size 7454162

graph_gen.ipynb ADDED Viewed

	@@ -0,0 +1,190 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from torch_geometric.data import Data\n",
+    "from ogb.utils import smiles2graph\n",
+    "import os\n",
+    "import json\n",
+    "from rdkit import RDLogger\n",
+    "from rdkit import Chem\n",
+    "RDLogger.DisableLog('rdApp.*')\n",
+    "from tqdm import tqdm\n",
+    "import multiprocessing\n",
+    "\n",
+    "def write_json(data, filename):\n",
+    "    with open(filename, 'w') as f:\n",
+    "        json.dump(data, f, indent=4, ensure_ascii=False)\n",
+    "\n",
+    "def read_json(filename):\n",
+    "    with open(filename, 'r') as f:\n",
+    "        data = json.load(f)\n",
+    "    return data\n",
+    "\n",
+    "def smiles2data(smiles):\n",
+    "    graph = smiles2graph(smiles)\n",
+    "    x = torch.from_numpy(graph['node_feat'])\n",
+    "    edge_index = torch.from_numpy(graph['edge_index'], )\n",
+    "    edge_attr = torch.from_numpy(graph['edge_feat'])\n",
+    "    data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)\n",
+    "    return data\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# make pretrain graphs\n",
+    "root = 'data/pretrain_data/'\n",
+    "mol_property_list = read_json(f'{root}/Abstract_property.json')\n",
+    "target_file = f'{root}/mol_graph_map.pt'\n",
+    "\n",
+    "if not os.path.exists(target_file):\n",
+    "    mol_graph_map = {}\n",
+    "    for mol_dict in tqdm(mol_property_list):\n",
+    "        smiles = mol_dict['canon_smiles']\n",
+    "        graph = smiles2data(smiles)\n",
+    "        mol_graph_map[smiles] = graph\n",
+    "    torch.save(mol_graph_map, target_file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# make downstrem (action prediction) graphs\n",
+    "root = 'data/action_data'\n",
+    "target_file = f'{root}/mol_graph_map.pt'\n",
+    "\n",
+    "if not os.path.exists(target_file):\n",
+    "    all_mols = set()\n",
+    "    reaction_list = read_json(f'{root}/processed.json')\n",
+    "    rxn_keys = ['REACTANT', 'PRODUCT', 'CATALYST', 'SOLVENT']\n",
+    "\n",
+    "    for rxn in reaction_list:\n",
+    "        for key in rxn_keys:\n",
+    "            for mol in rxn[key]:\n",
+    "                if mol in all_mols:\n",
+    "                    continue\n",
+    "                all_mols.add(mol)\n",
+    "    mol_graph_map={}\n",
+    "\n",
+    "    for smiles in all_mols:\n",
+    "        graph = smiles2data(smiles)\n",
+    "        mol_graph_map[smiles] = graph\n",
+    "    torch.save(mol_graph_map, target_file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# make downstream (retrosynthesis) graphs\n",
+    "root = 'data/synthesis_data'\n",
+    "\n",
+    "for folder in [\n",
+    "    'USPTO_50K_PtoR',\n",
+    "    'USPTO_50K_PtoR_aug20',\n",
+    "    'USPTO-MIT_PtoR_aug5',\n",
+    "    'USPTO-MIT_RtoP_aug5_mixed',\n",
+    "    'USPTO-MIT_RtoP_aug5_separated',\n",
+    "    'USPTO_full_pretrain_aug5_masked_token',\n",
+    "    ]:\n",
+    "    mol_graphid_file = f'{root}/{folder}/mol_graphid_map.json'\n",
+    "    target_file = f'{root}/{folder}/idx_graph_map.pt'\n",
+    "    if not os.path.exists(mol_graphid_file):\n",
+    "        canon_idx_map = {}\n",
+    "        mol_idx_map = {}\n",
+    "        mol_set = set()\n",
+    "        for mode in ['train', 'val', 'test']:\n",
+    "            for file in ['src', 'tgt']:\n",
+    "                if 'pretrain' in folder:\n",
+    "                    if file=='src':\n",
+    "                        continue\n",
+    "                else:\n",
+    "                    if file=='tgt':\n",
+    "                        continue\n",
+    "                file_path = f'{root}/{folder}/{mode}/{file}-{mode}.txt'\n",
+    "                with open(file_path) as f:\n",
+    "                    lines = f.readlines()\n",
+    "                for line in lines:\n",
+    "                    line = line.strip().replace(' ', '')\n",
+    "                    line = line.replace('<separated>', '.')\n",
+    "                    for smi in line.split('.'):\n",
+    "                        mol_set.add(smi)\n",
+    "        smi_list = list(mol_set)\n",
+    "        pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())\n",
+    "        canon_list = pool.map(func=Chem.CanonSmiles,iterable=smi_list)\n",
+    "        for smi, canon in zip(smi_list, canon_list):\n",
+    "            if canon not in canon_idx_map:\n",
+    "                canon_idx_map[canon] = len(canon_idx_map)\n",
+    "            mol_idx_map[smi] = canon_idx_map[canon]\n",
+    "        write_json(mol_idx_map, mol_graphid_file)\n",
+    "    else:\n",
+    "        mol_idx_map = read_json(mol_graphid_file)\n",
+    "\n",
+    "    cid_graph_map = {}\n",
+    "    for smiles, graph_id in mol_idx_map.items():\n",
+    "        if graph_id in cid_graph_map:\n",
+    "            continue\n",
+    "        graph = smiles2data(smiles)\n",
+    "        cid_graph_map[graph_id] = graph\n",
+    "    torch.save(cid_graph_map, target_file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# make downstream (retrosynthesis) graphs\n",
+    "root = 'data/ChEBI-20_data'\n",
+    "target_file = f'{root}/cid_graph_map.pt'\n",
+    "\n",
+    "cid_graph_map = {}\n",
+    "if not os.path.exists(target_file):\n",
+    "    for mode in ['train', 'validation', 'test']:\n",
+    "        with open(f'{root}/{mode}.txt') as f:\n",
+    "            lines = f.readlines()\n",
+    "        for line in lines[1:]:\n",
+    "            cid, smiles, _ = line.strip().split('\\t', maxsplit=2)\n",
+    "            graph = smiles2data(smiles)\n",
+    "            cid_graph_map[cid] = graph\n",
+    "    torch.save(cid_graph_map, target_file)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "pth20v3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.17"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

lora_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+"base_model_name_or_path": null,
+"bias": "none",
+"fan_in_fan_out": false,
+"inference_mode": false,
+"init_lora_weights": true,
+"lora_alpha": 16,
+"lora_dropout": 0.1,
+"target_modules": ["q_proj", "v_proj", "k_proj", "out_proj", "fc1", "fc2"],
+"peft_type": "LORA",
+"r": 8,
+"modules_to_save": null,
+"task_type": "CAUSAL_LM"
+}

main.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import os
+import torch
+import argparse
+import warnings
+import pytorch_lightning as pl
+from pytorch_lightning import Trainer, strategies
+import pytorch_lightning.callbacks as plc
+from pytorch_lightning.loggers import CSVLogger
+from pytorch_lightning.callbacks import TQDMProgressBar
+from data_provider.pretrain_dm import PretrainDM
+from data_provider.tune_dm import TuneDM
+from model.opt_flash_attention import replace_opt_attn_with_flash_attn
+from model.blip2_model import Blip2Model
+from model.dist_funs import MyDeepSpeedStrategy
+## for pyg bug
+warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
+## for A5000 gpus
+torch.set_float32_matmul_precision('medium') # can be medium (bfloat16), high (tensorfloat32), highest (float32)
+try:
+    class MyDDPSpawnStrategy(strategies.DDPSpawnStrategy):
+        def load_model_state_dict(self, checkpoint):
+            assert self.lightning_module is not None
+            self.lightning_module.load_state_dict(checkpoint["state_dict"], strict=False)
+except:
+    pass
+def main(args):
+    pl.seed_everything(args.seed)
+    # model
+    if args.init_checkpoint:
+        model = Blip2Model(args)
+        ckpt = torch.load(args.init_checkpoint, map_location='cpu')
+        model.load_state_dict(ckpt['state_dict'], strict=False)
+        print(f"loaded model from {args.init_checkpoint}")
+    else:
+        model = Blip2Model(args)
+    print('total params:', sum(p.numel() for p in model.parameters()))
+    if args.opt_model.find('galactica') >= 0 or args.opt_model.find('t5') >= 0:
+        tokenizer = model.blip2opt.opt_tokenizer
+    elif args.opt_model.find('llama') >= 0 or args.opt_model.find('vicuna') >= 0:
+        tokenizer = model.blip2opt.llm_tokenizer
+    else:
+        raise NotImplementedError
+    # data
+    if args.mode in {'pretrain', 'pretrain_eval'}:
+        dm = PretrainDM(
+            num_workers=args.num_workers,
+            batch_size=args.batch_size,
+            root=args.root,
+            text_max_len=args.text_max_len,
+            rxn_max_len=args.rxn_max_len,
+            smi_max_len=args.smi_max_len,
+            tokenizer=tokenizer,
+            args=args
+        )
+    elif args.mode in {'ft', 'eval'}:
+        dm = TuneDM(
+            num_workers=args.num_workers,
+            batch_size=args.batch_size,
+            root=args.root,
+            text_max_len=args.text_max_len,
+            rxn_max_len=args.rxn_max_len,
+            smi_max_len=args.smi_max_len,
+            tokenizer=tokenizer,
+            downstream_task=args.downstream_task,
+            args=args
+        )
+    callbacks = [TQDMProgressBar(refresh_rate=args.tqdm_interval)]
+    ## fixme save only used parameters
+    # callbacks.append(plc.ModelCheckpoint(dirpath="all_checkpoints/"+args.filename+"/", every_n_epochs=10, save_top_k=-1))
+    callbacks.append(plc.ModelCheckpoint(dirpath="all_checkpoints/"+args.filename+"/",
+                                         filename='{epoch:02d}',
+                                         every_n_epochs=args.save_every_n_epochs,
+                                         save_last=True,
+                                         save_top_k=-1,
+                                         save_on_train_epoch_end=True))
+    if len(args.devices.split(',')) > 1:
+        if args.strategy_name == 'fsdp':
+            strategy = strategies.DDPFullyShardedNativeStrategy()
+        elif args.strategy_name == 'deepspeed':
+            strategy = strategies.DeepSpeedStrategy(stage=3)
+        elif args.strategy_name == 'mydeepspeed':
+            strategy = MyDeepSpeedStrategy(stage=2)
+        else:
+            strategy = MyDDPSpawnStrategy(find_unused_parameters=True)
+    else:
+        strategy = None
+        args.devices = eval(args.devices)
+    logger = CSVLogger(save_dir=f'./all_checkpoints/{args.filename}/')
+    reload_freq = 1 if args.mode == 'pretrain' else 0
+    trainer = Trainer(
+        accelerator=args.accelerator,
+        devices=args.devices,
+        precision=args.precision,
+        max_epochs=args.max_epochs,
+        accumulate_grad_batches=args.accumulate_grad_batches,
+        check_val_every_n_epoch=args.check_val_every_n_epoch,
+        callbacks=callbacks,
+        strategy=strategy,
+        logger=logger,
+        reload_dataloaders_every_n_epochs=reload_freq
+        #  limit_train_batches=100,
+    )
+    if args.mode in {'pretrain', 'ft'}:
+        trainer.fit(model, datamodule=dm, ckpt_path=args.ckpt_path)
+    elif args.mode in {'eval', 'pretrain_eval'}:
+        trainer.fit_loop.epoch_progress.current.completed = args.caption_eval_epoch - 1
+        trainer.validate(model, datamodule=dm)
+        # trainer.test(model, datamodule=dm)
+    else:
+        raise NotImplementedError()
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--filename', type=str, default="main")
+    parser.add_argument('--seed', type=int, default=42, help='random seed')
+    # MM settings
+    parser.add_argument('--mode', type=str, default='pretrain', choices=['pretrain', 'ft', 'eval', 'pretrain_eval'])
+    parser.add_argument('--strategy_name', type=str, default='mydeepspeed')
+    parser.add_argument('--iupac_prediction', action='store_true', default=False)
+    parser.add_argument('--ckpt_path', type=str, default=None)
+    # parser = Trainer.add_argparse_args(parser)
+    parser = Blip2Model.add_model_specific_args(parser)  # add model args
+    parser = PretrainDM.add_model_specific_args(parser)
+    parser.add_argument('--accelerator', type=str, default='gpu')
+    parser.add_argument('--devices', type=str, default='0,1,2,3')
+    parser.add_argument('--precision', type=str, default='bf16-mixed')
+    parser.add_argument('--downstream_task', type=str, default='action', choices=['action', 'synthesis', 'caption', 'chebi'])
+    parser.add_argument('--max_epochs', type=int, default=10)
+    parser.add_argument('--enable_flash', action='store_true', default=False)
+    parser.add_argument('--disable_graph_cache', action='store_true', default=False)
+    parser.add_argument('--predict_rxn_condition', action='store_true', default=False)
+    parser.add_argument('--generate_restrict_tokens', action='store_true', default=False)
+    parser.add_argument('--train_restrict_tokens', action='store_true', default=False)
+    parser.add_argument('--smiles_type', type=str, default='default', choices=['default', 'canonical', 'restricted', 'unrestricted', 'r_smiles'])
+    parser.add_argument('--accumulate_grad_batches', type=int, default=1)
+    parser.add_argument('--tqdm_interval', type=int, default=50)
+    parser.add_argument('--check_val_every_n_epoch', type=int, default=1)
+    args = parser.parse_args()
+    if args.enable_flash:
+        replace_opt_attn_with_flash_attn()
+    print("=========================================")
+    for k, v in sorted(vars(args).items()):
+        print(k, '=', v)
+    print("=========================================")
+    return args
+if __name__ == '__main__':
+    main(get_args())

model/allowed_words.json ADDED Viewed

	@@ -0,0 +1,118 @@

+{
+    "30": "(",
+    "31": ")",
+    "57": "C",
+    "39": "1",
+    "89": "c",
+    "69": "O",
+    "40": "2",
+    "51": "=",
+    "2275": "CC",
+    "1030": "cc",
+    "19": "[START_SMILES]",
+    "20": "[END_SMILES]",
+    "221": "\n",
+    "68": "N",
+    "24552": "ccc",
+    "36": ".",
+    "81": "[",
+    "83": "]",
+    "41": "3",
+    "4162": "OC",
+    "60": "F",
+    "19321": "cccc",
+    "100": "n",
+    "2356": "Cl",
+    "11863": "nc",
+    "62": "H",
+    "35": "-",
+    "8183": "Br",
+    "6597": "NC",
+    "43888": "CCC",
+    "54": "@",
+    "29332": "@@",
+    "4015": "CN",
+    "42": "4",
+    "38985": "Nc",
+    "3027": "CO",
+    "73": "S",
+    "25": "#",
+    "33": "+",
+    "27312": "Oc",
+    "16288": "cn",
+    "8095": "nn",
+    "56": "B",
+    "1228": "sc",
+    "37": "/",
+    "63": "I",
+    "105": "s",
+    "408": "oc",
+    "1912": "SC",
+    "5965": "Si",
+    "43": "5",
+    "46183": "Cn",
+    "98": "l",
+    "101": "o",
+    "7662": "NS",
+    "5869": "NN",
+    "8314": "cs",
+    "5396": "CI",
+    "82": "\\",
+    "9835": "Sc",
+    "3470": "CS",
+    "32712": "Fc",
+    "2304": "OS",
+    "3330": "NO",
+    "6882": "FC",
+    "70": "P",
+    "13136": "Sn",
+    "12702": "Mg",
+    "3529": "no",
+    "2812": "co",
+    "14530": "SCC",
+    "6342": "rc",
+    "35011": "BrN",
+    "9677": "NH",
+    "283": "on",
+    "20938": "onc",
+    "37190": "COS",
+    "44": "6",
+    "17952": "OB",
+    "11004": "Zn",
+    "28819": "OO",
+    "2085": "ns",
+    "3696": "CP",
+    "5097": "CF",
+    "978": "con",
+    "6017": "non",
+    "34244": "CNS",
+    "4232": "occ",
+    "10907": "CON",
+    "8072": "Cu",
+    "13346": "CB",
+    "45": "7",
+    "16378": "sn",
+    "1513": "ON",
+    "46": "8",
+    "4939": "OP",
+    "6321": "SN",
+    "26505": "conc",
+    "6913": "Se",
+    "2636": "SS",
+    "422": "se",
+    "47": "9",
+    "48321": "SSC",
+    "47306": "SCN",
+    "15780": "CNN",
+    "48968": "OCI",
+    "27": "%",
+    "38": "0",
+    "6389": "FS",
+    "4864": "On",
+    "27133": "SCO",
+    "2001": "IC",
+    "0": "<s>",
+    "1": "<pad>",
+    "2": "</s>",
+    "3": "<unk>"
+}

model/blip2.py ADDED Viewed

	@@ -0,0 +1,126 @@

+"""
+ Copyright (c) 2023, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import contextlib
+import logging
+import os
+import torch
+import torch.nn as nn
+from lavis.common.dist_utils import download_cached_file
+from lavis.common.utils import is_url
+from lavis.models.base_model import BaseModel
+from lavis.models.blip2_models.Qformer import BertConfig, BertLMHeadModel
+from transformers import BertTokenizer
+from model.gin_model import GNN
+class Blip2Base(BaseModel):
+    @classmethod
+    def init_tokenizer(cls):
+        if True:
+            bert_name = 'allenai/scibert_scivocab_uncased'
+        else:
+            bert_name = 'bert_pretrained/'
+        tokenizer = BertTokenizer.from_pretrained(bert_name)
+        tokenizer.add_special_tokens({"bos_token": "[DEC]"})
+        return tokenizer
+    def maybe_autocast(self, dtype=torch.float16):
+        # if on cpu, don't use autocast
+        # if on gpu, use autocast with dtype if provided, otherwise use torch.float16
+        enable_autocast = self.device != torch.device("cpu")
+        if enable_autocast:
+            return torch.cuda.amp.autocast(dtype=dtype)
+        else:
+            return contextlib.nullcontext()
+    @classmethod
+    def init_Qformer(cls, model_name, num_query_token, graph_width, cross_attention_freq=2):
+        assert model_name == 'scibert'
+        print("bert load scibert")
+        if True:
+            bert_name = 'allenai/scibert_scivocab_uncased'
+        else:
+            bert_name = 'bert_pretrained/'
+        encoder_config = BertConfig.from_pretrained(bert_name)
+        encoder_config.encoder_width = graph_width
+        # insert cross-attention layer every other block
+        encoder_config.add_cross_attention = True
+        encoder_config.cross_attention_freq = cross_attention_freq
+        encoder_config.query_length = num_query_token
+        Qformer = BertLMHeadModel.from_pretrained(
+            bert_name, config=encoder_config
+        )
+        query_tokens = nn.Parameter(
+            torch.zeros(1, num_query_token, encoder_config.hidden_size)
+        )
+        query_tokens.data.normal_(mean=0.0, std=encoder_config.initializer_range)
+        return Qformer, query_tokens
+    @classmethod
+    def init_graph_encoder(
+        cls, gin_num_layers, gin_hidden_dim, gin_drop_ratio):
+        graph_encoder = GNN(
+            num_layer=gin_num_layers,
+            emb_dim=gin_hidden_dim,
+            gnn_type='gin',
+            drop_ratio=gin_drop_ratio,
+            JK='last',
+        )
+        ckpt = torch.load('gin_pretrained/graphcl_80.pth', map_location=torch.device('cpu'))
+        missing_keys, unexpected_keys = graph_encoder.load_state_dict(ckpt, strict=False)
+        if len(missing_keys) or len(unexpected_keys):
+            print(missing_keys)
+            print(unexpected_keys)
+        ln_graph = LayerNorm(graph_encoder.num_features)
+        return graph_encoder, ln_graph
+    def load_from_pretrained(self, url_or_filename):
+        if is_url(url_or_filename):
+            cached_file = download_cached_file(
+                url_or_filename, check_hash=False, progress=True
+            )
+            checkpoint = torch.load(cached_file, map_location="cpu")
+        elif os.path.isfile(url_or_filename):
+            checkpoint = torch.load(url_or_filename, map_location="cpu")
+        else:
+            raise RuntimeError("checkpoint url or path is invalid")
+        state_dict = checkpoint["model"]
+        msg = self.load_state_dict(state_dict, strict=False)
+        # logging.info("Missing keys {}".format(msg.missing_keys))
+        logging.info("load checkpoint from %s" % url_or_filename)
+        return msg
+def disabled_train(self, mode=True):
+    """Overwrite model.train with this function to make sure train/eval mode
+    does not change anymore."""
+    return self
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+    def forward(self, x: torch.Tensor, mask=None):
+        orig_type = x.dtype
+        # ret = super().forward(x.type(torch.float32))
+        ret = super().forward(x)
+        return ret.type(orig_type)

model/blip2_llama.py ADDED Viewed

	@@ -0,0 +1,266 @@

+"""
+ Copyright (c) 2023, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import logging
+import torch
+import torch.nn as nn
+from torch.cuda.amp import autocast as autocast
+from peft import get_peft_config, get_peft_model, get_peft_model_state_dict, LoraConfig, TaskType, PeftModel
+from lavis.models.blip2_models.blip2 import (
+    # Blip2Base,
+    disabled_train,
+)
+from model.blip2 import Blip2Base
+from transformers import LlamaTokenizer
+from model.modeling_llama import LlamaForCausalLM
+llama_model_list = [
+    "decapoda-research/llama-13b-hf",
+    "decapoda-research/llama-7b-hf",
+]
+def mask_by_len(input, lens, fill_value=0):
+    '''
+    input: shape = [N, D]
+    lens: shape = [N]
+    '''
+    mask = torch.arange(input.shape[1], device=input.device).reshape(1, -1)
+    mask = mask < lens.reshape(-1, 1)
+    input[mask] = fill_value
+    return input
+# @registry.register_model("blip2")
+# @registry.register_model("blip2_feature_extractor")
+class Blip2Llama(Blip2Base):
+    """
+    BLIP2 first-stage model with Q-former and ViT.
+    Supported model types:
+        - pretrained: pretrained model with vit-g
+        - pretrain_vitL: pretrained model with vit-large
+        - coco: fintuned model on coco
+    Usage:
+        >>> from lavis.models import load_model
+        >>> model = load_model("blip2", "pretrain")
+    """
+    def __init__(
+        self,
+        bert_name,
+        gin_num_layers,
+        gin_hidden_dim,
+        gin_drop_ratio,
+        tune_gnn=False,
+        num_query_token=32,
+        cross_attention_freq=2,
+        lora_tuning=False,
+        peft_dir='',
+        llm_model="decapoda-research/llama-7b-hf",
+        prompt="",
+        args=None,
+    ):
+        super().__init__()
+        self.graph_encoder, self.ln_graph = self.init_graph_encoder(gin_num_layers, gin_hidden_dim, gin_drop_ratio)
+        self.tune_gnn = tune_gnn
+        if not tune_gnn:
+            for name, param in self.graph_encoder.named_parameters():
+                param.requires_grad = False
+            self.graph_encoder = self.graph_encoder.eval()
+            self.graph_encoder.train = disabled_train
+            logging.info("freeze graph encoder")
+        self.Qformer, self.query_tokens = self.init_Qformer(bert_name, num_query_token, self.graph_encoder.num_features, cross_attention_freq)
+        ### remove the unused parameters
+        self.Qformer.cls = None
+        self.Qformer.bert.embeddings.word_embeddings = None
+        self.Qformer.bert.embeddings.position_embeddings = None
+        for layer in self.Qformer.bert.encoder.layer:
+            layer.output = None
+            layer.intermediate = None
+        ## initialize opt model
+        self.llm_tokenizer = LlamaTokenizer.from_pretrained(llm_model, use_fast=False, padding_side='right')
+        self.llm_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+        self.llm_tokenizer.add_special_tokens({'bos_token': '</s>'})
+        self.llm_tokenizer.add_special_tokens({'eos_token': '</s>'})
+        self.llm_tokenizer.add_special_tokens({'unk_token': '</s>'})
+        self.llm_model = LlamaForCausalLM.from_pretrained(llm_model, torch_dtype=torch.bfloat16)
+        # self.llm_model = LlamaForCausalLM.from_pretrained(llm_model)
+        self.llm_model.resize_token_embeddings(len(self.llm_tokenizer))
+        self.lora_tuning = lora_tuning
+        if lora_tuning:
+            if peft_dir:
+                self.llm_model = PeftModel.from_pretrained(self.llm_model, peft_dir, is_trainable=True)
+            else:
+                peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)
+                self.llm_model = get_peft_model(self.llm_model, peft_config)
+                self.llm_model.print_trainable_parameters()
+        else:
+            for name, param in self.llm_model.named_parameters():
+                param.requires_grad = False
+        ## fixme: this is different from the original BLIP2
+        self.eos_token_id = self.llm_tokenizer(
+            "\n", add_special_tokens=False
+        ).input_ids[0]
+        self.pad_token_id = self.llm_tokenizer.pad_token_id
+        self.llm_proj = nn.Linear(
+            self.Qformer.config.hidden_size, self.llm_model.config.hidden_size
+        )
+        ## fixme: no prompt yet
+        self.prompt = prompt
+        # prompt_tokens = self.opt_tokenizer(self.prompt, return_tensors="pt")
+        # self.prompt_length = prompt_tokens.attention_mask.sum(1)
+    def forward(self, batch):
+        graphs, text_tokens, prompt_lens = batch
+        graph_embeds, graph_masks = self.graph_encoder(graphs)
+        if not self.tune_gnn:
+            graph_embeds = graph_embeds.detach()
+        graph_embeds = self.ln_graph(graph_embeds, graph_masks)
+        device = graph_embeds.device
+        query_tokens = self.query_tokens.expand(graph_embeds.shape[0], -1, -1)
+        query_output = self.Qformer.bert(
+            query_embeds=query_tokens,
+            encoder_hidden_states=graph_embeds,
+            encoder_attention_mask=graph_masks, # fixme: check whether this mask is correct
+            return_dict=True,
+        )
+        inputs_llm = self.llm_proj(query_output.last_hidden_state)
+        atts_llm = torch.ones(inputs_llm.size()[:-1], dtype=torch.long).to(device)
+        targets = text_tokens.input_ids.masked_fill(
+            text_tokens.input_ids == self.llm_tokenizer.pad_token_id, -100
+        )
+        if self.prompt:
+            targets = mask_by_len(targets, prompt_lens, -100) # do not apply loss to the prompt
+            # targets[:, : self.prompt_length] = -100  # do not apply loss to the prompt
+        empty_targets = (
+            torch.ones(atts_llm.size(), dtype=torch.long).to(device).fill_(-100)
+        )
+        targets = torch.cat([empty_targets, targets], dim=1)
+        # if self.lora_tuning:
+        #     inputs_embeds = self.llm_model.model.get_decoder().embed_tokens(text_tokens.input_ids)
+        # else:
+        #     inputs_embeds = self.llm_model.model.decoder.embed_tokens(text_tokens.input_ids)
+        inputs_embeds = self.llm_model.get_input_embeddings()(text_tokens.input_ids)
+        inputs_embeds = torch.cat([inputs_llm, inputs_embeds], dim=1)
+        attention_mask = torch.cat([atts_llm, text_tokens.attention_mask], dim=1)
+        outputs = self.llm_model(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            return_dict=True,
+            labels=targets,
+            # use_cache=False,
+        )
+        loss = outputs.loss
+        return {"loss": loss}
+    @torch.no_grad()
+    def generate(
+        self,
+        samples,
+        do_sample=False,
+        num_beams=5,
+        max_length=128,
+        min_length=1,
+        top_p=0.9,
+        repetition_penalty=1.0,
+        length_penalty=1.0,
+        num_captions=1,
+        temperature=1,
+    ):
+        """
+        Args:
+            samples (dict): A dictionary containing the following keys:
+                - image (torch.Tensor): A tensor of shape (batch_size, 3, H, W)
+            num_beams (int): Number of beams for beam search. 1 means no beam search.
+            max_length (int): The maximum length of the sequence to be generated.
+            min_length (int): The minimum length of the sequence to be generated.
+            top_p (float): The cumulative probability for nucleus sampling.
+            repetition_penalty (float): The parameter for repetition penalty. 1.0 means no penalty.
+            num_captions (int): Number of captions to be generated for each image.
+        Returns:
+            captions (list): A list of strings of length batch_size * num_captions.
+        """
+        graphs = samples['graphs']
+        prompt_tokens = samples['prompt_tokens']
+        # prompt_lens = samples['prompt_lens']
+        with self.maybe_autocast():
+            graph_embeds, graph_masks = self.graph_encoder(graphs)
+            graph_embeds = self.ln_graph(graph_embeds)
+            query_tokens = self.query_tokens.expand(graph_embeds.shape[0], -1, -1)
+            query_output = self.Qformer.bert(
+                query_embeds=query_tokens,
+                encoder_hidden_states=graph_embeds,
+                encoder_attention_mask=graph_masks,
+                return_dict=True,
+            )
+            device = graph_embeds.device
+            inputs_llm = self.llm_proj(query_output.last_hidden_state)
+            atts_llm = torch.ones(inputs_llm.size()[:-1], dtype=torch.long, device=device)
+            attention_mask = torch.cat([atts_llm, prompt_tokens.attention_mask], dim=1)
+            if False:
+                if do_sample:
+                    query_embeds = inputs_llm.repeat_interleave(num_captions, dim=0)
+                    num_beams = 1
+                else:
+                    query_embeds = inputs_llm.repeat_interleave(num_beams, dim=0)
+                outputs = self.llm_model.generate(
+                    input_ids=prompt_tokens.input_ids,
+                    query_embeds=query_embeds,
+                    attention_mask=attention_mask,
+                    do_sample=do_sample,
+                    top_p=top_p,
+                    temperature=temperature,
+                    num_beams=num_beams,
+                    max_new_tokens=max_length,
+                    min_length=min_length,
+                    eos_token_id=self.eos_token_id,
+                    repetition_penalty=repetition_penalty,
+                    length_penalty=length_penalty,
+                    num_return_sequences=num_captions,
+                )
+                prompt_length = prompt_tokens.input_ids.shape[1]
+                output_text = self.opt_tokenizer.batch_decode(
+                    outputs[:, prompt_length:], skip_special_tokens=True
+                )
+            else:
+                inputs_embeds = self.llm_model.get_input_embeddings()(prompt_tokens.input_ids)
+                inputs_embeds = torch.cat([inputs_llm, inputs_embeds], dim=1)
+                attention_mask = torch.cat([atts_llm, prompt_tokens.attention_mask], dim=1)
+                outputs = self.llm_model.generate(
+                    inputs_embeds=inputs_embeds,
+                    attention_mask=attention_mask,
+                    do_sample=do_sample,
+                    top_p=top_p,
+                    temperature=temperature,
+                    num_beams=num_beams,
+                    max_length=max_length,
+                    min_length=min_length,
+                    pad_token_id=self.pad_token_id,
+                    eos_token_id=self.eos_token_id,
+                    repetition_penalty=repetition_penalty,
+                    length_penalty=length_penalty,
+                    num_return_sequences=num_captions,
+                    # use_cache=False,
+                )
+                # outputs[outputs == 0] = 2 # convert output id 0 to 2 (eos_token_id)
+                output_text = self.llm_tokenizer.batch_decode(outputs, skip_special_tokens=True)
+            output_text = [text.strip() for text in output_text]
+            return output_text

model/blip2_model.py ADDED Viewed

	@@ -0,0 +1,381 @@

+import os
+from typing import Any, Dict
+import torch
+from model.blip2_opt import Blip2OPT
+from model.blip2_llama import Blip2Llama
+from model.blip2_t5 import Blip2T5
+import pytorch_lightning as pl
+from torch import optim
+from lavis.common.optims import LinearWarmupCosineLRScheduler, LinearWarmupStepLRScheduler
+import json
+from model.opt_flash_attention import replace_opt_attn_with_flash_attn, replace_opt_attn_with_original_attn
+import torch.distributed as dist
+from peft import LoraConfig, TaskType
+from model.help_funcs import caption_evaluate, AttrDict
+from transformers import Adafactor
+from torch_ema import ExponentialMovingAverage
+def load_ignore_unexpected(model, state_dict):
+    keys = set(model.state_dict().keys())
+    state_dict = {k: v for k, v in state_dict.items() if k in keys}
+    ## try to print keys that are not included
+    model.load_state_dict(state_dict, strict=True)
+# def load_ignore_mismatch(model, state_dict):
+#     keys = set(model.state_dict().keys())
+#     extra_keys = set()
+#     for key in state_dict:
+#         if key not in keys:
+#             extra_keys.add(key)
+#     missing_keys = set()
+#     for key in keys:
+#         if key not in state_dict:
+#             missing_keys.add(key)
+#     ## try to print keys that are not included
+#     model.load_state_dict(state_dict, strict=False)
+def get_module_state_dict(state_dict, module_name):
+    module_state_dict = {}
+    for key, value in state_dict.items():
+        if key.startswith(module_name):
+            key = key[len(module_name) + 1:]
+            if key == '':
+                return value
+            module_state_dict[key] = value
+    return module_state_dict
+# peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)
+class Blip2Model(pl.LightningModule):
+    def on_save_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
+        if self.llm_tune != 'full':
+            to_be_removed = []
+            for key in checkpoint['state_dict']:
+                if key.startswith('blip2opt.opt_model') or key.startswith('blip2opt.llm_model'):
+                    to_be_removed.append(key)
+            for key in to_be_removed:
+                checkpoint['state_dict'].pop(key)
+        if isinstance(self.args.save_every_n_epochs, int) and self.args.save_every_n_epochs > 0:
+            if self.llm_tune == 'lora' and (self.current_epoch + 1) % self.args.save_every_n_epochs == 0:
+                if self.local_rank == 0: # manually fix a bug in peft module
+                    if self.args.peft_config:
+                        peft_config = LoraConfig(**LoraConfig.from_json_file(self.args.peft_config))
+                    else:
+                        peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, inference_mode=False, r=self.args.lora_r, lora_alpha=self.args.lora_alpha, lora_dropout=self.args.lora_dropout)
+                    if hasattr(self.blip2opt, 'opt_model'):
+                        self.blip2opt.opt_model.peft_config['default'] = peft_config
+                        self.blip2opt.opt_model.save_pretrained(os.path.join(self.logger.save_dir, f'lora_epoch_{self.current_epoch}'))
+                    elif hasattr(self.blip2opt, 'llm_model'):
+                        self.blip2opt.llm_model.peft_config['default'] = peft_config
+                        self.blip2opt.llm_model.save_pretrained(os.path.join(self.logger.save_dir, f'lora_epoch_{self.current_epoch}'))
+        return super().on_save_checkpoint(checkpoint)
+    def __init__(self, args):
+        super().__init__()
+        if isinstance(args, dict):
+            args = AttrDict(**args)
+        self.args = args
+        if not hasattr(args, 'do_sample'):
+            args.do_sample = False
+        self.caption_eval_epoch = args.caption_eval_epoch
+        self.do_sample = args.do_sample
+        self.num_beams = args.num_beams
+        self.max_inference_len = args.max_inference_len
+        self.min_inference_len = args.min_inference_len
+        self.num_generate_captions = args.num_generate_captions
+        self.reaction_weight = args.reaction_weight
+        self.llm_tune = args.llm_tune
+        self.enable_flash = args.enable_flash
+        if args.opt_model.find('galactica') >= 0:
+            self.blip2opt = Blip2OPT(args.bert_name, args.gin_num_layers, args.gin_hidden_dim, args.drop_ratio, args.tune_gnn, not args.not_tune_qformer, args.num_query_token, args.cross_attention_freq, args.llm_tune, args.peft_dir, args.opt_model, args.prompt, args)
+        elif args.opt_model.find('llama') >= 0 or args.opt_model.find('vicuna') >= 0:
+            self.blip2opt = Blip2Llama(args.bert_name, args.gin_num_layers, args.gin_hidden_dim, args.drop_ratio, args.tune_gnn, args.num_query_token, args.cross_attention_freq, args.llm_tune, args.peft_dir, args.opt_model, args.prompt, args)
+        elif args.opt_model.find('t5') >= 0:
+            self.blip2opt = Blip2T5(args.bert_name, args.gin_num_layers, args.gin_hidden_dim, args.drop_ratio, args.tune_gnn, args.num_query_token, args.cross_attention_freq, args.llm_tune, args.peft_dir, args.opt_model, args.prompt, args)
+        else:
+            raise NotImplementedError()
+        self.tokenizer = self.blip2opt.init_tokenizer()
+        self.mode = args.mode
+        self.downstream_task = args.downstream_task
+        self.save_hyperparameters(args)
+        self.save_ema_checkpoint = args.save_ema_checkpoint
+        if self.save_ema_checkpoint:
+            self.ema = ExponentialMovingAverage(self.parameters(), 0.99)
+        self.save_on_steps = args.save_on_steps
+    def load_from_stage1_checkpoint(self, path):
+        ckpt = torch.load(path, map_location='cpu')
+        state_dict = ckpt['state_dict']
+        graph_encoder_dict = get_module_state_dict(state_dict, 'blip2qformer.graph_encoder')
+        qformer_dict = get_module_state_dict(state_dict, 'blip2qformer.Qformer')
+        ln_graph_dict = get_module_state_dict(state_dict, 'blip2qformer.ln_graph')
+        qs_weight = get_module_state_dict(state_dict, 'blip2qformer.query_tokens')
+        load_ignore_unexpected(self.blip2opt.Qformer, qformer_dict)
+        self.blip2opt.graph_encoder.load_state_dict(graph_encoder_dict)
+        self.blip2opt.ln_graph.load_state_dict(ln_graph_dict)
+        self.blip2opt.query_tokens.data.copy_(qs_weight)
+        return self
+    # def load_from_stage1_checkpoint(self, path):
+    #     ckpt = torch.load(path, map_location='cpu')
+    #     state_dict = ckpt['state_dict']
+    #     state_dict = {k[13:]: v for k,v in state_dict.items()}
+    #     load_ignore_mismatch(self.blip2opt, state_dict)
+    #     return self
+    def configure_optimizers(self):
+        if self.args.optimizer == 'adafactor':
+            print('Using adafactor optimizer')
+            optimizer = Adafactor(
+                self.parameters(),
+                lr=1e-3,
+                relative_step=False,
+                scale_parameter=False,
+                warmup_init=False
+            )
+            self.scheduler = None
+        else:
+            self.trainer.fit_loop.setup_data()
+            # self.trainer.reset_train_dataloader()
+            warmup_steps = min(len(self.trainer.train_dataloader), self.args.warmup_steps)
+            optimizer = optim.AdamW(self.parameters(), lr=self.args.init_lr, weight_decay=self.args.weight_decay)
+            if self.args.scheduler == 'linear_warmup_cosine_lr':
+                self.scheduler = LinearWarmupCosineLRScheduler(optimizer, self.args.max_epochs, self.args.min_lr, self.args.init_lr, warmup_steps, self.args.warmup_lr)
+            elif self.args.scheduler == 'linear_warmup_step_lr':
+                self.scheduler = LinearWarmupStepLRScheduler(optimizer, self.args.max_epochs, self.args.min_lr, self.args.init_lr, self.args.lr_decay_rate, self.args.warmup_lr, warmup_steps)
+            elif self.args.scheduler == 'None':
+                self.scheduler = None
+            else:
+                raise NotImplementedError()
+        return optimizer
+    def test_epoch_end(self, outputs):
+        print('test epoch end')
+        list_ids, list_predictions, list_targets = zip(*outputs)
+        predictions = [i for ii in list_predictions for i in ii]
+        targets = [i for ii in list_targets for i in ii]
+        all_ids = [None for _ in range(self.trainer.world_size)]
+        all_predictions = [None for _ in range(self.trainer.world_size)]
+        all_targets = [None for _ in range(self.trainer.world_size)]
+        dist.all_gather_object(all_ids, list_ids)
+        dist.all_gather_object(all_predictions, predictions)
+        dist.all_gather_object(all_targets, targets)
+        print(len(all_ids), len(all_predictions), len(all_targets))
+        if self.global_rank == 0:
+            print(f'saveing predictions to {self.logger.log_dir}')
+            all_predictions = [i for ii in all_predictions for i in ii]
+            all_targets = [i for ii in all_targets for i in ii]
+            self.save_predictions(all_ids, all_predictions, all_targets)
+            ## fixme: I am not sure if the max length is the same as previous experiments
+            bleu2, bleu4, rouge_1, rouge_2, rouge_l, meteor_score = \
+                caption_evaluate(all_predictions, all_targets, self.tokenizer, self.max_inference_len * 2)
+            self.log("bleu2", bleu2, sync_dist=False)
+            self.log("bleu4", bleu4, sync_dist=False)
+            self.log("rouge_1", rouge_1, sync_dist=False)
+            self.log("rouge_2", rouge_2, sync_dist=False)
+            self.log("rouge_l", rouge_l, sync_dist=False)
+            self.log("meteor_score", meteor_score, sync_dist=False)
+    def save_predictions(self, rxn_ids, predictions, targets):
+        assert False
+        assert len(rxn_ids) == len(targets)
+        assert len(predictions) == len(targets)
+        with open(os.path.join(self.logger.log_dir, 'predictions.txt'), 'w', encoding='utf8') as f:
+            for i, p, t in zip(rxn_ids, predictions, targets):
+                line = {'index': i, 'prediction': p, 'target': t}
+                f.write(json.dumps(line, ensure_ascii=False) + '\n')
+    @torch.no_grad()
+    def test_step(self, batch, batch_idx):
+        assert False
+    def gather_dict_results(self, dict_list):
+        list_of_dict_list = [None for _ in range(self.trainer.world_size)]
+        dist.all_gather_object(list_of_dict_list, dict_list)
+        dict_list = [i for ii in list_of_dict_list for i in ii] ## dict list, each dict has values that are lists of predictions, etc.
+        keys = dict_list[0].keys()
+        gathered_dict = {} # each value is a list of predictions, etc.
+        for key in keys:
+            gathered_dict[key] = [i for d in dict_list for i in d[key]]
+        if self.num_generate_captions>1:
+            M = self.num_generate_captions
+            N = len(gathered_dict['index'])
+            assert len(gathered_dict['predictions'])==N*M
+            gathered_dict['predictions'] = [
+                gathered_dict['predictions'][i * M:(i + 1) * M]
+                for i in range(N)
+            ]
+        dict_list = []
+        for i in range(len(gathered_dict['predictions'])):
+            d = {k:gathered_dict[k][i] for k in keys}
+            dict_list.append(d)
+        return dict_list
+    def save_results(self, dict_list, log_prefix=""):
+        if log_prefix:
+            name = f'{log_prefix}_predictions.txt'
+        else:
+            name = 'predictions.txt'
+        with open(os.path.join(self.logger.log_dir, name), 'w', encoding='utf8') as f:
+            for i in range(len(dict_list)):
+                f.write(json.dumps(dict_list[i], ensure_ascii=True) + '\n')
+    def on_validation_epoch_start(self):
+        if self.enable_flash:
+            replace_opt_attn_with_original_attn()
+        self.saved_dict_list = []
+    def on_validation_epoch_end(self):
+        if self.enable_flash:
+            replace_opt_attn_with_flash_attn()
+        if (self.current_epoch+1) % self.caption_eval_epoch != 0:
+            return
+        result_list = self.gather_dict_results(self.saved_dict_list)
+        ## empty cache
+        self.saved_dict_list = []
+        if self.global_rank == 0:
+            self.save_results(result_list, 'epoch_{}'.format(self.current_epoch))
+            if self.downstream_task == 'synthesis':
+                return
+            all_predictions = [i['predictions'] for i in result_list]
+            all_targets = [i['targets'] for i in result_list]
+            bleu2, bleu4, rouge_1, rouge_2, rouge_l, meteor_score = \
+                caption_evaluate(all_predictions, all_targets, self.tokenizer, self.max_inference_len * 2)
+            self.log("bleu2", bleu2, sync_dist=False)
+            self.log("bleu4", bleu4, sync_dist=False)
+            self.log("rouge_1", rouge_1, sync_dist=False)
+            self.log("rouge_2", rouge_2, sync_dist=False)
+            self.log("rouge_l", rouge_l, sync_dist=False)
+            self.log("meteor_score", meteor_score, sync_dist=False)
+    @torch.no_grad()
+    def validation_step(self, batch, batch_idx, dataloader_idx=1):
+        if dataloader_idx == 0:
+            return
+        elif dataloader_idx == 1:
+            if (self.current_epoch+1) % self.caption_eval_epoch != 0:
+                return
+            rxn_ids, graphs, prompt_tokens, texts, inputs = batch
+            ###============== Captioning Results ===================###
+            samples = {'graphs': graphs, 'prompt_tokens': prompt_tokens}
+            if self.mode in {'ft', 'eval', 'pretrain_eval'}:
+                predictions = self.blip2opt.generate(
+                    samples,
+                    do_sample=self.do_sample,
+                    num_beams=self.num_beams,
+                    max_length=self.max_inference_len,
+                    min_length=self.min_inference_len,
+                    num_captions=self.num_generate_captions,
+                    use_graph=not self.args.disable_graphs
+                )
+            else:
+                raise NotImplementedError()
+            self.saved_dict_list.append({
+                'index': rxn_ids,
+                'input': inputs,
+                'predictions': predictions,
+                'targets': texts
+            })
+        else:
+            raise NotImplementedError
+    def on_train_start(self):
+        if hasattr(self, 'ema'):
+            self.ema.to(self.device)
+    def on_before_zero_grad(self, *args, **kwargs):
+        if self.save_ema_checkpoint:
+            if self.trainer.global_step % 100 == 0:
+                self.ema.update(self.parameters())
+        if self.trainer.global_step in self.save_on_steps:
+            checkpoint_path = os.path.join(f"all_checkpoints/{self.args.filename}/", f'step{self.trainer.global_step}.ckpt')
+            self.trainer.save_checkpoint(checkpoint_path)
+    def on_train_epoch_end(self):
+        save_every_n_epochs = self.args.save_every_n_epochs if self.args.save_every_n_epochs > 0 else self.args.max_epochs
+        if (self.current_epoch + 1) % save_every_n_epochs != 0:
+            return
+        if self.save_ema_checkpoint:
+            with self.ema.average_parameters():
+                checkpoint_path = os.path.join(f"all_checkpoints/{self.args.filename}/", f'ema_epoch{self.current_epoch}.ckpt')
+                self.trainer.save_checkpoint(checkpoint_path)
+    def training_step(self, batch, batch_idx):
+        if self.scheduler:
+            self.scheduler.step(self.trainer.current_epoch, self.trainer.global_step)
+        batch_size = batch[-1].input_ids.size(0)
+        ###============== Overall Loss ===================###
+        if self.mode == 'ft':
+            loss = self.blip2opt.forward_action(batch, use_gragh=not self.args.disable_graphs)
+        elif self.mode == 'pretrain':
+            loss = self.blip2opt.forward_abstract(batch, use_gragh=not self.args.disable_graphs)
+        else:
+            raise NotImplementedError()
+        self.log("molecule loss", float(loss['loss']), batch_size=batch_size, sync_dist=True, prog_bar=True)
+        self.log("lr", self.trainer.optimizers[0].param_groups[0]['lr'], batch_size=batch_size, sync_dist=True, prog_bar=True)
+        return loss['loss']
+    @staticmethod
+    def add_model_specific_args(parent_parser):
+        parser = parent_parser.add_argument_group("GINSimclr")
+        # train mode
+        # GIN
+        parser.add_argument('--gin_hidden_dim', type=int, default=300)
+        parser.add_argument('--gin_num_layers', type=int, default=5)
+        parser.add_argument('--drop_ratio', type=float, default=0.0)
+        parser.add_argument('--tune_gnn', action='store_true', default=False)
+        parser.add_argument('--not_tune_qformer', action='store_true', default=False)
+        parser.add_argument('--disable_graphs', action='store_true', default=False)
+        # Bert
+        parser.add_argument('--bert_hidden_dim', type=int, default=2048, help='')
+        parser.add_argument('--bert_name', type=str, default='scibert')
+        parser.add_argument('--cross_attention_freq', type=int, default=2)
+        parser.add_argument('--num_query_token', type=int, default=8)
+        # OPT
+        parser.add_argument('--opt_model', type=str, default="facebook/galactica-1.3b")
+        # parser.add_argument('--prompt', type=str, default='a molecule of ')
+        parser.add_argument('--num_beams', type=int, default=5)
+        parser.add_argument('--do_sample', action='store_true', default=False)
+        parser.add_argument('--max_inference_len', type=int, default=512)
+        parser.add_argument('--min_inference_len', type=int, default=8)
+        parser.add_argument('--llm_tune', type=str, default='freeze')
+        parser.add_argument('--peft_config', type=str, default=None)
+        parser.add_argument('--peft_dir', type=str, default='')
+        parser.add_argument('--save_every_n_epochs', type=int, default=0)
+        ## quantization
+        parser.add_argument('--load_in_8bit', action='store_true', default=False)
+        ## lora config
+        parser.add_argument('--lora_r', type=int, default=8)
+        parser.add_argument('--lora_alpha', type=int, default=32)
+        parser.add_argument('--lora_dropout', type=int, default=0.1)
+        # optimization
+        parser.add_argument('--reaction_weight', type=float, default=1.0)
+        parser.add_argument('--weight_decay', type=float, default=0.05, help='optimizer weight decay')
+        parser.add_argument('--init_lr', type=float, default=1e-4, help='optimizer init learning rate')
+        parser.add_argument('--min_lr', type=float, default=1e-5, help='optimizer min learning rate')
+        parser.add_argument('--warmup_lr', type=float, default=1e-6, help='optimizer warmup learning rate')
+        parser.add_argument('--warmup_steps', type=int, default=1000, help='optimizer warmup steps')
+        parser.add_argument('--lr_decay_rate', type=float, default=0.9, help='optimizer lr decay rate')
+        parser.add_argument('--scheduler', type=str, default='linear_warmup_cosine_lr', help='type of scheduler') # or linear_warmup_step_lr
+        parser.add_argument('--optimizer', type=str, default='adamw', help='type of scheduler')
+        parser.add_argument('--init_checkpoint', type=str, default='')
+        parser.add_argument('--caption_eval_epoch', type=int, default=10)
+        parser.add_argument('--num_generate_captions', type=int, default=1)
+        # OPT Config
+        parser.add_argument('--optconfig_attention_dropout', type=float, default=0.0)
+        parser.add_argument('--optconfig_dropout', type=float, default=0.0)
+        # others
+        parser.add_argument('--save_ema_checkpoint', action='store_true', default=False)
+        parser.add_argument('--save_on_steps', default=[], nargs='+', type=int)
+        return parent_parser

model/blip2_opt.py ADDED Viewed

	@@ -0,0 +1,417 @@

+"""
+ Copyright (c) 2023, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import logging
+import torch
+import torch.nn as nn
+from torch.cuda.amp import autocast as autocast
+from torch.nn import functional as F
+from torch.nn import CrossEntropyLoss
+from peft import get_peft_config, get_peft_model, get_peft_model_state_dict, LoraConfig, TaskType, PeftModel
+from ogb.utils import smiles2graph
+from torch_geometric.loader.dataloader import Collater
+from torch_geometric.data import Data
+import numpy as np
+from lavis.models.blip2_models.blip2 import (
+    # Blip2Base,
+    disabled_train,
+)
+from model.blip2 import Blip2Base
+from model.help_funcs import get_not_allowed_tokens_ids
+from transformers import AutoTokenizer
+from transformers import OPTForCausalLM, OPTConfig
+# from opendelta import LoraModel
+# from opendelta.delta_models.lora import LoraConfig
+# from opendelta.delta_configs
+opt_model_list = [
+    "facebook/galactica-125m",
+    "facebook/galactica-1.3b",
+    "facebook/galactica-6.7b",
+    "facebook/galactica-30b",
+]
+def mask_by_len(input, lens, fill_value=0):
+    '''
+    input: shape = [N, D]
+    lens: shape = [N]
+    '''
+    mask = torch.arange(input.shape[1], device=input.device).reshape(1, -1)
+    mask = mask < lens.reshape(-1, 1)
+    input[mask] = fill_value
+    return input
+def smiles2data(smiles):
+    graph = smiles2graph(smiles)
+    x = torch.from_numpy(graph['node_feat'])
+    edge_index = torch.from_numpy(graph['edge_index'], )
+    edge_attr = torch.from_numpy(graph['edge_feat'])
+    data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)
+    return data
+import re
+SPLIT_MARKER = f"SPL{1}T-TH{1}S-Pl3A5E"
+CUSTOM_SEQ_RE = re.compile(r"(\[START_(DNA|SMILES|I_SMILES|AMINO)])(.*?)(\[END_\2])")
+def _insert_split_marker(m: re.Match):
+    """
+    Applies split marker based on a regex match of special tokens such as
+    [START_DNA].
+    Parameters
+    ----------
+    n : str
+        Input text to split
+    Returns
+    ----------
+    str - the text with the split token added
+    """
+    start_token, _, sequence, end_token = m.groups()
+    sequence = re.sub(r"(.)", fr"{SPLIT_MARKER}\1", sequence, flags=re.DOTALL)
+    return f"{start_token}{sequence}{SPLIT_MARKER}{end_token}"
+def escape_custom_split_sequence(text):
+    """
+    Applies custom splitting to the text for GALILEO's tokenization
+    Parameters
+    ----------
+    text : str
+        Input text to split
+    Returns
+    ----------
+    str - the text with the split token added
+    """
+    return CUSTOM_SEQ_RE.sub(_insert_split_marker, text)
+def smiles_handler(text, mol_ph):
+    smiles_list = []
+    for match in CUSTOM_SEQ_RE.finditer(text):
+        smiles = match.group(3)
+        smiles_list.append(smiles)
+    text = CUSTOM_SEQ_RE.sub(r'\1\3\4%s' % (mol_ph), text)
+    text = escape_custom_split_sequence(text)
+    return text, smiles_list
+class Blip2OPT(Blip2Base):
+    """
+    BLIP2 first-stage model with Q-former and ViT.
+    Supported model types:
+        - pretrained: pretrained model with vit-g
+        - pretrain_vitL: pretrained model with vit-large
+        - coco: fintuned model on coco
+    Usage:
+        >>> from lavis.models import load_model
+        >>> model = load_model("blip2", "pretrain")
+    """
+    def __init__(
+        self,
+        bert_name,
+        gin_num_layers,
+        gin_hidden_dim,
+        gin_drop_ratio,
+        tune_gnn=False,
+        tune_qformer=False,
+        num_query_token=32,
+        cross_attention_freq=2,
+        llm_tune='freeze',
+        peft_dir='',
+        opt_model="facebook/galactica-1.3b",
+        prompt="",
+        args=None,
+    ):
+        super().__init__()
+        self.args = args
+        self.graph_encoder, self.ln_graph = self.init_graph_encoder(gin_num_layers, gin_hidden_dim, gin_drop_ratio)
+        self.tune_gnn = tune_gnn
+        self.tune_qformer = tune_qformer
+        if not tune_gnn:
+            for name, param in self.graph_encoder.named_parameters():
+                param.requires_grad = False
+            self.graph_encoder = self.graph_encoder.eval()
+            self.graph_encoder.train = disabled_train
+            logging.info("freeze graph encoder")
+        else:
+            logging.info("tune graph encoder")
+        self.num_query_token = num_query_token
+        self.Qformer, self.query_tokens = self.init_Qformer(bert_name, num_query_token, self.graph_encoder.num_features, cross_attention_freq)
+        if not tune_qformer:
+            for name, param in self.Qformer.named_parameters():
+                param.requires_grad = False
+            self.Qformer = self.Qformer.eval()
+            self.Qformer.train = disabled_train
+            self.query_tokens.requires_grad = False
+            logging.info("freeze qformer encoder")
+        else:
+            logging.info("tune qformer encoder")
+        ### remove the unused parameters
+        self.Qformer.cls = None
+        self.Qformer.bert.embeddings.word_embeddings = None
+        self.Qformer.bert.embeddings.position_embeddings = None
+        for layer in self.Qformer.bert.encoder.layer:
+            layer.output = None
+            layer.intermediate = None
+        opt_config_params = {k[len("optconfig_"):]: v for k, v in vars(args).items() if k.startswith("optconfig_")}
+        config = OPTConfig.from_pretrained(opt_model, **opt_config_params)
+        ## initialize opt model
+        self.opt_tokenizer = AutoTokenizer.from_pretrained(opt_model, use_fast=False, padding_side='right')
+        self.opt_tokenizer.add_special_tokens({'pad_token': '<pad>'})
+        self.opt_tokenizer.add_tokens('<mol>') # molecule placeholder
+        self.mol_token = '<mol>'
+        self.opt_tokenizer.mol_token_id = self.opt_tokenizer("<mol>", add_special_tokens=False).input_ids[0]
+        self.collater = Collater([], [])
+        if opt_model == 'facebook/galactica-125m':
+            self.opt_model = OPTForCausalLM.from_pretrained(opt_model, config=config)
+        else:
+            if torch.cuda.is_bf16_supported():
+                self.opt_model = OPTForCausalLM.from_pretrained(opt_model, torch_dtype=torch.bfloat16, config=config)
+            else:
+                self.opt_model = OPTForCausalLM.from_pretrained(opt_model, torch_dtype=torch.float16, config=config)
+        self.opt_model.resize_token_embeddings(len(self.opt_tokenizer)) ## this will cause bug when full fine-tuning the opt model
+        self.llm_tune = llm_tune
+        if llm_tune == 'lora':
+            if peft_dir:
+                self.opt_model = PeftModel.from_pretrained(self.opt_model, peft_dir, is_trainable=True)
+            else:
+                if self.args.peft_config:
+                    peft_config = LoraConfig(**LoraConfig.from_json_file(self.args.peft_config))
+                else:
+                    peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, inference_mode=False, r=args.lora_r, lora_alpha=args.lora_alpha, lora_dropout=args.lora_dropout)
+                self.peft_config = peft_config
+                self.opt_model = get_peft_model(self.opt_model, peft_config)
+                self.opt_model.print_trainable_parameters()
+        elif llm_tune == 'freeze':
+            for name, param in self.opt_model.named_parameters():
+                param.requires_grad = False
+        elif llm_tune == 'full':
+            pass
+        else:
+            raise NotImplementedError()
+        ## fixme: this is different from the original BLIP2
+        if args.mode=='pretrain_eval':
+            self.eos_token_id = self.opt_tokenizer(
+                "[START_SMILES]\n", add_special_tokens=False
+            ).input_ids
+        else:
+            self.eos_token_id = self.opt_tokenizer(
+                "\n", add_special_tokens=False
+            ).input_ids[0]
+        self.opt_proj = nn.Linear(
+            self.Qformer.config.hidden_size, self.opt_model.config.hidden_size
+        )
+        ## fixme: no prompt yet
+        self.prompt = prompt
+        self.rxn_batch_size = args.rxn_batch_size
+        self.generate_restrict_tokens = args.generate_restrict_tokens
+        self.train_restrict_tokens = args.train_restrict_tokens
+        if self.generate_restrict_tokens or self.train_restrict_tokens:
+            self.bad_words_ids = get_not_allowed_tokens_ids(opt_model)
+        # prompt_tokens = self.opt_tokenizer(self.prompt, return_tensors="pt")
+        # self.prompt_length = prompt_tokens.attention_mask.sum(1)
+    def opt_forward_v2(
+        self,
+        inputs_embeds,
+        attention_mask,
+        labels,
+        bad_word_ids=None,
+    ):
+        output = self.opt_model(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            return_dict=True,
+            labels=labels,
+        )
+        logits = output.logits
+        labels = labels.to(logits.device)
+        # Shift so that tokens < n predict n
+        if bad_word_ids:
+            bad_word_ids = torch.tensor(bad_word_ids, device=logits.device, dtype=torch.long)
+            bad_word_ids = bad_word_ids.squeeze()
+            logits[:, :, bad_word_ids] = -100
+        shift_logits = logits[..., :-1, :].contiguous()
+        shift_labels = labels[..., 1:].contiguous()
+        shift_logits = shift_logits.view(-1, self.opt_model.config.vocab_size)
+        loss_fct = CrossEntropyLoss()
+        loss = loss_fct(shift_logits, shift_labels.view(-1))
+        return loss
+    def forward_action(self, batch, use_gragh=True):
+        # batch unpack
+        rxn_ids, graphs, text_tokens = batch
+        if use_gragh:
+            graph_embeds, graph_masks = self.graph_encoder(graphs)
+            if not self.tune_gnn:
+                graph_embeds = graph_embeds.detach()
+            # graph embedding calculation
+            graph_embeds = self.ln_graph(graph_embeds, graph_masks)
+            query_tokens = self.query_tokens.expand(graph_embeds.shape[0], -1, -1)
+            query_output = self.Qformer.bert(
+                query_embeds=query_tokens,
+                encoder_hidden_states=graph_embeds,
+                encoder_attention_mask=graph_masks, # fixme: check whether this mask is correct
+                return_dict=True,
+            )
+            mol_tokens = self.opt_proj(query_output.last_hidden_state) # graph_num x num_query_token x D
+        else:
+            del graphs
+        pad_mask = text_tokens.input_ids == self.opt_tokenizer.pad_token_id
+        targets = text_tokens.input_ids.masked_fill(pad_mask, -100)
+        targets = targets.masked_fill(text_tokens.is_mol_token, -100)
+        targets = targets.masked_fill(text_tokens.token_type_ids == 0, -100)
+        inputs_embeds = self.opt_model.get_input_embeddings()(text_tokens.input_ids)
+        if use_gragh:
+            inputs_embeds[text_tokens.is_mol_token] = mol_tokens.flatten(0, 1) # graph_num x emb_dim
+        if self.train_restrict_tokens:
+            loss = self.opt_forward_v2(
+                inputs_embeds=inputs_embeds,
+                attention_mask=text_tokens.attention_mask,
+                labels=targets,
+                bad_word_ids=self.bad_words_ids,
+            )
+        else:
+            outputs = self.opt_model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=text_tokens.attention_mask,
+                return_dict=True,
+                labels=targets,
+            )
+            loss = outputs.loss
+        return {"loss": loss}
+    def forward_abstract(self, batch, use_gragh=True):
+        # batch unpack
+        graphs, text_tokens = batch
+        if use_gragh:
+            graph_embeds, graph_masks = self.graph_encoder(graphs)
+            if not self.tune_gnn:
+                graph_embeds = graph_embeds.detach()
+            # graph embedding calculation
+            graph_embeds = self.ln_graph(graph_embeds, graph_masks)
+            query_tokens = self.query_tokens.expand(graph_embeds.shape[0], -1, -1)
+            query_output = self.Qformer.bert(
+                query_embeds=query_tokens,
+                encoder_hidden_states=graph_embeds,
+                encoder_attention_mask=graph_masks, # fixme: check whether this mask is correct
+                return_dict=True,
+            )
+            mol_tokens = self.opt_proj(query_output.last_hidden_state) # graph_num x num_query_token x D
+        else:
+            del graphs
+        pad_mask = text_tokens.input_ids == self.opt_tokenizer.pad_token_id
+        targets = text_tokens.input_ids.masked_fill(pad_mask, -100)
+        targets = targets.masked_fill(text_tokens.is_mol_token, -100)
+        inputs_embeds = self.opt_model.get_input_embeddings()(text_tokens.input_ids)
+        if use_gragh:
+            inputs_embeds[text_tokens.is_mol_token] = mol_tokens.flatten(0, 1) # graph_num x emb_dim
+        outputs = self.opt_model(
+            inputs_embeds=inputs_embeds,
+            attention_mask=text_tokens.attention_mask,
+            return_dict=True,
+            labels=targets,
+        )
+        loss = outputs.loss
+        return {"loss": loss}
+    @torch.no_grad()
+    def generate(
+        self,
+        samples,
+        do_sample=False,
+        num_beams=5,
+        max_length=128,
+        min_length=1,
+        top_p=0.9,
+        repetition_penalty=1.0,
+        length_penalty=1.0,
+        num_captions=1,
+        temperature=1,
+        use_graph=True,
+    ):
+        """
+        Args:
+            samples (dict): A dictionary containing the following keys:
+                - image (torch.Tensor): A tensor of shape (batch_size, 3, H, W)
+            num_beams (int): Number of beams for beam search. 1 means no beam search.
+            max_length (int): The maximum length of the sequence to be generated.
+            min_length (int): The minimum length of the sequence to be generated.
+            top_p (float): The cumulative probability for nucleus sampling.
+            repetition_penalty (float): The parameter for repetition penalty. 1.0 means no penalty.
+            num_captions (int): Number of captions to be generated for each image.
+        Returns:
+            captions (list): A list of strings of length batch_size * num_captions.
+        """
+        graphs = samples['graphs']
+        prompt_tokens = samples['prompt_tokens']
+        # prompt_lens = samples['prompt_lens']
+        # with self.maybe_autocast():
+        if use_graph:
+            graph_embeds, graph_masks = self.graph_encoder(graphs)
+            graph_embeds = self.ln_graph(graph_embeds)
+            query_tokens = self.query_tokens.expand(graph_embeds.shape[0], -1, -1)
+            query_output = self.Qformer.bert(
+                query_embeds=query_tokens,
+                encoder_hidden_states=graph_embeds,
+                encoder_attention_mask=graph_masks,
+                return_dict=True,
+            )
+            mol_tokens = self.opt_proj(query_output.last_hidden_state)
+        prompt_embeds = self.opt_model.get_input_embeddings()(prompt_tokens.input_ids)
+        if use_graph:
+            prompt_embeds[prompt_tokens.is_mol_token] = mol_tokens.flatten(0, 1).to(dtype=prompt_embeds.dtype)
+        extra_params = {}
+        if self.generate_restrict_tokens:
+            extra_params['bad_words_ids'] = self.bad_words_ids
+        outputs = self.opt_model.generate(
+            inputs_embeds=prompt_embeds,
+            attention_mask=prompt_tokens.attention_mask,
+            do_sample=do_sample,
+            top_p=top_p,
+            temperature=temperature,
+            num_beams=num_beams,
+            max_length=max_length,
+            min_length=min_length,
+            # pad_token_id=self.pad_token_id,
+            eos_token_id=self.eos_token_id,
+            repetition_penalty=repetition_penalty,
+            length_penalty=length_penalty,
+            num_return_sequences=num_captions,
+            # use_cache=False,
+            **extra_params
+        )
+        output_text = self.opt_tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        output_text = [text.strip() for text in output_text]
+        return output_text

model/blip2_t5.py ADDED Viewed

	@@ -0,0 +1,305 @@

+"""
+ Copyright (c) 2023, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import logging
+import torch
+import torch.nn as nn
+from torch.cuda.amp import autocast as autocast
+from peft import get_peft_model, LoraConfig, TaskType, PeftModel
+from lavis.models.blip2_models.blip2 import disabled_train
+from model.blip2 import Blip2Base
+# from model.smiles_t5_captioning
+from lavis.models.blip2_models.modeling_t5 import T5ForConditionalGeneration
+from transformers import AutoTokenizer, T5TokenizerFast
+#, T5ForConditionalGeneration
+class Blip2T5(Blip2Base):
+    """
+    BLIP2 first-stage model with Q-former and ViT.
+    Supported model types:
+        - pretrained: pretrained model with vit-g
+        - pretrain_vitL: pretrained model with vit-large
+        - coco: fintuned model on coco
+    Usage:
+        >>> from lavis.models import load_model
+        >>> model = load_model("blip2", "pretrain")
+    """
+    def __init__(
+        self,
+        bert_name,
+        gin_num_layers,
+        gin_hidden_dim,
+        gin_drop_ratio,
+        tune_gnn=False,
+        num_query_token=32,
+        cross_attention_freq=2,
+        llm_tune='freeze',
+        peft_dir='',
+        opt_model="facebook/galactica-1.3b",
+        prompt="",
+        args=None,
+    ):
+        super().__init__()
+        self.args = args
+        self.graph_encoder, self.ln_graph = self.init_graph_encoder(gin_num_layers, gin_hidden_dim, gin_drop_ratio)
+        self.tune_gnn = tune_gnn
+        if not tune_gnn:
+            for name, param in self.graph_encoder.named_parameters():
+                param.requires_grad = False
+            self.graph_encoder = self.graph_encoder.eval()
+            self.graph_encoder.train = disabled_train
+            logging.info("freeze graph encoder")
+        self.num_query_token = num_query_token
+        self.Qformer, self.query_tokens = self.init_Qformer(bert_name, num_query_token, self.graph_encoder.num_features, cross_attention_freq)
+        ### remove the unused parameters
+        self.Qformer.cls = None
+        self.Qformer.bert.embeddings.word_embeddings = None
+        self.Qformer.bert.embeddings.position_embeddings = None
+        for layer in self.Qformer.bert.encoder.layer:
+            layer.output = None
+            layer.intermediate = None
+        # assert opt_model == 'laituan245/molt5-large'
+        ## initialize opt model
+        # self.opt_tokenizer = AutoTokenizer.from_pretrained(opt_model)
+        self.opt_tokenizer = T5TokenizerFast.from_pretrained(opt_model)
+        self.opt_tokenizer.add_tokens('<mol>') # molecule placeholder
+        self.mol_token = '<mol>'
+        self.opt_tokenizer.mol_token_id = self.opt_tokenizer("<mol>", add_special_tokens=False).input_ids[0]
+        self.opt_model = T5ForConditionalGeneration.from_pretrained(opt_model, torch_dtype=torch.float32)
+        self.opt_model.resize_token_embeddings(len(self.opt_tokenizer)) ## this will cause bug when full fine-tuning the opt model
+        self.llm_tune = llm_tune
+        if llm_tune == 'lora':
+            if peft_dir:
+                self.opt_model = PeftModel.from_pretrained(self.opt_model, peft_dir, is_trainable=True)
+            else:
+                if self.args.peft_config:
+                    peft_config = LoraConfig(**LoraConfig.from_json_file(self.args.peft_config))
+                else:
+                    peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, inference_mode=False, r=args.lora_r, lora_alpha=args.lora_alpha, lora_dropout=args.lora_dropout)
+                self.peft_config = peft_config
+                self.opt_model = get_peft_model(self.opt_model, peft_config)
+                self.opt_model.print_trainable_parameters()
+        elif llm_tune == 'freeze':
+            for name, param in self.opt_model.named_parameters():
+                param.requires_grad = False
+        elif llm_tune == 'full':
+            pass
+        else:
+            raise NotImplementedError()
+        ## fixme: this is different from the original BLIP2
+        # self.eos_token_id = self.opt_tokenizer(
+        #     "\n", add_special_tokens=False
+        # ).input_ids[0]
+        self.eos_token_id = self.opt_tokenizer(
+            "</s>", add_special_tokens=False
+        ).input_ids[0]
+        self.opt_proj = nn.Linear(
+            self.Qformer.config.hidden_size, self.opt_model.config.hidden_size
+        )
+    def forward(self, batch):
+        graphs, prompt_tokens, text_tokens = batch
+        graph_embeds, graph_masks = self.graph_encoder(graphs)
+        if not self.tune_gnn:
+            graph_embeds = graph_embeds.detach()
+        graph_embeds = self.ln_graph(graph_embeds, graph_masks)
+        query_tokens = self.query_tokens.expand(graph_embeds.shape[0], -1, -1)
+        query_output = self.Qformer.bert(
+            query_embeds=query_tokens,
+            encoder_hidden_states=graph_embeds,
+            encoder_attention_mask=graph_masks, # fixme: check whether this mask is correct
+            return_dict=True,
+        )
+        mol_tokens = self.opt_proj(query_output.last_hidden_state)
+        targets = text_tokens.input_ids.masked_fill(
+            text_tokens.input_ids == self.opt_tokenizer.pad_token_id, -100
+        )
+        with self.maybe_autocast(torch.float32):
+            prompt_embeds = self.opt_model.encoder.embed_tokens(prompt_tokens.input_ids)
+            prompt_embeds[prompt_tokens.is_mol_token] = mol_tokens.flatten(0, 1).to(torch.float32)
+            outputs = self.opt_model(
+                inputs_embeds=prompt_embeds,
+                attention_mask=prompt_tokens.attention_mask,
+                decoder_attention_mask=text_tokens.attention_mask,
+                return_dict=True,
+                labels=targets,
+            )
+            loss = outputs.loss
+        return {"loss": loss}
+    def forward_action(self, batch, use_gragh=True):
+        rxn_ids, graphs, prompt_tokens, text_tokens = batch
+        if use_gragh:
+            graph_embeds, graph_masks = self.graph_encoder(graphs)
+            if not self.tune_gnn:
+                graph_embeds = graph_embeds.detach()
+            graph_embeds = self.ln_graph(graph_embeds, graph_masks)
+            query_tokens = self.query_tokens.expand(graph_embeds.shape[0], -1, -1)
+            query_output = self.Qformer.bert(
+                query_embeds=query_tokens,
+                encoder_hidden_states=graph_embeds,
+                encoder_attention_mask=graph_masks, # fixme: check whether this mask is correct
+                return_dict=True,
+            )
+            mol_tokens = self.opt_proj(query_output.last_hidden_state)
+        else:
+            del graphs
+        targets = text_tokens.input_ids.masked_fill(
+            text_tokens.input_ids == self.opt_tokenizer.pad_token_id, -100
+        )
+        with self.maybe_autocast(torch.float32):
+            prompt_embeds = self.opt_model.encoder.embed_tokens(prompt_tokens.input_ids)
+            if use_gragh:
+                prompt_embeds[prompt_tokens.is_mol_token] = mol_tokens.flatten(0, 1).to(torch.float32)
+            outputs = self.opt_model(
+                inputs_embeds=prompt_embeds,
+                attention_mask=prompt_tokens.attention_mask,
+                decoder_attention_mask=text_tokens.attention_mask,
+                return_dict=True,
+                labels=targets,
+            )
+            loss = outputs.loss
+        return {"loss": loss}
+    @torch.no_grad()
+    def generate(
+        self,
+        samples,
+        do_sample=False,
+        num_beams=5,
+        max_length=128,
+        min_length=1,
+        top_p=0.9,
+        repetition_penalty=1.0,
+        length_penalty=1.0,
+        num_captions=1,
+        temperature=1,
+    ):
+        """
+        Args:
+            samples (dict): A dictionary containing the following keys:
+                - image (torch.Tensor): A tensor of shape (batch_size, 3, H, W)
+            num_beams (int): Number of beams for beam search. 1 means no beam search.
+            max_length (int): The maximum length of the sequence to be generated.
+            min_length (int): The minimum length of the sequence to be generated.
+            top_p (float): The cumulative probability for nucleus sampling.
+            repetition_penalty (float): The parameter for repetition penalty. 1.0 means no penalty.
+            num_captions (int): Number of captions to be generated for each image.
+        Returns:
+            captions (list): A list of strings of length batch_size * num_captions.
+        """
+        graphs = samples['graphs']
+        prompt_tokens = samples['prompt_tokens']
+        graph_embeds, graph_masks = self.graph_encoder(graphs)
+        graph_embeds = self.ln_graph(graph_embeds)
+        query_tokens = self.query_tokens.expand(graph_embeds.shape[0], -1, -1)
+        query_output = self.Qformer.bert(
+            query_embeds=query_tokens,
+            encoder_hidden_states=graph_embeds,
+            encoder_attention_mask=graph_masks,
+            return_dict=True,
+        )
+        mol_tokens = self.opt_proj(query_output.last_hidden_state)
+        with self.maybe_autocast(torch.float32):
+            prompt_embeds = self.opt_model.encoder.embed_tokens(prompt_tokens.input_ids)
+            prompt_embeds[prompt_tokens.is_mol_token] = mol_tokens.flatten(0, 1).to(torch.float32)
+            # prompt_embeds = self.opt_model.encoder.embed_tokens(prompt_tokens.input_ids)
+            # prompt_embeds[prompt_tokens.is_mol_token] = mol_tokens.flatten(0, 1)
+            outputs = self.opt_model.generate(
+                inputs_embeds=prompt_embeds,
+                attention_mask=prompt_tokens.attention_mask,
+                do_sample=do_sample,
+                top_p=top_p,
+                temperature=temperature,
+                num_beams=num_beams,
+                max_length=max_length,
+                min_length=min_length,
+                # pad_token_id=self.pad_token_id,
+                eos_token_id=self.eos_token_id,
+                repetition_penalty=repetition_penalty,
+                length_penalty=length_penalty,
+                num_return_sequences=num_captions,
+                # use_cache=False,
+            )
+            output_text = self.opt_tokenizer.batch_decode(outputs, skip_special_tokens=True)
+            output_text = [text.strip() for text in output_text]
+            return output_text
+    @torch.no_grad()
+    def generate_action(
+        self,
+        samples,
+        do_sample=False,
+        num_beams=5,
+        max_length=128,
+        min_length=1,
+        top_p=0.9,
+        repetition_penalty=1.0,
+        length_penalty=1.0,
+        num_captions=1,
+        temperature=1,
+        use_graph=True
+    ):
+        graphs = samples['graphs']
+        prompt_tokens = samples['prompt_tokens']
+        if use_graph:
+            graph_embeds, graph_masks = self.graph_encoder(graphs)
+            graph_embeds = self.ln_graph(graph_embeds)
+            query_tokens = self.query_tokens.expand(graph_embeds.shape[0], -1, -1)
+            query_output = self.Qformer.bert(
+                query_embeds=query_tokens,
+                encoder_hidden_states=graph_embeds,
+                encoder_attention_mask=graph_masks,
+                return_dict=True,
+            )
+            mol_tokens = self.opt_proj(query_output.last_hidden_state)
+        with self.maybe_autocast(torch.float32):
+            prompt_embeds = self.opt_model.encoder.embed_tokens(prompt_tokens.input_ids)
+            if use_graph:
+                prompt_embeds[prompt_tokens.is_mol_token] = mol_tokens.flatten(0, 1).to(torch.float32)
+            # prompt_embeds = self.opt_model.encoder.embed_tokens(prompt_tokens.input_ids)
+            # prompt_embeds[prompt_tokens.is_mol_token] = mol_tokens.flatten(0, 1)
+            outputs = self.opt_model.generate(
+                inputs_embeds=prompt_embeds,
+                attention_mask=prompt_tokens.attention_mask,
+                do_sample=do_sample,
+                top_p=top_p,
+                temperature=temperature,
+                num_beams=num_beams,
+                max_length=max_length,
+                min_length=min_length,
+                # pad_token_id=self.pad_token_id,
+                eos_token_id=self.eos_token_id,
+                repetition_penalty=repetition_penalty,
+                length_penalty=length_penalty,
+                num_return_sequences=num_captions,
+                # use_cache=False,
+            )
+            output_text = self.opt_tokenizer.batch_decode(outputs, skip_special_tokens=True)
+            output_text = [text.strip() for text in output_text]
+            return output_text

model/blip2qformer.py ADDED Viewed

	@@ -0,0 +1,603 @@

+"""
+ Copyright (c) 2023, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import logging
+import os
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch.cuda.amp import autocast as autocast
+from torch.nn import functional as F
+# from lavis.common.registry import registry
+# from lavis.models.base_model import all_gather_with_grad, concat_all_gather
+from lavis.models.blip2_models.blip2 import (
+    disabled_train,
+)
+from lavis.models.blip_models.blip_outputs import BlipOutput
+from lavis.common.dist_utils import is_dist_avail_and_initialized
+from model.blip2 import Blip2Base
+from pytorch_lightning.utilities import distributed
+@torch.no_grad()
+def concat_all_gather(tensor):
+    """
+    Performs all_gather operation on the provided tensors.
+    *** Warning ***: torch.distributed.all_gather has no gradient.
+    """
+    # if use distributed training
+    if not is_dist_avail_and_initialized():
+        return tensor
+    tensors_gather = [
+        torch.ones_like(tensor) for _ in range(torch.distributed.get_world_size())
+    ]
+    torch.distributed.all_gather(tensors_gather, tensor, async_op=False)
+    output = torch.cat(tensors_gather, dim=0)
+    print('running here')
+    return output
+@torch.no_grad()
+def pl_concat_all_gather(tensor):
+    """
+    Performs all_gather operation on the provided tensors.
+    *** Warning ***: torch.distributed.all_gather has no gradient.
+    """
+    # if use distributed training
+    if not is_dist_avail_and_initialized():
+        return tensor
+    tensors_gather = distributed.gather_all_tensors(tensor)
+    output = torch.cat(tensors_gather, dim=0)
+    return output
+# @registry.register_model("blip2")
+# @registry.register_model("blip2_feature_extractor")
+class Blip2Qformer(Blip2Base):
+    """
+    BLIP2 first-stage model with Q-former and ViT.
+    Supported model types:
+        - pretrained: pretrained model with vit-g
+        - pretrain_vitL: pretrained model with vit-large
+        - coco: fintuned model on coco
+    Usage:
+        >>> from lavis.models import load_model
+        >>> model = load_model("blip2", "pretrain")
+    """
+    def __init__(
+        self,
+        gtm,
+        lm,
+        bert_name,
+        temperature,
+        gin_num_layers,
+        gin_hidden_dim,
+        gin_drop_ratio,
+        tune_gnn=False,
+        num_query_token=32,
+        cross_attention_freq=2,
+        embed_dim=256,
+    ):
+        super().__init__()
+        self.gtm = gtm
+        self.lm = lm
+        self.tokenizer = self.init_tokenizer()
+        self.graph_encoder, self.ln_graph = self.init_graph_encoder(gin_num_layers, gin_hidden_dim, gin_drop_ratio)
+        self.tune_gnn = tune_gnn
+        if not tune_gnn:
+            for name, param in self.graph_encoder.named_parameters():
+                param.requires_grad = False
+            self.graph_encoder = self.graph_encoder.eval()
+            self.graph_encoder.train = disabled_train
+            logging.info("freeze graph encoder")
+        self.Qformer, self.query_tokens = self.init_Qformer(bert_name, num_query_token, self.graph_encoder.num_features, cross_attention_freq)
+        self.Qformer.resize_token_embeddings(len(self.tokenizer))
+        state_dict = self.Qformer.state_dict()
+        for name, param in self.Qformer.named_parameters():
+            if "_query" in name:
+                key_orig = name.replace("_query", "")
+                param.data.copy_(state_dict[key_orig])
+        self.graph_proj = nn.Linear(self.Qformer.config.hidden_size, embed_dim)
+        self.text_proj = nn.Linear(self.Qformer.config.hidden_size, embed_dim)
+        self.gtm_head = nn.Linear(self.Qformer.config.hidden_size, 2)
+        self.temperature = temperature
+    def contrast(self, features_graph, features_text, return_sim=False):
+        '''
+        features_graph: shape = [B, num_qs, D]
+        features_text: shape = [B, D]
+        '''
+        batch_size = features_graph.size(0)
+        # normalized features
+        features_graph = F.normalize(features_graph, dim=-1)
+        features_text = F.normalize(features_text, dim=-1)
+        # cosine similarity as logits
+        sim_q2t = (features_graph.unsqueeze(1) @ features_text.unsqueeze(-1)).squeeze() # shape = [B, 1, num_qs, D]; shape = [B, D, 1]; output shape = [B, B, num_qs]
+        sim_g2t, _ = sim_q2t.max(-1) # shape = [B, B]
+        logits_per_graph = sim_g2t / self.temperature
+        logits_per_text = logits_per_graph.t()
+        labels = torch.arange(batch_size, dtype=torch.long, device=self.device)  # 大小为B
+        loss_graph = F.cross_entropy(logits_per_graph, labels)
+        loss_text = F.cross_entropy(logits_per_text, labels)
+        loss = (loss_graph + loss_text) / 2
+        if return_sim:
+            return logits_per_graph, logits_per_text, loss
+        else:
+            return loss
+    def contrast_global(self, features_graph, features_text, features_graph_all, features_text_all, return_sim=False):
+        '''
+        features_graph: shape = [B, num_qs, D]
+        features_text: shape = [B, D]
+        features_text_all: shape = [B * num_gpus, D]
+        features_graph_all: shape = [B * num_gpus, num_qs, D]
+        '''
+        bs = features_graph.size(0)
+        # cosine similarity as logits
+        sim_q2t = (features_graph.unsqueeze(1) @ features_text_all.unsqueeze(-1)).squeeze() # shape = [B, 1, num_qs, D]; shape = [B * num_gpus, D, 1]; output shape = [B, B * num_gpus, num_qs]
+        sim_g2t, _ = sim_q2t.max(-1) # shape = [B, B * num_gpus]
+        logits_per_graph = sim_g2t / self.temperature
+        sim_t2q = (features_text.unsqueeze(1).unsqueeze(1) @ features_graph_all.permute(0, 2, 1)).squeeze() # shape = [B, 1, 1, D]; [B*num_gpus, D, num_qs]; output shape = [B, B*num_gpus, 1, num_qs]
+        sim_t2g, _ = sim_t2q.max(-1)
+        logits_per_text = sim_t2g / self.temperature
+        # labels = torch.arange(bs, dtype=torch.long, device=self.device)
+        rank = dist.get_rank()
+        labels = torch.linspace(rank * bs, rank * bs + bs - 1, bs, dtype=int).to(self.device)
+        loss_graph = F.cross_entropy(logits_per_graph, labels)
+        loss_text = F.cross_entropy(logits_per_text, labels)
+        loss = (loss_graph + loss_text) / 2
+        if return_sim:
+            return logits_per_graph[:, rank*bs:rank*bs+bs], logits_per_text[:, rank*bs:rank*bs+bs], loss
+        else:
+            return loss
+    def forward_old(self, batch):
+        ## v1: not gather results from all gpus
+        ###============== Image-text Contrastive ===================###
+        graph, text, mask = batch
+        batch_node, batch_mask = self.graph_encoder(graph)
+        batch_node = batch_node.detach()
+        batch_size = batch_node.shape[0]
+        batch_node = self.ln_graph(batch_node, batch_mask)
+        query_tokens = self.query_tokens.expand(batch_node.shape[0], -1, -1)
+        query_output = self.Qformer.bert(
+            query_embeds=query_tokens,
+            encoder_hidden_states=batch_node,
+            encoder_attention_mask=batch_mask, # fixme: check whether this mask is correct
+            use_cache=True,
+            return_dict=True,
+        )
+        graph_feats = self.graph_proj(query_output.last_hidden_state) # shape = [B, num_q, D]
+        text_output = self.Qformer.bert(text, attention_mask=mask, return_dict=True) # shape = [B, n_max, D]
+        text_feats = self.text_proj(text_output.last_hidden_state[:, 0, :])
+        sim_g2t, sim_t2g, loss_gtc = self.contrast(graph_feats, text_feats, return_sim=True)
+        ###============== Image-text Matching ===================###
+        loss_gtm = 0
+        if self.gtm:
+            g_emb = batch_node
+            g_mask = batch_mask
+            text_ids = text.clone()
+            with torch.no_grad():
+                weights_t2g = F.softmax(sim_t2g, dim=1) + 1e-4
+                weights_t2g.fill_diagonal_(0)
+                weights_g2t = F.softmax(sim_g2t, dim=1) + 1e-4
+                weights_g2t.fill_diagonal_(0)
+            # select a negative graph for each text
+            graph_embeds_neg = []
+            graph_mask_neg = []
+            for b in range(batch_size):
+                neg_idx = torch.multinomial(weights_t2g[b], 1).item()
+                graph_embeds_neg.append(g_emb[neg_idx])
+                graph_mask_neg.append(g_mask[neg_idx])
+            graph_embeds_neg = torch.stack(graph_embeds_neg, dim=0)
+            graph_mask_neg = torch.stack(graph_mask_neg, dim=0)
+            # select a negative text for each image
+            text_ids_neg = []
+            text_atts_neg = []
+            for b in range(batch_size):
+                neg_idx = torch.multinomial(weights_g2t[b], 1).item()
+                text_ids_neg.append(text_ids[neg_idx])
+                text_atts_neg.append(mask[neg_idx])
+            text_ids_neg = torch.stack(text_ids_neg, dim=0)
+            text_atts_neg = torch.stack(text_atts_neg, dim=0)
+            text_ids_all = torch.cat(
+                [text_ids, text_ids, text_ids_neg], dim=0
+            )  # pos, pos, neg
+            text_atts_all = torch.cat(
+                [mask, mask, text_atts_neg],
+                dim=0,
+            )
+            query_tokens_itm = self.query_tokens.expand(text_ids_all.shape[0], -1, -1)
+            query_atts_itm = torch.ones(query_tokens_itm.size()[:-1], dtype=torch.long, device=text.device)
+            attention_mask_all = torch.cat([query_atts_itm, text_atts_all], dim=1)
+            graph_embeds_all = torch.cat([g_emb, graph_embeds_neg, g_emb], dim=0)  # pos, neg, pos
+            graph_atts_all = torch.cat([g_mask, graph_mask_neg, g_mask], dim=0)
+            output_itm = self.Qformer.bert(
+                text_ids_all,
+                query_embeds=query_tokens_itm,
+                attention_mask=attention_mask_all,
+                encoder_hidden_states=graph_embeds_all,
+                encoder_attention_mask=graph_atts_all,
+                return_dict=True,
+            )
+            vl_embeddings = output_itm.last_hidden_state[:, : query_tokens_itm.size(1), :] # keep query tokens only
+            vl_output = self.gtm_head(vl_embeddings)
+            logits = vl_output.mean(dim=1)
+            itm_labels = torch.cat(
+                [torch.ones(batch_size, dtype=torch.long), torch.zeros(2 * batch_size, dtype=torch.long)],
+                dim=0,
+            ).to(text.device)
+            loss_gtm = F.cross_entropy(logits, itm_labels)
+        ##================= Image Captioning ========================##
+        loss_lm = 0
+        if self.lm:
+            decoder_input_ids = text.clone()
+            decoder_input_ids[:, 0] = self.tokenizer.bos_token_id
+            labels = decoder_input_ids.masked_fill(
+                decoder_input_ids == self.tokenizer.pad_token_id, -100
+            )
+            query_atts = torch.ones(query_tokens.size()[:-1], dtype=torch.long, device=text.device)
+            attention_mask = torch.cat([query_atts, mask], dim=1)
+            lm_output = self.Qformer(
+                decoder_input_ids,
+                attention_mask=attention_mask,
+                past_key_values=query_output.past_key_values,
+                return_dict=True,
+                labels=labels,
+            )
+            loss_lm = lm_output.loss
+        return BlipOutput(
+            loss=loss_gtc + loss_gtm + loss_lm,
+            loss_itc=loss_gtc,
+            loss_itm=loss_gtm,
+            loss_lm=loss_lm,
+        )
+    def forward(self, batch):
+        ## v2: gather results from all gpus
+        ###============== Image-text Contrastive ===================###
+        graph, text, mask = batch
+        batch_node, batch_mask = self.graph_encoder(graph)
+        if not self.tune_gnn:
+            batch_node = batch_node.detach()
+        batch_size = batch_node.shape[0]
+        batch_node = self.ln_graph(batch_node, batch_mask)
+        query_tokens = self.query_tokens.expand(batch_node.shape[0], -1, -1)
+        query_output = self.Qformer.bert(
+            query_embeds=query_tokens,
+            encoder_hidden_states=batch_node,
+            encoder_attention_mask=batch_mask, # fixme: check whether this mask is correct
+            use_cache=True,
+            return_dict=True,
+        )
+        graph_feats = self.graph_proj(query_output.last_hidden_state) # shape = [B, num_q, D]
+        text_output = self.Qformer.bert(text, attention_mask=mask, return_dict=True) # shape = [B, n_max, D]
+        text_feats = self.text_proj(text_output.last_hidden_state[:, 0, :])
+        text_feats, graph_feats = F.normalize(text_feats, p=2, dim=-1), F.normalize(graph_feats, p=2, dim=-1)
+        text_feats_all, graph_feats_all = pl_concat_all_gather(text_feats), pl_concat_all_gather(graph_feats) # shape = [B * num_gpus, D]
+        sim_g2t, sim_t2g, loss_gtc = self.contrast_global(graph_feats, text_feats, graph_feats_all, text_feats_all, return_sim=True)
+        ###============== Image-text Matching ===================###
+        loss_gtm = 0
+        if self.gtm:
+            ## not aggregate global tensor because of their different shapes
+            g_emb_world = batch_node
+            g_mask_world = batch_mask
+            text_ids_world = text
+            text_mask_world = mask
+            with torch.no_grad():
+                weights_t2g = F.softmax(sim_t2g, dim=1) + 1e-4
+                weights_t2g.fill_diagonal_(0)
+                weights_g2t = F.softmax(sim_g2t, dim=1) + 1e-4
+                weights_g2t.fill_diagonal_(0)
+            # select a negative graph for each text
+            graph_embeds_neg = []
+            graph_mask_neg = []
+            for b in range(batch_size):
+                neg_idx = torch.multinomial(weights_t2g[b], 1).item()
+                graph_embeds_neg.append(g_emb_world[neg_idx])
+                graph_mask_neg.append(g_mask_world[neg_idx])
+            graph_embeds_neg = torch.stack(graph_embeds_neg, dim=0)
+            graph_mask_neg = torch.stack(graph_mask_neg, dim=0)
+            # select a negative text for each image
+            text_ids_neg = []
+            text_atts_neg = []
+            for b in range(batch_size):
+                neg_idx = torch.multinomial(weights_g2t[b], 1).item()
+                text_ids_neg.append(text_ids_world[neg_idx])
+                text_atts_neg.append(text_mask_world[neg_idx])
+            text_ids_neg = torch.stack(text_ids_neg, dim=0)
+            text_atts_neg = torch.stack(text_atts_neg, dim=0)
+            text_ids_all = torch.cat(
+                [text, text, text_ids_neg], dim=0
+            )  # pos, pos, neg
+            text_atts_all = torch.cat(
+                [mask, mask, text_atts_neg],
+                dim=0,
+            )
+            query_tokens_itm = self.query_tokens.expand(text_ids_all.shape[0], -1, -1)
+            query_atts_itm = torch.ones(query_tokens_itm.size()[:-1], dtype=torch.long, device=text.device)
+            attention_mask_all = torch.cat([query_atts_itm, text_atts_all], dim=1)
+            graph_embeds_all = torch.cat([batch_node, graph_embeds_neg, batch_node], dim=0)  # pos, neg, pos
+            graph_atts_all = torch.cat([batch_mask, graph_mask_neg, batch_mask], dim=0)
+            output_itm = self.Qformer.bert(
+                text_ids_all,
+                query_embeds=query_tokens_itm,
+                attention_mask=attention_mask_all,
+                encoder_hidden_states=graph_embeds_all,
+                encoder_attention_mask=graph_atts_all,
+                return_dict=True,
+            )
+            vl_embeddings = output_itm.last_hidden_state[:, : query_tokens_itm.size(1), :] # keep query tokens only
+            vl_output = self.gtm_head(vl_embeddings)
+            logits = vl_output.mean(dim=1)
+            itm_labels = torch.cat(
+                [torch.ones(batch_size, dtype=torch.long), torch.zeros(2 * batch_size, dtype=torch.long)],
+                dim=0,
+            ).to(text.device)
+            loss_gtm = F.cross_entropy(logits, itm_labels)
+        ##================= Image Captioning ========================##
+        loss_lm = 0
+        if self.lm:
+            decoder_input_ids = text.clone()
+            decoder_input_ids[:, 0] = self.tokenizer.bos_token_id
+            labels = decoder_input_ids.masked_fill(
+                decoder_input_ids == self.tokenizer.pad_token_id, -100
+            )
+            query_atts = torch.ones(query_tokens.size()[:-1], dtype=torch.long, device=text.device)
+            attention_mask = torch.cat([query_atts, mask], dim=1)
+            lm_output = self.Qformer(
+                decoder_input_ids,
+                attention_mask=attention_mask,
+                past_key_values=query_output.past_key_values,
+                return_dict=True,
+                labels=labels,
+            )
+            loss_lm = lm_output.loss
+        return BlipOutput(
+            loss=loss_gtc + loss_gtm + loss_lm,
+            loss_itc=loss_gtc,
+            loss_itm=loss_gtm,
+            loss_lm=loss_lm,
+        )
+    def forward_v3(self, batch):
+        ## v3: use smiles instruction
+        ###============== Image-text Contrastive ===================###
+        graphs, text_tokens, prompt_tokens = batch
+        graph_embeds, graph_masks = self.graph_encoder(graphs)
+        if not self.tune_gnn:
+            graph_embeds = graph_embeds.detach()
+        graph_embeds = self.ln_graph(graph_embeds, graph_masks)
+        device = text_tokens.input_ids.device
+        batch_size = graph_embeds.shape[0]
+        ##
+        query_tokens = self.query_tokens.expand(graph_embeds.shape[0], -1, -1)
+        query_atts = torch.ones(query_tokens.size()[:-1], dtype=torch.long, device=device)
+        attention_mask_gtc = torch.cat([query_atts, prompt_tokens.attention_mask], dim=1)
+        query_output = self.Qformer.bert(
+            input_ids=prompt_tokens,
+            query_embeds=query_tokens,
+            attention_mask=attention_mask_gtc,
+            encoder_hidden_states=graph_embeds,
+            encoder_attention_mask=graph_masks, # fixme: check whether this mask is correct
+            use_cache=True,
+            return_dict=True,
+        )
+        query_output = query_output.last_hidden_state[:, : query_tokens.size(1), :] # keep query tokens only
+        graph_feats = self.graph_proj(query_output) # shape = [B, num_q, D]
+        text_output = self.Qformer.bert(text_tokens.input_ids, attention_mask=text_tokens.attention_mask, return_dict=True) # shape = [B, n_max, D]
+        text_feats = self.text_proj(text_output.last_hidden_state[:, 0, :])
+        text_feats, graph_feats = F.normalize(text_feats, p=2, dim=-1), F.normalize(graph_feats, p=2, dim=-1)
+        text_feats_all, graph_feats_all = pl_concat_all_gather(text_feats), pl_concat_all_gather(graph_feats) # shape = [B * num_gpus, D]
+        sim_g2t, sim_t2g, loss_gtc = self.contrast_global(graph_feats, text_feats, graph_feats_all, text_feats_all, return_sim=True)
+        ###============== Image-text Matching ===================###
+        loss_gtm = 0
+        if self.gtm:
+            ## not aggregate global tensor because of their different shapes
+            g_emb_world = graph_embeds
+            g_mask_world = graph_masks
+            text_ids_world = text_tokens.input_ids
+            text_mask_world = text_tokens.attention_mask
+            with torch.no_grad():
+                weights_t2g = F.softmax(sim_t2g, dim=1) + 1e-4
+                weights_t2g.fill_diagonal_(0)
+                weights_g2t = F.softmax(sim_g2t, dim=1) + 1e-4
+                weights_g2t.fill_diagonal_(0)
+            # select a negative graph for each text
+            graph_embeds_neg = []
+            graph_mask_neg = []
+            for b in range(batch_size):
+                neg_idx = torch.multinomial(weights_t2g[b], 1).item()
+                graph_embeds_neg.append(g_emb_world[neg_idx])
+                graph_mask_neg.append(g_mask_world[neg_idx])
+            graph_embeds_neg = torch.stack(graph_embeds_neg, dim=0)
+            graph_mask_neg = torch.stack(graph_mask_neg, dim=0)
+            # select a negative text for each image
+            text_ids_neg = []
+            text_atts_neg = []
+            for b in range(batch_size):
+                neg_idx = torch.multinomial(weights_g2t[b], 1).item()
+                text_ids_neg.append(text_ids_world[neg_idx])
+                text_atts_neg.append(text_mask_world[neg_idx])
+            text_ids_neg = torch.stack(text_ids_neg, dim=0)
+            text_atts_neg = torch.stack(text_atts_neg, dim=0)
+            text_ids_all = torch.cat(
+                [text_tokens.input_ids, text_tokens.input_ids, text_ids_neg], dim=0
+            )  # pos, pos, neg
+            text_atts_all = torch.cat(
+                [text_tokens.attention_mask, text_tokens.attention_mask, text_atts_neg],
+                dim=0,
+            )
+            query_tokens_itm = self.query_tokens.expand(text_ids_all.shape[0], -1, -1)
+            query_atts_itm = torch.ones(query_tokens_itm.size()[:-1], dtype=torch.long, device=text_tokens.input_ids.device)
+            attention_mask_all = torch.cat([query_atts_itm, text_atts_all], dim=1)
+            graph_embeds_all = torch.cat([graph_embeds, graph_embeds_neg, graph_embeds], dim=0)  # pos, neg, pos
+            graph_atts_all = torch.cat([graph_masks, graph_mask_neg, graph_masks], dim=0)
+            output_itm = self.Qformer.bert(
+                text_ids_all,
+                query_embeds=query_tokens_itm,
+                attention_mask=attention_mask_all,
+                encoder_hidden_states=graph_embeds_all,
+                encoder_attention_mask=graph_atts_all,
+                return_dict=True,
+            )
+            vl_embeddings = output_itm.last_hidden_state[:, : query_tokens_itm.size(1), :] # keep query tokens only
+            vl_output = self.gtm_head(vl_embeddings)
+            logits = vl_output.mean(dim=1)
+            itm_labels = torch.cat(
+                [torch.ones(batch_size, dtype=torch.long), torch.zeros(2 * batch_size, dtype=torch.long)],
+                dim=0,
+            ).to(text_tokens.input_ids.device)
+            loss_gtm = F.cross_entropy(logits, itm_labels)
+        ##================= Image Captioning ========================##
+        loss_lm = 0
+        if self.lm:
+            decoder_input_ids = text_tokens.input_ids.clone()
+            decoder_input_ids[:, 0] = self.tokenizer.bos_token_id
+            labels = decoder_input_ids.masked_fill(
+                decoder_input_ids == self.tokenizer.pad_token_id, -100
+            )
+            query_atts = torch.ones(query_tokens.size()[:-1], dtype=torch.long, device=text_tokens.input_ids.device)
+            attention_mask = torch.cat([query_atts, prompt_tokens.attention_mask, text_tokens.attention_mask], dim=1)
+            lm_output = self.Qformer(
+                decoder_input_ids,
+                attention_mask=attention_mask,
+                past_key_values=query_output.past_key_values,
+                return_dict=True,
+                labels=labels,
+            )
+            loss_lm = lm_output.loss
+        return BlipOutput(
+            loss=loss_gtc + loss_gtm + loss_lm,
+            loss_itc=loss_gtc,
+            loss_itm=loss_gtm,
+            loss_lm=loss_lm,
+        )
+    def graph_forward(self, graph):
+        batch_node, batch_mask = self.graph_encoder(graph)
+        batch_node = self.ln_graph(batch_node, batch_mask)
+        query_tokens = self.query_tokens.expand(batch_node.shape[0], -1, -1)
+        query_output = self.Qformer.bert(
+            query_embeds=query_tokens,
+            encoder_hidden_states=batch_node,
+            encoder_attention_mask=batch_mask, # fixme: check whether this mask is correct
+            use_cache=False,
+            return_dict=True,
+        )
+        graph_feats = self.graph_proj(query_output.last_hidden_state) # shape = [B, num_q, D]
+        graph_feats = F.normalize(graph_feats, p=2, dim=-1)
+        return graph_feats, batch_node, batch_mask
+    def text_forward(self, text, mask):
+        text_output = self.Qformer.bert(text, attention_mask=mask, return_dict=True) # shape = [B, n_max, D]
+        text_feats = self.text_proj(text_output.last_hidden_state[:, 0, :] )
+        text_feats = F.normalize(text_feats, dim=-1, p=2)
+        return text_feats
+    def compute_gtm(self, batch_node, batch_mask, text_ids, text_atts):
+        '''
+        batch_node shape = [B, N, D]
+        batch_mask shape = [B, N]
+        text_ids shape = [B, N]
+        text_atts shape = [B, N]
+        '''
+        query_tokens = self.query_tokens.expand(batch_node.shape[0], -1, -1) # shape = [B, Nq, D]
+        query_atts = torch.ones(query_tokens.size()[:-1], dtype=torch.long).to(
+            batch_node.device
+        ) # shape = [B, Nq]
+        attention_mask = torch.cat([query_atts, text_atts], dim=1) # shape = [B, Nq + N]
+        output_gtm = self.Qformer.bert(
+            text_ids,
+            query_embeds=query_tokens,
+            attention_mask=attention_mask,
+            encoder_hidden_states=batch_node,
+            encoder_attention_mask=batch_mask,
+            return_dict=True,
+        )
+        gl_embeddings = output_gtm.last_hidden_state[:, : query_tokens.size(1), :] # shape = [B, Nq, D]
+        gtm_logit = self.gtm_head(gl_embeddings).mean(dim=1) # shape = [B, Nq, 2]
+        # gtm_logit = F.softmax(gtm_logit, dim=-1)[:, 1] # select the axis of the positive class
+        gtm_logit = gtm_logit[:, 1] # select the axis of the positive class
+        return gtm_logit

model/dist_funs.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import torch
+from typing import Any, Iterable, Iterator, List, Optional, Sized, Tuple, Union, Dict
+from pytorch_lightning import strategies
+from lightning_fabric.utilities.types import _PATH
+from deepspeed.runtime.data_pipeline.data_routing.helper import remove_random_ltd_state_dict
+'''
+overwrite the function in deepspeed
+'''
+### start overwrite ###
+def module_state_dict(self, destination=None, prefix="", keep_vars=False, exclude_frozen_parameters=False):
+    sd = self.module.state_dict(destination, prefix, keep_vars)
+    # Remove frozen parameter weights from state_dict if specified
+    if exclude_frozen_parameters:
+        to_be_removed = []
+        for n in sd:
+            try:
+                if not self.module.get_parameter(n).requires_grad:
+                    to_be_removed.append(n)
+            except AttributeError:
+                to_be_removed.append(n)
+        for key in to_be_removed:
+            sd.pop(key)
+    if self.random_ltd_enabled():
+        sd = remove_random_ltd_state_dict(sd)
+    return sd
+from deepspeed import DeepSpeedEngine
+DeepSpeedEngine.module_state_dict = module_state_dict
+### end overwrite ###
+class MyDeepSpeedStrategy(strategies.DeepSpeedStrategy):
+    def save_checkpoint_v1(
+        self, checkpoint: Dict[str, Any], filepath: _PATH, storage_options: Optional[Any] = None
+    ):
+        """Save model/training states as a checkpoint file through state-dump and file-write.
+        Args:
+            checkpoint: dict containing model and trainer state
+            filepath: write-target file's path
+            storage_options: parameter for how to save to st
+            orage, passed to ``CheckpointIO`` plugin
+        """
+        if self.is_global_zero:
+            self.checkpoint_io.save_checkpoint(checkpoint, filepath, storage_options=storage_options)
+    def load_model_state_dict(self, checkpoint):
+        assert self.lightning_module is not None
+        self.lightning_module.load_state_dict(checkpoint["state_dict"], strict=False)
+    def save_checkpoint(self, checkpoint: Dict, filepath: _PATH, storage_options: Optional[Any] = None) -> None:
+        """Save model/training states as a checkpoint file through state-dump and file-write.
+        Args:
+            checkpoint: The checkpoint state dictionary
+            filepath: write-target file's path
+            storage_options: not used for ``DeepSpeedStrategy`` as ``CheckpointIO`` is not used
+        Raises:
+            TypeError:
+                If ``storage_options`` arg is passed in
+        """
+        # broadcast the filepath from rank 0 to ensure all the states are saved in a common filepath
+        filepath = self.broadcast(filepath)
+        if storage_options is not None:
+            raise TypeError(
+                "`Trainer.save_checkpoint(..., storage_options=...)` with `storage_options` arg"
+                f" is not supported for `{self.__class__.__name__}` as `CheckpointIO` is not used."
+            )
+        if self.zero_stage_3 and self._multi_device and self.is_global_zero:
+            print(
+                "Warning: When saving the DeepSpeed Stage 3 checkpoint, "
+                "each worker will save a shard of the checkpoint within a directory. "
+                "If a single file is required after training, "
+                "see https://lightning.ai/docs/pytorch/stable/advanced/model_parallel.html#"
+                "deepspeed-zero-stage-3-single-file for instructions."
+            )
+        # Use deepspeed's internal checkpointing function to handle partitioned weights across processes
+        # dump states as a checkpoint dictionary object
+        _exclude_keys = ["state_dict", "optimizer_states"]
+        checkpoint = {k: v for k, v in checkpoint.items() if k not in _exclude_keys}
+        self.deepspeed_engine.save_checkpoint(filepath, client_state=checkpoint, tag="checkpoint", exclude_frozen_parameters=True)

model/gin_model.py ADDED Viewed

	@@ -0,0 +1,397 @@

+import torch
+from torch_geometric.nn import MessagePassing
+from torch_geometric.utils import add_self_loops, degree, softmax, to_dense_batch
+from torch_geometric.nn import global_add_pool, global_mean_pool, global_max_pool, GlobalAttention, Set2Set
+import torch.nn.functional as F
+# from torch_scatter import scatter_add
+from torch_geometric.nn.inits import glorot, zeros
+num_atom_type = 120 #including the extra mask tokens
+num_chirality_tag = 3
+num_bond_type = 6 #including aromatic and self-loop edge, and extra masked tokens
+num_bond_direction = 3
+class GINConv(MessagePassing):
+    """
+    Extension of GIN aggregation to incorporate edge information by concatenation.
+    Args:
+        emb_dim (int): dimensionality of embeddings for nodes and edges.
+        embed_input (bool): whether to embed input or not.
+    See https://arxiv.org/abs/1810.00826
+    """
+    def __init__(self, emb_dim, aggr = "add"):
+        super(GINConv, self).__init__(aggr = "add")
+        #multi-layer perceptron
+        self.mlp = torch.nn.Sequential(torch.nn.Linear(emb_dim, 2*emb_dim), torch.nn.ReLU(), torch.nn.Linear(2*emb_dim, emb_dim))
+        self.edge_embedding1 = torch.nn.Embedding(num_bond_type, emb_dim)
+        self.edge_embedding2 = torch.nn.Embedding(num_bond_direction, emb_dim)
+        torch.nn.init.xavier_uniform_(self.edge_embedding1.weight.data)
+        torch.nn.init.xavier_uniform_(self.edge_embedding2.weight.data)
+        self.aggr = aggr
+    def forward(self, x, edge_index, edge_attr):
+        #add self loops in the edge space
+        # print('--------------------')
+        # print('x:', x.shape)
+        # print('edge_index:',edge_index.shape)
+        edge_index, edge_attr = add_self_loops(edge_index, edge_attr, fill_value=0, num_nodes = x.size(0))
+        #add features corresponding to self-loop edges.
+        # self_loop_attr = torch.zeros(x.size(0), 2)
+        # self_loop_attr[:,0] = 4 #bond type for self-loop edge
+        # self_loop_attr = self_loop_attr.to(edge_attr.device).to(edge_attr.dtype)
+        # print('edge_attr:',edge_attr.shape)
+        # print('self_loop_attr:',self_loop_attr.shape)
+        # print('--------------------')
+        # edge_attr = torch.cat((edge_attr, self_loop_attr), dim = 0)
+        edge_embeddings = self.edge_embedding1(edge_attr[:,0]) + self.edge_embedding2(edge_attr[:,1])
+        return self.propagate(edge_index, x=x, edge_attr=edge_embeddings)
+    def message(self, x_j, edge_attr):
+        return x_j + edge_attr
+    def update(self, aggr_out):
+        return self.mlp(aggr_out)
+class GCNConv(MessagePassing):
+    def __init__(self, emb_dim, aggr = "add"):
+        super(GCNConv, self).__init__()
+        self.emb_dim = emb_dim
+        self.linear = torch.nn.Linear(emb_dim, emb_dim)
+        self.edge_embedding1 = torch.nn.Embedding(num_bond_type, emb_dim)
+        self.edge_embedding2 = torch.nn.Embedding(num_bond_direction, emb_dim)
+        torch.nn.init.xavier_uniform_(self.edge_embedding1.weight.data)
+        torch.nn.init.xavier_uniform_(self.edge_embedding2.weight.data)
+        self.aggr = aggr
+    def norm(self, edge_index, num_nodes, dtype):
+        ### assuming that self-loops have been already added in edge_index
+        edge_weight = torch.ones((edge_index.size(1), ), dtype=dtype,
+                                     device=edge_index.device)
+        row, col = edge_index
+        deg = scatter_add(edge_weight, row, dim=0, dim_size=num_nodes)
+        deg_inv_sqrt = deg.pow(-0.5)
+        deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
+        return deg_inv_sqrt[row] * edge_weight * deg_inv_sqrt[col]
+    def forward(self, x, edge_index, edge_attr):
+        #add self loops in the edge space
+        edge_index = add_self_loops(edge_index, num_nodes = x.size(0))
+        #add features corresponding to self-loop edges.
+        self_loop_attr = torch.zeros(x.size(0), 2)
+        self_loop_attr[:,0] = 4 #bond type for self-loop edge
+        self_loop_attr = self_loop_attr.to(edge_attr.device).to(edge_attr.dtype)
+        edge_attr = torch.cat((edge_attr, self_loop_attr), dim = 0)
+        edge_embeddings = self.edge_embedding1(edge_attr[:,0]) + self.edge_embedding2(edge_attr[:,1])
+        norm = self.norm(edge_index, x.size(0), x.dtype)
+        x = self.linear(x)
+        return self.propagate(self.aggr, edge_index, x=x, edge_attr=edge_embeddings, norm = norm)
+    def message(self, x_j, edge_attr, norm):
+        return norm.view(-1, 1) * (x_j + edge_attr)
+class GATConv(MessagePassing):
+    def __init__(self, emb_dim, heads=2, negative_slope=0.2, aggr = "add"):
+        super(GATConv, self).__init__()
+        self.aggr = aggr
+        self.emb_dim = emb_dim
+        self.heads = heads
+        self.negative_slope = negative_slope
+        self.weight_linear = torch.nn.Linear(emb_dim, heads * emb_dim)
+        self.att = torch.nn.Parameter(torch.Tensor(1, heads, 2 * emb_dim))
+        self.bias = torch.nn.Parameter(torch.Tensor(emb_dim))
+        self.edge_embedding1 = torch.nn.Embedding(num_bond_type, heads * emb_dim)
+        self.edge_embedding2 = torch.nn.Embedding(num_bond_direction, heads * emb_dim)
+        torch.nn.init.xavier_uniform_(self.edge_embedding1.weight.data)
+        torch.nn.init.xavier_uniform_(self.edge_embedding2.weight.data)
+        self.reset_parameters()
+    def reset_parameters(self):
+        glorot(self.att)
+        zeros(self.bias)
+    def forward(self, x, edge_index, edge_attr):
+        #add self loops in the edge space
+        edge_index = add_self_loops(edge_index, num_nodes = x.size(0))
+        #add features corresponding to self-loop edges.
+        self_loop_attr = torch.zeros(x.size(0), 2)
+        self_loop_attr[:,0] = 4 #bond type for self-loop edge
+        self_loop_attr = self_loop_attr.to(edge_attr.device).to(edge_attr.dtype)
+        edge_attr = torch.cat((edge_attr, self_loop_attr), dim = 0)
+        edge_embeddings = self.edge_embedding1(edge_attr[:,0]) + self.edge_embedding2(edge_attr[:,1])
+        x = self.weight_linear(x).view(-1, self.heads, self.emb_dim)
+        return self.propagate(self.aggr, edge_index, x=x, edge_attr=edge_embeddings)
+    def message(self, edge_index, x_i, x_j, edge_attr):
+        edge_attr = edge_attr.view(-1, self.heads, self.emb_dim)
+        x_j += edge_attr
+        alpha = (torch.cat([x_i, x_j], dim=-1) * self.att).sum(dim=-1)
+        alpha = F.leaky_relu(alpha, self.negative_slope)
+        alpha = softmax(alpha, edge_index[0])
+        return x_j * alpha.view(-1, self.heads, 1)
+    def update(self, aggr_out):
+        aggr_out = aggr_out.mean(dim=1)
+        aggr_out = aggr_out + self.bias
+        return aggr_out
+class GraphSAGEConv(MessagePassing):
+    def __init__(self, emb_dim, aggr = "mean"):
+        super(GraphSAGEConv, self).__init__()
+        self.emb_dim = emb_dim
+        self.linear = torch.nn.Linear(emb_dim, emb_dim)
+        self.edge_embedding1 = torch.nn.Embedding(num_bond_type, emb_dim)
+        self.edge_embedding2 = torch.nn.Embedding(num_bond_direction, emb_dim)
+        torch.nn.init.xavier_uniform_(self.edge_embedding1.weight.data)
+        torch.nn.init.xavier_uniform_(self.edge_embedding2.weight.data)
+        self.aggr = aggr
+    def forward(self, x, edge_index, edge_attr):
+        #add self loops in the edge space
+        edge_index = add_self_loops(edge_index, num_nodes = x.size(0))
+        #add features corresponding to self-loop edges.
+        self_loop_attr = torch.zeros(x.size(0), 2)
+        self_loop_attr[:,0] = 4 #bond type for self-loop edge
+        self_loop_attr = self_loop_attr.to(edge_attr.device).to(edge_attr.dtype)
+        edge_attr = torch.cat((edge_attr, self_loop_attr), dim = 0)
+        edge_embeddings = self.edge_embedding1(edge_attr[:,0]) + self.edge_embedding2(edge_attr[:,1])
+        x = self.linear(x)
+        return self.propagate(self.aggr, edge_index, x=x, edge_attr=edge_embeddings)
+    def message(self, x_j, edge_attr):
+        return x_j + edge_attr
+    def update(self, aggr_out):
+        return F.normalize(aggr_out, p = 2, dim = -1)
+class GNN(torch.nn.Module):
+    """
+    Args:
+        num_layer (int): the number of GNN layers
+        emb_dim (int): dimensionality of embeddings
+        JK (str): last, concat, max or sum.
+        max_pool_layer (int): the layer from which we use max pool rather than add pool for neighbor aggregation
+        drop_ratio (float): dropout rate
+        gnn_type: gin, gcn, graphsage, gat
+    Output:
+        node representations
+    """
+    def __init__(self, num_layer, emb_dim, JK = "last", drop_ratio = 0, gnn_type = "gin"):
+        super(GNN, self).__init__()
+        self.num_layer = num_layer
+        self.drop_ratio = drop_ratio
+        self.JK = JK
+        if self.num_layer < 2:
+            raise ValueError("Number of GNN layers must be greater than 1.")
+        self.x_embedding1 = torch.nn.Embedding(num_atom_type, emb_dim)
+        self.x_embedding2 = torch.nn.Embedding(num_chirality_tag, emb_dim)
+        torch.nn.init.xavier_uniform_(self.x_embedding1.weight.data)
+        torch.nn.init.xavier_uniform_(self.x_embedding2.weight.data)
+        ###List of MLPs
+        self.gnns = torch.nn.ModuleList()
+        for layer in range(num_layer):
+            if gnn_type == "gin":
+                self.gnns.append(GINConv(emb_dim, aggr = "add"))
+            elif gnn_type == "gcn":
+                self.gnns.append(GCNConv(emb_dim))
+            elif gnn_type == "gat":
+                self.gnns.append(GATConv(emb_dim))
+            elif gnn_type == "graphsage":
+                self.gnns.append(GraphSAGEConv(emb_dim))
+        self.pool = global_mean_pool
+        ###List of batchnorms
+        self.batch_norms = torch.nn.ModuleList()
+        for layer in range(num_layer):
+            self.batch_norms.append(torch.nn.BatchNorm1d(emb_dim))
+        self.num_features = emb_dim
+        self.cat_grep = True
+    #def forward(self, x, edge_index, edge_attr):
+    def forward(self, *argv):
+        if len(argv) == 3:
+            x, edge_index, edge_attr = argv[0], argv[1], argv[2]
+        elif len(argv) == 1:
+            data = argv[0]
+            x, edge_index, edge_attr, batch = data.x, data.edge_index, data.edge_attr, data.batch
+        else:
+            raise ValueError("unmatched number of arguments.")
+        x = self.x_embedding1(x[:,0]) + self.x_embedding2(x[:,1])
+        h_list = [x]
+        for layer in range(self.num_layer):
+            h = self.gnns[layer](h_list[layer], edge_index, edge_attr)
+            h = self.batch_norms[layer](h)
+            #h = F.dropout(F.relu(h), self.drop_ratio, training = self.training)
+            if layer == self.num_layer - 1:
+                #remove relu for the last layer
+                h = F.dropout(h, self.drop_ratio, training = self.training)
+            else:
+                h = F.dropout(F.relu(h), self.drop_ratio, training = self.training)
+            h_list.append(h)
+        ### Different implementations of Jk-concat
+        if self.JK == "concat":
+            node_representation = torch.cat(h_list, dim = 1)
+        elif self.JK == "last":
+            node_representation = h_list[-1]
+        elif self.JK == "max":
+            h_list = [h.unsqueeze_(0) for h in h_list]
+            node_representation = torch.max(torch.cat(h_list, dim = 0), dim = 0)[0]
+        elif self.JK == "sum":
+            h_list = [h.unsqueeze_(0) for h in h_list]
+            node_representation = torch.sum(torch.cat(h_list, dim=0), dim=0)[0]
+        h_graph = self.pool(node_representation, batch) # shape = [B, D]
+        batch_node, batch_mask = to_dense_batch(node_representation, batch) # shape = [B, n_max, D],
+        batch_mask = batch_mask.bool()
+        if self.cat_grep:
+            batch_node = torch.cat((h_graph.unsqueeze(1), batch_node), dim=1) # shape = [B, n_max+1, D]
+            batch_mask = torch.cat([torch.ones((batch_mask.shape[0], 1), dtype=torch.bool, device=batch.device), batch_mask], dim=1)
+            return batch_node, batch_mask
+        else:
+            return batch_node, batch_mask, h_graph
+class GNN_graphpred(torch.nn.Module):
+    """
+    Extension of GIN to incorporate edge information by concatenation.
+    Args:
+        num_layer (int): the number of GNN layers
+        emb_dim (int): dimensionality of embeddings
+        num_tasks (int): number of tasks in multi-task learning scenario
+        drop_ratio (float): dropout rate
+        JK (str): last, concat, max or sum.
+        graph_pooling (str): sum, mean, max, attention, set2set
+        gnn_type: gin, gcn, graphsage, gat
+    See https://arxiv.org/abs/1810.00826
+    JK-net: https://arxiv.org/abs/1806.03536
+    """
+    def __init__(self, num_layer, emb_dim, num_tasks, JK = "last", drop_ratio = 0, graph_pooling = "mean", gnn_type = "gin"):
+        super(GNN_graphpred, self).__init__()
+        self.num_layer = num_layer
+        self.drop_ratio = drop_ratio
+        self.JK = JK
+        self.emb_dim = emb_dim
+        self.num_tasks = num_tasks
+        if self.num_layer < 2:
+            raise ValueError("Number of GNN layers must be greater than 1.")
+        self.gnn = GNN(num_layer, emb_dim, JK, drop_ratio, gnn_type = gnn_type)
+        #Different kind of graph pooling
+        if graph_pooling == "sum":
+            self.pool = global_add_pool
+        elif graph_pooling == "mean":
+            self.pool = global_mean_pool
+        elif graph_pooling == "max":
+            self.pool = global_max_pool
+        elif graph_pooling == "attention":
+            if self.JK == "concat":
+                self.pool = GlobalAttention(gate_nn = torch.nn.Linear((self.num_layer + 1) * emb_dim, 1))
+            else:
+                self.pool = GlobalAttention(gate_nn = torch.nn.Linear(emb_dim, 1))
+        elif graph_pooling[:-1] == "set2set":
+            set2set_iter = int(graph_pooling[-1])
+            if self.JK == "concat":
+                self.pool = Set2Set((self.num_layer + 1) * emb_dim, set2set_iter)
+            else:
+                self.pool = Set2Set(emb_dim, set2set_iter)
+        else:
+            raise ValueError("Invalid graph pooling type.")
+        #For graph-level binary classification
+        if graph_pooling[:-1] == "set2set":
+            self.mult = 2
+        else:
+            self.mult = 1
+        if self.JK == "concat":
+            self.graph_pred_linear = torch.nn.Linear(self.mult * (self.num_layer + 1) * self.emb_dim, self.num_tasks)
+        else:
+            self.graph_pred_linear = torch.nn.Linear(self.mult * self.emb_dim, self.num_tasks)
+    def from_pretrained(self, model_file):
+        #self.gnn = GNN(self.num_layer, self.emb_dim, JK = self.JK, drop_ratio = self.drop_ratio)
+        missing_keys, unexpected_keys = self.gnn.load_state_dict(torch.load(model_file))
+        print(missing_keys)
+        print(unexpected_keys)
+    def forward(self, *argv):
+        if len(argv) == 4:
+            x, edge_index, edge_attr, batch = argv[0], argv[1], argv[2], argv[3]
+        elif len(argv) == 1:
+            data = argv[0]
+            x, edge_index, edge_attr, batch = data.x, data.edge_index, data.edge_attr, data.batch
+        else:
+            raise ValueError("unmatched number of arguments.")
+        node_representation = self.gnn(x, edge_index, edge_attr)
+        return self.graph_pred_linear(self.pool(node_representation, batch))
+if __name__ == "__main__":
+    pass

model/help_funcs.py ADDED Viewed

	@@ -0,0 +1,86 @@

+from nltk.translate.bleu_score import corpus_bleu
+from nltk.translate.meteor_score import meteor_score
+from rouge_score import rouge_scorer
+from tqdm import tqdm
+import numpy as np
+import json
+from transformers import AutoTokenizer
+def caption_evaluate(predictions, targets, tokenizer, text_trunc_length):
+    meteor_scores = []
+    references = []
+    hypotheses = []
+    for gt, out in tqdm(zip(targets, predictions)):
+        gt_tokens = tokenizer.tokenize(gt, truncation=True, max_length=text_trunc_length,
+                                            padding='max_length')
+        gt_tokens = list(filter(('[PAD]').__ne__, gt_tokens))
+        gt_tokens = list(filter(('[CLS]').__ne__, gt_tokens))
+        gt_tokens = list(filter(('[SEP]').__ne__, gt_tokens))
+        out_tokens = tokenizer.tokenize(out, truncation=True, max_length=text_trunc_length,
+                                            padding='max_length')
+        out_tokens = list(filter(('[PAD]').__ne__, out_tokens))
+        out_tokens = list(filter(('[CLS]').__ne__, out_tokens))
+        out_tokens = list(filter(('[SEP]').__ne__, out_tokens))
+        references.append([gt_tokens])
+        hypotheses.append(out_tokens)
+        mscore = meteor_score([gt_tokens], out_tokens)
+        meteor_scores.append(mscore)
+    bleu2 = corpus_bleu(references, hypotheses, weights=(.5,.5))
+    bleu4 = corpus_bleu(references, hypotheses, weights=(.25,.25,.25,.25))
+    bleu2 *= 100
+    bleu4 *= 100
+    print('BLEU-2 score:', bleu2)
+    print('BLEU-4 score:', bleu4)
+    _meteor_score = np.mean(meteor_scores)
+    _meteor_score *= 100
+    print('Average Meteor score:', _meteor_score)
+    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'])
+    rouge_scores = []
+    references = []
+    hypotheses = []
+    for gt, out in tqdm(zip(targets, predictions)):
+        rs = scorer.score(out, gt)
+        rouge_scores.append(rs)
+    print('ROUGE score:')
+    rouge_1 = np.mean([rs['rouge1'].fmeasure for rs in rouge_scores]) * 100
+    rouge_2 = np.mean([rs['rouge2'].fmeasure for rs in rouge_scores]) * 100
+    rouge_l = np.mean([rs['rougeL'].fmeasure for rs in rouge_scores]) * 100
+    print('rouge1:', rouge_1)
+    print('rouge2:', rouge_2)
+    print('rougeL:', rouge_l)
+    return bleu2, bleu4, rouge_1, rouge_2, rouge_l, _meteor_score
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+def get_tokens_as_list(tokenizer, word_list):
+    "Converts a sequence of words into a list of tokens"
+    "Source: https://huggingface.co/docs/transformers/internal/generation_utils"
+    tokens_list = []
+    for word in word_list:
+        tokenized_word = tokenizer([word], add_special_tokens=False).input_ids[0]
+        tokens_list.extend(tokenized_word)
+    return tokens_list
+def get_not_allowed_tokens_ids(tokenizer_name, allowed_words_file='model/allowed_words.json'):
+    tokenizer_with_prefix_space = AutoTokenizer.from_pretrained(tokenizer_name, add_prefix_space=True)
+    with open(allowed_words_file, 'r') as f:
+        allowed_words = json.load(f)
+    allowed_words = list(allowed_words.values())
+    allowed_tokens_ids = get_tokens_as_list(tokenizer_with_prefix_space, allowed_words)
+    full_token_space = list(range(tokenizer_with_prefix_space.vocab_size))
+    not_allowed_tokens_ids = [[token_id] for token_id in full_token_space if token_id not in allowed_tokens_ids]
+    return not_allowed_tokens_ids

model/modeling_llama.py ADDED Viewed

	@@ -0,0 +1,888 @@

+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch LLaMA model."""
+import math
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from transformers.models.llama.configuration_llama import LlamaConfig
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "LlamaConfig"
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+    inverted_mask = 1.0 - expanded_mask
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+class LlamaRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        LlamaRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        # convert into half-precision if necessary
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            hidden_states = hidden_states.to(self.weight.dtype)
+        return self.weight * hidden_states
+class LlamaRotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
+        self.register_buffer("inv_freq", inv_freq)
+        # Build here to make `torch.jit.trace` work.
+        self.max_seq_len_cached = max_position_embeddings
+        t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
+        self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
+        if seq_len > self.max_seq_len_cached:
+            self.max_seq_len_cached = seq_len
+            t = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype)
+            freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+            # Different from paper, but it uses a different permutation in order to obtain the same calculation
+            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
+            self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
+            self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
+        return (
+            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+        )
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+    gather_indices = position_ids[:, None, :, None]  # [bs, 1, seq_len, 1]
+    gather_indices = gather_indices.repeat(1, cos.shape[1], 1, cos.shape[3])
+    cos = torch.gather(cos.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices)
+    sin = torch.gather(sin.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class LlamaMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+    ):
+        super().__init__()
+        self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
+        self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.act_fn = ACT2FN[hidden_act]
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+class LlamaAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: LlamaConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.rotary_emb = LlamaRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings)
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        # [bsz, nh, t, hd]
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        past_key_value = (key_states, value_states) if use_cache else None
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+            attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min))
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_output = torch.matmul(attn_weights, value_states)
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class LlamaDecoderLayer(nn.Module):
+    def __init__(self, config: LlamaConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = LlamaAttention(config=config)
+        self.mlp = LlamaMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+        )
+        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+LLAMA_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`LlamaConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+@add_start_docstrings(
+    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+    LLAMA_START_DOCSTRING,
+)
+class LlamaPreTrainedModel(PreTrainedModel):
+    config_class = LlamaConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["LlamaDecoderLayer"]
+    _keys_to_ignore_on_load_unexpected = [r"decoder\.version"]
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, LlamaModel):
+            module.gradient_checkpointing = value
+LLAMA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+@add_start_docstrings(
+    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+    LLAMA_START_DOCSTRING,
+)
+class LlamaModel(LlamaPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
+    Args:
+        config: LlamaConfig
+    """
+    def __init__(self, config: LlamaConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                inputs_embeds.device
+            )
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+        return combined_attention_mask
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        # embed positions
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
+            )
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+        )
+        hidden_states = inputs_embeds
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+            if self.gradient_checkpointing and self.training:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, None)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class LlamaForCausalLM(LlamaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = LlamaModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        reduction: Optional[str] = "mean",
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, LlamaForCausalLM
+        >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+        >>> prompt = "Hey, are you consciours? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss(reduction=reduction)
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+            if reduction == "none":
+                # loss = loss.view(logits.size(0), -1).sum(1)
+                loss = loss.view(logits.size(0), -1).mean(1)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past
+@add_start_docstrings(
+    """
+    The LLaMa Model transformer with a sequence classification head on top (linear layer).
+    [`LlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    LLAMA_START_DOCSTRING,
+)
+class LlamaForSequenceClassification(LlamaPreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = LlamaModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device)
+            else:
+                sequence_lengths = -1
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )

model/modeling_opt.py ADDED Viewed

	@@ -0,0 +1,1223 @@

+# coding=utf-8
+# Copyright 2022 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch OPT model."""
+import random
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+# from ...activations import ACT2FN
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutputWithPast,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from transformers.models.opt.configuration_opt import OPTConfig
+# from .configuration_opt
+logger = logging.get_logger(__name__)
+_CHECKPOINT_FOR_DOC = "facebook/opt-350m"
+_CONFIG_FOR_DOC = "OPTConfig"
+# Base model docstring
+_EXPECTED_OUTPUT_SHAPE = [1, 8, 1024]
+# SequenceClassification docstring
+_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION = "ArthurZ/opt-350m-dummy-sc"
+_SEQ_CLASS_EXPECTED_LOSS = 1.71
+_SEQ_CLASS_EXPECTED_OUTPUT = "'LABEL_0'"
+OPT_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "facebook/opt-125m",
+    "facebook/opt-350m",
+    "facebook/opt-1.3b",
+    "facebook/opt-2.7b",
+    "facebook/opt-6.7b",
+    "facebook/opt-13b",
+    "facebook/opt-30b",
+    # See all OPT models at https://huggingface.co/models?filter=opt
+]
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+    inverted_mask = 1.0 - expanded_mask
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+class OPTLearnedPositionalEmbedding(nn.Embedding):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+    def __init__(self, num_embeddings: int, embedding_dim: int):
+        # OPT is set up so that if padding_idx is specified then offset the embedding ids by 2
+        # and adjust num_embeddings appropriately. Other models don't have this hack
+        self.offset = 2
+        super().__init__(num_embeddings + self.offset, embedding_dim)
+    def forward(self, attention_mask: torch.LongTensor, past_key_values_length: int = 0):
+        """`input_ids_shape` is expected to be [bsz x seqlen]."""
+        attention_mask = attention_mask.long()
+        # create positions depending on attention_mask
+        positions = (torch.cumsum(attention_mask, dim=1).type_as(attention_mask) * attention_mask).long() - 1
+        # cut positions if `past_key_values_length` is > 0
+        positions = positions[:, past_key_values_length:]
+        return super().forward(positions + self.offset)
+class OPTAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        bsz, tgt_len, _ = hidden_states.size()
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min))
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        # upcast to fp32 if the weights are in fp16. Please see https://github.com/huggingface/transformers/pull/17437
+        if attn_weights.dtype == torch.float16:
+            attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(torch.float16)
+        else:
+            attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.bmm(attn_probs, value_states)
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned aross GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+        attn_output = self.out_proj(attn_output)
+        return attn_output, attn_weights_reshaped, past_key_value
+class OPTDecoderLayer(nn.Module):
+    def __init__(self, config: OPTConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = OPTAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+            bias=config.enable_bias,
+        )
+        self.do_layer_norm_before = config.do_layer_norm_before
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.self_attn_layer_norm = nn.LayerNorm(
+            self.embed_dim, elementwise_affine=config.layer_norm_elementwise_affine
+        )
+        self.fc1 = nn.Linear(self.embed_dim, config.ffn_dim, bias=config.enable_bias)
+        self.fc2 = nn.Linear(config.ffn_dim, self.embed_dim, bias=config.enable_bias)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim, elementwise_affine=config.layer_norm_elementwise_affine)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`, *optional*): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+        residual = hidden_states
+        # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention
+        if self.do_layer_norm_before:
+            hidden_states = self.self_attn_layer_norm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=past_key_value,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        # 350m applies layer norm AFTER attention
+        if not self.do_layer_norm_before:
+            hidden_states = self.self_attn_layer_norm(hidden_states)
+        # Fully Connected
+        hidden_states_shape = hidden_states.shape
+        hidden_states = hidden_states.reshape(-1, hidden_states.size(-1))
+        residual = hidden_states
+        # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention
+        if self.do_layer_norm_before:
+            hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = (residual + hidden_states).view(hidden_states_shape)
+        # 350m applies layer norm AFTER attention
+        if not self.do_layer_norm_before:
+            hidden_states = self.final_layer_norm(hidden_states)
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+OPT_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`OPTConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+@add_start_docstrings(
+    "The bare OPT Model outputting raw hidden-states without any specific head on top.",
+    OPT_START_DOCSTRING,
+)
+class OPTPreTrainedModel(PreTrainedModel):
+    config_class = OPTConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["OPTDecoderLayer"]
+    _keys_to_ignore_on_load_unexpected = [r"decoder\.version"]
+    def _init_weights(self, module):
+        std = self.config.init_std
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (OPTDecoder)):
+            module.gradient_checkpointing = value
+OPT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+class OPTDecoder(OPTPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`OPTDecoderLayer`]
+    Args:
+        config: OPTConfig
+    """
+    def __init__(self, config: OPTConfig):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.layerdrop
+        self.padding_idx = config.pad_token_id
+        self.max_target_positions = config.max_position_embeddings
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.word_embed_proj_dim, self.padding_idx)
+        self.embed_positions = OPTLearnedPositionalEmbedding(config.max_position_embeddings, config.hidden_size)
+        if config.word_embed_proj_dim != config.hidden_size:
+            self.project_out = nn.Linear(config.hidden_size, config.word_embed_proj_dim, bias=False)
+        else:
+            self.project_out = None
+        if config.word_embed_proj_dim != config.hidden_size:
+            self.project_in = nn.Linear(config.word_embed_proj_dim, config.hidden_size, bias=False)
+        else:
+            self.project_in = None
+        # Note that the only purpose of `config._remove_final_layer_norm` is to keep backward compatibility
+        # with checkpoints that have been fine-tuned before transformers v4.20.1
+        # see https://github.com/facebookresearch/metaseq/pull/164
+        if config.do_layer_norm_before and not config._remove_final_layer_norm:
+            self.final_layer_norm = nn.LayerNorm(
+                config.hidden_size, elementwise_affine=config.layer_norm_elementwise_affine
+            )
+        else:
+            self.final_layer_norm = None
+        self.layers = nn.ModuleList([OPTDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                inputs_embeds.device
+            )
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+        return combined_attention_mask
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(num_hidden_layers, num_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
+                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        batch_size, seq_length = input_shape
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+        # required mask seq length can be calculated via length of past
+        mask_seq_length = past_key_values_length + seq_length
+        # embed positions
+        if attention_mask is None:
+            attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
+        causal_attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, input_shape, inputs_embeds, past_key_values_length
+        )
+        pos_embeds = self.embed_positions(attention_mask, past_key_values_length)
+        if self.project_in is not None:
+            inputs_embeds = self.project_in(inputs_embeds)
+        hidden_states = inputs_embeds + pos_embeds
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+        # check if head_mask has a correct number of layers specified if desired
+        for attn_mask, mask_name in zip([head_mask], ["head_mask"]):
+            if attn_mask is not None:
+                if attn_mask.size()[0] != (len(self.layers)):
+                    raise ValueError(
+                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
+                        f" {head_mask.size()[0]}."
+                    )
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):
+                continue
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+            if self.gradient_checkpointing and self.training:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, None)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    causal_attention_mask,
+                    head_mask[idx] if head_mask is not None else None,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_attention_mask,
+                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        if self.final_layer_norm is not None:
+            hidden_states = self.final_layer_norm(hidden_states)
+        if self.project_out is not None:
+            hidden_states = self.project_out(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+@add_start_docstrings(
+    "The bare OPT Model outputting raw hidden-states without any specific head on top.",
+    OPT_START_DOCSTRING,
+)
+class OPTModel(OPTPreTrainedModel):
+    def __init__(self, config: OPTConfig):
+        super().__init__(config)
+        self.decoder = OPTDecoder(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.decoder.embed_tokens
+    def set_input_embeddings(self, value):
+        self.decoder.embed_tokens = value
+    def get_decoder(self):
+        return self.decoder
+    @add_start_docstrings_to_model_forward(OPT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPast,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if not return_dict:
+            return decoder_outputs
+        return BaseModelOutputWithPast(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            hidden_states=decoder_outputs.hidden_states,
+            attentions=decoder_outputs.attentions,
+        )
+class OPTForCausalLM(OPTPreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = OPTModel(config)
+        # the lm_head weight is automatically tied to the embed tokens weight
+        self.lm_head = nn.Linear(config.word_embed_proj_dim, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.decoder.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.decoder.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model.decoder = decoder
+    def get_decoder(self):
+        return self.model.decoder
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(num_hidden_layers, num_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional
+                tensors are only required when the model is used as a decoder in a Sequence to Sequence model.
+                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
+                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, OPTForCausalLM
+        >>> model = OPTForCausalLM.from_pretrained("facebook/opt-350m")
+        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
+        >>> prompt = "Hey, are you consciours? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model.decoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        logits = self.lm_head(outputs[0]).contiguous()
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(shift_logits.view(-1, self.config.vocab_size), shift_labels.view(-1))
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past
+@add_start_docstrings(
+    """
+    The OPT Model transformer with a sequence classification head on top (linear layer).
+    [`OPTForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    OPT_START_DOCSTRING,
+)
+class OPTForSequenceClassification(OPTPreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
+    def __init__(self, config: OPTConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = OPTModel(config)
+        self.score = nn.Linear(config.word_embed_proj_dim, self.num_labels, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @add_start_docstrings_to_model_forward(OPT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION,
+        output_type=SequenceClassifierOutputWithPast,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
+        expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.model(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask, # shape = [B, max_len]
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states) # shape = [B, max_len, D]
+        denom = torch.sum(attention_mask, -1, keepdim=True) # shape = [B, 1]
+        pooled_logits = torch.sum(logits * attention_mask.unsqueeze(-1), dim=1) # shape = [B, D]
+        pooled_logits = pooled_logits / denom
+        loss = None
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+    def get_input_embeddings(self):
+        return self.model.decoder.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.decoder.embed_tokens = value
+@add_start_docstrings(
+    """
+    The OPT Model transformer with a span classification head on top for extractive question-answering tasks like SQuAD
+    (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    OPT_START_DOCSTRING,
+)
+class OPTForQuestionAnswering(OPTPreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
+    def __init__(self, config: OPTConfig):
+        super().__init__(config)
+        self.model = OPTModel(config)
+        self.qa_outputs = nn.Linear(config.word_embed_proj_dim, 2)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @add_start_docstrings_to_model_forward(OPT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=QuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, OPTForQuestionAnswering
+        >>> import torch
+        >>> torch.manual_seed(4)  # doctest: +IGNORE_RESULT
+        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
+        >>> # note: we are loading a OPTForQuestionAnswering from the hub here,
+        >>> # so the head will be randomly initialized, hence the predictions will be random
+        >>> model = OPTForQuestionAnswering.from_pretrained("facebook/opt-350m")
+        >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
+        >>> inputs = tokenizer(question, text, return_tensors="pt")
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+        >>> answer_start_index = outputs.start_logits.argmax()
+        >>> answer_end_index = outputs.end_logits.argmax()
+        >>> answer_offset = len(tokenizer(question)[0])
+        >>> predict_answer_tokens = inputs.input_ids[
+        ...     0, answer_offset + answer_start_index : answer_offset + answer_end_index + 1
+        ... ]
+        >>> predicted = tokenizer.decode(predict_answer_tokens)
+        >>> predicted
+        ' a nice puppet'
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.model(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.qa_outputs(hidden_states)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+        if not return_dict:
+            output = (start_logits, end_logits) + transformer_outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+    def get_input_embeddings(self):
+        return self.model.decoder.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.decoder.embed_tokens = value

model/opt_flash_attention.py ADDED Viewed

	@@ -0,0 +1,331 @@

+from typing import List, Optional, Tuple
+import logging
+import torch
+from torch import nn
+import transformers
+from einops import rearrange
+from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func
+from flash_attn.bert_padding import unpad_input, pad_input
+from transformers.models.opt.modeling_opt import _make_causal_mask, _expand_mask
+def _prepare_decoder_attention_mask_original(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+    # create causal mask
+    # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+    combined_attention_mask = None
+    if input_shape[-1] > 1:
+        combined_attention_mask = _make_causal_mask(
+            input_shape,
+            inputs_embeds.dtype,
+            device=inputs_embeds.device,
+            past_key_values_length=past_key_values_length,
+        )
+    if attention_mask is not None:
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+            inputs_embeds.device
+        )
+        combined_attention_mask = (
+            expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+        )
+    return combined_attention_mask
+def forward_original(
+    self,
+    hidden_states: torch.Tensor,
+    key_value_states: Optional[torch.Tensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    layer_head_mask: Optional[torch.Tensor] = None,
+    output_attentions: bool = False,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    """Input shape: Batch x Time x Channel"""
+    # if key_value_states are provided this layer is used as a cross-attention layer
+    # for the decoder
+    is_cross_attention = key_value_states is not None
+    bsz, tgt_len, _ = hidden_states.size()
+    # get query proj
+    query_states = self.q_proj(hidden_states) * self.scaling
+    # get key, value proj
+    if is_cross_attention and past_key_value is not None:
+        # reuse k,v, cross_attentions
+        key_states = past_key_value[0]
+        value_states = past_key_value[1]
+    elif is_cross_attention:
+        # cross_attentions
+        key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+        value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+    elif past_key_value is not None:
+        # reuse k, v, self_attention
+        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+        key_states = torch.cat([past_key_value[0], key_states], dim=2)
+        value_states = torch.cat([past_key_value[1], value_states], dim=2)
+    else:
+        # self_attention
+        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+    if self.is_decoder:
+        # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+        # Further calls to cross_attention layer can then reuse all cross-attention
+        # key/value_states (first "if" case)
+        # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+        # all previous decoder key/value_states. Further calls to uni-directional self-attention
+        # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+        # if encoder bi-directional self-attention `past_key_value` is always `None`
+        past_key_value = (key_states, value_states)
+    proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+    query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+    key_states = key_states.view(*proj_shape)
+    value_states = value_states.view(*proj_shape)
+    src_len = key_states.size(1)
+    attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+    if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+        raise ValueError(
+            f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+            f" {attn_weights.size()}"
+        )
+    if attention_mask is not None:
+        if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+            raise ValueError(
+                f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+            )
+        attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+        attn_weights = torch.max(
+            attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min, device=attn_weights.device)
+        )
+        attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+    # upcast to fp32 if the weights are in fp16. Please see https://github.com/huggingface/transformers/pull/17437
+    if attn_weights.dtype == torch.float16:
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(torch.float16)
+    else:
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+    if layer_head_mask is not None:
+        if layer_head_mask.size() != (self.num_heads,):
+            raise ValueError(
+                f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                f" {layer_head_mask.size()}"
+            )
+        attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+        attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+    if output_attentions:
+        # this operation is a bit awkward, but it's required to
+        # make sure that attn_weights keeps its gradient.
+        # In order to do so, attn_weights have to be reshaped
+        # twice and have to be reused in the following
+        attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+        attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+    else:
+        attn_weights_reshaped = None
+    attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+    attn_output = torch.bmm(attn_probs, value_states)
+    if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+        raise ValueError(
+            f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+            f" {attn_output.size()}"
+        )
+    attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+    attn_output = attn_output.transpose(1, 2)
+    # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+    # partitioned aross GPUs when using tensor-parallelism.
+    attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+    attn_output = self.out_proj(attn_output)
+    return attn_output, attn_weights_reshaped, past_key_value
+def forward(
+    self,
+    hidden_states: torch.Tensor,
+    key_value_states: Optional[torch.Tensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    layer_head_mask: Optional[torch.Tensor] = None,
+    output_attentions: bool = False,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    """Input shape: Batch x Time x Channel"""
+    # if key_value_states are provided this layer is used as a cross-attention layer
+    # for the decoder
+    is_cross_attention = key_value_states is not None
+    assert not is_cross_attention, "Cross attention is not supported for flash attention"
+    assert past_key_value is None, "past_key_value is not None is not supported for flash attention"
+    assert not output_attentions, "output_attentions is not supported for flash attention"
+    bsz, tgt_len, _ = hidden_states.size()
+    # get query proj
+    query_states = self.q_proj(hidden_states) * self.scaling
+    # get key, value proj
+    if past_key_value is not None:
+        # reuse k, v, self_attention
+        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+        key_states = torch.cat([past_key_value[0], key_states], dim=2)
+        value_states = torch.cat([past_key_value[1], value_states], dim=2)
+    else:
+        # self_attention
+        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+    if self.is_decoder:
+        # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+        # Further calls to cross_attention layer can then reuse all cross-attention
+        # key/value_states (first "if" case)
+        # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+        # all previous decoder key/value_states. Further calls to uni-directional self-attention
+        # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+        # if encoder bi-directional self-attention `past_key_value` is always `None`
+        past_key_value = (key_states, value_states)
+    proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+    query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+    key_states = key_states.view(*proj_shape)
+    value_states = value_states.view(*proj_shape)
+    ## for flash attention
+    flash_shape = (bsz, self.num_heads, tgt_len, self.head_dim)
+    query_states = query_states.view(*flash_shape)
+    key_states = key_states.view(*flash_shape)
+    value_states = value_states.view(*flash_shape)
+    qkv = torch.stack([query_states, key_states, value_states], dim=2) # shape = [bsz, num_heads, 3, tgt_len, head_dim]
+    qkv = qkv.transpose(1, 3)  # [bsz, tgt_len, 3, num_heads, head_dim]
+    key_padding_mask = attention_mask
+    assert key_padding_mask is not None
+    x = rearrange(qkv, "b s three h d -> b s (three h d)")
+    x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask)
+    x_unpad = rearrange(x_unpad, 'nnz (three h d) -> nnz three h d', three=3, h=self.num_heads)
+    output_unpad = flash_attn_varlen_qkvpacked_func(
+        x_unpad, cu_seqlens, max_s, self.dropout if self.training else 0.0,
+        softmax_scale=1, causal=True, return_attn_probs=False
+    )
+    output = rearrange(pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'),
+                                indices, bsz, tgt_len),
+                    'b s (h d) -> b s h d', h=self.num_heads)
+    attn_output = self.out_proj(rearrange(output, "b s h d -> b s (h d)"))
+    return attn_output, None, past_key_value
+# Disable the transformation of the attention mask in LlamaModel as the flash attention
+# requires the attention mask to be the same as the key_padding_mask
+def _prepare_decoder_attention_mask(
+    self, attention_mask, input_shape, inputs_embeds, past_key_values_length
+):
+    # [bsz, seq_len]
+    return attention_mask
+def replace_opt_attn_with_flash_attn():
+    cuda_major, cuda_minor = torch.cuda.get_device_capability()
+    if cuda_major < 8:
+        logging.warning(
+            "Flash attention is only supported on A100 or H100 GPU during training due to head dim > 64 backward."
+            "ref: https://github.com/HazyResearch/flash-attention/issues/190#issuecomment-1523359593"
+        )
+    transformers.models.opt.modeling_opt.OPTDecoder._prepare_decoder_attention_mask = _prepare_decoder_attention_mask
+    transformers.models.opt.modeling_opt.OPTAttention.forward = forward
+def replace_opt_attn_with_original_attn():
+    transformers.models.opt.modeling_opt.OPTDecoder._prepare_decoder_attention_mask = _prepare_decoder_attention_mask_original
+    transformers.models.opt.modeling_opt.OPTAttention.forward = forward_original
+if __name__ == '__main__':
+    ## generate tests to verify the equivalence between forward_original and forward
+    import torch.nn as nn
+    import math
+    class FakeNN(nn.Module):
+        def __init__(self, ):
+            super().__init__()
+            self.scaling = 1 / math.sqrt(2048)
+            if False:
+                self.q_proj = nn.Linear(2048, 2048)
+                self.k_proj = nn.Linear(2048, 2048)
+                self.v_proj = nn.Linear(2048, 2048)
+                self.out_proj = nn.Linear(2048, 2048)
+            else:
+                self.q_proj = nn.Identity()
+                self.k_proj = nn.Identity()
+                self.v_proj = nn.Identity()
+                self.out_proj = nn.Identity()
+            self.is_decoder = True
+            self.num_heads = 2
+            self.head_dim = 128
+            self.embed_dim = 256
+            self.dropout = 0
+        def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+            # create causal mask
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            combined_attention_mask = None
+            if input_shape[-1] > 1:
+                combined_attention_mask = _make_causal_mask(
+                    input_shape,
+                    inputs_embeds.dtype,
+                    device=inputs_embeds.device,
+                    past_key_values_length=past_key_values_length,
+                )
+            if attention_mask is not None:
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                    inputs_embeds.device
+                )
+                combined_attention_mask = (
+                    expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+                )
+            return combined_attention_mask
+        def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+            return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    fakenn = FakeNN().to(torch.bfloat16).to('cuda:0')
+    t_len = 3
+    fake_input = torch.randn(2, t_len, fakenn.embed_dim).to(torch.bfloat16).to('cuda:0')
+    if False:
+        fake_lens = torch.randint(0, t_len, (2,)).to('cuda:0')
+        fake_lens = torch.LongTensor([3, 2]).to('cuda:0')
+        # fake_lens = torch.ones((2,)).to('cuda:0') * 3
+        fake_mask = torch.arange(t_len).unsqueeze(0).to('cuda:0') < fake_lens.unsqueeze(1)
+    else:
+        fake_mask = torch.randint(0, t_len, (2, t_len)).bool().to('cuda:0')
+    fake_mask2 = fakenn._prepare_decoder_attention_mask(fake_mask, (2,t_len), fake_input, 0)
+    attn_output0, _, _ = forward_original(fakenn, fake_input, None, None, fake_mask2, None, False)
+    attn_output1, _, _ = forward(fakenn, fake_input, None, None, fake_mask, None, False) # shape = [2, 3, 256]
+    attn_output0 = attn_output0 * fake_mask.unsqueeze(-1)
+    print(torch.isclose(attn_output0, attn_output1).all())
+    print(attn_output0.shape, attn_output1.shape)
+    difference = (attn_output0- attn_output1).abs()
+    print(difference)
+    print(difference.sum())

read_results/baselines.py ADDED Viewed

	@@ -0,0 +1,141 @@

+from utils import *
+import torch
+from rxnfp.transformer_fingerprints import (
+    RXNBERTFingerprintGenerator, get_default_model_and_tokenizer, generate_fingerprints
+)
+class Reaction_model:
+    def __init__(self, train_list, test_list):
+        self.train_list = train_list
+        self.test_list = test_list
+        model, tokenizer = get_default_model_and_tokenizer()
+        self.rxnfp_generator = RXNBERTFingerprintGenerator(model, tokenizer)
+    @time_it
+    def generate_random(self):
+        pred = random.sample(self.train_list, k=len(self.test_list))
+        pred = [i['actions'] for i in pred]
+        return pred
+    @time_it
+    def generate_random_compatible_old(self):
+        pred_list = []
+        len_id_map = defaultdict(list)
+        for train_rxn in self.train_list:
+            len_id_map[len(train_rxn['extracted_molecules'])-1].append(train_rxn['index'])
+        keys = sorted(k for k in len_id_map.keys())
+        accumulated_counts = {}
+        count = 0
+        for key in keys:
+            count += len(len_id_map[key])
+            accumulated_counts[key] = count
+        for rxn in self.test_list:
+            test_token_num = len(rxn['extracted_molecules'])-1
+            idx = random.randint(0, accumulated_counts[test_token_num] - 1)
+            for key in keys:
+                if idx < len(len_id_map[key]):
+                    pred_list.append(self.train_list[len_id_map[key][idx]]['actions'])
+                    break
+                else:
+                    idx -= len(len_id_map[key])
+        return pred_list
+    @time_it
+    def generate_random_compatible(self):
+        pred_list = []
+        len_id_map = defaultdict(list)
+        for train_rxn in self.train_list:
+            len_id_map[len(train_rxn['extracted_molecules'])-1].append(train_rxn['index'])
+        for rxn in self.test_list:
+            mole_num = len(rxn['extracted_molecules'])-1
+            pred_list.append(self.train_list[random.choice(len_id_map[mole_num])]['actions'])
+        return pred_list
+    @time_it
+    def generate_nn(self, batch_size=2048):
+        train_rxns = [f"{'.'.join(rxn['REACTANT'])}>>{rxn['PRODUCT'][0]}" for rxn in self.train_list]
+        test_rxns = [f"{'.'.join(rxn['REACTANT'])}>>{rxn['PRODUCT'][0]}" for rxn in self.test_list]
+        train_rxns_batches = [train_rxns[i:i+batch_size] for i in range(0, len(train_rxns), batch_size)]
+        test_rxns_batches = [test_rxns[i:i+batch_size] for i in range(0, len(test_rxns), batch_size)]
+        device = torch.device("cuda")
+        train_fps = []
+        for batch in tqdm(train_rxns_batches, desc='Generating fingerprints for training reactions'):
+            batch_fps = self.rxnfp_generator.convert_batch(batch)
+            train_fps.extend(batch_fps)
+        train_fps = torch.tensor(train_fps, device=device) # N x 256
+        most_similar_indices = []
+        for batch in tqdm(test_rxns_batches, desc='Generating fingerprints for test reactions'):
+            batch_fps = self.rxnfp_generator.convert_batch(batch)
+            batch_fps = torch.tensor(batch_fps, device=device) # BS x 256
+            batch_fps = batch_fps / torch.norm(batch_fps, dim=1, keepdim=True)
+            similarity_matrix = torch.matmul(train_fps, batch_fps.T) # N x BS
+            most_similar_indices.extend(torch.argmax(similarity_matrix, dim=0).tolist())
+        return [self.train_list[i]['actions'] for i in most_similar_indices]
+    def save_results(self, gt_list, pred_list, target_file):
+        text_dict_list = [{
+                "targets": gt,
+                "indices": i,
+                "predictions": pred,
+            } for i, (gt, pred) in enumerate(zip(gt_list, pred_list))]
+        with open(target_file, 'w') as f:
+            json.dump(text_dict_list, f, indent=4)
+def parse_args():
+    parser = argparse.ArgumentParser(description="A simple argument parser")
+    parser.add_argument('--name', default='none', type=str)
+    parser.add_argument('--train_file', default=None, type=str)
+    parser.add_argument('--test_file', default=None, type=str)
+    parser.add_argument('--use_tok', default=False, action='store_true')
+    args = parser.parse_args()
+    return args
+def read_dataset(args):
+    print(f'Reading {args.train_file}...')
+    with open(args.train_file, 'r', encoding='utf-8') as f:
+        train_ds = json.load(f)
+    print(f'{len(train_ds)} samples read.')
+    print(f'Reading {args.test_file}...')
+    with open(args.test_file, 'r', encoding='utf-8') as f:
+        test_ds = json.load(f)
+    print(f'{len(test_ds)} samples read.')
+    return train_ds, test_ds
+def run_baselines(args):
+    set_random_seed(0)
+    train_ds, test_ds = read_dataset(args)
+    model = Reaction_model(train_ds, test_ds)
+    calculator = Metric_calculator()
+    gt_list = [i['actions'] for i in test_ds]
+    print('Random:')
+    pred_list = model.generate_random()
+    calculator(gt_list, pred_list, args.use_tok)
+    model.save_results(gt_list, pred_list, f'results/{args.name}/random.json')
+    print('Random (compatible pattern):')
+    pred_list = model.generate_random_compatible()
+    calculator(gt_list, pred_list, args.use_tok)
+    model.save_results(gt_list, pred_list, f'results/{args.name}/random_compatible.json')
+    print('Nearest neighbor:')
+    pred_list = model.generate_nn()
+    calculator(gt_list, pred_list, args.use_tok)
+    model.save_results(gt_list, pred_list, f'results/{args.name}/nn.json')
+    # assert 0
+if __name__ == "__main__":
+    args=parse_args()
+    run_baselines(args)

read_results/read_results.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from utils import *
+def parse_args():
+    parser = argparse.ArgumentParser(description="A simple argument parser")
+    parser.add_argument('--name', default='none', type=str)
+    parser.add_argument('--path', default=None, type=str)
+    parser.add_argument('--use_tok', default=False, action='store_true')
+    args = parser.parse_args()
+    return args
+def read_dataset(args):
+    print(f'Reading {args.path}...')
+    with open(args.path, 'r', encoding='utf-8') as f:
+        test_tgt = [json.loads(line) for line in f.readlines()]
+    print(f'{len(test_tgt)} samples read.')
+    gt_list = [i['targets'] for i in test_tgt]
+    pred_list = [i['predictions'] for i in test_tgt]
+    return gt_list, pred_list
+def read_result(args):
+    gt_list, pred_list = read_dataset(args)
+    calculator = Metric_calculator()
+    calculator(gt_list, pred_list, args.use_tok)
+if __name__ == "__main__":
+    args=parse_args()
+    read_result(args)

read_results/score.py ADDED Viewed

	@@ -0,0 +1,358 @@

+from rdkit import Chem
+import os
+import argparse
+from tqdm import tqdm
+import multiprocessing
+import pandas as pd
+from rdkit import RDLogger
+import re
+from utils import *
+lg = RDLogger.logger()
+lg.setLevel(RDLogger.CRITICAL)
+def extract_smiles(s):
+    start_token = "[START_SMILES]"
+    end_token = "[END_SMILES]"
+    start_index = s.find(start_token) + len(start_token)
+    end_index = s.find(end_token)
+    if start_index > -1 and end_index > -1:
+        return s[start_index:end_index].strip()
+    return s
+def canonicalize_smiles_clear_map(smiles,return_max_frag=True):
+    mol = Chem.MolFromSmiles(smiles,sanitize=not opt.synthon)
+    if mol is not None:
+        [atom.ClearProp('molAtomMapNumber') for atom in mol.GetAtoms() if atom.HasProp('molAtomMapNumber')]
+        try:
+            smi = Chem.MolToSmiles(mol, isomericSmiles=False)
+        except:
+            if return_max_frag:
+                return '',''
+            else:
+                return ''
+        if return_max_frag:
+            sub_smi = smi.split(".")
+            sub_mol = [Chem.MolFromSmiles(smiles,sanitize=not opt.synthon) for smiles in sub_smi]
+            sub_mol_size = [(sub_smi[i], len(m.GetAtoms())) for i, m in enumerate(sub_mol) if m is not None]
+            if len(sub_mol_size) > 0:
+                return smi, canonicalize_smiles_clear_map(sorted(sub_mol_size,key=lambda x:x[1],reverse=True)[0][0],return_max_frag=False)
+            else:
+                return smi, ''
+        else:
+            return smi
+    else:
+        if return_max_frag:
+            return '',''
+        else:
+            return ''
+def compute_rank(input_smiles, prediction,raw=False,alpha=1.0):
+    valid_score = [[k for k in range(len(prediction[j]))] for j in range(len(prediction))]
+    invalid_rates = [0 for k in range(len(prediction[0]))]
+    rank = {}
+    max_frag_rank = {}
+    highest = {}
+    if raw:
+        # no test augmentation
+        assert len(prediction) == 1
+        for j in range(len(prediction)):
+            for k in range(len(prediction[j])):
+                if prediction[j][k][0] == "":
+                    invalid_rates[k] += 1
+            # error detection
+            de_error = [i[0] for i in sorted(list(zip(prediction[j], valid_score[j])), key=lambda x: x[1]) if i[0][0] != ""]
+            prediction[j] = list(set(de_error))
+            prediction[j].sort(key=de_error.index)
+            for k, data in enumerate(prediction[j]):
+                rank[data] = 1 / (alpha * k + 1)
+    else:
+        for j in range(len(prediction)): # aug_num, beam_size, 2
+            for k in range(len(prediction[j])):
+                # predictions[i][j][k] = canonicalize_smiles_clear_map(predictions[i][j][k])
+                if prediction[j][k][0] == "":
+                    valid_score[j][k] = opt.beam_size + 1
+                    invalid_rates[k] += 1
+            # error detection and deduplication
+            de_error = [i[0] for i in sorted(list(zip(prediction[j], valid_score[j])), key=lambda x: x[1]) if i[0][0] != ""]
+            prediction[j] = list(set(de_error))
+            prediction[j].sort(key=de_error.index)
+            for k, data in enumerate(prediction[j]):
+                if data in rank:
+                    rank[data] += 1 / (alpha * k + 1)
+                else:
+                    rank[data] = 1 / (alpha * k + 1)
+                if data in highest:
+                    highest[data] = min(k,highest[data])
+                else:
+                    highest[data] = k
+        for key in rank.keys():
+            rank[key] += highest[key] * -1
+            rank[key] += abs(len(key[0])-len(input_smiles)) * -0.2
+            rank[key] += len(key[0]) * -0.2
+    return rank,invalid_rates
+def read_dataset(opt):
+    print(f'Reading {opt.path}...')
+    with open(opt.path, 'r', encoding='utf-8') as f:
+        test_tgt = [json.loads(line) for line in f.readlines()]
+        if opt.raw:
+            test_tgt = test_tgt[::opt.augmentation]
+    filtered_tgt = {}
+    idx_key = 'ds_idx' if 'ds_idx' in test_tgt[0] else 'index'
+    for dic in test_tgt:
+        if dic[idx_key] not in filtered_tgt:
+            filtered_tgt[dic[idx_key]] = dic
+    test_tgt = list(filtered_tgt.values())
+    test_tgt.sort(key=lambda x: x[idx_key])
+    print(f'{len(test_tgt)} samples read.')
+    input_list = [extract_smiles(i['input']) for i in test_tgt]
+    gt_list = [i['targets'].replace('[START_SMILES]', '').replace('[END_SMILES]', '').replace('SPL1T-TH1S-Pl3A5E','').strip().replace(' ','.') for i in test_tgt]
+    pred_list = [[smi.strip().replace(' ','.') for smi in i['predictions']] for i in test_tgt]
+    return input_list, gt_list, pred_list
+def main(opt):
+    input_list, gt_list, pred_list = read_dataset(opt)
+    if opt.raw:
+        opt.augmentation=1
+    print('Reading predictions from file ...')
+    # inputs
+    print("Input Length", len(gt_list))
+    ras_src_smiles = input_list[::opt.augmentation]
+    with multiprocessing.Pool(processes=opt.process_number) as pool:
+        ras_src_smiles = pool.map(func=canonicalize_smiles_clear_map,iterable=ras_src_smiles)
+    ras_src_smiles = [i[0] for i in ras_src_smiles]
+    # predictions
+    print("Prediction Length", len(pred_list))
+    pred_lines = [i.split('>')[0] for d in pred_list for i in d]
+    data_size = len(pred_lines) // (opt.augmentation * opt.beam_size) if opt.length == -1 else opt.length
+    pred_lines = pred_lines[:data_size * (opt.augmentation * opt.beam_size)]
+    print("Canonicalizing predictions using Process Number ",opt.process_number)
+    with multiprocessing.Pool(processes=opt.process_number) as pool:
+        raw_predictions = pool.map(func=canonicalize_smiles_clear_map,iterable=pred_lines)
+    predictions = [[[] for j in range(opt.augmentation)] for i in range(data_size)]  # data_len x augmentation x beam_size
+    for i, line in enumerate(raw_predictions):
+        predictions[i // (opt.beam_size * opt.augmentation)][i % (opt.beam_size * opt.augmentation) // opt.beam_size].append(line)
+    # ground truth
+    print("Origin Length", len(gt_list))
+    targets = [''.join(gt_list[i].strip().split(' ')) for i in tqdm(range(0,data_size * opt.augmentation,opt.augmentation))]
+    with multiprocessing.Pool(processes=opt.process_number) as pool:
+        targets = pool.map(func=canonicalize_smiles_clear_map, iterable=targets)
+    print("predictions Length", len(predictions), len(predictions[0]), len(predictions[0][0]))
+    print("Target Length", len(targets))
+    ground_truth = targets
+    print("Origin Target Lentgh, ", len(ground_truth))
+    print("Cutted Length, ",data_size)
+    print('\n')
+    accuracy = [0 for j in range(opt.n_best)]
+    topn_accuracy_chirality = [0 for _ in range(opt.n_best)]
+    topn_accuracy_wochirality = [0 for _ in range(opt.n_best)]
+    topn_accuracy_ringopening = [0 for _ in range(opt.n_best)]
+    topn_accuracy_ringformation = [0 for _ in range(opt.n_best)]
+    topn_accuracy_woring = [0 for _ in range(opt.n_best)]
+    total_chirality = 0
+    total_ringopening = 0
+    total_ringformation = 0
+    atomsize_topk = []
+    accurate_indices = [[] for j in range(opt.n_best)]
+    max_frag_accuracy = [0 for j in range(opt.n_best)]
+    invalid_rates = [0 for j in range(opt.beam_size)]
+    sorted_invalid_rates = [0 for j in range(opt.beam_size)]
+    unique_rates = 0
+    ranked_results = []
+    for i in tqdm(range(len(predictions))):
+        accurate_flag = False
+        if opt.detailed:
+            chirality_flag = False
+            ringopening_flag = False
+            ringformation_flag = False
+            pro_mol = Chem.MolFromSmiles(ras_src_smiles[i])
+            rea_mol = Chem.MolFromSmiles(ground_truth[i][0])
+            try:
+                pro_ringinfo = pro_mol.GetRingInfo()
+                rea_ringinfo = rea_mol.GetRingInfo()
+                pro_ringnum = pro_ringinfo.NumRings()
+                rea_ringnum = rea_ringinfo.NumRings()
+                size = len(rea_mol.GetAtoms()) - len(pro_mol.GetAtoms())
+                # if (int(ras_src_smiles[i].count("@") > 0) + int(ground_truth[i][0].count("@") > 0)) == 1:
+                if ras_src_smiles[i].count("@") > 0 or ground_truth[i][0].count("@") > 0:
+                    total_chirality += 1
+                    chirality_flag = True
+                if pro_ringnum < rea_ringnum:
+                    total_ringopening += 1
+                    ringopening_flag = True
+                if pro_ringnum > rea_ringnum:
+                    total_ringformation += 1
+                    ringformation_flag = True
+            except:
+                pass
+                # continue
+        inputs = input_list[i*opt.augmentation:(i+1)*opt.augmentation]
+        gt = gt_list[i*opt.augmentation:(i+1)*opt.augmentation]
+        rank, invalid_rate = compute_rank(ras_src_smiles[i], predictions[i], raw=opt.raw,alpha=opt.score_alpha)
+        rank_ = {k[0]: v for k, v in sorted(rank.items(), key=lambda item: item[1], reverse=True)}
+        if opt.detailed:
+            print('Index', i)
+            print('inputs', json.dumps(inputs, indent=4))
+            print('targets', json.dumps(gt, indent=4))
+            print('input', ras_src_smiles[i])
+            print('target', targets[i][0])
+            print('rank', json.dumps(rank_,indent=4))
+            print('invalid_rate', json.dumps(invalid_rate,indent=4))
+            print('\n')
+        for j in range(opt.beam_size):
+            invalid_rates[j] += invalid_rate[j]
+        rank = list(zip(rank.keys(),rank.values()))
+        rank.sort(key=lambda x:x[1],reverse=True)
+        rank = rank[:opt.n_best]
+        ranked_results.append([item[0][0] for item in rank])
+        for j, item in enumerate(rank):
+            if item[0][0] == ground_truth[i][0]:
+                if not accurate_flag:
+                    accurate_flag = True
+                    accurate_indices[j].append(i)
+                    for k in range(j, opt.n_best):
+                        accuracy[k] += 1
+                    if opt.detailed:
+                        atomsize_topk.append((size,j))
+                        if chirality_flag:
+                            for k in range(j,opt.n_best):
+                                topn_accuracy_chirality[k] += 1
+                        else:
+                            for k in range(j,opt.n_best):
+                                topn_accuracy_wochirality[k] += 1
+                        if ringopening_flag:
+                            for k in range(j,opt.n_best):
+                                topn_accuracy_ringopening[k] += 1
+                        if ringformation_flag:
+                            for k in range(j,opt.n_best):
+                                topn_accuracy_ringformation[k] += 1
+                        if not ringopening_flag and not ringformation_flag:
+                            for k in range(j,opt.n_best):
+                                topn_accuracy_woring[k] += 1
+        if opt.detailed and not accurate_flag:
+            atomsize_topk.append((size,opt.n_best))
+        for j, item in enumerate(rank):
+            if item[0][1] == ground_truth[i][1]:
+                for k in range(j,opt.n_best):
+                    max_frag_accuracy[k] += 1
+                break
+        for j in range(len(rank),opt.beam_size):
+            sorted_invalid_rates[j] += 1
+        unique_rates += len(rank)
+    for i in range(opt.n_best):
+        if i in [0,1,2,3,4,5,6,7,8,9,19,49]:
+        # if i in range(10):
+            print("Top-{} Acc:{:.3f}%, MaxFrag {:.3f}%,".format(i+1,accuracy[i] / data_size * 100,max_frag_accuracy[i] / data_size * 100),
+                  " Invalid SMILES:{:.3f}% Sorted Invalid SMILES:{:.3f}%".format(invalid_rates[i] / data_size / opt.augmentation * 100,sorted_invalid_rates[i] / data_size / opt.augmentation * 100))
+    print(' '.join([f'{accuracy[i] / data_size * 100:.3f}' for i in [0,2,4,9]]))
+    print("Unique Rates:{:.3f}%".format(unique_rates / data_size / opt.beam_size * 100))
+    if opt.detailed:
+        print_topk = [1,3,5,10]
+        save_dict = {}
+        atomsize_topk.sort(key=lambda x:x[0])
+        differ_now = atomsize_topk[0][0]
+        topn_accuracy_bydiffer = [0 for _ in range(opt.n_best)]
+        total_bydiffer = 0
+        for i,item in enumerate(atomsize_topk):
+            if differ_now < 11 and differ_now != item[0]:
+                for j in range(opt.n_best):
+                    if (j+1) in print_topk:
+                        save_dict[f'top-{j+1}_size_{differ_now}'] = topn_accuracy_bydiffer[j] / total_bydiffer * 100
+                        print("Top-{} Atom differ size {} Acc:{:.3f}%, Number:{:.3f}%".format(j+1,
+                                              differ_now,
+                                               topn_accuracy_bydiffer[j] / total_bydiffer * 100,
+                                               total_bydiffer/data_size * 100))
+                total_bydiffer = 0
+                topn_accuracy_bydiffer = [0 for _ in range(opt.n_best)]
+                differ_now = item[0]
+            for k in range(item[1],opt.n_best):
+                topn_accuracy_bydiffer[k] += 1
+            total_bydiffer += 1
+        for j in range(opt.n_best):
+            if (j + 1) in print_topk:
+                print("Top-{} Atom differ size {} Acc:{:.3f}%, Number:{:.3f}%".format(j + 1,
+                      differ_now,
+                      topn_accuracy_bydiffer[j] / total_bydiffer * 100,
+                      total_bydiffer / data_size * 100))
+                save_dict[f'top-{j+1}_size_{differ_now}'] = topn_accuracy_bydiffer[j] / total_bydiffer * 100
+        for i in range(opt.n_best):
+            if (i+1) in print_topk:
+                if total_chirality > 0:
+                    print("Top-{} Accuracy with chirality:{:.3f}%".format(i + 1, topn_accuracy_chirality[i] / total_chirality * 100))
+                    save_dict[f'top-{i+1}_chilarity'] = topn_accuracy_chirality[i] / total_chirality * 100
+                print("Top-{} Accuracy without chirality:{:.3f}%".format(i + 1, topn_accuracy_wochirality[i] / (data_size - total_chirality) * 100))
+                save_dict[f'top-{i+1}_wochilarity'] = topn_accuracy_wochirality[i] / (data_size - total_chirality) * 100
+                if total_ringopening > 0:
+                    print("Top-{} Accuracy ring Opening:{:.3f}%".format(i + 1, topn_accuracy_ringopening[i] / total_ringopening * 100))
+                    save_dict[f'top-{i+1}_ringopening'] = topn_accuracy_ringopening[i] / total_ringopening * 100
+                if total_ringformation > 0:
+                    print("Top-{} Accuracy ring Formation:{:.3f}%".format(i + 1, topn_accuracy_ringformation[i] / total_ringformation * 100))
+                    save_dict[f'top-{i+1}_ringformation'] = topn_accuracy_ringformation[i] / total_ringformation * 100
+                print("Top-{} Accuracy without ring:{:.3f}%".format(i + 1, topn_accuracy_woring[i] / (data_size - total_ringopening - total_ringformation) * 100))
+                save_dict[f'top-{i+1}_wocring'] = topn_accuracy_woring[i] /  (data_size - total_ringopening - total_ringformation)* 100
+        print(total_chirality)
+        print(total_ringformation)
+        print(total_ringopening)
+        # df = pd.DataFrame(list(save_dict.items()))
+        df = pd.DataFrame(save_dict,index=[0])
+        df.to_csv("detailed_results.csv")
+    if opt.save_accurate_indices != "":
+        with open(opt.save_accurate_indices, "w") as f:
+            total_accurate_indices = []
+            for indices in accurate_indices:
+                total_accurate_indices.extend(indices)
+            total_accurate_indices.sort()
+            # for index in total_accurate_indices:
+            for index in accurate_indices[0]:
+                f.write(str(index))
+                f.write("\n")
+    if opt.save_file != "":
+        with open(opt.save_file,"w") as f:
+            for res in ranked_results:
+                for smi in res:
+                    f.write(smi)
+                    f.write("\n")
+                for i in range(len(res),opt.n_best):
+                    f.write("")
+                    f.write("\n")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description='score.py',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--beam_size', type=int, default=10,help='Beam size')
+    parser.add_argument('--n_best', type=int, default=10,help='n best')
+    parser.add_argument('--path', type=str, required=True, help="Path to file containing the predictions and ground truth.")
+    parser.add_argument('--augmentation', type=int, default=20)
+    parser.add_argument('--score_alpha', type=float, default=1.0)
+    parser.add_argument('--length', type=int, default=-1)
+    parser.add_argument('--process_number', type=int, default=multiprocessing.cpu_count())
+    parser.add_argument('--synthon', action="store_true", default=False)
+    parser.add_argument('--detailed', action="store_true", default=False)
+    parser.add_argument('--raw', action="store_true", default=False)
+    parser.add_argument('--save_file', type=str,default="")
+    parser.add_argument('--save_accurate_indices', type=str,default="")
+    opt = parser.parse_args()
+    print(opt)
+    main(opt)

read_results/t_test.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from utils import *
+import scipy.stats as stats
+def parse_args():
+    parser = argparse.ArgumentParser(description="A simple argument parser")
+    parser.add_argument('--name', default='none', type=str)
+    parser.add_argument('--path_exp', default=None, type=str)
+    parser.add_argument('--path_ref', default=None, type=str)
+    parser.add_argument('--use_tok', default=False, action='store_true')
+    args = parser.parse_args()
+    return args
+def read_dataset(data_path):
+    print(f'Reading {data_path}...')
+    with open(data_path, 'r', encoding='utf-8') as f:
+        test_tgt = [json.loads(line) for line in f.readlines()]
+    print(f'{len(test_tgt)} samples read.')
+    gt_list = [i['targets'] for i in test_tgt]
+    pred_list = [i['predictions'] for i in test_tgt]
+    return gt_list, pred_list
+def t_test(mean_exp, std_exp, mean_ref, std_ref, n):
+    numerator = mean_exp - mean_ref
+    denominator = np.sqrt((std_exp**2 / n) + (std_ref**2 / n))
+    t_statistic = numerator / denominator
+    df = (((std_exp**2 / n) + (std_ref**2 / n))**2) / (((std_exp**2 / n)**2 / (n-1)) + ((std_ref**2 / n)**2 / (n-1)))
+    p_value = 2 * stats.t.sf(np.abs(t_statistic), df)
+    return t_statistic, p_value
+def read_result(args):
+    gt_list_exp, pred_list_exp = read_dataset(args.path_exp)
+    gt_list_ref, pred_list_ref = read_dataset(args.path_ref)
+    calculator = Metric_calculator()
+    result_exp = calculator.get_result_list(gt_list_exp, pred_list_exp, args.use_tok)
+    result_ref = calculator.get_result_list(gt_list_ref, pred_list_ref, args.use_tok)
+    for key in ['bleu2', 'bleu4', 'rouge_1', 'rouge_2', 'rouge_l', 'lev_score', 'meteor_score']:
+        if not isinstance(result_exp[key], list):
+            continue
+        levene_s, levene_p = stats.levene(result_exp[key], result_ref[key])
+        t_stat, p_val = stats.ttest_ind(result_exp[key], result_ref[key], equal_var=(levene_p > 0.05))
+        print(f'{key} (mean={float(np.mean(result_exp[key])):.4f}, levene p={levene_p:.3f}):\t{t_stat:.6f}\t{p_val}')
+if __name__ == "__main__":
+    args=parse_args()
+    read_result(args)

read_results/utils.py ADDED Viewed

	@@ -0,0 +1,256 @@

+from Levenshtein import distance as lev_distance
+import random
+import json
+from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction, corpus_bleu
+from nltk.translate.meteor_score import meteor_score
+from rouge_score import rouge_scorer
+from tqdm import tqdm
+import random
+import numpy as np
+import argparse
+from paragraph2actions.readable_converter import ReadableConverter
+import re
+from transformers import AutoTokenizer
+from collections import defaultdict
+import time
+from functools import wraps
+import os
+import torch
+import textdistance
+from typing import List
+def levenshtein_similarity(truth: List[str], pred: List[str]) -> List[float]:
+    assert len(truth) == len(pred)
+    scores: List[float] = [
+        textdistance.levenshtein.normalized_similarity(t, p)
+        for t, p in zip(truth, pred)
+    ]
+    return scores
+def modified_bleu(truth: List[str], pred: List[str], bleu_n=4) -> float:
+    """
+    Calculates the BLEU score of a translation, with a small modification in order not to penalize sentences
+    with less than 4 words.
+    Returns:
+        value between 0 and 1.
+    """
+    references = [sentence.split() for sentence in truth]
+    candidates = [sentence.split() for sentence in pred]
+    # BLEU penalizes sentences with only one word. Even correct translations get a score of zero.
+    references = [r + max(0, bleu_n - len(r)) * [""] for r in references]
+    candidates = [c + max(0, bleu_n - len(c)) * [""] for c in candidates]
+    # references must have a larger depth because it supports multiple choices
+    refs = [[r] for r in references]
+    weights = {
+        2: (0.5, 0.5),
+        4: (0.25, 0.25, 0.25, 0.25),
+    }
+    return 100*corpus_bleu(refs, candidates, weights=weights[bleu_n])  # type: ignore[no-any-return]
+def set_random_seed(seed):
+    random.seed(seed)
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)  # If using multi-GPU.
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+def time_it(func):
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        start_time = time.time()
+        result = func(*args, **kwargs)
+        end_time = time.time()
+        print(f"Function {func.__name__} finished in {end_time - start_time:.5f} seconds.\n")
+        return result
+    return wrapper
+def accuracy_score(score_list, threshold):
+    matches = sum(score>=threshold for score in score_list)
+    acc = matches / len(score_list)
+    return acc
+def extract_tokenized_entities(text):
+    pattern = r'\$[^\$]+\$|#[^#]+#|@[^\@]+@'
+    return re.findall(pattern, text)
+def extract_reactant_cnt(text):
+    max_id = None
+    for token in text.split():
+        if token.startswith('$') and token.endswith('$'):
+            try:
+                current_id = int(token.strip('$'))
+                if max_id is None or current_id > max_id:
+                    max_id = current_id
+            except ValueError:
+                pass  # Ignore tokens that do not represent an integer
+    if not max_id:
+        return 0
+    return max_id
+class Metric_calculator:
+    def __init__(self, text_trunc_length=1024):
+        self.converter = ReadableConverter(separator=' ; ')
+        self.tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased', use_fast=False, padding_side='right')
+        self.tokenizer.add_special_tokens({'pad_token': '<pad>'})
+        self.text_trunc_length = text_trunc_length
+        self.scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'])
+    def tokenize(self, gt_list, pred_list):
+        references = []
+        hypotheses = []
+        for gt, out in tqdm(zip(gt_list, pred_list)):
+            gt_tokens = self.tokenizer.tokenize(gt)
+            ## added for galactica
+            gt_tokens = list(filter(('<pad>').__ne__, gt_tokens))
+            gt_tokens = list(filter(('[PAD]').__ne__, gt_tokens))
+            gt_tokens = list(filter(('[CLS]').__ne__, gt_tokens))
+            gt_tokens = list(filter(('[SEP]').__ne__, gt_tokens))
+            out_tokens = self.tokenizer.tokenize(out)
+            out_tokens = list(filter(('<pad>').__ne__, out_tokens))
+            out_tokens = list(filter(('[PAD]').__ne__, out_tokens))
+            out_tokens = list(filter(('[CLS]').__ne__, out_tokens))
+            out_tokens = list(filter(('[SEP]').__ne__, out_tokens))
+            references.append([gt_tokens])
+            hypotheses.append(out_tokens)
+        return references, hypotheses
+    @time_it
+    def __call__(self, gt_list, pred_list, use_tokenizer=False):
+        gt_list = [gt.strip() for gt in gt_list]
+        pred_list = [pred.strip() for pred in pred_list]
+        if use_tokenizer:
+            references, hypotheses = self.tokenize(gt_list, pred_list)
+            bleu2, bleu4 = self.bleu(references, hypotheses)
+            _meteor_score = self.meteor(references, hypotheses)
+        else:
+            bleu2 = modified_bleu(gt_list, pred_list, bleu_n=2)
+            bleu4 = modified_bleu(gt_list, pred_list, bleu_n=4)
+            _meteor_score = 0
+        rouge_1, rouge_2, rouge_l = self.rouge(gt_list, pred_list)
+        validity = self.validity(gt_list, pred_list)
+        acc_100, acc_90, acc_75, acc_50 = self.accuracy(gt_list, pred_list)
+        print('BLEU-2 score:', bleu2)
+        print('BLEU-4 score:', bleu4)
+        print('Average Meteor score:', _meteor_score)
+        print('rouge1:', rouge_1)
+        print('rouge2:', rouge_2)
+        print('rougeL:', rouge_l)
+        print(f'Validity: {validity:.6f}')
+        print(f'Accuracy (100): {acc_100:.6f}')
+        print(f'Accuracy (90): {acc_90:.6f}')
+        print(f'Accuracy (75): {acc_75:.6f}')
+        print(f'Accuracy (50): {acc_50:.6f}')
+        line = ''
+        for score in [validity, bleu2, bleu4, acc_100, acc_90, acc_75, acc_50, rouge_1, rouge_2, rouge_l, _meteor_score]:
+            line += f'{score:.6f} '
+        print(line)
+        return {
+            'bleu2': bleu2,
+            'bleu4': bleu4,
+            'rouge_1': rouge_1,
+            'rouge_2': rouge_2,
+            'rouge_l': rouge_l,
+            'meteor_score': _meteor_score,
+            'validity': validity,
+            'acc_100': acc_100,
+            'acc_90': acc_90,
+            'acc_75': acc_75,
+            'acc_50': acc_50,
+        }
+    def get_result_list(self, gt_list, pred_list, use_tokenizer=False):
+        gt_list = [gt.strip() for gt in gt_list]
+        pred_list = [pred.strip() for pred in pred_list]
+        if use_tokenizer:
+            references, hypotheses = self.tokenize(gt_list, pred_list)
+            bleu2 = [corpus_bleu([gt], [pred], weights=(.5,.5)) for gt, pred in zip(references, hypotheses)]
+            bleu4 = [corpus_bleu([gt], [pred], weights=(.25,.25,.25,.25)) for gt, pred in zip(references, hypotheses)]
+            _meteor_score = [meteor_score(gt, out) for gt, out in zip(references, hypotheses)]
+        else:
+            bleu2 = [modified_bleu([gt], [pred], bleu_n=2) for gt, pred in zip(gt_list, pred_list)]
+            bleu4 = [modified_bleu([gt], [pred], bleu_n=4) for gt, pred in zip(gt_list, pred_list)]
+            _meteor_score = 0
+        rouge_1, rouge_2, rouge_l = self.rouge(gt_list, pred_list, return_list=True)
+        lev_score = levenshtein_similarity(gt_list, pred_list)
+        return {
+            'bleu2': bleu2,
+            'bleu4': bleu4,
+            'rouge_1': rouge_1,
+            'rouge_2': rouge_2,
+            'rouge_l': rouge_l,
+            'meteor_score': _meteor_score,
+            'lev_score': lev_score,
+        }
+    def bleu(self, references, hypotheses):
+        bleu2 = corpus_bleu(references, hypotheses, weights=(.5,.5))
+        bleu4 = corpus_bleu(references, hypotheses, weights=(.25,.25,.25,.25))
+        bleu2 *= 100
+        bleu4 *= 100
+        return bleu2, bleu4
+    def meteor(self, references, hypotheses):
+        meteor_scores = []
+        for gt, out in zip(references, hypotheses):
+            mscore = meteor_score(gt, out)
+            meteor_scores.append(mscore)
+        _meteor_score = np.mean(meteor_scores)
+        _meteor_score *= 100
+        return _meteor_score
+    def rouge(self, targets, predictions, return_list=False):
+        rouge_scores = []
+        for gt, out in zip(targets, predictions):
+            rs = self.scorer.score(out, gt)
+            rouge_scores.append(rs)
+        rouge_1 = [rs['rouge1'].fmeasure for rs in rouge_scores]
+        rouge_2 = [rs['rouge2'].fmeasure for rs in rouge_scores]
+        rouge_l = [rs['rougeL'].fmeasure for rs in rouge_scores]
+        if return_list:
+            return rouge_1, rouge_2, rouge_l
+        rouge_1 = np.mean(rouge_1) * 100
+        rouge_2 = np.mean(rouge_2) * 100
+        rouge_l = np.mean(rouge_l) * 100
+        return rouge_1, rouge_2, rouge_l
+    def validity(self, gt_list, pred_list):
+        num_valid, n = 0, len(pred_list)
+        for pred, gt in zip(pred_list, gt_list):
+            try:
+                actions = self.converter.string_to_actions(pred)
+                max_token_pred = extract_reactant_cnt(pred)
+                max_token_gt = extract_reactant_cnt(gt)
+                assert max_token_gt >= max_token_pred
+                num_valid += 1
+            except:
+                pass
+        return 100*(num_valid / n)
+    def accuracy(self, gt_list, pred_list):
+        score_list = levenshtein_similarity(gt_list, pred_list)
+        acc_100 = 100*accuracy_score(score_list, 1.0)
+        acc_90 = 100*accuracy_score(score_list, 0.90)
+        acc_75 = 100*accuracy_score(score_list, 0.75)
+        acc_50 = 100*accuracy_score(score_list, 0.50)
+        return acc_100, acc_90, acc_75, acc_50

visualize_context_gen.py ADDED Viewed

	@@ -0,0 +1,164 @@

+from data_provider.context_gen import *
+def parse_args():
+    parser = argparse.ArgumentParser(description="A simple argument parser")
+	# Script arguments
+    parser.add_argument('--name', default='none', type=str)
+    parser.add_argument('--seed', default=0, type=int)
+    parser.add_argument('--epochs', default=100, type=int)
+    parser.add_argument('--chunk_size', default=100, type=int)
+    parser.add_argument('--rxn_num', default=50000, type=int)
+    parser.add_argument('--k', default=4, type=int)
+    parser.add_argument('--root', default='data/pretrain_data', type=str)
+    args = parser.parse_args()
+    return args
+def pad_shorter_array(arr1, arr2):
+    len1 = arr1.shape[0]
+    len2 = arr2.shape[0]
+    if len1 > len2:
+        arr2 = np.pad(arr2, (0, len1 - len2), 'constant')
+    elif len2 > len1:
+        arr1 = np.pad(arr1, (0, len2 - len1), 'constant')
+    return arr1, arr2
+def plot_distribution(values, target_path, x_lim=None, y_lim=None, chunk_size=100, color='blue'):
+    num_full_chunks = len(values) // chunk_size
+    values = np.mean(values[:num_full_chunks*chunk_size].reshape(-1, chunk_size), axis=1)
+    values = np.sort(values)[::-1]
+    plt.figure(figsize=(10, 4), dpi=100)
+    x = np.arange(len(values))
+    plt.bar(x, values, color=color)
+    current_values = np.array([0, 200000, 400000, 600000, 800000, 1000000], dtype=int)
+    plt.xticks((current_values/chunk_size).astype(int), current_values)
+    plt.ylabel('Molecule Frequency', fontsize=20)
+    if x_lim:
+        plt.xlim(*x_lim)
+    if y_lim:
+        plt.ylim(*y_lim)
+    plt.tick_params(axis='both', which='major', labelsize=12)
+    plt.tight_layout(pad=0.5)
+    plt.savefig(target_path)
+    print(f'Figure saved to {target_path}')
+    plt.clf()
+def plot_compare_distribution(list1, list2, target_path, x_lim=None, y_lim=None, labels=['Random', 'Ours'], colors=['blue', 'orange'], chunk_size=100):
+    num_full_chunks = len(list1) // chunk_size
+    list1, list2 = pad_shorter_array(list1, list2)
+    values1, values2 = [
+        np.sort(np.mean(values[:num_full_chunks*chunk_size].reshape(-1, chunk_size), axis=1))[::-1]
+        for values in (list1, list2)]
+    plt.figure(figsize=(10, 6), dpi=100)
+    x = np.arange(len(values1))
+    plt.bar(x, values1, color=colors[0], label=labels[0], alpha=0.6)
+    plt.bar(x, values2, color=colors[1], label=labels[1], alpha=0.5)
+    current_values = np.array([0, 200000, 400000, 600000, 800000, 1000000], dtype=int)
+    plt.xticks((current_values/chunk_size).astype(int), current_values)
+    plt.ylabel('Molecule Frequency', fontsize=20)
+    if x_lim:
+        plt.xlim(*x_lim)
+    if y_lim:
+        plt.ylim(*y_lim)
+    plt.tick_params(axis='both', which='major', labelsize=18)
+    plt.tight_layout(pad=0.5)
+    plt.legend(fontsize=24, loc='upper right')
+    plt.savefig(target_path)
+    print(f'Figure saved to {target_path}')
+    plt.clf()
+def statistics(args):
+    if args.seed:
+        set_random_seed(args.seed)
+    # 1141864 rxns from ord
+    # 1120773 rxns from uspto
+    cluster = Reaction_Cluster(args.root)
+    rxn_num = len(cluster.reaction_data)
+    abstract_num = 0
+    property_num = 0
+    calculated_property_num = 0
+    experimental_property_num = 0
+    avg_calculated_property_len = 0
+    avg_experimental_property_len = 0
+    mol_set = set()
+    for rxn_dict in cluster.reaction_data:
+        for key in ['REACTANT', 'CATALYST', 'SOLVENT', 'PRODUCT']:
+            for mol in rxn_dict[key]:
+                mol_set.add(mol)
+    mol_num = len(mol_set)
+    for mol_dict in cluster.property_data:
+        if 'abstract' in mol_dict:
+            abstract_num += 1
+        if 'property' in mol_dict:
+            property_num += 1
+            if 'Experimental Properties' in mol_dict['property']:
+                experimental_property_num += 1
+                avg_experimental_property_len += len(mol_dict['property']['Experimental Properties'])
+            if 'Computed Properties' in mol_dict['property']:
+                calculated_property_num += 1
+                avg_calculated_property_len += len(mol_dict['property']['Computed Properties'])
+    print(f'Reaction Number: {rxn_num}')
+    print(f'Molecule Number: {mol_num}')
+    print(f'Abstract Number: {abstract_num}/{mol_num}({abstract_num/mol_num*100:.2f}%)')
+    print(f'Property Number: {property_num}/{mol_num}({property_num/mol_num*100:.2f}%)')
+    print(f'- Experimental Properties Number: {experimental_property_num}/{property_num}({experimental_property_num/property_num*100:.2f}%), {avg_experimental_property_len/mol_num:.2f} items per molecule')
+    print(f'- Computed Properties: {calculated_property_num}/{property_num}({calculated_property_num/property_num*100:.2f}%), {avg_calculated_property_len/mol_num:.2f} items per molecule')
+def visualize(args):
+    if args.seed:
+        set_random_seed(args.seed)
+    cluster = Reaction_Cluster(args.root)
+    prob_values, rxn_weights = cluster.visualize_mol_distribution()
+    rand_prob_values, rand_rxn_weights = cluster._randomly(
+        cluster.visualize_mol_distribution
+    )
+    fig_root = f'results/{args.name}/'
+    plot_distribution(prob_values, fig_root+'mol_distribution.pdf')
+    plot_distribution(rxn_weights, fig_root+'rxns_distribution.pdf')
+    plot_distribution(rand_prob_values, fig_root+'mol_distribution_random.pdf')
+    plot_distribution(rand_rxn_weights, fig_root+'rxns_distribution_random.pdf')
+    plot_compare_distribution(prob_values, rand_prob_values, fig_root+'Compare_mol.pdf', y_lim=(-0.5,15.5))
+    plot_compare_distribution(rxn_weights, rand_rxn_weights, fig_root+'Compare_rxns.pdf')
+def visualize_frequency(args):
+    if args.seed:
+        set_random_seed(args.seed)
+    fig_root = f'results/{args.name}/'
+    name_suffix = f'E{args.epochs}_Rxn{args.rxn_num}_K{args.k}'
+    cache_path = f'{fig_root}/freq_{name_suffix}.npy'
+    if os.path.exists(cache_path):
+        mol_freq, rxn_freq, rand_mol_freq, rand_rxn_freq = np.load(cache_path, allow_pickle=True)
+    else:
+        cluster = Reaction_Cluster(args.root)
+        mol_freq, rxn_freq = cluster.visualize_mol_frequency(rxn_num=args.rxn_num, k=args.k, epochs=args.epochs)
+        rand_mol_freq, rand_rxn_freq = cluster._randomly(
+            cluster.visualize_mol_frequency,
+            rxn_num=args.rxn_num, k=args.k, epochs=args.epochs
+        )
+        np.save(cache_path, np.array([mol_freq, rxn_freq, rand_mol_freq, rand_rxn_freq], dtype=object), allow_pickle=True)
+    color1 = '#FA7F6F'
+    color2 = '#80AFBF'
+    color3 = '#FFBE7A'
+    plot_distribution(mol_freq, fig_root+f'mol_frequency_{name_suffix}.pdf', x_lim=(-50000//args.chunk_size, 1200000//args.chunk_size), y_lim=(-2, 62), chunk_size=args.chunk_size, color=color2)
+    # plot_distribution(rxn_freq, fig_root+f'rxns_frequency_{name_suffix}.pdf', chunk_size=args.chunk_size, color=color1)
+    plot_distribution(rand_mol_freq, fig_root+f'mol_frequency_random_{name_suffix}.pdf', x_lim=(-50000//args.chunk_size, 1200000//args.chunk_size), y_lim=(-2, 62), chunk_size=args.chunk_size, color=color2)
+    # plot_distribution(rand_rxn_freq, fig_root+f'rxns_frequency_random_{name_suffix}.pdf', chunk_size=args.chunk_size, color=color1)
+    plot_compare_distribution(rand_mol_freq, mol_freq, fig_root+f'Compare_mol_{name_suffix}.pdf', y_lim=(-2, 62), labels=['Before Adjustment', 'After Adjustment'], colors=[color1, color2], chunk_size=args.chunk_size)
+    # plot_compare_distribution(rxn_freq, rand_rxn_freq, fig_root+f'Compare_rxns_{name_suffix}.pdf', chunk_size=args.chunk_size)
+if __name__=='__main__':
+    args = parse_args()
+    print(args, flush=True)
+    # statistics(args)
+    # visualize(args)
+    visualize_frequency(args)