Spaces:

ProteinDesignLab
/

protpardelle

Sleeping

App Files Files Community

Simon Duerr commited on Sep 13, 2023

Commit

8c639ec

0 Parent(s):

webapp

Browse files

Files changed (41) hide show

CODEOWNERS +2 -0
LICENSE +29 -0
ProteinMPNN +1 -0
app.py +498 -0
checkpoints/allatom.yml +69 -0
checkpoints/allatom_state_dict.pth +3 -0
checkpoints/backbone.yml +69 -0
checkpoints/backbone_state_dict.pth +3 -0
checkpoints/minimpnn_state_dict.pth +3 -0
configs/allatom.yml +69 -0
configs/backbone.yml +69 -0
configs/seqdes.yml +74 -0
core/__init__.py +0 -0
core/__pycache__/__init__.cpython-38.pyc +0 -0
core/__pycache__/__init__.cpython-39.pyc +0 -0
core/__pycache__/data.cpython-38.pyc +0 -0
core/__pycache__/data.cpython-39.pyc +0 -0
core/__pycache__/protein.cpython-38.pyc +0 -0
core/__pycache__/protein.cpython-39.pyc +0 -0
core/__pycache__/protein_mpnn.cpython-38.pyc +0 -0
core/__pycache__/protein_mpnn.cpython-39.pyc +0 -0
core/__pycache__/residue_constants.cpython-38.pyc +0 -0
core/__pycache__/residue_constants.cpython-39.pyc +0 -0
core/__pycache__/utils.cpython-38.pyc +0 -0
core/__pycache__/utils.cpython-39.pyc +0 -0
core/data.py +271 -0
core/protein.py +341 -0
core/protein_mpnn.py +1886 -0
core/residue_constants.py +1104 -0
core/stereo_chemical_props.txt +345 -0
core/utils.py +1062 -0
diffusion.py +66 -0
draw_samples.py +353 -0
evaluation.py +406 -0
models.py +778 -0
modules.py +696 -0
output_helpers.py +0 -0
package.txt +1 -0
protpardelle_pymol.py +159 -0
requirements.txt +14 -0
sampling.py +213 -0

CODEOWNERS ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Global owner
2	+ * @alexechu

LICENSE ADDED Viewed

	@@ -0,0 +1,29 @@

+BSD 3-Clause License
+Copyright (c) 2022, Alex Chu
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

ProteinMPNN ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit 8907e6671bfbfc92303b5f79c4b5e6ce47cdef57

app.py ADDED Viewed

	@@ -0,0 +1,498 @@

+import gradio as gr
+import re
+import urllib
+import tempfile
+from output_helpers import viewer_html, output_html, load_js, get_js
+import json
+import os
+import shlex
+import subprocess
+from datetime import datetime
+from einops import repeat
+import torch
+from core import data
+from core import utils
+import models
+import sampling
+# from draw_samples import draw_and_save_samples, parse_resample_idx_string
+def draw_and_save_samples(
+    model,
+    samples_per_len=8,
+    lengths=range(50, 512),
+    save_dir="./",
+    mode="backbone",
+    **sampling_kwargs,
+):
+    device = model.device
+    sample_files = []
+    if mode == "backbone":
+        total_sampling_time = 0
+        for l in lengths:
+            prot_lens = torch.ones(samples_per_len).long() * l
+            seq_mask = model.make_seq_mask_for_sampling(prot_lens=prot_lens)
+            aux = sampling.draw_backbone_samples(
+                model,
+                seq_mask=seq_mask,
+                pdb_save_path=f"{save_dir}/len{format(l, '03d')}_samp",
+                return_aux=True,
+                return_sampling_runtime=True,
+                **sampling_kwargs,
+            )
+            total_sampling_time += aux["runtime"]
+            sample_files+= [f"{save_dir}/len{format(l, '03d')}_samp{i}.pdb" for i in range(samples_per_len)]
+        return sample_files
+    elif mode == "allatom":
+        total_sampling_time = 0
+        for l in lengths:
+            prot_lens = torch.ones(samples_per_len).long() * l
+            seq_mask = model.make_seq_mask_for_sampling(prot_lens=prot_lens)
+            aux = sampling.draw_allatom_samples(
+                model,
+                seq_mask=seq_mask,
+                pdb_save_path=f"{save_dir}/len{format(l, '03d')}",
+                return_aux=True,
+                **sampling_kwargs,
+            )
+            total_sampling_time += aux["runtime"]
+            sample_files+= [f"{save_dir}/len{format(l, '03d')}_samp{i}.pdb" for i in range(samples_per_len)]
+        return sample_files
+def parse_idx_string(idx_str):
+    spans = idx_str.split(",")
+    idxs = []
+    for s in spans:
+        if "-" in s:
+            start, stop = s.split("-")
+            idxs.extend(list(range(int(start), int(stop))))
+        else:
+            idxs.append(int(s))
+    return idxs
+def changemode(m):
+    if (m == "unconditional"):
+        return gr.update(visible=True), gr.update(visible=False),gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)
+    else:
+        return gr.update(visible=False), gr.update(visible=True),gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
+def fileselection(val):
+    if (val == "upload"):
+        return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
+    else:
+        return gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)
+def update_structuresel(pdb, radio_val):
+    pdb_file = tempfile.NamedTemporaryFile(delete=False, suffix=".pdb")
+    representations = [{
+        "model": 0,
+        "chain": "",
+        "resname": "",
+        "style": "cartoon",
+        "color": "whiteCarbon",
+        "residue_range": "",
+        "around": 0,
+        "byres": False,
+        "visible": False,
+      }]
+    if (radio_val == "PDB"):
+        if (len(pdb) != 4):
+            return gr.update(open=True),gr.update(), gr.update(value="",visible=False)
+        else:
+            urllib.request.urlretrieve(
+                    f"http://files.rcsb.org/download/{pdb.lower()}.pdb1",
+                    pdb_file.name,
+                )
+            return gr.update(open=False),gr.update(value=pdb_file.name), gr.update(value=f"""<iframe style="width: 100%; height: 930px" name="result" allow="midi; geolocation; microphone; camera;
+    display-capture; encrypted-media;" sandbox="allow-modals allow-forms
+    allow-scripts allow-same-origin allow-popups
+    allow-top-navigation-by-user-activation allow-downloads" allowfullscreen=""
+    allowpaymentrequest="" frameborder="0" srcdoc='{viewer_html(pdb_file.name, representations=representations)}'></iframe>""",visible=True)
+    elif (radio_val == "AFDB2"):
+        if (re.match("[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}",pdb) != None):
+            urllib.request.urlretrieve(
+                    f"https://alphafold.ebi.ac.uk/files/AF-{pdb}-F1-model_v2.pdb",
+                    pdb_file.name
+                )
+            return gr.update(open=False),gr.update(value=pdb_file.name), gr.update(value=f"""<iframe style="width: 100%; height: 930px" name="result" allow="midi; geolocation; microphone; camera;
+    display-capture; encrypted-media;" sandbox="allow-modals allow-forms
+    allow-scripts allow-same-origin allow-popups
+    allow-top-navigation-by-user-activation allow-downloads" allowfullscreen=""
+    allowpaymentrequest="" frameborder="0" srcdoc='{viewer_html(pdb_file.name, representations=representations)}'></iframe>""",visible=True)
+        else:
+            return gr.update(open=True), gr.update(value="regex not matched",visible=True)
+    else:
+        return gr.update(open=False),gr.update(value=f"{pdb.name}"), gr.update(value=f"""<iframe style="width: 100%; height: 930px" name="result" allow="midi; geolocation; microphone; camera;
+    display-capture; encrypted-media;" sandbox="allow-modals allow-forms
+    allow-scripts allow-same-origin allow-popups
+    allow-top-navigation-by-user-activation allow-downloads" allowfullscreen=""
+    allowpaymentrequest="" frameborder="0" srcdoc='{viewer_html(pdb.name, representations=representations)}'></iframe>""",visible=True)
+from Bio.PDB import PDBParser, cealign
+from Bio.PDB.PDBIO import PDBIO
+class dotdict(dict):
+    """dot.notation access to dictionary attributes"""
+    __getattr__ = dict.get
+    __setattr__ = dict.__setitem__
+    __delattr__ = dict.__delitem__
+def protpardelle(path_to_file, m, resample_idx,  modeltype, minlen, maxlen, steplen, perlen):
+        # Set up params, arguments, sampling config
+    ####################
+    args = {}
+    args["model_checkpoint"] = "checkpoints" #Path to denoiser model weights and config",
+    args["mpnnpath"] = "checkpoints/minimpnn_state_dict.pth" #"Path to minimpnn model weights",
+    args["modeldir"] = None #"Model base directory, ex 'training_logs/other/lemon-shape-51'",
+    args["modelepoch"] = None #"Model epoch, ex 1000")
+    args["type"]=modeltype # "Type of model"
+    if m == "conditional":
+        args["param"] = None #"Which sampling param to vary"
+        args["paramval"]=None #"Which param val to use"
+        args["parampath"]= None # Path to json file with params, either use param/paramval or parampath, not both",
+        args["perlen"] = int(perlen) #How many samples per sequence length"
+        args["minlen"] = None #"Minimum sequence length"
+        args["maxlen"] = None #Maximum sequence length, not inclusive",
+        args["steplen"] = int(steplen) #"How frequently to select sequence length, for steplen 2, would be 50, 52, 54, etc",
+        args["num_lens"] = None #"If steplen not provided, how many random lengths to sample at",
+        args["targetdir"] = "." #"Directory to save results"
+        args["input_pdb"] = path_to_file # "PDB file to condition on"
+        args["resample_idxs"] = resample_idx[1:-1] # "Indices from PDB file to resample. Zero-indexed, comma-delimited, can use dashes, eg 0,2-5,7"
+    else:
+        args["param"] = "n_steps" #"Which sampling param to vary"
+        args["paramval"]="100" #"Which param val to use"
+        args["parampath"]= None # Path to json file with params, either use param/paramval or parampath, not both",
+        args["perlen"] = int(perlen) #How many samples per sequence length"
+        args["minlen"] = int(minlen) #"Minimum sequence length"
+        args["maxlen"] = int(maxlen)+1 #Maximum sequence length
+        args["steplen"] = int(steplen) #"How frequently to select sequence length, for steplen 2, would be 50, 52, 54, etc",
+        args["num_lens"] = None #"If steplen not provided, how many random lengths to sample at",
+        args["targetdir"] = "." #"Directory to save results"
+        args["resample_idxs"] = None
+    args = dotdict(args)
+    is_test_run = False
+    seed = 0
+    samples_per_len = args.perlen
+    min_len = args.minlen
+    max_len = args.maxlen
+    len_step_size = args.steplen
+    device = "cuda:0"
+    # setting default sampling config
+    if args.type == "backbone":
+        sampling_config = sampling.default_backbone_sampling_config()
+    elif args.type == "allatom":
+        sampling_config = sampling.default_allatom_sampling_config()
+    sampling_kwargs = vars(sampling_config)
+    # Parse conditioning inputs
+    input_pdb_len = None
+    if args.input_pdb:
+        input_feats = utils.load_feats_from_pdb(args.input_pdb, protein_only=True)
+        input_pdb_len = input_feats["aatype"].shape[0]
+        if args.resample_idxs:
+            print(
+                f"Warning: when sampling conditionally, the input pdb length ({input_pdb_len} residues) is used automatically for the sampling lengths."
+            )
+            resample_idxs = parse_idx_string(args.resample_idxs)
+        else:
+            resample_idxs = list(range(input_pdb_len))
+        cond_idxs = [i for i in range(input_pdb_len) if i not in resample_idxs]
+        to_batch_size = lambda x: repeat(x, "... -> b ...", b=samples_per_len).to(
+            device
+        )
+        # For unconditional model, center coords on whole structure
+        centered_coords = data.apply_random_se3(
+            input_feats["atom_positions"],
+            atom_mask=input_feats["atom_mask"],
+            translation_scale=0.0,
+        )
+        cond_kwargs = {}
+        cond_kwargs["gt_coords"] = to_batch_size(centered_coords)
+        cond_kwargs["gt_cond_atom_mask"] = to_batch_size(input_feats["atom_mask"])
+        cond_kwargs["gt_cond_atom_mask"][:, resample_idxs] = 0
+        cond_kwargs["gt_aatype"] = to_batch_size(input_feats["aatype"])
+        cond_kwargs["gt_cond_seq_mask"] = torch.zeros_like(cond_kwargs["gt_aatype"])
+        cond_kwargs["gt_cond_seq_mask"][:, cond_idxs] = 1
+        sampling_kwargs.update(cond_kwargs)
+    print("input_pdb_len", input_pdb_len)
+    # Determine lengths to sample at
+    if min_len is not None and max_len is not None:
+        if len_step_size is not None:
+            sampling_lengths = range(min_len, max_len, len_step_size)
+        else:
+            sampling_lengths = list(
+                torch.randint(min_len, max_len, size=(args.num_lens,))
+            )
+    elif input_pdb_len is not None:
+        sampling_lengths = [input_pdb_len]
+    else:
+        raise Exception("Need to provide a set of protein lengths or an input pdb.")
+    total_num_samples = len(list(sampling_lengths)) * samples_per_len
+    model_directory = args.modeldir
+    epoch = args.modelepoch
+    base_dir = args.targetdir
+    date_string = datetime.now().strftime("%y-%m-%d-%H-%M-%S")
+    if is_test_run:
+        date_string = f"test-{date_string}"
+    # Update sampling config with arguments
+    if args.param:
+        var_param = args.param
+        var_value = args.paramval
+        sampling_kwargs[var_param] = (
+            None
+            if var_value == "None"
+            else int(var_value)
+            if var_param == "n_steps"
+            else float(var_value)
+        )
+    elif args.parampath:
+        with open(args.parampath) as f:
+            var_params = json.loads(f.read())
+            sampling_kwargs.update(var_params)
+    # this is only used for the readme, keep s_min and s_max as params instead of struct_noise_schedule
+    sampling_kwargs_readme = list(sampling_kwargs.items())
+    print("Base directory:", base_dir)
+    save_dir = f"{base_dir}/samples/{date_string}"
+    save_init_dir = f"{base_dir}/samples_inits/{date_string}"
+    # make dirs if do not exist
+    if not os.path.exists(save_dir):
+        subprocess.run(shlex.split(f"mkdir -p {save_dir}"))
+    if not os.path.exists(save_init_dir):
+        subprocess.run(shlex.split(f"mkdir -p {save_init_dir}"))
+    print("Samples saved to:", save_dir)
+    torch.manual_seed(seed)
+    # Load model
+    if args.type == "backbone":
+        if args.model_checkpoint:
+            checkpoint = f"{args.model_checkpoint}/backbone_state_dict.pth"
+            cfg_path = f"{args.model_checkpoint}/backbone.yml"
+        else:
+            checkpoint = (
+                f"{model_directory}/checkpoints/epoch{epoch}_training_state.pth"
+            )
+            cfg_path = f"{model_directory}/configs/backbone.yml"
+        cfg = utils.load_config(cfg_path)
+        weights = torch.load(checkpoint, map_location=device)["model_state_dict"]
+        model = models.Protpardelle(cfg, device=device)
+        model.load_state_dict(weights)
+        model.to(device)
+        model.eval()
+        model.device = device
+    elif args.type == "allatom":
+        if args.model_checkpoint:
+            checkpoint = f"{args.model_checkpoint}/allatom_state_dict.pth"
+            cfg_path = f"{args.model_checkpoint}/allatom.yml"
+        else:
+            checkpoint = (
+                f"{model_directory}/checkpoints/epoch{epoch}_training_state.pth"
+            )
+            cfg_path = f"{model_directory}/configs/allatom.yml"
+        config = utils.load_config(cfg_path)
+        weights = torch.load(checkpoint, map_location=device)["model_state_dict"]
+        model = models.Protpardelle(config, device=device)
+        model.load_state_dict(weights)
+        model.load_minimpnn(args.mpnnpath)
+        model.to(device)
+        model.eval()
+        model.device = device
+    with open(save_dir + "/run_parameters.txt", "w") as f:
+        f.write(f"Sampling run for {date_string}\n")
+        f.write(f"Random seed {seed}\n")
+        f.write(f"Model checkpoint: {checkpoint}\n")
+        f.write(
+            f"{samples_per_len} samples per length from {min_len}:{max_len}:{len_step_size}\n"
+        )
+        f.write("Sampling params:\n")
+        for k, v in sampling_kwargs_readme:
+            f.write(f"{k}\t{v}\n")
+    # Draw samples
+    output_files = draw_and_save_samples(
+        model,
+        samples_per_len=samples_per_len,
+        lengths=sampling_lengths,
+        save_dir=save_dir,
+        mode=args.type,
+        **sampling_kwargs,
+    )
+    return output_files
+def api_predict(pdb_content,m, resample_idx,  modeltype, minlen, maxlen, steplen, perlen):
+    if (m == "conditional"):
+        tempPDB = tempfile.NamedTemporaryFile(delete=False, suffix=".pdb")
+        tempPDB.write(pdb_content.encode())
+        tempPDB.close()
+        path_to_file = tempPDB.name
+    else:
+        path_to_file = None
+    try:
+        designs = protpardelle(path_to_file, m, resample_idx, modeltype, minlen, maxlen, steplen, perlen)
+    except Exception as e:
+        print(e)
+        raise gr.Error(e)
+    # load each design as string
+    design_str = []
+    for d in designs:
+        with open(d, "r") as f:
+            design_str.append(f.read())
+    results = list(zip(designs, design_str))
+    return json.dumps(results)
+def predict(pdb_radio, path_to_file,m, resample_idx,  modeltype, minlen, maxlen, steplen, perlen):
+    print("running predict")
+    try:
+        designs = protpardelle(path_to_file, m, resample_idx, modeltype, minlen, maxlen, steplen, perlen)
+    except Exception as e:
+        print(e)
+        raise gr.Error(e)
+        return gr.update(open=True), gr.update(value="something went wrong")
+    parser = PDBParser()
+    aligner = cealign.CEAligner()
+    io=PDBIO()
+    aligned_designs = []
+    metrics = []
+    if (m == "conditional"):
+        ref = parser.get_structure("ref", path_to_file)
+        aligner.set_reference(ref)
+        for d in designs:
+            design = parser.get_structure("design", d)
+            aligner.align(design)
+            metrics.append({"rms": f"{aligner.rms:.1f}", "len": len(list(design[0].get_residues()))})
+            io.set_structure(design)
+            io.save(d.replace(".pdb", f"_al.pdb"))
+            aligned_designs.append(d.replace(".pdb", f"_al.pdb"))
+    else:
+        for d in designs:
+            design = parser.get_structure("design", d)
+            metrics.append({"len": len(list(design[0].get_residues()))})
+        aligned_designs = designs
+    output_view = f"""<iframe style="width: 100%; height: 900px" name="result" allow="midi; geolocation; microphone; camera;
+    display-capture; encrypted-media;" sandbox="allow-modals allow-forms
+    allow-scripts allow-same-origin allow-popups
+    allow-top-navigation-by-user-activation allow-downloads" allowfullscreen=""
+    allowpaymentrequest="" frameborder="0" srcdoc='{output_html(path_to_file, aligned_designs, metrics, resample_idx=resample_idx, mode=m)}'></iframe>"""
+    return gr.update(open=False), gr.update(value=output_view,visible=True)
+protpardelleDemo = gr.Blocks()
+with protpardelleDemo:
+    gr.Markdown("# Protpardelle")
+    gr.Markdown(""" An all-atom protein generative model
+                Alexander E. Chu, Lucy Cheng, Gina El Nesr, Minkai Xu,  Po-Ssu Huang
+doi: https://doi.org/10.1101/2023.05.24.542194""")
+    with gr.Accordion(label="Input options", open=True) as input_accordion:
+        model = gr.Dropdown(["backbone", "allatom"], value="allatom", label="What to sample?")
+        m = gr.Radio(['unconditional','conditional'],value="unconditional", label="Choose a Mode")
+        #unconditional
+        with gr.Group(visible=True) as uncond:
+            gr.Markdown("Unconditional Sampling")
+            # length = gr.Slider(minimum=0, maximum=200, step=1, value=50, label="length")
+            # param = gr.Dropdown(["length", "param"], value="length", label="Which sampling param to vary?")
+            # paramval = gr.Dropdown(["nsteps"], label="paramval", info="Which param val to use?")
+        #conditional
+        with gr.Group(visible=False) as cond:
+            with gr.Accordion(label="Structure to condition on", open=True) as input_accordion:
+                pdb_radio = gr.Radio(['PDB','AF2 EBI DB', 'upload'],value="PDB", label="source of the structure")
+                pdbcode = gr.Textbox(label="Uniprot code to be retrieved Alphafold2 Database", visible=True)
+                pdbfile = gr.File(label="PDB File", visible=False)
+                btn_load = gr.Button("Load PDB")
+                pdb_radio.change(fileselection, inputs=pdb_radio, outputs=[pdbcode, pdbfile, btn_load])
+            pdb_html = gr.HTML("", visible=False)
+            path_to_file = gr.Textbox(label="Path to file", visible=False)
+            resample_idxs = gr.Textbox(label="Cond Idxs", interactive=False, info="Zero indexed list of indices to condition on, select in sequence viewer above")
+            btn_load.click(update_structuresel, inputs=[pdbcode, pdb_radio], outputs=[input_accordion,path_to_file,pdb_html])
+            pdbfile.change(update_structuresel, inputs=[pdbfile,pdb_radio], outputs=[input_accordion,path_to_file,pdb_html])
+        with gr.Accordion(label="Sizes", open=True) as size_uncond:
+            with gr.Row():
+                minlen = gr.Slider(minimum=2, maximum=200,value=50, step=1, label="minlen", info="Minimum sequence length")
+                maxlen = gr.Slider(minimum=3, maximum=200,value=60, step=1, label="maxlen", info="Maximum sequence length")
+                steplen = gr.Slider(minimum=1, maximum=50, step=1, value=1, label="steplen", info="How frequently to select sequence length?" )
+        perlen = gr.Slider(minimum=1, maximum=200, step=1, value=2, label="perlen", info="How many samples per sequence length?")
+    btn_conditional = gr.Button("Run conditional",visible=False)
+    btn_unconditional = gr.Button("Run unconditional")
+    m.change(changemode, inputs=m, outputs=[uncond, cond, btn_unconditional, btn_conditional, size_uncond])
+    out = gr.HTML("", visible=True)
+    btn_unconditional.click(predict, inputs=[pdb_radio, path_to_file,m, resample_idxs, model, minlen, maxlen, steplen, perlen], outputs=[input_accordion, out])
+    btn_conditional.click(fn=None,
+                     inputs=[resample_idxs],
+                     outputs=[resample_idxs],
+                     _js=get_js
+                     ) #
+    out_text = gr.Textbox(label="Output", visible=False)
+    #hidden button for named api route
+    pdb_content = gr.Textbox(label="PDB Content", visible=False)
+    btn_api = gr.Button("Run API",visible=False)
+    btn_api.click(api_predict, inputs=[pdb_content,m, resample_idxs, model, minlen, maxlen, steplen, perlen], outputs=[out_text], api_name="protpardelle")
+    resample_idxs.change(predict, inputs=[pdb_radio, path_to_file,m, resample_idxs, model, minlen, maxlen, steplen, perlen], outputs=[input_accordion, out])
+    protpardelleDemo.load(None, None, None, _js=load_js)
+protpardelleDemo.queue()
+protpardelleDemo.launch(allowed_paths=['samples'], share=True)

checkpoints/allatom.yml ADDED Viewed

	@@ -0,0 +1,69 @@

+train:
+    home_dir: '/home/duerr/phd/08_Code/protpardelle-final'
+    seed: 0
+    checkpoint: ['', 0]
+    batch_size: 32
+    max_epochs: 10000
+    eval_freq: 7200  # seconds
+    checkpoint_freq: 50
+    checkpoints: []
+    lr:  0.0001
+    warmup_steps: 1000
+    decay_steps: 2_000_000
+    clip_grad_norm: True
+    grad_clip_val: 1.0
+    weight_decay: 0.0
+    n_eval_samples: 8
+    sample_length_range: [50, 512]
+    sc_num_seqs: 4
+    eval_loss_t: [0.1, 0.3, 0.5, 0.7, 0.9]
+    self_cond_train_prob: 0.9
+    subsample_eval_set: 0.05
+    crop_conditional: False
+data:
+    pdb_path: 'datasets/ingraham_cath_dataset'
+    fixed_size: 512
+    n_aatype_tokens: 21
+    se3_data_augment: True
+    sigma_data: 10.0
+diffusion:
+    training:
+        function: 'lognormal'
+        psigma_mean: -1.0
+        psigma_std: 1.5
+    sampling:
+        function: 'uniform'
+        s_min: 0.001
+        s_max: 80
+model:
+    task: 'allatom'  # 'backbone', 'allatom', 'seqdes', 'codesign'
+    pretrained_modules: []  # 'struct_model', 'mpnn_model'
+    struct_model_checkpoint: ''
+    mpnn_model_checkpoint: ''
+    crop_conditional: False
+    dummy_fill_masked_atoms: False
+    struct_model:
+        arch: 'uvit'
+        n_atoms: 37
+        n_channel: 256
+        noise_cond_mult: 4
+        uvit:
+            patch_size: 1
+            n_layers: 6
+            n_heads: 8
+            dim_head: 32
+            n_filt_per_layer: []
+            n_blocks_per_layer: 2
+            cat_pwd_to_conv: False
+            conv_skip_connection: False
+            position_embedding_type: 'rotary'
+    mpnn_model:
+        use_self_conditioning: True
+        label_smoothing: 0.1
+        n_channel: 128
+        n_layers: 3
+        n_neighbors: 32
+        noise_cond_mult: 4

checkpoints/allatom_state_dict.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c854ce05b3b1b28c45f58ebf6e5cfba5a45b389ea2aa58a6ce25649d90da238f
+size 87550006

checkpoints/backbone.yml ADDED Viewed

	@@ -0,0 +1,69 @@

+train:
+    home_dir: '/home/duerr/phd/08_Code/protpardelle-final'
+    seed: 0
+    checkpoint: ['', 0]
+    batch_size: 32
+    max_epochs: 10000
+    eval_freq: 7200  # seconds
+    checkpoint_freq: 50
+    checkpoints: []
+    lr:  0.0001
+    warmup_steps: 1000
+    decay_steps: 2_000_000
+    clip_grad_norm: True
+    grad_clip_val: 1.0
+    weight_decay: 0.0
+    n_eval_samples: 8
+    sample_length_range: [50, 512]
+    sc_num_seqs: 4
+    eval_loss_t: [0.1, 0.3, 0.5, 0.7, 0.9]
+    self_cond_train_prob: 0.9
+    subsample_eval_set: 0.05
+    crop_conditional: False
+data:
+    pdb_path: 'datasets/ingraham_cath_dataset'
+    fixed_size: 384
+    n_aatype_tokens: 21
+    se3_data_augment: True
+    sigma_data: 10.0
+diffusion:
+    training:
+        function: 'lognormal'
+        psigma_mean: -1.2
+        psigma_std: 1.2
+    sampling:
+        function: 'uniform'
+        s_min: 0.001
+        s_max: 80
+model:
+    task: 'backbone'  # 'backbone', 'allatom', 'seqdes', 'codesign'
+    pretrained_modules: []  # 'struct_model', 'mpnn_model'
+    struct_model_checkpoint: ''
+    mpnn_model_checkpoint: ''
+    crop_conditional: False
+    dummy_fill_masked_atoms: False
+    struct_model:
+        arch: 'uvit'
+        n_atoms: 37  # keep same shapes, just zero out sidechains
+        n_channel: 256
+        noise_cond_mult: 4
+        uvit:
+            patch_size: 1
+            n_layers: 6
+            n_heads: 8
+            dim_head: 32
+            n_filt_per_layer: []
+            n_blocks_per_layer: 2
+            cat_pwd_to_conv: False
+            conv_skip_connection: False
+            position_embedding_type: 'absolute_residx'
+    mpnn_model:
+        use_self_conditioning: True
+        label_smoothing: 0.1
+        n_channel: 128
+        n_layers: 3
+        n_neighbors: 32
+        noise_cond_mult: 4

checkpoints/backbone_state_dict.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2bcbdcca2419beb8f07cc1d43ee4d8c53d7e4ce21b4a144b88218af00ed3b2b9
+size 87548437

checkpoints/minimpnn_state_dict.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:86be202225b3769976ef9bcec75029f4352d670d0107db560eec3d35eeacca9f
+size 100570633

configs/allatom.yml ADDED Viewed

	@@ -0,0 +1,69 @@

+train:
+    home_dir: '/home/duerr/phd/08_Code/protpardelle-final'
+    seed: 0
+    checkpoint: ['', 0]
+    batch_size: 32
+    max_epochs: 10000
+    eval_freq: 7200  # seconds
+    checkpoint_freq: 50
+    checkpoints: []
+    lr:  0.0001
+    warmup_steps: 1000
+    decay_steps: 2_000_000
+    clip_grad_norm: True
+    grad_clip_val: 1.0
+    weight_decay: 0.0
+    n_eval_samples: 8
+    sample_length_range: [50, 512]
+    sc_num_seqs: 4
+    eval_loss_t: [0.1, 0.3, 0.5, 0.7, 0.9]
+    self_cond_train_prob: 0.9
+    subsample_eval_set: 0.05
+    crop_conditional: False
+data:
+    pdb_path: 'datasets/ingraham_cath_dataset'
+    fixed_size: 512
+    n_aatype_tokens: 21
+    se3_data_augment: True
+    sigma_data: 10.0
+diffusion:
+    training:
+        function: 'lognormal'
+        psigma_mean: -1.0
+        psigma_std: 1.5
+    sampling:
+        function: 'uniform'
+        s_min: 0.001
+        s_max: 80
+model:
+    task: 'allatom'  # 'backbone', 'allatom', 'seqdes', 'codesign'
+    pretrained_modules: []  # 'struct_model', 'mpnn_model'
+    struct_model_checkpoint: ''
+    mpnn_model_checkpoint: ''
+    crop_conditional: False
+    dummy_fill_masked_atoms: False
+    struct_model:
+        arch: 'uvit'
+        n_atoms: 37
+        n_channel: 256
+        noise_cond_mult: 4
+        uvit:
+            patch_size: 1
+            n_layers: 6
+            n_heads: 8
+            dim_head: 32
+            n_filt_per_layer: []
+            n_blocks_per_layer: 2
+            cat_pwd_to_conv: False
+            conv_skip_connection: False
+            position_embedding_type: 'rotary'
+    mpnn_model:
+        use_self_conditioning: True
+        label_smoothing: 0.1
+        n_channel: 128
+        n_layers: 3
+        n_neighbors: 32
+        noise_cond_mult: 4

configs/backbone.yml ADDED Viewed

	@@ -0,0 +1,69 @@

+train:
+    home_dir: '/scratch/users/alexechu'
+    seed: 0
+    checkpoint: ['', 0]
+    batch_size: 32
+    max_epochs: 10000
+    eval_freq: 7200  # seconds
+    checkpoint_freq: 50
+    checkpoints: []
+    lr:  0.0001
+    warmup_steps: 1000
+    decay_steps: 2_000_000
+    clip_grad_norm: True
+    grad_clip_val: 1.0
+    weight_decay: 0.0
+    n_eval_samples: 8
+    sample_length_range: [50, 512]
+    sc_num_seqs: 4
+    eval_loss_t: [0.1, 0.3, 0.5, 0.7, 0.9]
+    self_cond_train_prob: 0.9
+    subsample_eval_set: 0.05
+    crop_conditional: False
+data:
+    pdb_path: 'datasets/ingraham_cath_dataset'
+    fixed_size: 384
+    n_aatype_tokens: 21
+    se3_data_augment: True
+    sigma_data: 10.0
+diffusion:
+    training:
+        function: 'lognormal'
+        psigma_mean: -1.2
+        psigma_std: 1.2
+    sampling:
+        function: 'uniform'
+        s_min: 0.001
+        s_max: 80
+model:
+    task: 'backbone'  # 'backbone', 'allatom', 'seqdes', 'codesign'
+    pretrained_modules: []  # 'struct_model', 'mpnn_model'
+    struct_model_checkpoint: ''
+    mpnn_model_checkpoint: ''
+    crop_conditional: False
+    dummy_fill_masked_atoms: False
+    struct_model:
+        arch: 'uvit'
+        n_atoms: 37  # keep same shapes, just zero out sidechains
+        n_channel: 256
+        noise_cond_mult: 4
+        uvit:
+            patch_size: 1
+            n_layers: 6
+            n_heads: 8
+            dim_head: 32
+            n_filt_per_layer: []
+            n_blocks_per_layer: 2
+            cat_pwd_to_conv: False
+            conv_skip_connection: False
+            position_embedding_type: 'absolute_residx'
+    mpnn_model:
+        use_self_conditioning: True
+        label_smoothing: 0.1
+        n_channel: 128
+        n_layers: 3
+        n_neighbors: 32
+        noise_cond_mult: 4

configs/seqdes.yml ADDED Viewed

	@@ -0,0 +1,74 @@

+train:
+    home_dir: '/scratch/users/alexechu'
+    seed: 0
+    checkpoint: ['', 0]
+    batch_size: 32
+    max_epochs: 10000
+    eval_freq: 3600  # seconds
+    checkpoint_freq: 20
+    checkpoints: []
+    lr:  0.0001
+    warmup_steps: 1000
+    decay_steps: 400_000
+    clip_grad_norm: True
+    grad_clip_val: 1.0
+    weight_decay: 0.0
+    n_eval_samples: 8
+    sample_length_range: [50, 512]
+    sc_num_seqs: 4
+    eval_loss_t: [0.1, 0.3, 0.5, 0.7, 0.9]
+    self_cond_train_prob: 0.9
+    dgram_loss_weight: False
+    subsample_eval_set: 0.1
+    crop_conditional: False
+data:
+    pdb_path: 'datasets/ingraham_cath_dataset'
+    fixed_size: 512
+    n_aatype_tokens: 21
+    se3_data_augment: True
+    sigma_data: 10.0
+diffusion:
+    training:
+        function: 'mpnn'
+        psigma_mean: -1.2
+        psigma_std: 1.2
+        time_power: 30.0
+        constant_val: 0.02
+    sampling:
+        function: 'uniform'
+        s_min: 0.001
+        s_max: 60
+model:
+    task: 'seqdes'  # 'backbone', 'allatom', 'seqdes', 'codesign'
+    pretrained_modules: ['struct_model']  # 'struct_model', 'mpnn_model'
+    struct_model_checkpoint: 'protpardelle/checkpoints/allatom_state_dict.pth'
+    mpnn_model_checkpoint: ''
+    crop_conditional: False
+    dummy_fill_masked_atoms: False
+    debug_mpnn: True
+    struct_model:
+        arch: 'uvit'
+        n_channel: 256
+        n_atoms: 37
+        noise_cond_mult: 4
+        uvit:
+            patch_size: 1
+            n_layers: 6
+            n_heads: 8
+            dim_head: 32
+            n_filt_per_layer: []  # None or [] for vanilla trf
+            n_blocks_per_layer: 2
+            cat_pwd_to_conv: False
+            conv_skip_connection: False  # n layers must == 1
+            position_embedding_type: 'rotary'
+    mpnn_model:
+        use_self_conditioning: False
+        label_smoothing: 0.0
+        n_channel: 128
+        n_layers: 3
+        n_neighbors: 32
+        noise_cond_mult: 4

core/__init__.py ADDED Viewed

File without changes

core/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (169 Bytes). View file

core/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (169 Bytes). View file

core/__pycache__/data.cpython-38.pyc ADDED Viewed

Binary file (6.74 kB). View file

core/__pycache__/data.cpython-39.pyc ADDED Viewed

Binary file (6.66 kB). View file

core/__pycache__/protein.cpython-38.pyc ADDED Viewed

Binary file (7.97 kB). View file

core/__pycache__/protein.cpython-39.pyc ADDED Viewed

Binary file (7.94 kB). View file

core/__pycache__/protein_mpnn.cpython-38.pyc ADDED Viewed

Binary file (53.5 kB). View file

core/__pycache__/protein_mpnn.cpython-39.pyc ADDED Viewed

Binary file (53.3 kB). View file

core/__pycache__/residue_constants.cpython-38.pyc ADDED Viewed

Binary file (21.2 kB). View file

core/__pycache__/residue_constants.cpython-39.pyc ADDED Viewed

Binary file (24 kB). View file

core/__pycache__/utils.cpython-38.pyc ADDED Viewed

Binary file (30.3 kB). View file

core/__pycache__/utils.cpython-39.pyc ADDED Viewed

Binary file (30.1 kB). View file

core/data.py ADDED Viewed

	@@ -0,0 +1,271 @@

+"""
+https://github.com/ProteinDesignLab/protpardelle
+License: MIT
+Author: Alex Chu
+Dataloader from PDB files.
+"""
+import copy
+import pickle
+import json
+import numpy as np
+import torch
+from torch.utils import data
+from core import utils
+from core import protein
+from core import residue_constants
+FEATURES_1D = (
+    "coords_in",
+    "torsions_in",
+    "b_factors",
+    "atom_positions",
+    "aatype",
+    "atom_mask",
+    "residue_index",
+    "chain_index",
+)
+FEATURES_FLOAT = (
+    "coords_in",
+    "torsions_in",
+    "b_factors",
+    "atom_positions",
+    "atom_mask",
+    "seq_mask",
+)
+FEATURES_LONG = ("aatype", "residue_index", "chain_index", "orig_size")
+def make_fixed_size_1d(data, fixed_size=128):
+    data_len = data.shape[0]
+    if data_len >= fixed_size:
+        extra_len = data_len - fixed_size
+        start_idx = np.random.choice(np.arange(extra_len + 1))
+        new_data = data[start_idx : (start_idx + fixed_size)]
+        mask = torch.ones(fixed_size)
+    if data_len < fixed_size:
+        pad_size = fixed_size - data_len
+        extra_shape = data.shape[1:]
+        new_data = torch.cat([data, torch.zeros(pad_size, *extra_shape)], 0)
+        mask = torch.cat([torch.ones(data_len), torch.zeros(pad_size)], 0)
+    return new_data, mask
+def apply_random_se3(coords_in, atom_mask=None, translation_scale=1.0):
+    # unbatched. center on the mean of CA coords
+    coords_mean = coords_in[:, 1:2].mean(-3, keepdim=True)
+    coords_in -= coords_mean
+    random_rot, _ = torch.linalg.qr(torch.randn(3, 3))
+    coords_in = coords_in @ random_rot
+    random_trans = torch.randn_like(coords_mean) * translation_scale
+    coords_in += random_trans
+    if atom_mask is not None:
+        coords_in = coords_in * atom_mask[..., None]
+    return coords_in
+def get_masked_coords_array(coords, atom_mask):
+    ma_mask = repeat(1 - atom_mask[..., None].cpu().numpy(), "... 1 -> ... 3")
+    return np.ma.array(coords.cpu().numpy(), mask=ma_mask)
+def make_crop_cond_mask_and_recenter_coords(
+    atom_mask,
+    atom_coords,
+    contiguous_prob=0.05,
+    discontiguous_prob=0.9,
+    sidechain_only_prob=0.8,
+    max_span_len=10,
+    max_discontiguous_res=8,
+    dist_threshold=8.0,
+    recenter_coords=True,
+):
+    b, n, a = atom_mask.shape
+    device = atom_mask.device
+    seq_mask = atom_mask[..., 1]
+    n_res = seq_mask.sum(-1)
+    masks = []
+    for i, nr in enumerate(n_res):
+        nr = nr.int().item()
+        mask = torch.zeros((n, a), device=device)
+        conditioning_type = torch.distributions.Categorical(
+            torch.tensor(
+                [
+                    contiguous_prob,
+                    discontiguous_prob,
+                    1.0 - contiguous_prob - discontiguous_prob,
+                ]
+            )
+        ).sample()
+        conditioning_type = ["contiguous", "discontiguous", "none"][conditioning_type]
+        if conditioning_type == "contiguous":
+            span_len = torch.randint(
+                1, min(max_span_len, nr), (1,), device=device
+            ).item()
+            span_start = torch.randint(0, nr - span_len, (1,), device=device)
+            mask[span_start : span_start + span_len, :] = 1
+        elif conditioning_type == "discontiguous":
+            # Extract CB atoms coordinates for the i-th example
+            cb_atoms = atom_coords[i, :, 3]
+            # Pairwise distances between CB atoms
+            cb_distances = torch.cdist(cb_atoms, cb_atoms)
+            close_mask = (
+                cb_distances <= dist_threshold
+            )  # Mask for selecting close CB atoms
+            random_residue = torch.randint(0, nr, (1,), device=device).squeeze()
+            cb_dist_i = cb_distances[random_residue] + 1e3 * (1 - seq_mask[i])
+            close_mask = cb_dist_i <= dist_threshold
+            n_neighbors = close_mask.sum().int()
+            # pick how many neighbors (up to 10)
+            n_sele = torch.randint(
+                2,
+                n_neighbors.clamp(min=3, max=max_discontiguous_res + 1),
+                (1,),
+                device=device,
+            )
+            # Select the indices of CB atoms that are close together
+            idxs = torch.arange(n, device=device)[close_mask.bool()]
+            idxs = idxs[torch.randperm(len(idxs))[:n_sele]]
+            if len(idxs) > 0:
+                mask[idxs] = 1
+            if np.random.uniform() < sidechain_only_prob:
+                mask[:, :5] = 0
+        masks.append(mask)
+    crop_cond_mask = torch.stack(masks)
+    crop_cond_mask = crop_cond_mask * atom_mask
+    if recenter_coords:
+        motif_masked_array = get_masked_coords_array(atom_coords, crop_cond_mask)
+        cond_coords_center = motif_masked_array.mean((1, 2))
+        motif_mask = torch.Tensor(1 - cond_coords_center.mask).to(crop_cond_mask)
+        means = torch.Tensor(cond_coords_center.data).to(atom_coords) * motif_mask
+        coords_out = atom_coords - rearrange(means, "b c -> b 1 1 c")
+    else:
+        coords_out = atom_coords
+    return coords_out, crop_cond_mask
+class Dataset(data.Dataset):
+    """Loads and processes PDBs into tensors."""
+    def __init__(
+        self,
+        pdb_path,
+        fixed_size,
+        mode="train",
+        overfit=-1,
+        short_epoch=False,
+        se3_data_augment=True,
+    ):
+        self.pdb_path = pdb_path
+        self.fixed_size = fixed_size
+        self.mode = mode
+        self.overfit = overfit
+        self.short_epoch = short_epoch
+        self.se3_data_augment = se3_data_augment
+        with open(f"{self.pdb_path}/{mode}_pdb_keys.list") as f:
+            self.pdb_keys = np.array(f.read().split("\n")[:-1])
+        if overfit > 0:
+            n_data = len(self.pdb_keys)
+            self.pdb_keys = np.random.choice(
+                self.pdb_keys, min(n_data, overfit), replace=False
+            ).repeat(n_data // overfit)
+    def __len__(self):
+        if self.short_epoch:
+            return min(len(self.pdb_keys), 256)
+        else:
+            return len(self.pdb_keys)
+    def __getitem__(self, idx):
+        pdb_key = self.pdb_keys[idx]
+        data = self.get_item(pdb_key)
+        # For now, replace dataloading errors with a random pdb. 10 tries
+        for _ in range(10):
+            if data is not None:
+                return data
+            pdb_key = self.pdb_keys[np.random.randint(len(self.pdb_keys))]
+            data = self.get_item(pdb_key)
+        raise Exception("Failed to load data example after 10 tries.")
+    def get_item(self, pdb_key):
+        example = {}
+        if self.pdb_path.endswith("cath_s40_dataset"):  # CATH pdbs
+            data_file = f"{self.pdb_path}/dompdb/{pdb_key}"
+        elif self.pdb_path.endswith("ingraham_cath_dataset"):  # ingraham splits
+            data_file = f"{self.pdb_path}/pdb_store/{pdb_key}"
+        else:
+            raise Exception("Invalid pdb path.")
+        try:
+            example = utils.load_feats_from_pdb(data_file)
+            coords_in = example["atom_positions"]
+        except FileNotFoundError:
+            raise Exception(f"File {pdb_key} not found. Check if dataset is corrupted?")
+        except RuntimeError:
+            return None
+        # Apply data augmentation
+        if self.se3_data_augment:
+            coords_in = apply_random_se3(coords_in, atom_mask=example["atom_mask"])
+        orig_size = coords_in.shape[0]
+        example["coords_in"] = coords_in
+        example["orig_size"] = torch.ones(1) * orig_size
+        fixed_size_example = {}
+        seq_mask = None
+        for k, v in example.items():
+            if k in FEATURES_1D:
+                fixed_size_example[k], seq_mask = make_fixed_size_1d(
+                    v, fixed_size=self.fixed_size
+                )
+            else:
+                fixed_size_example[k] = v
+        if seq_mask is not None:
+            fixed_size_example["seq_mask"] = seq_mask
+        example_out = {}
+        for k, v in fixed_size_example.items():
+            if k in FEATURES_FLOAT:
+                example_out[k] = v.float()
+            elif k in FEATURES_LONG:
+                example_out[k] = v.long()
+        return example_out
+    def collate(self, example_list):
+        out = {}
+        for ex in example_list:
+            for k, v in ex.items():
+                out.setdefault(k, []).append(v)
+        return {k: torch.stack(v) for k, v in out.items()}
+    def sample(self, n=1, return_data=True, return_keys=False):
+        keys = self.pdb_keys[torch.randperm(self.__len__())[:n].long()]
+        if return_keys and not return_data:
+            return keys
+        if n == 1:
+            data = self.collate([self.get_item(keys)])
+        else:
+            data = self.collate([self.get_item(key) for key in keys])
+        if return_data and return_keys:
+            return data, keys
+        if return_data and not return_keys:
+            return data

core/protein.py ADDED Viewed

	@@ -0,0 +1,341 @@

+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Protein data type.
+Adapted from original code by alexechu.
+"""
+import dataclasses
+import io
+from typing import Any, Mapping, Optional
+from core import residue_constants
+from Bio.PDB import PDBParser
+import numpy as np
+FeatureDict = Mapping[str, np.ndarray]
+ModelOutput = Mapping[str, Any]  # Is a nested dict.
+# Complete sequence of chain IDs supported by the PDB format.
+PDB_CHAIN_IDS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
+PDB_MAX_CHAINS = len(PDB_CHAIN_IDS)  # := 62.
+@dataclasses.dataclass(frozen=True)
+class Protein:
+    """Protein structure representation."""
+    # Cartesian coordinates of atoms in angstroms. The atom types correspond to
+    # residue_constants.atom_types, i.e. the first three are N, CA, CB.
+    atom_positions: np.ndarray  # [num_res, num_atom_type, 3]
+    # Amino-acid type for each residue represented as an integer between 0 and
+    # 20, where 20 is 'X'.
+    aatype: np.ndarray  # [num_res]
+    # Binary float mask to indicate presence of a particular atom. 1.0 if an atom
+    # is present and 0.0 if not. This should be used for loss masking.
+    atom_mask: np.ndarray  # [num_res, num_atom_type]
+    # Residue index as used in PDB. It is not necessarily continuous or 0-indexed.
+    residue_index: np.ndarray  # [num_res]
+    # 0-indexed number corresponding to the chain in the protein that this residue
+    # belongs to.
+    chain_index: np.ndarray  # [num_res]
+    # B-factors, or temperature factors, of each residue (in sq. angstroms units),
+    # representing the displacement of the residue from its ground truth mean
+    # value.
+    b_factors: np.ndarray  # [num_res, num_atom_type]
+    def __post_init__(self):
+        if len(np.unique(self.chain_index)) > PDB_MAX_CHAINS:
+            raise ValueError(
+                f"Cannot build an instance with more than {PDB_MAX_CHAINS} chains "
+                "because these cannot be written to PDB format."
+            )
+def from_pdb_string(
+    pdb_str: str, chain_id: Optional[str] = None, protein_only: bool = False
+) -> Protein:
+    """Takes a PDB string and constructs a Protein object.
+    WARNING: All non-standard residue types will be converted into UNK. All
+      non-standard atoms will be ignored.
+    Args:
+      pdb_str: The contents of the pdb file
+      chain_id: If chain_id is specified (e.g. A), then only that chain
+        is parsed. Otherwise all chains are parsed.
+    Returns:
+      A new `Protein` parsed from the pdb contents.
+    """
+    pdb_fh = io.StringIO(pdb_str)
+    parser = PDBParser(QUIET=True)
+    structure = parser.get_structure("none", pdb_fh)
+    models = list(structure.get_models())
+    if len(models) != 1:
+        raise ValueError(
+            f"Only single model PDBs are supported. Found {len(models)} models."
+        )
+    model = models[0]
+    atom_positions = []
+    aatype = []
+    atom_mask = []
+    residue_index = []
+    chain_ids = []
+    b_factors = []
+    for chain in model:
+        if chain_id is not None and chain.id != chain_id:
+            continue
+        for res in chain:
+            if protein_only and res.id[0] != " ":
+                continue
+            if res.id[2] != " ":
+                pass
+                # raise ValueError(
+                #     f"PDB contains an insertion code at chain {chain.id} and residue "
+                #     f"index {res.id[1]}. These are not supported."
+                # )
+            res_shortname = residue_constants.restype_3to1.get(res.resname, "X")
+            restype_idx = residue_constants.restype_order.get(
+                res_shortname, residue_constants.restype_num
+            )
+            pos = np.zeros((residue_constants.atom_type_num, 3))
+            mask = np.zeros((residue_constants.atom_type_num,))
+            res_b_factors = np.zeros((residue_constants.atom_type_num,))
+            for atom in res:
+                if atom.name not in residue_constants.atom_types:
+                    continue
+                pos[residue_constants.atom_order[atom.name]] = atom.coord
+                mask[residue_constants.atom_order[atom.name]] = 1.0
+                res_b_factors[residue_constants.atom_order[atom.name]] = atom.bfactor
+            if np.sum(mask) < 0.5:
+                # If no known atom positions are reported for the residue then skip it.
+                continue
+            aatype.append(restype_idx)
+            atom_positions.append(pos)
+            atom_mask.append(mask)
+            residue_index.append(res.id[1])
+            chain_ids.append(chain.id)
+            b_factors.append(res_b_factors)
+    # Chain IDs are usually characters so map these to ints.
+    unique_chain_ids = np.unique(chain_ids)
+    chain_id_mapping = {cid: n for n, cid in enumerate(unique_chain_ids)}
+    chain_index = np.array([chain_id_mapping[cid] for cid in chain_ids])
+    return Protein(
+        atom_positions=np.array(atom_positions),
+        atom_mask=np.array(atom_mask),
+        aatype=np.array(aatype),
+        residue_index=np.array(residue_index),
+        chain_index=chain_index,
+        b_factors=np.array(b_factors),
+    )
+def _chain_end(atom_index, end_resname, chain_name, residue_index) -> str:
+    chain_end = "TER"
+    return (
+        f"{chain_end:<6}{atom_index:>5}      {end_resname:>3} "
+        f"{chain_name:>1}{residue_index:>4}"
+    )
+def are_atoms_bonded(res3name, atom1_name, atom2_name):
+    lookup_table = residue_constants.standard_residue_bonds
+    for bond in lookup_table[res3name]:
+        if bond.atom1_name == atom1_name and bond.atom2_name == atom2_name:
+            return True
+        elif bond.atom1_name == atom2_name and bond.atom2_name == atom1_name:
+            return True
+    return False
+def to_pdb(prot: Protein, conect=False) -> str:
+    """Converts a `Protein` instance to a PDB string.
+    Args:
+      prot: The protein to convert to PDB.
+    Returns:
+      PDB string.
+    """
+    restypes = residue_constants.restypes + ["X"]
+    res_1to3 = lambda r: residue_constants.restype_1to3.get(restypes[r], "UNK")
+    atom_types = residue_constants.atom_types
+    pdb_lines = []
+    atom_mask = prot.atom_mask
+    aatype = prot.aatype
+    atom_positions = prot.atom_positions
+    residue_index = prot.residue_index.astype(np.int32)
+    chain_index = prot.chain_index.astype(np.int32)
+    b_factors = prot.b_factors
+    if np.any(aatype > residue_constants.restype_num):
+        raise ValueError("Invalid aatypes.")
+    # Construct a mapping from chain integer indices to chain ID strings.
+    chain_ids = {}
+    for i in np.unique(chain_index):  # np.unique gives sorted output.
+        if i >= PDB_MAX_CHAINS:
+            raise ValueError(
+                f"The PDB format supports at most {PDB_MAX_CHAINS} chains."
+            )
+        chain_ids[i] = PDB_CHAIN_IDS[i]
+    pdb_lines.append("MODEL     1")
+    atom_index = 1
+    last_chain_index = chain_index[0]
+    conect_lines = []
+    # Add all atom sites.
+    for i in range(aatype.shape[0]):
+        # Close the previous chain if in a multichain PDB.
+        if last_chain_index != chain_index[i]:
+            pdb_lines.append(
+                _chain_end(
+                    atom_index,
+                    res_1to3(aatype[i - 1]),
+                    chain_ids[chain_index[i - 1]],
+                    residue_index[i - 1],
+                )
+            )
+            last_chain_index = chain_index[i]
+            atom_index += 1  # Atom index increases at the TER symbol.
+        res_name_3 = res_1to3(aatype[i])
+        atoms_appended_for_res = []
+        for atom_name, pos, mask, b_factor in zip(
+            atom_types, atom_positions[i], atom_mask[i], b_factors[i]
+        ):
+            if mask < 0.5:
+                continue
+            record_type = "ATOM"
+            name = atom_name if len(atom_name) == 4 else f" {atom_name}"
+            alt_loc = ""
+            insertion_code = ""
+            occupancy = 1.00
+            element = atom_name[0]  # Protein supports only C, N, O, S, this works.
+            charge = ""
+            # PDB is a columnar format, every space matters here!
+            atom_line = (
+                f"{record_type:<6}{atom_index:>5} {name:<4}{alt_loc:>1}"
+                f"{res_name_3:>3} {chain_ids[chain_index[i]]:>1}"
+                f"{residue_index[i]:>4}{insertion_code:>1}   "
+                f"{pos[0]:>8.3f}{pos[1]:>8.3f}{pos[2]:>8.3f}"
+                f"{occupancy:>6.2f}{b_factor:>6.2f}          "
+                f"{element:>2}{charge:>2}"
+            )
+            pdb_lines.append(atom_line)
+            for prev_atom_idx, prev_atom in atoms_appended_for_res:
+                if are_atoms_bonded(res_name_3, atom_name, prev_atom):
+                    conect_line = f"CONECT{prev_atom_idx:5d}{atom_index:5d}\n"
+                    conect_lines.append(conect_line)
+            atoms_appended_for_res.append((atom_index, atom_name))
+            if atom_name == "N":
+                n_atom_idx = atom_index
+            if atom_name == "C":
+                c_atom_idx = atom_index
+            atom_index += 1
+        if i > 0:
+            conect_line = f"CONECT{prev_c_atom_idx:5d}{n_atom_idx:5d}\n"
+            conect_lines.append(conect_line)
+        prev_c_atom_idx = c_atom_idx
+    # Close the final chain.
+    pdb_lines.append(
+        _chain_end(
+            atom_index,
+            res_1to3(aatype[-1]),
+            chain_ids[chain_index[-1]],
+            residue_index[-1],
+        )
+    )
+    pdb_lines.append("ENDMDL")
+    pdb_lines.append("END")
+    # Pad all lines to 80 characters.
+    pdb_lines = [line.ljust(80) for line in pdb_lines]
+    pdb_str = "\n".join(pdb_lines) + "\n"  # Add terminating newline.
+    if conect:
+        conect_str = "".join(conect_lines) + "\n"
+        return pdb_str, conect_str
+    return pdb_str
+def ideal_atom_mask(prot: Protein) -> np.ndarray:
+    """Computes an ideal atom mask.
+    `Protein.atom_mask` typically is defined according to the atoms that are
+    reported in the PDB. This function computes a mask according to heavy atoms
+    that should be present in the given sequence of amino acids.
+    Args:
+      prot: `Protein` whose fields are `numpy.ndarray` objects.
+    Returns:
+      An ideal atom mask.
+    """
+    return residue_constants.STANDARD_ATOM_MASK[prot.aatype]
+def from_prediction(
+    features: FeatureDict,
+    result: ModelOutput,
+    b_factors: Optional[np.ndarray] = None,
+    remove_leading_feature_dimension: bool = True,
+) -> Protein:
+    """Assembles a protein from a prediction.
+    Args:
+      features: Dictionary holding model inputs.
+      result: Dictionary holding model outputs.
+      b_factors: (Optional) B-factors to use for the protein.
+      remove_leading_feature_dimension: Whether to remove the leading dimension
+        of the `features` values.
+    Returns:
+      A protein instance.
+    """
+    fold_output = result["structure_module"]
+    def _maybe_remove_leading_dim(arr: np.ndarray) -> np.ndarray:
+        return arr[0] if remove_leading_feature_dimension else arr
+    if "asym_id" in features:
+        chain_index = _maybe_remove_leading_dim(features["asym_id"])
+    else:
+        chain_index = np.zeros_like(_maybe_remove_leading_dim(features["aatype"]))
+    if b_factors is None:
+        b_factors = np.zeros_like(fold_output["final_atom_mask"])
+    return Protein(
+        aatype=_maybe_remove_leading_dim(features["aatype"]),
+        atom_positions=fold_output["final_atom_positions"],
+        atom_mask=fold_output["final_atom_mask"],
+        residue_index=_maybe_remove_leading_dim(features["residue_index"]) + 1,
+        chain_index=chain_index,
+        b_factors=b_factors,
+    )

core/protein_mpnn.py ADDED Viewed

	@@ -0,0 +1,1886 @@

+# MIT License
+# Copyright (c) 2022 Justas Dauparas
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+'''
+Adapted from original code by alexechu.
+'''
+import json, time, os, sys, glob
+import shutil
+import warnings
+import copy
+import random
+import os.path
+import subprocess
+import itertools
+from einops.layers.torch import Rearrange
+import numpy as np
+import torch
+from torch import optim
+from torch.utils.data import DataLoader
+from torch.utils.data.dataset import random_split, Subset
+import torch.nn as nn
+import torch.nn.functional as F
+def get_mpnn_model(model_name='v_48_020', path_to_model_weights='', ca_only=False, backbone_noise=0.0, verbose=False, device=None):
+    hidden_dim = 128
+    num_layers = 3
+    if device is None:
+        device = torch.device("cuda:0" if (torch.cuda.is_available()) else "cpu")
+    if path_to_model_weights:
+        model_folder_path = path_to_model_weights
+        if model_folder_path[-1] != '/':
+            model_folder_path = model_folder_path + '/'
+    else:
+        file_path = os.path.realpath(__file__)
+        k = file_path.rfind("/")
+        if ca_only:
+            model_folder_path = file_path[:k] + '/ca_model_weights/'
+        else:
+            model_folder_path = file_path[:k] + '/vanilla_model_weights/'
+    checkpoint_path = model_folder_path + f'{model_name}.pt'
+    checkpoint = torch.load(checkpoint_path, map_location=device)
+    noise_level_print = checkpoint['noise_level']
+    model = ProteinMPNN(ca_only=ca_only, num_letters=21, node_features=hidden_dim, edge_features=hidden_dim, hidden_dim=hidden_dim,
+        num_encoder_layers=num_layers, num_decoder_layers=num_layers, augment_eps=backbone_noise, k_neighbors=checkpoint['num_edges'])
+    model.to(device)
+    model.load_state_dict(checkpoint['model_state_dict'])
+    model.eval()
+    if verbose:
+        print(40*'-')
+        print('Model loaded...')
+        print('Number of edges:', checkpoint['num_edges'])
+        print(f'Training noise level: {noise_level_print}A')
+    return model
+def run_proteinmpnn(model=None, pdb_path='', pdb_path_chains='', path_to_model_weights='', model_name='v_48_020', seed=0, ca_only=False, out_folder='', num_seq_per_target=1, batch_size=1, sampling_temps=[0.1], backbone_noise=0.0, max_length=200000, omit_AAs=[], print_all=False,
+    chain_id_jsonl='', fixed_positions_jsonl='', pssm_jsonl='', omit_AA_jsonl='', bias_AA_jsonl='', tied_positions_jsonl='', bias_by_res_jsonl='', jsonl_path='',
+    pssm_threshold=0.0, pssm_multi=0.0, pssm_log_odds_flag=False, pssm_bias_flag=False, write_output_files=False):
+    if model is None:
+        model = get_mpnn_model(model_name=model_name, path_to_model_weights=path_to_model_weights, ca_only=ca_only, backbone_noise=backbone_noise, verbose=print_all)
+    if seed:
+        seed=seed
+    else:
+        seed=int(np.random.randint(0, high=999, size=1, dtype=int)[0])
+    torch.manual_seed(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+    NUM_BATCHES = num_seq_per_target//batch_size
+    BATCH_COPIES = batch_size
+    temperatures = sampling_temps
+    omit_AAs_list = omit_AAs
+    alphabet = 'ACDEFGHIKLMNPQRSTVWYX'
+    alphabet_dict = dict(zip(alphabet, range(21)))
+    omit_AAs_np = np.array([AA in omit_AAs_list for AA in alphabet]).astype(np.float32)
+    device = torch.device("cuda:0" if (torch.cuda.is_available()) else "cpu")
+    if os.path.isfile(chain_id_jsonl):
+        with open(chain_id_jsonl, 'r') as json_file:
+            json_list = list(json_file)
+        for json_str in json_list:
+            chain_id_dict = json.loads(json_str)
+    else:
+        chain_id_dict = None
+        if print_all:
+            print(40*'-')
+            print('chain_id_jsonl is NOT loaded')
+    if os.path.isfile(fixed_positions_jsonl):
+        with open(fixed_positions_jsonl, 'r') as json_file:
+            json_list = list(json_file)
+        for json_str in json_list:
+            fixed_positions_dict = json.loads(json_str)
+    else:
+        if print_all:
+            print(40*'-')
+            print('fixed_positions_jsonl is NOT loaded')
+        fixed_positions_dict = None
+    if os.path.isfile(pssm_jsonl):
+        with open(pssm_jsonl, 'r') as json_file:
+            json_list = list(json_file)
+        pssm_dict = {}
+        for json_str in json_list:
+            pssm_dict.update(json.loads(json_str))
+    else:
+        if print_all:
+            print(40*'-')
+            print('pssm_jsonl is NOT loaded')
+        pssm_dict = None
+    if os.path.isfile(omit_AA_jsonl):
+        with open(omit_AA_jsonl, 'r') as json_file:
+            json_list = list(json_file)
+        for json_str in json_list:
+            omit_AA_dict = json.loads(json_str)
+    else:
+        if print_all:
+            print(40*'-')
+            print('omit_AA_jsonl is NOT loaded')
+        omit_AA_dict = None
+    if os.path.isfile(bias_AA_jsonl):
+        with open(bias_AA_jsonl, 'r') as json_file:
+            json_list = list(json_file)
+        for json_str in json_list:
+            bias_AA_dict = json.loads(json_str)
+    else:
+        if print_all:
+            print(40*'-')
+            print('bias_AA_jsonl is NOT loaded')
+        bias_AA_dict = None
+    if os.path.isfile(tied_positions_jsonl):
+        with open(tied_positions_jsonl, 'r') as json_file:
+            json_list = list(json_file)
+        for json_str in json_list:
+            tied_positions_dict = json.loads(json_str)
+    else:
+        if print_all:
+            print(40*'-')
+            print('tied_positions_jsonl is NOT loaded')
+        tied_positions_dict = None
+    if os.path.isfile(bias_by_res_jsonl):
+        with open(bias_by_res_jsonl, 'r') as json_file:
+            json_list = list(json_file)
+        for json_str in json_list:
+            bias_by_res_dict = json.loads(json_str)
+        if print_all:
+            print('bias by residue dictionary is loaded')
+    else:
+        if print_all:
+            print(40*'-')
+            print('bias by residue dictionary is not loaded, or not provided')
+        bias_by_res_dict = None
+    if print_all:
+        print(40*'-')
+    bias_AAs_np = np.zeros(len(alphabet))
+    if bias_AA_dict:
+            for n, AA in enumerate(alphabet):
+                    if AA in list(bias_AA_dict.keys()):
+                            bias_AAs_np[n] = bias_AA_dict[AA]
+    if pdb_path:
+        pdb_dict_list = parse_PDB(pdb_path, ca_only=ca_only)
+        dataset_valid = StructureDatasetPDB(pdb_dict_list, truncate=None, max_length=max_length)
+        all_chain_list = [item[-1:] for item in list(pdb_dict_list[0]) if item[:9]=='seq_chain'] #['A','B', 'C',...]
+        if pdb_path_chains:
+            designed_chain_list = [str(item) for item in pdb_path_chains.split()]
+        else:
+            designed_chain_list = all_chain_list
+        fixed_chain_list = [letter for letter in all_chain_list if letter not in designed_chain_list]
+        chain_id_dict = {}
+        chain_id_dict[pdb_dict_list[0]['name']]= (designed_chain_list, fixed_chain_list)
+    else:
+        dataset_valid = StructureDataset(jsonl_path, truncate=None, max_length=max_length, verbose=print_all)
+    # Build paths for experiment
+    if write_output_files:
+        folder_for_outputs = out_folder
+        base_folder = folder_for_outputs
+        if base_folder[-1] != '/':
+            base_folder = base_folder + '/'
+        if not os.path.exists(base_folder):
+            os.makedirs(base_folder)
+        if not os.path.exists(base_folder + 'seqs'):
+            os.makedirs(base_folder + 'seqs')
+    # if args.save_score:
+    #     if not os.path.exists(base_folder + 'scores'):
+    #         os.makedirs(base_folder + 'scores')
+    # if args.score_only:
+    #     if not os.path.exists(base_folder + 'score_only'):
+    #         os.makedirs(base_folder + 'score_only')
+    # if args.conditional_probs_only:
+    #     if not os.path.exists(base_folder + 'conditional_probs_only'):
+    #         os.makedirs(base_folder + 'conditional_probs_only')
+    # if args.unconditional_probs_only:
+    #     if not os.path.exists(base_folder + 'unconditional_probs_only'):
+    #         os.makedirs(base_folder + 'unconditional_probs_only')
+    # if args.save_probs:
+    #     if not os.path.exists(base_folder + 'probs'):
+    #         os.makedirs(base_folder + 'probs')
+    # Timing
+    start_time = time.time()
+    total_residues = 0
+    protein_list = []
+    total_step = 0
+    # Validation epoch
+    new_mpnn_seqs = []
+    with torch.no_grad():
+        test_sum, test_weights = 0., 0.
+        for ix, protein in enumerate(dataset_valid):
+            score_list = []
+            global_score_list = []
+            all_probs_list = []
+            all_log_probs_list = []
+            S_sample_list = []
+            batch_clones = [copy.deepcopy(protein) for i in range(BATCH_COPIES)]
+            X, S, mask, lengths, chain_M, chain_encoding_all, chain_list_list, visible_list_list, masked_list_list, masked_chain_length_list_list, chain_M_pos, omit_AA_mask, residue_idx, dihedral_mask, tied_pos_list_of_lists_list, pssm_coef, pssm_bias, pssm_log_odds_all, bias_by_res_all, tied_beta = tied_featurize(batch_clones, device, chain_id_dict, fixed_positions_dict, omit_AA_dict, tied_positions_dict, pssm_dict, bias_by_res_dict, ca_only=ca_only)
+            pssm_log_odds_mask = (pssm_log_odds_all > pssm_threshold).float() #1.0 for true, 0.0 for false
+            name_ = batch_clones[0]['name']
+            if False:
+                pass
+            # if args.score_only:
+            #     loop_c = 0
+            #     if args.path_to_fasta:
+            #         fasta_names, fasta_seqs = parse_fasta(args.path_to_fasta, omit=["/"])
+            #         loop_c = len(fasta_seqs)
+            #     for fc in range(1+loop_c):
+            #         if fc == 0:
+            #             structure_sequence_score_file = base_folder + '/score_only/' + batch_clones[0]['name'] + f'_pdb'
+            #         else:
+            #             structure_sequence_score_file = base_folder + '/score_only/' + batch_clones[0]['name'] + f'_fasta_{fc}'
+            #         native_score_list = []
+            #         global_native_score_list = []
+            #         if fc > 0:
+            #             input_seq_length = len(fasta_seqs[fc-1])
+            #             S_input = torch.tensor([alphabet_dict[AA] for AA in fasta_seqs[fc-1]], device=device)[None,:].repeat(X.shape[0], 1)
+            #             S[:,:input_seq_length] = S_input #assumes that S and S_input are alphabetically sorted for masked_chains
+            #         for j in range(NUM_BATCHES):
+            #             randn_1 = torch.randn(chain_M.shape, device=X.device)
+            #             log_probs = model(X, S, mask, chain_M*chain_M_pos, residue_idx, chain_encoding_all, randn_1)
+            #             mask_for_loss = mask*chain_M*chain_M_pos
+            #             scores = _scores(S, log_probs, mask_for_loss)
+            #             native_score = scores.cpu().data.numpy()
+            #             native_score_list.append(native_score)
+            #             global_scores = _scores(S, log_probs, mask)
+            #             global_native_score = global_scores.cpu().data.numpy()
+            #             global_native_score_list.append(global_native_score)
+            #         native_score = np.concatenate(native_score_list, 0)
+            #         global_native_score = np.concatenate(global_native_score_list, 0)
+            #         ns_mean = native_score.mean()
+            #         ns_mean_print = np.format_float_positional(np.float32(ns_mean), unique=False, precision=4)
+            #         ns_std = native_score.std()
+            #         ns_std_print = np.format_float_positional(np.float32(ns_std), unique=False, precision=4)
+            #         global_ns_mean = global_native_score.mean()
+            #         global_ns_mean_print = np.format_float_positional(np.float32(global_ns_mean), unique=False, precision=4)
+            #         global_ns_std = global_native_score.std()
+            #         global_ns_std_print = np.format_float_positional(np.float32(global_ns_std), unique=False, precision=4)
+            #         ns_sample_size = native_score.shape[0]
+            #         seq_str = _S_to_seq(S[0,], chain_M[0,])
+            #         np.savez(structure_sequence_score_file, score=native_score, global_score=global_native_score, S=S[0,].cpu().numpy(), seq_str=seq_str)
+            #         if print_all:
+            #             if fc == 0:
+            #                 print(f'Score for {name_} from PDB, mean: {ns_mean_print}, std: {ns_std_print}, sample size: {ns_sample_size},  global score, mean: {global_ns_mean_print}, std: {global_ns_std_print}, sample size: {ns_sample_size}')
+            #             else:
+            #                 print(f'Score for {name_}_{fc} from FASTA, mean: {ns_mean_print}, std: {ns_std_print}, sample size: {ns_sample_size},  global score, mean: {global_ns_mean_print}, std: {global_ns_std_print}, sample size: {ns_sample_size}')
+            # elif args.conditional_probs_only:
+            #     if print_all:
+            #         print(f'Calculating conditional probabilities for {name_}')
+            #     conditional_probs_only_file = base_folder + '/conditional_probs_only/' + batch_clones[0]['name']
+            #     log_conditional_probs_list = []
+            #     for j in range(NUM_BATCHES):
+            #         randn_1 = torch.randn(chain_M.shape, device=X.device)
+            #         log_conditional_probs = model.conditional_probs(X, S, mask, chain_M*chain_M_pos, residue_idx, chain_encoding_all, randn_1, args.conditional_probs_only_backbone)
+            #         log_conditional_probs_list.append(log_conditional_probs.cpu().numpy())
+            #     concat_log_p = np.concatenate(log_conditional_probs_list, 0) #[B, L, 21]
+            #     mask_out = (chain_M*chain_M_pos*mask)[0,].cpu().numpy()
+            #     np.savez(conditional_probs_only_file, log_p=concat_log_p, S=S[0,].cpu().numpy(), mask=mask[0,].cpu().numpy(), design_mask=mask_out)
+            # elif args.unconditional_probs_only:
+            #     if print_all:
+            #         print(f'Calculating sequence unconditional probabilities for {name_}')
+            #     unconditional_probs_only_file = base_folder + '/unconditional_probs_only/' + batch_clones[0]['name']
+            #     log_unconditional_probs_list = []
+            #     for j in range(NUM_BATCHES):
+            #         log_unconditional_probs = model.unconditional_probs(X, mask, residue_idx, chain_encoding_all)
+            #         log_unconditional_probs_list.append(log_unconditional_probs.cpu().numpy())
+            #     concat_log_p = np.concatenate(log_unconditional_probs_list, 0) #[B, L, 21]
+            #     mask_out = (chain_M*chain_M_pos*mask)[0,].cpu().numpy()
+            #     np.savez(unconditional_probs_only_file, log_p=concat_log_p, S=S[0,].cpu().numpy(), mask=mask[0,].cpu().numpy(), design_mask=mask_out)
+            else:
+                randn_1 = torch.randn(chain_M.shape, device=X.device)
+                log_probs = model(X, S, mask, chain_M*chain_M_pos, residue_idx, chain_encoding_all, randn_1)
+                mask_for_loss = mask*chain_M*chain_M_pos
+                scores = _scores(S, log_probs, mask_for_loss) #score only the redesigned part
+                native_score = scores.cpu().data.numpy()
+                global_scores = _scores(S, log_probs, mask) #score the whole structure-sequence
+                global_native_score = global_scores.cpu().data.numpy()
+                # Generate some sequences
+                if write_output_files:
+                    ali_file = base_folder + '/seqs/' + batch_clones[0]['name'] + '.fa'
+                    score_file = base_folder + '/scores/' + batch_clones[0]['name'] + '.npz'
+                    probs_file = base_folder + '/probs/' + batch_clones[0]['name'] + '.npz'
+                    f = open(ali_file, 'w')
+                if print_all:
+                    print(f'Generating sequences for: {name_}')
+                t0 = time.time()
+                for temp in temperatures:
+                    for j in range(NUM_BATCHES):
+                        randn_2 = torch.randn(chain_M.shape, device=X.device)
+                        if tied_positions_dict == None:
+                            sample_dict = model.sample(X, randn_2, S, chain_M, chain_encoding_all, residue_idx, mask=mask, temperature=temp, omit_AAs_np=omit_AAs_np, bias_AAs_np=bias_AAs_np, chain_M_pos=chain_M_pos, omit_AA_mask=omit_AA_mask, pssm_coef=pssm_coef, pssm_bias=pssm_bias, pssm_multi=pssm_multi, pssm_log_odds_flag=bool(pssm_log_odds_flag), pssm_log_odds_mask=pssm_log_odds_mask, pssm_bias_flag=bool(pssm_bias_flag), bias_by_res=bias_by_res_all)
+                            S_sample = sample_dict["S"]
+                        else:
+                            sample_dict = model.tied_sample(X, randn_2, S, chain_M, chain_encoding_all, residue_idx, mask=mask, temperature=temp, omit_AAs_np=omit_AAs_np, bias_AAs_np=bias_AAs_np, chain_M_pos=chain_M_pos, omit_AA_mask=omit_AA_mask, pssm_coef=pssm_coef, pssm_bias=pssm_bias, pssm_multi=pssm_multi, pssm_log_odds_flag=bool(pssm_log_odds_flag), pssm_log_odds_mask=pssm_log_odds_mask, pssm_bias_flag=bool(pssm_bias_flag), tied_pos=tied_pos_list_of_lists_list[0], tied_beta=tied_beta, bias_by_res=bias_by_res_all)
+                        # Compute scores
+                            S_sample = sample_dict["S"]
+                        log_probs = model(X, S_sample, mask, chain_M*chain_M_pos, residue_idx, chain_encoding_all, randn_2, use_input_decoding_order=True, decoding_order=sample_dict["decoding_order"])
+                        mask_for_loss = mask*chain_M*chain_M_pos
+                        scores = _scores(S_sample, log_probs, mask_for_loss)
+                        scores = scores.cpu().data.numpy()
+                        global_scores = _scores(S_sample, log_probs, mask) #score the whole structure-sequence
+                        global_scores = global_scores.cpu().data.numpy()
+                        all_probs_list.append(sample_dict["probs"].cpu().data.numpy())
+                        all_log_probs_list.append(log_probs.cpu().data.numpy())
+                        S_sample_list.append(S_sample.cpu().data.numpy())
+                        for b_ix in range(BATCH_COPIES):
+                            masked_chain_length_list = masked_chain_length_list_list[b_ix]
+                            masked_list = masked_list_list[b_ix]
+                            seq_recovery_rate = torch.sum(torch.sum(torch.nn.functional.one_hot(S[b_ix], 21)*torch.nn.functional.one_hot(S_sample[b_ix], 21),axis=-1)*mask_for_loss[b_ix])/torch.sum(mask_for_loss[b_ix])
+                            seq = _S_to_seq(S_sample[b_ix], chain_M[b_ix])
+                            new_mpnn_seqs.append(seq)
+                            score = scores[b_ix]
+                            score_list.append(score)
+                            global_score = global_scores[b_ix]
+                            global_score_list.append(global_score)
+                            native_seq = _S_to_seq(S[b_ix], chain_M[b_ix])
+                            if b_ix == 0 and j==0 and temp==temperatures[0]:
+                                start = 0
+                                end = 0
+                                list_of_AAs = []
+                                for mask_l in masked_chain_length_list:
+                                    end += mask_l
+                                    list_of_AAs.append(native_seq[start:end])
+                                    start = end
+                                native_seq = "".join(list(np.array(list_of_AAs)[np.argsort(masked_list)]))
+                                l0 = 0
+                                for mc_length in list(np.array(masked_chain_length_list)[np.argsort(masked_list)])[:-1]:
+                                    l0 += mc_length
+                                    native_seq = native_seq[:l0] + '/' + native_seq[l0:]
+                                    l0 += 1
+                                sorted_masked_chain_letters = np.argsort(masked_list_list[0])
+                                print_masked_chains = [masked_list_list[0][i] for i in sorted_masked_chain_letters]
+                                sorted_visible_chain_letters = np.argsort(visible_list_list[0])
+                                print_visible_chains = [visible_list_list[0][i] for i in sorted_visible_chain_letters]
+                                native_score_print = np.format_float_positional(np.float32(native_score.mean()), unique=False, precision=4)
+                                global_native_score_print = np.format_float_positional(np.float32(global_native_score.mean()), unique=False, precision=4)
+                                script_dir = os.path.dirname(os.path.realpath(__file__))
+                                try:
+                                    commit_str = subprocess.check_output(f'git --git-dir {script_dir}/.git rev-parse HEAD', shell=True, stderr=subprocess.DEVNULL).decode().strip()
+                                except subprocess.CalledProcessError:
+                                    commit_str = 'unknown'
+                                if ca_only:
+                                    print_model_name = 'CA_model_name'
+                                else:
+                                    print_model_name = 'model_name'
+                                if write_output_files:
+                                    f.write('>{}, score={}, global_score={}, fixed_chains={}, designed_chains={}, {}={}, git_hash={}, seed={}\n{}\n'.format(name_, native_score_print, global_native_score_print, print_visible_chains, print_masked_chains, print_model_name, model_name, commit_str, seed, native_seq)) #write the native sequence
+                            start = 0
+                            end = 0
+                            list_of_AAs = []
+                            for mask_l in masked_chain_length_list:
+                                end += mask_l
+                                list_of_AAs.append(seq[start:end])
+                                start = end
+                            seq = "".join(list(np.array(list_of_AAs)[np.argsort(masked_list)]))
+                            l0 = 0
+                            for mc_length in list(np.array(masked_chain_length_list)[np.argsort(masked_list)])[:-1]:
+                                l0 += mc_length
+                                seq = seq[:l0] + '/' + seq[l0:]
+                                l0 += 1
+                            score_print = np.format_float_positional(np.float32(score), unique=False, precision=4)
+                            global_score_print = np.format_float_positional(np.float32(global_score), unique=False, precision=4)
+                            seq_rec_print = np.format_float_positional(np.float32(seq_recovery_rate.detach().cpu().numpy()), unique=False, precision=4)
+                            sample_number = j*BATCH_COPIES+b_ix+1
+                            if write_output_files:
+                                f.write('>T={}, sample={}, score={}, global_score={}, seq_recovery={}\n{}\n'.format(temp,sample_number,score_print,global_score_print,seq_rec_print,seq)) #write generated sequence
+                # if args.save_score:
+                #     np.savez(score_file, score=np.array(score_list, np.float32), global_score=np.array(global_score_list, np.float32))
+                # if args.save_probs:
+                #     all_probs_concat = np.concatenate(all_probs_list)
+                #     all_log_probs_concat = np.concatenate(all_log_probs_list)
+                #     S_sample_concat = np.concatenate(S_sample_list)
+                #     np.savez(probs_file, probs=np.array(all_probs_concat, np.float32), log_probs=np.array(all_log_probs_concat, np.float32), S=np.array(S_sample_concat, np.int32), mask=mask_for_loss.cpu().data.numpy(), chain_order=chain_list_list)
+                t1 = time.time()
+                dt = round(float(t1-t0), 4)
+                num_seqs = len(temperatures)*NUM_BATCHES*BATCH_COPIES
+                total_length = X.shape[1]
+                if print_all:
+                    print(f'{num_seqs} sequences of length {total_length} generated in {dt} seconds')
+                if write_output_files:
+                    f.close()
+    return new_mpnn_seqs
+def parse_fasta(filename,limit=-1, omit=[]):
+    header = []
+    sequence = []
+    lines = open(filename, "r")
+    for line in lines:
+        line = line.rstrip()
+        if line[0] == ">":
+            if len(header) == limit:
+                break
+            header.append(line[1:])
+            sequence.append([])
+        else:
+            if omit:
+                line = [item for item in line if item not in omit]
+                line = ''.join(line)
+            line = ''.join(line)
+            sequence[-1].append(line)
+    lines.close()
+    sequence = [''.join(seq) for seq in sequence]
+    return np.array(header), np.array(sequence)
+def _scores(S, log_probs, mask):
+    """ Negative log probabilities """
+    criterion = torch.nn.NLLLoss(reduction='none')
+    loss = criterion(
+        log_probs.contiguous().view(-1,log_probs.size(-1)),
+        S.contiguous().view(-1)
+    ).view(S.size())
+    scores = torch.sum(loss * mask, dim=-1) / torch.sum(mask, dim=-1)
+    return scores
+def _S_to_seq(S, mask):
+    alphabet = 'ACDEFGHIKLMNPQRSTVWYX'
+    seq = ''.join([alphabet[c] for c, m in zip(S.tolist(), mask.tolist()) if m > 0])
+    return seq
+def parse_PDB_biounits(x, atoms=['N','CA','C'], chain=None):
+  '''
+  input:  x = PDB filename
+          atoms = atoms to extract (optional)
+  output: (length, atoms, coords=(x,y,z)), sequence
+  '''
+  alpha_1 = list("ARNDCQEGHILKMFPSTWYV-")
+  states = len(alpha_1)
+  alpha_3 = ['ALA','ARG','ASN','ASP','CYS','GLN','GLU','GLY','HIS','ILE',
+             'LEU','LYS','MET','PHE','PRO','SER','THR','TRP','TYR','VAL','GAP']
+  aa_1_N = {a:n for n,a in enumerate(alpha_1)}
+  aa_3_N = {a:n for n,a in enumerate(alpha_3)}
+  aa_N_1 = {n:a for n,a in enumerate(alpha_1)}
+  aa_1_3 = {a:b for a,b in zip(alpha_1,alpha_3)}
+  aa_3_1 = {b:a for a,b in zip(alpha_1,alpha_3)}
+  def AA_to_N(x):
+    # ["ARND"] -> [[0,1,2,3]]
+    x = np.array(x);
+    if x.ndim == 0: x = x[None]
+    return [[aa_1_N.get(a, states-1) for a in y] for y in x]
+  def N_to_AA(x):
+    # [[0,1,2,3]] -> ["ARND"]
+    x = np.array(x);
+    if x.ndim == 1: x = x[None]
+    return ["".join([aa_N_1.get(a,"-") for a in y]) for y in x]
+  xyz,seq,min_resn,max_resn = {},{},1e6,-1e6
+  for line in open(x,"rb"):
+    line = line.decode("utf-8","ignore").rstrip()
+    if line[:6] == "HETATM" and line[17:17+3] == "MSE":
+      line = line.replace("HETATM","ATOM  ")
+      line = line.replace("MSE","MET")
+    if line[:4] == "ATOM":
+      ch = line[21:22]
+      if ch == chain or chain is None:
+        atom = line[12:12+4].strip()
+        resi = line[17:17+3]
+        resn = line[22:22+5].strip()
+        x,y,z = [float(line[i:(i+8)]) for i in [30,38,46]]
+        if resn[-1].isalpha():
+            resa,resn = resn[-1],int(resn[:-1])-1
+        else:
+            resa,resn = "",int(resn)-1
+#         resn = int(resn)
+        if resn < min_resn:
+            min_resn = resn
+        if resn > max_resn:
+            max_resn = resn
+        if resn not in xyz:
+            xyz[resn] = {}
+        if resa not in xyz[resn]:
+            xyz[resn][resa] = {}
+        if resn not in seq:
+            seq[resn] = {}
+        if resa not in seq[resn]:
+            seq[resn][resa] = resi
+        if atom not in xyz[resn][resa]:
+          xyz[resn][resa][atom] = np.array([x,y,z])
+  # convert to numpy arrays, fill in missing values
+  seq_,xyz_ = [],[]
+  try:
+      for resn in range(min_resn,max_resn+1):
+        if resn in seq:
+          for k in sorted(seq[resn]): seq_.append(aa_3_N.get(seq[resn][k],20))
+        else: seq_.append(20)
+        if resn in xyz:
+          for k in sorted(xyz[resn]):
+            for atom in atoms:
+              if atom in xyz[resn][k]: xyz_.append(xyz[resn][k][atom])
+              else: xyz_.append(np.full(3,np.nan))
+        else:
+          for atom in atoms: xyz_.append(np.full(3,np.nan))
+      return np.array(xyz_).reshape(-1,len(atoms),3), N_to_AA(np.array(seq_))
+  except TypeError:
+      return 'no_chain', 'no_chain'
+def parse_PDB(path_to_pdb, input_chain_list=None, ca_only=False):
+    c=0
+    pdb_dict_list = []
+    init_alphabet = ['A', 'B', 'C', 'D', 'E', 'F', 'G','H', 'I', 'J','K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T','U', 'V','W','X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g','h', 'i', 'j','k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't','u', 'v','w','x', 'y', 'z']
+    extra_alphabet = [str(item) for item in list(np.arange(300))]
+    chain_alphabet = init_alphabet + extra_alphabet
+    if input_chain_list:
+        chain_alphabet = input_chain_list
+    biounit_names = [path_to_pdb]
+    for biounit in biounit_names:
+        my_dict = {}
+        s = 0
+        concat_seq = ''
+        concat_N = []
+        concat_CA = []
+        concat_C = []
+        concat_O = []
+        concat_mask = []
+        coords_dict = {}
+        for letter in chain_alphabet:
+            if ca_only:
+                sidechain_atoms = ['CA']
+            else:
+                sidechain_atoms = ['N', 'CA', 'C', 'O']
+            xyz, seq = parse_PDB_biounits(biounit, atoms=sidechain_atoms, chain=letter)
+            if type(xyz) != str:
+                concat_seq += seq[0]
+                my_dict['seq_chain_'+letter]=seq[0]
+                coords_dict_chain = {}
+                if ca_only:
+                    coords_dict_chain['CA_chain_'+letter]=xyz.tolist()
+                else:
+                    coords_dict_chain['N_chain_' + letter] = xyz[:, 0, :].tolist()
+                    coords_dict_chain['CA_chain_' + letter] = xyz[:, 1, :].tolist()
+                    coords_dict_chain['C_chain_' + letter] = xyz[:, 2, :].tolist()
+                    coords_dict_chain['O_chain_' + letter] = xyz[:, 3, :].tolist()
+                my_dict['coords_chain_'+letter]=coords_dict_chain
+                s += 1
+        fi = biounit.rfind("/")
+        my_dict['name']=biounit[(fi+1):-4]
+        my_dict['num_of_chains'] = s
+        my_dict['seq'] = concat_seq
+        if s <= len(chain_alphabet):
+            pdb_dict_list.append(my_dict)
+            c+=1
+    return pdb_dict_list
+def tied_featurize(batch, device, chain_dict, fixed_position_dict=None, omit_AA_dict=None, tied_positions_dict=None, pssm_dict=None, bias_by_res_dict=None, ca_only=False):
+    """ Pack and pad batch into torch tensors """
+    alphabet = 'ACDEFGHIKLMNPQRSTVWYX'
+    B = len(batch)
+    lengths = np.array([len(b['seq']) for b in batch], dtype=np.int32) #sum of chain seq lengths
+    L_max = max([len(b['seq']) for b in batch])
+    if ca_only:
+        X = np.zeros([B, L_max, 1, 3])
+    else:
+        X = np.zeros([B, L_max, 4, 3])
+    residue_idx = -100*np.ones([B, L_max], dtype=np.int32)
+    chain_M = np.zeros([B, L_max], dtype=np.int32) #1.0 for the bits that need to be predicted
+    pssm_coef_all = np.zeros([B, L_max], dtype=np.float32) #1.0 for the bits that need to be predicted
+    pssm_bias_all = np.zeros([B, L_max, 21], dtype=np.float32) #1.0 for the bits that need to be predicted
+    pssm_log_odds_all = 10000.0*np.ones([B, L_max, 21], dtype=np.float32) #1.0 for the bits that need to be predicted
+    chain_M_pos = np.zeros([B, L_max], dtype=np.int32) #1.0 for the bits that need to be predicted
+    bias_by_res_all = np.zeros([B, L_max, 21], dtype=np.float32)
+    chain_encoding_all = np.zeros([B, L_max], dtype=np.int32) #1.0 for the bits that need to be predicted
+    S = np.zeros([B, L_max], dtype=np.int32)
+    omit_AA_mask = np.zeros([B, L_max, len(alphabet)], dtype=np.int32)
+    # Build the batch
+    letter_list_list = []
+    visible_list_list = []
+    masked_list_list = []
+    masked_chain_length_list_list = []
+    tied_pos_list_of_lists_list = []
+    for i, b in enumerate(batch):
+        if chain_dict != None:
+            masked_chains, visible_chains = chain_dict[b['name']] #masked_chains a list of chain letters to predict [A, D, F]
+        else:
+            masked_chains = [item[-1:] for item in list(b) if item[:10]=='seq_chain_']
+            visible_chains = []
+        masked_chains.sort() #sort masked_chains
+        visible_chains.sort() #sort visible_chains
+        all_chains = masked_chains + visible_chains
+    for i, b in enumerate(batch):
+        mask_dict = {}
+        a = 0
+        x_chain_list = []
+        chain_mask_list = []
+        chain_seq_list = []
+        chain_encoding_list = []
+        c = 1
+        letter_list = []
+        global_idx_start_list = [0]
+        visible_list = []
+        masked_list = []
+        masked_chain_length_list = []
+        fixed_position_mask_list = []
+        omit_AA_mask_list = []
+        pssm_coef_list = []
+        pssm_bias_list = []
+        pssm_log_odds_list = []
+        bias_by_res_list = []
+        l0 = 0
+        l1 = 0
+        for step, letter in enumerate(all_chains):
+            if letter in visible_chains:
+                letter_list.append(letter)
+                visible_list.append(letter)
+                chain_seq = b[f'seq_chain_{letter}']
+                chain_seq = ''.join([a if a!='-' else 'X' for a in chain_seq])
+                chain_length = len(chain_seq)
+                global_idx_start_list.append(global_idx_start_list[-1]+chain_length)
+                chain_coords = b[f'coords_chain_{letter}'] #this is a dictionary
+                chain_mask = np.zeros(chain_length) #0.0 for visible chains
+                if ca_only:
+                    x_chain = np.array(chain_coords[f'CA_chain_{letter}']) #[chain_lenght,1,3] #CA_diff
+                    if len(x_chain.shape) == 2:
+                        x_chain = x_chain[:,None,:]
+                else:
+                    x_chain = np.stack([chain_coords[c] for c in [f'N_chain_{letter}', f'CA_chain_{letter}', f'C_chain_{letter}', f'O_chain_{letter}']], 1) #[chain_lenght,4,3]
+                x_chain_list.append(x_chain)
+                chain_mask_list.append(chain_mask)
+                chain_seq_list.append(chain_seq)
+                chain_encoding_list.append(c*np.ones(np.array(chain_mask).shape[0]))
+                l1 += chain_length
+                residue_idx[i, l0:l1] = 100*(c-1)+np.arange(l0, l1)
+                l0 += chain_length
+                c+=1
+                fixed_position_mask = np.ones(chain_length)
+                fixed_position_mask_list.append(fixed_position_mask)
+                omit_AA_mask_temp = np.zeros([chain_length, len(alphabet)], np.int32)
+                omit_AA_mask_list.append(omit_AA_mask_temp)
+                pssm_coef = np.zeros(chain_length)
+                pssm_bias = np.zeros([chain_length, 21])
+                pssm_log_odds = 10000.0*np.ones([chain_length, 21])
+                pssm_coef_list.append(pssm_coef)
+                pssm_bias_list.append(pssm_bias)
+                pssm_log_odds_list.append(pssm_log_odds)
+                bias_by_res_list.append(np.zeros([chain_length, 21]))
+            if letter in masked_chains:
+                masked_list.append(letter)
+                letter_list.append(letter)
+                chain_seq = b[f'seq_chain_{letter}']
+                chain_seq = ''.join([a if a!='-' else 'X' for a in chain_seq])
+                chain_length = len(chain_seq)
+                global_idx_start_list.append(global_idx_start_list[-1]+chain_length)
+                masked_chain_length_list.append(chain_length)
+                chain_coords = b[f'coords_chain_{letter}'] #this is a dictionary
+                chain_mask = np.ones(chain_length) #1.0 for masked
+                if ca_only:
+                    x_chain = np.array(chain_coords[f'CA_chain_{letter}']) #[chain_lenght,1,3] #CA_diff
+                    if len(x_chain.shape) == 2:
+                        x_chain = x_chain[:,None,:]
+                else:
+                    x_chain = np.stack([chain_coords[c] for c in [f'N_chain_{letter}', f'CA_chain_{letter}', f'C_chain_{letter}', f'O_chain_{letter}']], 1) #[chain_lenght,4,3]
+                x_chain_list.append(x_chain)
+                chain_mask_list.append(chain_mask)
+                chain_seq_list.append(chain_seq)
+                chain_encoding_list.append(c*np.ones(np.array(chain_mask).shape[0]))
+                l1 += chain_length
+                residue_idx[i, l0:l1] = 100*(c-1)+np.arange(l0, l1)
+                l0 += chain_length
+                c+=1
+                fixed_position_mask = np.ones(chain_length)
+                if fixed_position_dict!=None:
+                    fixed_pos_list = fixed_position_dict[b['name']][letter]
+                    if fixed_pos_list:
+                        fixed_position_mask[np.array(fixed_pos_list)-1] = 0.0
+                fixed_position_mask_list.append(fixed_position_mask)
+                omit_AA_mask_temp = np.zeros([chain_length, len(alphabet)], np.int32)
+                if omit_AA_dict!=None:
+                    for item in omit_AA_dict[b['name']][letter]:
+                        idx_AA = np.array(item[0])-1
+                        AA_idx = np.array([np.argwhere(np.array(list(alphabet))== AA)[0][0] for AA in item[1]]).repeat(idx_AA.shape[0])
+                        idx_ = np.array([[a, b] for a in idx_AA for b in AA_idx])
+                        omit_AA_mask_temp[idx_[:,0], idx_[:,1]] = 1
+                omit_AA_mask_list.append(omit_AA_mask_temp)
+                pssm_coef = np.zeros(chain_length)
+                pssm_bias = np.zeros([chain_length, 21])
+                pssm_log_odds = 10000.0*np.ones([chain_length, 21])
+                if pssm_dict:
+                    if pssm_dict[b['name']][letter]:
+                        pssm_coef = pssm_dict[b['name']][letter]['pssm_coef']
+                        pssm_bias = pssm_dict[b['name']][letter]['pssm_bias']
+                        pssm_log_odds = pssm_dict[b['name']][letter]['pssm_log_odds']
+                pssm_coef_list.append(pssm_coef)
+                pssm_bias_list.append(pssm_bias)
+                pssm_log_odds_list.append(pssm_log_odds)
+                if bias_by_res_dict:
+                    bias_by_res_list.append(bias_by_res_dict[b['name']][letter])
+                else:
+                    bias_by_res_list.append(np.zeros([chain_length, 21]))
+        letter_list_np = np.array(letter_list)
+        tied_pos_list_of_lists = []
+        tied_beta = np.ones(L_max)
+        if tied_positions_dict!=None:
+            tied_pos_list = tied_positions_dict[b['name']]
+            if tied_pos_list:
+                set_chains_tied = set(list(itertools.chain(*[list(item) for item in tied_pos_list])))
+                for tied_item in tied_pos_list:
+                    one_list = []
+                    for k, v in tied_item.items():
+                        start_idx = global_idx_start_list[np.argwhere(letter_list_np == k)[0][0]]
+                        if isinstance(v[0], list):
+                            for v_count in range(len(v[0])):
+                                one_list.append(start_idx+v[0][v_count]-1)#make 0 to be the first
+                                tied_beta[start_idx+v[0][v_count]-1] = v[1][v_count]
+                        else:
+                            for v_ in v:
+                                one_list.append(start_idx+v_-1)#make 0 to be the first
+                    tied_pos_list_of_lists.append(one_list)
+        tied_pos_list_of_lists_list.append(tied_pos_list_of_lists)
+        x = np.concatenate(x_chain_list,0) #[L, 4, 3]
+        all_sequence = "".join(chain_seq_list)
+        m = np.concatenate(chain_mask_list,0) #[L,], 1.0 for places that need to be predicted
+        chain_encoding = np.concatenate(chain_encoding_list,0)
+        m_pos = np.concatenate(fixed_position_mask_list,0) #[L,], 1.0 for places that need to be predicted
+        pssm_coef_ = np.concatenate(pssm_coef_list,0) #[L,], 1.0 for places that need to be predicted
+        pssm_bias_ = np.concatenate(pssm_bias_list,0) #[L,], 1.0 for places that need to be predicted
+        pssm_log_odds_ = np.concatenate(pssm_log_odds_list,0) #[L,], 1.0 for places that need to be predicted
+        bias_by_res_ = np.concatenate(bias_by_res_list, 0)  #[L,21], 0.0 for places where AA frequencies don't need to be tweaked
+        l = len(all_sequence)
+        x_pad = np.pad(x, [[0,L_max-l], [0,0], [0,0]], 'constant', constant_values=(np.nan, ))
+        X[i,:,:,:] = x_pad
+        m_pad = np.pad(m, [[0,L_max-l]], 'constant', constant_values=(0.0, ))
+        m_pos_pad = np.pad(m_pos, [[0,L_max-l]], 'constant', constant_values=(0.0, ))
+        omit_AA_mask_pad = np.pad(np.concatenate(omit_AA_mask_list,0), [[0,L_max-l]], 'constant', constant_values=(0.0, ))
+        chain_M[i,:] = m_pad
+        chain_M_pos[i,:] = m_pos_pad
+        omit_AA_mask[i,] = omit_AA_mask_pad
+        chain_encoding_pad = np.pad(chain_encoding, [[0,L_max-l]], 'constant', constant_values=(0.0, ))
+        chain_encoding_all[i,:] = chain_encoding_pad
+        pssm_coef_pad = np.pad(pssm_coef_, [[0,L_max-l]], 'constant', constant_values=(0.0, ))
+        pssm_bias_pad = np.pad(pssm_bias_, [[0,L_max-l], [0,0]], 'constant', constant_values=(0.0, ))
+        pssm_log_odds_pad = np.pad(pssm_log_odds_, [[0,L_max-l], [0,0]], 'constant', constant_values=(0.0, ))
+        pssm_coef_all[i,:] = pssm_coef_pad
+        pssm_bias_all[i,:] = pssm_bias_pad
+        pssm_log_odds_all[i,:] = pssm_log_odds_pad
+        bias_by_res_pad = np.pad(bias_by_res_, [[0,L_max-l], [0,0]], 'constant', constant_values=(0.0, ))
+        bias_by_res_all[i,:] = bias_by_res_pad
+        # Convert to labels
+        indices = np.asarray([alphabet.index(a) for a in all_sequence], dtype=np.int32)
+        S[i, :l] = indices
+        letter_list_list.append(letter_list)
+        visible_list_list.append(visible_list)
+        masked_list_list.append(masked_list)
+        masked_chain_length_list_list.append(masked_chain_length_list)
+    isnan = np.isnan(X)
+    mask = np.isfinite(np.sum(X,(2,3))).astype(np.float32)
+    X[isnan] = 0.
+    # Conversion
+    pssm_coef_all = torch.from_numpy(pssm_coef_all).to(dtype=torch.float32, device=device)
+    pssm_bias_all = torch.from_numpy(pssm_bias_all).to(dtype=torch.float32, device=device)
+    pssm_log_odds_all = torch.from_numpy(pssm_log_odds_all).to(dtype=torch.float32, device=device)
+    tied_beta = torch.from_numpy(tied_beta).to(dtype=torch.float32, device=device)
+    jumps = ((residue_idx[:,1:]-residue_idx[:,:-1])==1).astype(np.float32)
+    bias_by_res_all = torch.from_numpy(bias_by_res_all).to(dtype=torch.float32, device=device)
+    phi_mask = np.pad(jumps, [[0,0],[1,0]])
+    psi_mask = np.pad(jumps, [[0,0],[0,1]])
+    omega_mask = np.pad(jumps, [[0,0],[0,1]])
+    dihedral_mask = np.concatenate([phi_mask[:,:,None], psi_mask[:,:,None], omega_mask[:,:,None]], -1) #[B,L,3]
+    dihedral_mask = torch.from_numpy(dihedral_mask).to(dtype=torch.float32, device=device)
+    residue_idx = torch.from_numpy(residue_idx).to(dtype=torch.long,device=device)
+    S = torch.from_numpy(S).to(dtype=torch.long,device=device)
+    X = torch.from_numpy(X).to(dtype=torch.float32, device=device)
+    mask = torch.from_numpy(mask).to(dtype=torch.float32, device=device)
+    chain_M = torch.from_numpy(chain_M).to(dtype=torch.float32, device=device)
+    chain_M_pos = torch.from_numpy(chain_M_pos).to(dtype=torch.float32, device=device)
+    omit_AA_mask = torch.from_numpy(omit_AA_mask).to(dtype=torch.float32, device=device)
+    chain_encoding_all = torch.from_numpy(chain_encoding_all).to(dtype=torch.long, device=device)
+    if ca_only:
+        X_out = X[:,:,0]
+    else:
+        X_out = X
+    return X_out, S, mask, lengths, chain_M, chain_encoding_all, letter_list_list, visible_list_list, masked_list_list, masked_chain_length_list_list, chain_M_pos, omit_AA_mask, residue_idx, dihedral_mask, tied_pos_list_of_lists_list, pssm_coef_all, pssm_bias_all, pssm_log_odds_all, bias_by_res_all, tied_beta
+def loss_nll(S, log_probs, mask):
+    """ Negative log probabilities """
+    criterion = torch.nn.NLLLoss(reduction='none')
+    loss = criterion(
+        log_probs.contiguous().view(-1, log_probs.size(-1)), S.contiguous().view(-1)
+    ).view(S.size())
+    loss_av = torch.sum(loss * mask) / torch.sum(mask)
+    return loss, loss_av
+def loss_smoothed(S, log_probs, mask, weight=0.1):
+    """ Negative log probabilities """
+    S_onehot = torch.nn.functional.one_hot(S, 21).float()
+    # Label smoothing
+    S_onehot = S_onehot + weight / float(S_onehot.size(-1))
+    S_onehot = S_onehot / S_onehot.sum(-1, keepdim=True)
+    loss = -(S_onehot * log_probs).sum(-1)
+    loss_av = torch.sum(loss * mask) / torch.sum(mask)
+    return loss, loss_av
+class StructureDataset():
+    def __init__(self, jsonl_file, verbose=True, truncate=None, max_length=100,
+        alphabet='ACDEFGHIKLMNPQRSTVWYX-'):
+        alphabet_set = set([a for a in alphabet])
+        discard_count = {
+            'bad_chars': 0,
+            'too_long': 0,
+            'bad_seq_length': 0
+        }
+        with open(jsonl_file) as f:
+            self.data = []
+            lines = f.readlines()
+            start = time.time()
+            for i, line in enumerate(lines):
+                entry = json.loads(line)
+                seq = entry['seq']
+                name = entry['name']
+                # Convert raw coords to np arrays
+                #for key, val in entry['coords'].items():
+                #    entry['coords'][key] = np.asarray(val)
+                # Check if in alphabet
+                bad_chars = set([s for s in seq]).difference(alphabet_set)
+                if len(bad_chars) == 0:
+                    if len(entry['seq']) <= max_length:
+                        if True:
+                            self.data.append(entry)
+                        else:
+                            discard_count['bad_seq_length'] += 1
+                    else:
+                        discard_count['too_long'] += 1
+                else:
+                    if verbose:
+                        print(name, bad_chars, entry['seq'])
+                    discard_count['bad_chars'] += 1
+                # Truncate early
+                if truncate is not None and len(self.data) == truncate:
+                    return
+                if verbose and (i + 1) % 1000 == 0:
+                    elapsed = time.time() - start
+                    print('{} entries ({} loaded) in {:.1f} s'.format(len(self.data), i+1, elapsed))
+            if verbose:
+                print('discarded', discard_count)
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        return self.data[idx]
+class StructureDatasetPDB():
+    def __init__(self, pdb_dict_list, verbose=True, truncate=None, max_length=100,
+        alphabet='ACDEFGHIKLMNPQRSTVWYX-'):
+        alphabet_set = set([a for a in alphabet])
+        discard_count = {
+            'bad_chars': 0,
+            'too_long': 0,
+            'bad_seq_length': 0
+        }
+        self.data = []
+        start = time.time()
+        for i, entry in enumerate(pdb_dict_list):
+            seq = entry['seq']
+            name = entry['name']
+            bad_chars = set([s for s in seq]).difference(alphabet_set)
+            if len(bad_chars) == 0:
+                if len(entry['seq']) <= max_length:
+                    self.data.append(entry)
+                else:
+                    discard_count['too_long'] += 1
+            else:
+                discard_count['bad_chars'] += 1
+            # Truncate early
+            if truncate is not None and len(self.data) == truncate:
+                return
+            if verbose and (i + 1) % 1000 == 0:
+                elapsed = time.time() - start
+            #print('Discarded', discard_count)
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        return self.data[idx]
+class StructureLoader():
+    def __init__(self, dataset, batch_size=100, shuffle=True,
+        collate_fn=lambda x:x, drop_last=False):
+        self.dataset = dataset
+        self.size = len(dataset)
+        self.lengths = [len(dataset[i]['seq']) for i in range(self.size)]
+        self.batch_size = batch_size
+        sorted_ix = np.argsort(self.lengths)
+        # Cluster into batches of similar sizes
+        clusters, batch = [], []
+        batch_max = 0
+        for ix in sorted_ix:
+            size = self.lengths[ix]
+            if size * (len(batch) + 1) <= self.batch_size:
+                batch.append(ix)
+                batch_max = size
+            else:
+                clusters.append(batch)
+                batch, batch_max = [], 0
+        if len(batch) > 0:
+            clusters.append(batch)
+        self.clusters = clusters
+    def __len__(self):
+        return len(self.clusters)
+    def __iter__(self):
+        np.random.shuffle(self.clusters)
+        for b_idx in self.clusters:
+            batch = [self.dataset[i] for i in b_idx]
+            yield batch
+# The following gather functions
+def gather_edges(edges, neighbor_idx):
+    # Features [B,N,N,C] at Neighbor indices [B,N,K] => Neighbor features [B,N,K,C]
+    neighbors = neighbor_idx.unsqueeze(-1).expand(-1, -1, -1, edges.size(-1))
+    edge_features = torch.gather(edges, 2, neighbors)
+    return edge_features
+def gather_nodes(nodes, neighbor_idx):
+    # Features [B,N,C] at Neighbor indices [B,N,K] => [B,N,K,C]
+    # Flatten and expand indices per batch [B,N,K] => [B,NK] => [B,NK,C]
+    neighbors_flat = neighbor_idx.view((neighbor_idx.shape[0], -1))
+    neighbors_flat = neighbors_flat.unsqueeze(-1).expand(-1, -1, nodes.size(2))
+    # Gather and re-pack
+    neighbor_features = torch.gather(nodes, 1, neighbors_flat)
+    neighbor_features = neighbor_features.view(list(neighbor_idx.shape)[:3] + [-1])
+    return neighbor_features
+def gather_nodes_t(nodes, neighbor_idx):
+    # Features [B,N,C] at Neighbor index [B,K] => Neighbor features[B,K,C]
+    idx_flat = neighbor_idx.unsqueeze(-1).expand(-1, -1, nodes.size(2))
+    neighbor_features = torch.gather(nodes, 1, idx_flat)
+    return neighbor_features
+def cat_neighbors_nodes(h_nodes, h_neighbors, E_idx):
+    h_nodes = gather_nodes(h_nodes, E_idx)
+    h_nn = torch.cat([h_neighbors, h_nodes], -1)
+    return h_nn
+class EncLayer(nn.Module):
+    def __init__(self, num_hidden, num_in, dropout=0.1, num_heads=None, scale=30, time_cond_dim=None):
+        super(EncLayer, self).__init__()
+        self.num_hidden = num_hidden
+        self.num_in = num_in
+        self.scale = scale
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.dropout3 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(num_hidden)
+        self.norm2 = nn.LayerNorm(num_hidden)
+        self.norm3 = nn.LayerNorm(num_hidden)
+        if time_cond_dim is not None:
+            self.time_block1 = nn.Sequential(
+                Rearrange('b 1 d -> b 1 1 d'),
+                nn.SiLU(),
+                nn.Linear(time_cond_dim, num_hidden * 2))
+            self.time_block2 = nn.Sequential(
+                Rearrange('b 1 d -> b 1 1 d'),
+                nn.SiLU(),
+                nn.Linear(time_cond_dim, num_hidden * 2))
+        self.W1 = nn.Linear(num_hidden + num_in, num_hidden, bias=True)
+        self.W2 = nn.Linear(num_hidden, num_hidden, bias=True)
+        self.W3 = nn.Linear(num_hidden, num_hidden, bias=True)
+        self.W11 = nn.Linear(num_hidden + num_in, num_hidden, bias=True)
+        self.W12 = nn.Linear(num_hidden, num_hidden, bias=True)
+        self.W13 = nn.Linear(num_hidden, num_hidden, bias=True)
+        self.act = torch.nn.GELU()
+        self.dense = PositionWiseFeedForward(num_hidden, num_hidden * 4)
+    def forward(self, h_V, h_E, E_idx, mask_V=None, mask_attend=None, time_cond=None):
+        """ Parallel computation of full transformer layer """
+        h_EV = cat_neighbors_nodes(h_V, h_E, E_idx)
+        h_V_expand = h_V.unsqueeze(-2).expand(-1,-1,h_EV.size(-2),-1)
+        h_EV = torch.cat([h_V_expand, h_EV], -1)
+        h_message = self.act(self.W2(self.act(self.W1(h_EV))))
+        if time_cond is not None:
+            scale, shift = self.time_block1(time_cond).chunk(2, dim=-1)
+            h_message = h_message * (scale + 1) + shift
+        h_message = self.W3(h_message)
+        if mask_attend is not None:
+            h_message = mask_attend.unsqueeze(-1) * h_message
+        dh = torch.sum(h_message, -2) / self.scale
+        h_V = self.norm1(h_V + self.dropout1(dh))
+        dh = self.dense(h_V)
+        h_V = self.norm2(h_V + self.dropout2(dh))
+        if mask_V is not None:
+            mask_V = mask_V.unsqueeze(-1)
+            h_V = mask_V * h_V
+        h_EV = cat_neighbors_nodes(h_V, h_E, E_idx)
+        h_V_expand = h_V.unsqueeze(-2).expand(-1,-1,h_EV.size(-2),-1)
+        h_EV = torch.cat([h_V_expand, h_EV], -1)
+        h_message = self.act(self.W12(self.act(self.W11(h_EV))))
+        if time_cond is not None:
+            scale, shift = self.time_block2(time_cond).chunk(2, dim=-1)
+            h_message = h_message * (scale + 1) + shift
+        h_message = self.W13(h_message)
+        h_E = self.norm3(h_E + self.dropout3(h_message))
+        return h_V, h_E
+class DecLayer(nn.Module):
+    def __init__(self, num_hidden, num_in, dropout=0.1, num_heads=None, scale=30, time_cond_dim=None):
+        super(DecLayer, self).__init__()
+        self.num_hidden = num_hidden
+        self.num_in = num_in
+        self.scale = scale
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(num_hidden)
+        self.norm2 = nn.LayerNorm(num_hidden)
+        if time_cond_dim is not None:
+            self.time_block = nn.Sequential(
+                Rearrange('b 1 d -> b 1 1 d'),
+                nn.SiLU(),
+                nn.Linear(time_cond_dim, num_hidden * 2))
+        self.W1 = nn.Linear(num_hidden + num_in, num_hidden, bias=True)
+        self.W2 = nn.Linear(num_hidden, num_hidden, bias=True)
+        self.W3 = nn.Linear(num_hidden, num_hidden, bias=True)
+        self.act = torch.nn.GELU()
+        self.dense = PositionWiseFeedForward(num_hidden, num_hidden * 4)
+    def forward(self, h_V, h_E, mask_V=None, mask_attend=None, time_cond=None):
+        """ Parallel computation of full transformer layer """
+        # Concatenate h_V_i to h_E_ij
+        h_V_expand = h_V.unsqueeze(-2).expand(-1,-1,h_E.size(-2),-1)
+        h_EV = torch.cat([h_V_expand, h_E], -1)
+        h_message = self.act(self.W2(self.act(self.W1(h_EV))))
+        if time_cond is not None:
+            scale, shift = self.time_block(time_cond).chunk(2, dim=-1)
+            h_message = h_message * (scale + 1) + shift
+        h_message = self.W3(h_message)
+        if mask_attend is not None:
+            h_message = mask_attend.unsqueeze(-1) * h_message
+        dh = torch.sum(h_message, -2) / self.scale
+        h_V = self.norm1(h_V + self.dropout1(dh))
+        # Position-wise feedforward
+        dh = self.dense(h_V)
+        h_V = self.norm2(h_V + self.dropout2(dh))
+        if mask_V is not None:
+            mask_V = mask_V.unsqueeze(-1)
+            h_V = mask_V * h_V
+        return h_V
+class PositionWiseFeedForward(nn.Module):
+    def __init__(self, num_hidden, num_ff):
+        super(PositionWiseFeedForward, self).__init__()
+        self.W_in = nn.Linear(num_hidden, num_ff, bias=True)
+        self.W_out = nn.Linear(num_ff, num_hidden, bias=True)
+        self.act = torch.nn.GELU()
+    def forward(self, h_V):
+        h = self.act(self.W_in(h_V))
+        h = self.W_out(h)
+        return h
+class PositionalEncodings(nn.Module):
+    def __init__(self, num_embeddings, max_relative_feature=32):
+        super(PositionalEncodings, self).__init__()
+        self.num_embeddings = num_embeddings
+        self.max_relative_feature = max_relative_feature
+        self.linear = nn.Linear(2*max_relative_feature+1+1, num_embeddings)
+    def forward(self, offset, mask):
+        d = torch.clip(offset + self.max_relative_feature, 0, 2*self.max_relative_feature)*mask + (1-mask)*(2*self.max_relative_feature+1)
+        d_onehot = torch.nn.functional.one_hot(d, 2*self.max_relative_feature+1+1)
+        E = self.linear(d_onehot.float())
+        return E
+class CA_ProteinFeatures(nn.Module):
+    def __init__(self, edge_features, node_features, num_positional_embeddings=16,
+        num_rbf=16, top_k=30, augment_eps=0., num_chain_embeddings=16):
+        """ Extract protein features """
+        super(CA_ProteinFeatures, self).__init__()
+        self.edge_features = edge_features
+        self.node_features = node_features
+        self.top_k = top_k
+        self.augment_eps = augment_eps
+        self.num_rbf = num_rbf
+        self.num_positional_embeddings = num_positional_embeddings
+        # Positional encoding
+        self.embeddings = PositionalEncodings(num_positional_embeddings)
+        # Normalization and embedding
+        node_in, edge_in = 3, num_positional_embeddings + num_rbf*9 + 7
+        self.node_embedding = nn.Linear(node_in,  node_features, bias=False) #NOT USED
+        self.edge_embedding = nn.Linear(edge_in, edge_features, bias=False)
+        self.norm_nodes = nn.LayerNorm(node_features)
+        self.norm_edges = nn.LayerNorm(edge_features)
+    def _quaternions(self, R):
+        """ Convert a batch of 3D rotations [R] to quaternions [Q]
+            R [...,3,3]
+            Q [...,4]
+        """
+        # Simple Wikipedia version
+        # en.wikipedia.org/wiki/Rotation_matrix#Quaternion
+        # For other options see math.stackexchange.com/questions/2074316/calculating-rotation-axis-from-rotation-matrix
+        diag = torch.diagonal(R, dim1=-2, dim2=-1)
+        Rxx, Ryy, Rzz = diag.unbind(-1)
+        magnitudes = 0.5 * torch.sqrt(torch.abs(1 + torch.stack([
+              Rxx - Ryy - Rzz,
+            - Rxx + Ryy - Rzz,
+            - Rxx - Ryy + Rzz
+        ], -1)))
+        _R = lambda i,j: R[:,:,:,i,j]
+        signs = torch.sign(torch.stack([
+            _R(2,1) - _R(1,2),
+            _R(0,2) - _R(2,0),
+            _R(1,0) - _R(0,1)
+        ], -1))
+        xyz = signs * magnitudes
+        # The relu enforces a non-negative trace
+        w = torch.sqrt(F.relu(1 + diag.sum(-1, keepdim=True))) / 2.
+        Q = torch.cat((xyz, w), -1)
+        Q = F.normalize(Q, dim=-1)
+        return Q
+    def _orientations_coarse(self, X, E_idx, eps=1e-6):
+        dX = X[:,1:,:] - X[:,:-1,:]
+        dX_norm = torch.norm(dX,dim=-1)
+        dX_mask = (3.6<dX_norm) & (dX_norm<4.0) #exclude CA-CA jumps
+        dX = dX*dX_mask[:,:,None]
+        U = F.normalize(dX, dim=-1)
+        u_2 = U[:,:-2,:]
+        u_1 = U[:,1:-1,:]
+        u_0 = U[:,2:,:]
+        # Backbone normals
+        n_2 = F.normalize(torch.cross(u_2, u_1), dim=-1)
+        n_1 = F.normalize(torch.cross(u_1, u_0), dim=-1)
+        # Bond angle calculation
+        cosA = -(u_1 * u_0).sum(-1)
+        cosA = torch.clamp(cosA, -1+eps, 1-eps)
+        A = torch.acos(cosA)
+        # Angle between normals
+        cosD = (n_2 * n_1).sum(-1)
+        cosD = torch.clamp(cosD, -1+eps, 1-eps)
+        D = torch.sign((u_2 * n_1).sum(-1)) * torch.acos(cosD)
+        # Backbone features
+        AD_features = torch.stack((torch.cos(A), torch.sin(A) * torch.cos(D), torch.sin(A) * torch.sin(D)), 2)
+        AD_features = F.pad(AD_features, (0,0,1,2), 'constant', 0)
+        # Build relative orientations
+        o_1 = F.normalize(u_2 - u_1, dim=-1)
+        O = torch.stack((o_1, n_2, torch.cross(o_1, n_2)), 2)
+        O = O.view(list(O.shape[:2]) + [9])
+        O = F.pad(O, (0,0,1,2), 'constant', 0)
+        O_neighbors = gather_nodes(O, E_idx)
+        X_neighbors = gather_nodes(X, E_idx)
+        # Re-view as rotation matrices
+        O = O.view(list(O.shape[:2]) + [3,3])
+        O_neighbors = O_neighbors.view(list(O_neighbors.shape[:3]) + [3,3])
+        # Rotate into local reference frames
+        dX = X_neighbors - X.unsqueeze(-2)
+        dU = torch.matmul(O.unsqueeze(2), dX.unsqueeze(-1)).squeeze(-1)
+        dU = F.normalize(dU, dim=-1)
+        R = torch.matmul(O.unsqueeze(2).transpose(-1,-2), O_neighbors)
+        Q = self._quaternions(R)
+        # Orientation features
+        O_features = torch.cat((dU,Q), dim=-1)
+        return AD_features, O_features
+    def _dist(self, X, mask, eps=1E-6):
+        """ Pairwise euclidean distances """
+        # Convolutional network on NCHW
+        mask_2D = torch.unsqueeze(mask,1) * torch.unsqueeze(mask,2)
+        dX = torch.unsqueeze(X,1) - torch.unsqueeze(X,2)
+        D = mask_2D * torch.sqrt(torch.sum(dX**2, 3) + eps)
+        # Identify k nearest neighbors (including self)
+        D_max, _ = torch.max(D, -1, keepdim=True)
+        D_adjust = D + (1. - mask_2D) * D_max
+        D_neighbors, E_idx = torch.topk(D_adjust, np.minimum(self.top_k, X.shape[1]), dim=-1, largest=False)
+        mask_neighbors = gather_edges(mask_2D.unsqueeze(-1), E_idx)
+        return D_neighbors, E_idx, mask_neighbors
+    def _rbf(self, D):
+        # Distance radial basis function
+        device = D.device
+        D_min, D_max, D_count = 2., 22., self.num_rbf
+        D_mu = torch.linspace(D_min, D_max, D_count).to(device)
+        D_mu = D_mu.view([1,1,1,-1])
+        D_sigma = (D_max - D_min) / D_count
+        D_expand = torch.unsqueeze(D, -1)
+        RBF = torch.exp(-((D_expand - D_mu) / D_sigma)**2)
+        return RBF
+    def _get_rbf(self, A, B, E_idx):
+        D_A_B = torch.sqrt(torch.sum((A[:,:,None,:] - B[:,None,:,:])**2,-1) + 1e-6) #[B, L, L]
+        D_A_B_neighbors = gather_edges(D_A_B[:,:,:,None], E_idx)[:,:,:,0] #[B,L,K]
+        RBF_A_B = self._rbf(D_A_B_neighbors)
+        return RBF_A_B
+    def forward(self, Ca, mask, residue_idx, chain_labels):
+        """ Featurize coordinates as an attributed graph """
+        if self.augment_eps > 0:
+            Ca = Ca + self.augment_eps * torch.randn_like(Ca)
+        D_neighbors, E_idx, mask_neighbors = self._dist(Ca, mask)
+        Ca_0 = torch.zeros(Ca.shape, device=Ca.device)
+        Ca_2 = torch.zeros(Ca.shape, device=Ca.device)
+        Ca_0[:,1:,:] = Ca[:,:-1,:]
+        Ca_1 = Ca
+        Ca_2[:,:-1,:] = Ca[:,1:,:]
+        V, O_features = self._orientations_coarse(Ca, E_idx)
+        RBF_all = []
+        RBF_all.append(self._rbf(D_neighbors)) #Ca_1-Ca_1
+        RBF_all.append(self._get_rbf(Ca_0, Ca_0, E_idx))
+        RBF_all.append(self._get_rbf(Ca_2, Ca_2, E_idx))
+        RBF_all.append(self._get_rbf(Ca_0, Ca_1, E_idx))
+        RBF_all.append(self._get_rbf(Ca_0, Ca_2, E_idx))
+        RBF_all.append(self._get_rbf(Ca_1, Ca_0, E_idx))
+        RBF_all.append(self._get_rbf(Ca_1, Ca_2, E_idx))
+        RBF_all.append(self._get_rbf(Ca_2, Ca_0, E_idx))
+        RBF_all.append(self._get_rbf(Ca_2, Ca_1, E_idx))
+        RBF_all = torch.cat(tuple(RBF_all), dim=-1)
+        offset = residue_idx[:,:,None]-residue_idx[:,None,:]
+        offset = gather_edges(offset[:,:,:,None], E_idx)[:,:,:,0] #[B, L, K]
+        d_chains = ((chain_labels[:, :, None] - chain_labels[:,None,:])==0).long()
+        E_chains = gather_edges(d_chains[:,:,:,None], E_idx)[:,:,:,0]
+        E_positional = self.embeddings(offset.long(), E_chains)
+        E = torch.cat((E_positional, RBF_all, O_features), -1)
+        E = self.edge_embedding(E)
+        E = self.norm_edges(E)
+        return E, E_idx
+def get_closest_neighbors(X, mask, top_k, eps=1e-6):
+    # X is ca coords (b, n, 3), mask is seq mask
+    mask_2D = torch.unsqueeze(mask,1) * torch.unsqueeze(mask,2)
+    dX = torch.unsqueeze(X,1) - torch.unsqueeze(X,2)
+    D = mask_2D * torch.sqrt(torch.sum(dX**2, 3) + eps)
+    D_max, _ = torch.max(D, -1, keepdim=True)
+    D_adjust = D + (1. - mask_2D) * D_max
+    sampled_top_k = top_k
+    D_neighbors, E_idx = torch.topk(D_adjust, np.minimum(top_k, X.shape[1]), dim=-1, largest=False)
+    return D_neighbors, E_idx
+class ProteinFeatures(nn.Module):
+    def __init__(self, edge_features, node_features, num_positional_embeddings=16,
+        num_rbf=16, top_k=30, augment_eps=0., num_chain_embeddings=16):
+        """ Extract protein features """
+        super(ProteinFeatures, self).__init__()
+        self.edge_features = edge_features
+        self.node_features = node_features
+        self.top_k = top_k
+        self.augment_eps = augment_eps
+        self.num_rbf = num_rbf
+        self.num_positional_embeddings = num_positional_embeddings
+        self.embeddings = PositionalEncodings(num_positional_embeddings)
+        node_in, edge_in = 6, num_positional_embeddings + num_rbf*25
+        self.edge_embedding = nn.Linear(edge_in, edge_features, bias=False)
+        self.norm_edges = nn.LayerNorm(edge_features)
+    def _dist(self, X, mask, eps=1E-6):
+        # mask_2D = torch.unsqueeze(mask,1) * torch.unsqueeze(mask,2)
+        # dX = torch.unsqueeze(X,1) - torch.unsqueeze(X,2)
+        # D = mask_2D * torch.sqrt(torch.sum(dX**2, 3) + eps)
+        # D_max, _ = torch.max(D, -1, keepdim=True)
+        # D_adjust = D + (1. - mask_2D) * D_max
+        # sampled_top_k = self.top_k
+        # D_neighbors, E_idx = torch.topk(D_adjust, np.minimum(self.top_k, X.shape[1]), dim=-1, largest=False)
+        # return D_neighbors, E_idx
+        return get_closest_neighbors(X, mask, self.top_k, eps=eps)
+    def _rbf(self, D):
+        device = D.device
+        D_min, D_max, D_count = 2., 22., self.num_rbf
+        D_mu = torch.linspace(D_min, D_max, D_count, device=device)
+        D_mu = D_mu.view([1,1,1,-1])
+        D_sigma = (D_max - D_min) / D_count
+        D_expand = torch.unsqueeze(D, -1)
+        RBF = torch.exp(-((D_expand - D_mu) / D_sigma)**2)
+        return RBF
+    def _get_rbf(self, A, B, E_idx):
+        D_A_B = torch.sqrt(torch.sum((A[:,:,None,:] - B[:,None,:,:])**2,-1) + 1e-6) #[B, L, L]
+        D_A_B_neighbors = gather_edges(D_A_B[:,:,:,None], E_idx)[:,:,:,0] #[B,L,K]
+        RBF_A_B = self._rbf(D_A_B_neighbors)
+        return RBF_A_B
+    def forward(self, X, mask, residue_idx, chain_labels):
+        if self.augment_eps > 0:
+            X = X + self.augment_eps * torch.randn_like(X)
+        b = X[:,:,1,:] - X[:,:,0,:]
+        c = X[:,:,2,:] - X[:,:,1,:]
+        a = torch.cross(b, c, dim=-1)
+        Cb = -0.58273431*a + 0.56802827*b - 0.54067466*c + X[:,:,1,:]
+        Ca = X[:,:,1,:]
+        N = X[:,:,0,:]
+        C = X[:,:,2,:]
+        O = X[:,:,3,:]
+        D_neighbors, E_idx = self._dist(Ca, mask)
+        RBF_all = []
+        RBF_all.append(self._rbf(D_neighbors)) #Ca-Ca
+        RBF_all.append(self._get_rbf(N, N, E_idx)) #N-N
+        RBF_all.append(self._get_rbf(C, C, E_idx)) #C-C
+        RBF_all.append(self._get_rbf(O, O, E_idx)) #O-O
+        RBF_all.append(self._get_rbf(Cb, Cb, E_idx)) #Cb-Cb
+        RBF_all.append(self._get_rbf(Ca, N, E_idx)) #Ca-N
+        RBF_all.append(self._get_rbf(Ca, C, E_idx)) #Ca-C
+        RBF_all.append(self._get_rbf(Ca, O, E_idx)) #Ca-O
+        RBF_all.append(self._get_rbf(Ca, Cb, E_idx)) #Ca-Cb
+        RBF_all.append(self._get_rbf(N, C, E_idx)) #N-C
+        RBF_all.append(self._get_rbf(N, O, E_idx)) #N-O
+        RBF_all.append(self._get_rbf(N, Cb, E_idx)) #N-Cb
+        RBF_all.append(self._get_rbf(Cb, C, E_idx)) #Cb-C
+        RBF_all.append(self._get_rbf(Cb, O, E_idx)) #Cb-O
+        RBF_all.append(self._get_rbf(O, C, E_idx)) #O-C
+        RBF_all.append(self._get_rbf(N, Ca, E_idx)) #N-Ca
+        RBF_all.append(self._get_rbf(C, Ca, E_idx)) #C-Ca
+        RBF_all.append(self._get_rbf(O, Ca, E_idx)) #O-Ca
+        RBF_all.append(self._get_rbf(Cb, Ca, E_idx)) #Cb-Ca
+        RBF_all.append(self._get_rbf(C, N, E_idx)) #C-N
+        RBF_all.append(self._get_rbf(O, N, E_idx)) #O-N
+        RBF_all.append(self._get_rbf(Cb, N, E_idx)) #Cb-N
+        RBF_all.append(self._get_rbf(C, Cb, E_idx)) #C-Cb
+        RBF_all.append(self._get_rbf(O, Cb, E_idx)) #O-Cb
+        RBF_all.append(self._get_rbf(C, O, E_idx)) #C-O
+        RBF_all = torch.cat(tuple(RBF_all), dim=-1)
+        offset = residue_idx[:,:,None]-residue_idx[:,None,:]
+        offset = gather_edges(offset[:,:,:,None], E_idx)[:,:,:,0] #[B, L, K]
+        d_chains = ((chain_labels[:, :, None] - chain_labels[:,None,:])==0).long() #find self vs non-self interaction
+        E_chains = gather_edges(d_chains[:,:,:,None], E_idx)[:,:,:,0]
+        E_positional = self.embeddings(offset.long(), E_chains)
+        E = torch.cat((E_positional, RBF_all), -1)
+        E = self.edge_embedding(E)
+        E = self.norm_edges(E)
+        return E, E_idx
+class ProteinMPNN(nn.Module):
+    def __init__(self, num_letters, node_features, edge_features,
+        hidden_dim, num_encoder_layers=3, num_decoder_layers=3,
+        vocab=21, k_neighbors=64, augment_eps=0.05, dropout=0.1, ca_only=False, time_cond_dim=None, input_S_is_embeddings=False):
+        super(ProteinMPNN, self).__init__()
+        # Hyperparameters
+        self.node_features = node_features
+        self.edge_features = edge_features
+        self.hidden_dim = hidden_dim
+        # Featurization layers
+        if ca_only:
+            self.features = CA_ProteinFeatures(node_features, edge_features, top_k=k_neighbors, augment_eps=augment_eps)
+            self.W_v = nn.Linear(node_features, hidden_dim, bias=True)
+        else:
+            self.features = ProteinFeatures(node_features, edge_features, top_k=k_neighbors, augment_eps=augment_eps)
+        self.W_e = nn.Linear(edge_features, hidden_dim, bias=True)
+        self.input_S_is_embeddings = input_S_is_embeddings
+        if not self.input_S_is_embeddings:
+            self.W_s = nn.Embedding(vocab, hidden_dim)
+        if time_cond_dim is not None:
+            self.time_block = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(time_cond_dim, hidden_dim)
+            )
+        # Encoder layers
+        self.encoder_layers = nn.ModuleList([
+            EncLayer(hidden_dim, hidden_dim*2, dropout=dropout, time_cond_dim=time_cond_dim)
+            for _ in range(num_encoder_layers)
+        ])
+        # Decoder layers
+        self.decoder_layers = nn.ModuleList([
+            DecLayer(hidden_dim, hidden_dim*3, dropout=dropout, time_cond_dim=time_cond_dim)
+            for _ in range(num_decoder_layers)
+        ])
+        self.W_out = nn.Linear(hidden_dim, num_letters, bias=True)
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+    def forward(self, X, S, mask, chain_M, residue_idx, chain_encoding_all, randn, use_input_decoding_order=False, decoding_order=None, causal_mask=True, time_cond=None, return_node_embs=False):
+        """ Graph-conditioned sequence model """
+        device=X.device
+        # Prepare node and edge embeddings
+        E, E_idx = self.features(X, mask, residue_idx, chain_encoding_all)
+        h_V = torch.zeros((E.shape[0], E.shape[1], E.shape[-1]), device=E.device)
+        if time_cond is not None:
+            time_cond_nodes = self.time_block(time_cond)
+            h_V += time_cond_nodes  # time_cond is b, 1, c
+        h_E = self.W_e(E)
+        # Encoder is unmasked self-attention
+        mask_attend = gather_nodes(mask.unsqueeze(-1),  E_idx).squeeze(-1)
+        mask_attend = mask.unsqueeze(-1) * mask_attend
+        for layer in self.encoder_layers:
+            h_V, h_E = layer(h_V, h_E, E_idx, mask, mask_attend, time_cond=time_cond)
+        encoder_embs = h_V
+        # Concatenate sequence embeddings for autoregressive decoder
+        if self.input_S_is_embeddings:
+            h_S = S
+        else:
+            h_S = self.W_s(S)
+        h_ES = cat_neighbors_nodes(h_S, h_E, E_idx)
+        # Build encoder embeddings
+        h_EX_encoder = cat_neighbors_nodes(torch.zeros_like(h_S), h_E, E_idx)
+        h_EXV_encoder = cat_neighbors_nodes(h_V, h_EX_encoder, E_idx)
+        chain_M = chain_M*mask #update chain_M to include missing regions
+        mask_size = E_idx.shape[1]
+        if causal_mask:
+            if not use_input_decoding_order:
+                decoding_order = torch.argsort((chain_M+0.0001)*(torch.abs(randn))) #[numbers will be smaller for places where chain_M = 0.0 and higher for places where chain_M = 1.0]
+            permutation_matrix_reverse = torch.nn.functional.one_hot(decoding_order, num_classes=mask_size).float()
+            order_mask_backward = torch.einsum('ij, biq, bjp->bqp',(1-torch.triu(torch.ones(mask_size,mask_size, device=device))), permutation_matrix_reverse, permutation_matrix_reverse)
+        else:
+            order_mask_backward = torch.ones(X.shape[0], mask_size, mask_size, device=device)
+        mask_attend = torch.gather(order_mask_backward, 2, E_idx).unsqueeze(-1)
+        mask_1D = mask.view([mask.size(0), mask.size(1), 1, 1])
+        mask_bw = mask_1D * mask_attend
+        mask_fw = mask_1D * (1. - mask_attend)
+        h_EXV_encoder_fw = mask_fw * h_EXV_encoder
+        for layer in self.decoder_layers:
+            # Masked positions attend to encoder information, unmasked see.
+            h_ESV = cat_neighbors_nodes(h_V, h_ES, E_idx)
+            h_ESV = mask_bw * h_ESV + h_EXV_encoder_fw
+            h_V = layer(h_V, h_ESV, mask, time_cond=time_cond)
+        if return_node_embs:
+            return h_V, encoder_embs
+        else:
+            logits = self.W_out(h_V)
+            log_probs = F.log_softmax(logits, dim=-1)
+            return log_probs
+    def sample(self, X, randn, S_true, chain_mask, chain_encoding_all, residue_idx, mask=None, temperature=1.0, omit_AAs_np=None, bias_AAs_np=None, chain_M_pos=None, omit_AA_mask=None, pssm_coef=None, pssm_bias=None, pssm_multi=None, pssm_log_odds_flag=None, pssm_log_odds_mask=None, pssm_bias_flag=None, bias_by_res=None):
+        device = X.device
+        # Prepare node and edge embeddings
+        E, E_idx = self.features(X, mask, residue_idx, chain_encoding_all)
+        h_V = torch.zeros((E.shape[0], E.shape[1], E.shape[-1]), device=device)
+        h_E = self.W_e(E)
+        # Encoder is unmasked self-attention
+        mask_attend = gather_nodes(mask.unsqueeze(-1),  E_idx).squeeze(-1)
+        mask_attend = mask.unsqueeze(-1) * mask_attend
+        for layer in self.encoder_layers:
+            h_V, h_E = layer(h_V, h_E, E_idx, mask, mask_attend)
+        # Decoder uses masked self-attention
+        chain_mask = chain_mask*chain_M_pos*mask #update chain_M to include missing regions
+        decoding_order = torch.argsort((chain_mask+0.0001)*(torch.abs(randn))) #[numbers will be smaller for places where chain_M = 0.0 and higher for places where chain_M = 1.0]
+        mask_size = E_idx.shape[1]
+        permutation_matrix_reverse = torch.nn.functional.one_hot(decoding_order, num_classes=mask_size).float()
+        order_mask_backward = torch.einsum('ij, biq, bjp->bqp',(1-torch.triu(torch.ones(mask_size,mask_size, device=device))), permutation_matrix_reverse, permutation_matrix_reverse)
+        mask_attend = torch.gather(order_mask_backward, 2, E_idx).unsqueeze(-1)
+        mask_1D = mask.view([mask.size(0), mask.size(1), 1, 1])
+        mask_bw = mask_1D * mask_attend
+        mask_fw = mask_1D * (1. - mask_attend)
+        N_batch, N_nodes = X.size(0), X.size(1)
+        log_probs = torch.zeros((N_batch, N_nodes, 21), device=device)
+        all_probs = torch.zeros((N_batch, N_nodes, 21), device=device, dtype=torch.float32)
+        h_S = torch.zeros_like(h_V, device=device)
+        S = torch.zeros((N_batch, N_nodes), dtype=torch.int64, device=device)
+        h_V_stack = [h_V] + [torch.zeros_like(h_V, device=device) for _ in range(len(self.decoder_layers))]
+        constant = torch.tensor(omit_AAs_np, device=device)
+        constant_bias = torch.tensor(bias_AAs_np, device=device)
+        #chain_mask_combined = chain_mask*chain_M_pos
+        omit_AA_mask_flag = omit_AA_mask != None
+        h_EX_encoder = cat_neighbors_nodes(torch.zeros_like(h_S), h_E, E_idx)
+        h_EXV_encoder = cat_neighbors_nodes(h_V, h_EX_encoder, E_idx)
+        h_EXV_encoder_fw = mask_fw * h_EXV_encoder
+        for t_ in range(N_nodes):
+            t = decoding_order[:,t_] #[B]
+            chain_mask_gathered = torch.gather(chain_mask, 1, t[:,None]) #[B]
+            mask_gathered = torch.gather(mask, 1, t[:,None]) #[B]
+            bias_by_res_gathered = torch.gather(bias_by_res, 1, t[:,None,None].repeat(1,1,21))[:,0,:] #[B, 21]
+            if (mask_gathered==0).all(): #for padded or missing regions only
+                S_t = torch.gather(S_true, 1, t[:,None])
+            else:
+                # Hidden layers
+                E_idx_t = torch.gather(E_idx, 1, t[:,None,None].repeat(1,1,E_idx.shape[-1]))
+                h_E_t = torch.gather(h_E, 1, t[:,None,None,None].repeat(1,1,h_E.shape[-2], h_E.shape[-1]))
+                h_ES_t = cat_neighbors_nodes(h_S, h_E_t, E_idx_t)
+                h_EXV_encoder_t = torch.gather(h_EXV_encoder_fw, 1, t[:,None,None,None].repeat(1,1,h_EXV_encoder_fw.shape[-2], h_EXV_encoder_fw.shape[-1]))
+                mask_t = torch.gather(mask, 1, t[:,None])
+                for l, layer in enumerate(self.decoder_layers):
+                    # Updated relational features for future states
+                    h_ESV_decoder_t = cat_neighbors_nodes(h_V_stack[l], h_ES_t, E_idx_t)
+                    h_V_t = torch.gather(h_V_stack[l], 1, t[:,None,None].repeat(1,1,h_V_stack[l].shape[-1]))
+                    h_ESV_t = torch.gather(mask_bw, 1, t[:,None,None,None].repeat(1,1,mask_bw.shape[-2], mask_bw.shape[-1])) * h_ESV_decoder_t + h_EXV_encoder_t
+                    h_V_stack[l+1].scatter_(1, t[:,None,None].repeat(1,1,h_V.shape[-1]), layer(h_V_t, h_ESV_t, mask_V=mask_t))
+                # Sampling step
+                h_V_t = torch.gather(h_V_stack[-1], 1, t[:,None,None].repeat(1,1,h_V_stack[-1].shape[-1]))[:,0]
+                logits = self.W_out(h_V_t) / temperature
+                probs = F.softmax(logits-constant[None,:]*1e8+constant_bias[None,:]/temperature+bias_by_res_gathered/temperature, dim=-1)
+                if pssm_bias_flag:
+                    pssm_coef_gathered = torch.gather(pssm_coef, 1, t[:,None])[:,0]
+                    pssm_bias_gathered = torch.gather(pssm_bias, 1, t[:,None,None].repeat(1,1,pssm_bias.shape[-1]))[:,0]
+                    probs = (1-pssm_multi*pssm_coef_gathered[:,None])*probs + pssm_multi*pssm_coef_gathered[:,None]*pssm_bias_gathered
+                if pssm_log_odds_flag:
+                    pssm_log_odds_mask_gathered = torch.gather(pssm_log_odds_mask, 1, t[:,None, None].repeat(1,1,pssm_log_odds_mask.shape[-1]))[:,0] #[B, 21]
+                    probs_masked = probs*pssm_log_odds_mask_gathered
+                    probs_masked += probs * 0.001
+                    probs = probs_masked/torch.sum(probs_masked, dim=-1, keepdim=True) #[B, 21]
+                if omit_AA_mask_flag:
+                    omit_AA_mask_gathered = torch.gather(omit_AA_mask, 1, t[:,None, None].repeat(1,1,omit_AA_mask.shape[-1]))[:,0] #[B, 21]
+                    probs_masked = probs*(1.0-omit_AA_mask_gathered)
+                    probs = probs_masked/torch.sum(probs_masked, dim=-1, keepdim=True) #[B, 21]
+                S_t = torch.multinomial(probs, 1)
+                all_probs.scatter_(1, t[:,None,None].repeat(1,1,21), (chain_mask_gathered[:,:,None,]*probs[:,None,:]).float())
+            S_true_gathered = torch.gather(S_true, 1, t[:,None])
+            S_t = (S_t*chain_mask_gathered+S_true_gathered*(1.0-chain_mask_gathered)).long()
+            temp1 = self.W_s(S_t)
+            h_S.scatter_(1, t[:,None,None].repeat(1,1,temp1.shape[-1]), temp1)
+            S.scatter_(1, t[:,None], S_t)
+        output_dict = {"S": S, "probs": all_probs, "decoding_order": decoding_order}
+        return output_dict
+    def tied_sample(self, X, randn, S_true, chain_mask, chain_encoding_all, residue_idx, mask=None, temperature=1.0, omit_AAs_np=None, bias_AAs_np=None, chain_M_pos=None, omit_AA_mask=None, pssm_coef=None, pssm_bias=None, pssm_multi=None, pssm_log_odds_flag=None, pssm_log_odds_mask=None, pssm_bias_flag=None, tied_pos=None, tied_beta=None, bias_by_res=None):
+        device = X.device
+        # Prepare node and edge embeddings
+        E, E_idx = self.features(X, mask, residue_idx, chain_encoding_all)
+        h_V = torch.zeros((E.shape[0], E.shape[1], E.shape[-1]), device=device)
+        h_E = self.W_e(E)
+        # Encoder is unmasked self-attention
+        mask_attend = gather_nodes(mask.unsqueeze(-1),  E_idx).squeeze(-1)
+        mask_attend = mask.unsqueeze(-1) * mask_attend
+        for layer in self.encoder_layers:
+            h_V, h_E = layer(h_V, h_E, E_idx, mask, mask_attend)
+        # Decoder uses masked self-attention
+        chain_mask = chain_mask*chain_M_pos*mask #update chain_M to include missing regions
+        decoding_order = torch.argsort((chain_mask+0.0001)*(torch.abs(randn))) #[numbers will be smaller for places where chain_M = 0.0 and higher for places where chain_M = 1.0]
+        new_decoding_order = []
+        for t_dec in list(decoding_order[0,].cpu().data.numpy()):
+            if t_dec not in list(itertools.chain(*new_decoding_order)):
+                list_a = [item for item in tied_pos if t_dec in item]
+                if list_a:
+                    new_decoding_order.append(list_a[0])
+                else:
+                    new_decoding_order.append([t_dec])
+        decoding_order = torch.tensor(list(itertools.chain(*new_decoding_order)), device=device)[None,].repeat(X.shape[0],1)
+        mask_size = E_idx.shape[1]
+        permutation_matrix_reverse = torch.nn.functional.one_hot(decoding_order, num_classes=mask_size).float()
+        order_mask_backward = torch.einsum('ij, biq, bjp->bqp',(1-torch.triu(torch.ones(mask_size,mask_size, device=device))), permutation_matrix_reverse, permutation_matrix_reverse)
+        mask_attend = torch.gather(order_mask_backward, 2, E_idx).unsqueeze(-1)
+        mask_1D = mask.view([mask.size(0), mask.size(1), 1, 1])
+        mask_bw = mask_1D * mask_attend
+        mask_fw = mask_1D * (1. - mask_attend)
+        N_batch, N_nodes = X.size(0), X.size(1)
+        log_probs = torch.zeros((N_batch, N_nodes, 21), device=device)
+        all_probs = torch.zeros((N_batch, N_nodes, 21), device=device, dtype=torch.float32)
+        h_S = torch.zeros_like(h_V, device=device)
+        S = torch.zeros((N_batch, N_nodes), dtype=torch.int64, device=device)
+        h_V_stack = [h_V] + [torch.zeros_like(h_V, device=device) for _ in range(len(self.decoder_layers))]
+        constant = torch.tensor(omit_AAs_np, device=device)
+        constant_bias = torch.tensor(bias_AAs_np, device=device)
+        omit_AA_mask_flag = omit_AA_mask != None
+        h_EX_encoder = cat_neighbors_nodes(torch.zeros_like(h_S), h_E, E_idx)
+        h_EXV_encoder = cat_neighbors_nodes(h_V, h_EX_encoder, E_idx)
+        h_EXV_encoder_fw = mask_fw * h_EXV_encoder
+        for t_list in new_decoding_order:
+            logits = 0.0
+            logit_list = []
+            done_flag = False
+            for t in t_list:
+                if (mask[:,t]==0).all():
+                    S_t = S_true[:,t]
+                    for t in t_list:
+                        h_S[:,t,:] = self.W_s(S_t)
+                        S[:,t] = S_t
+                    done_flag = True
+                    break
+                else:
+                    E_idx_t = E_idx[:,t:t+1,:]
+                    h_E_t = h_E[:,t:t+1,:,:]
+                    h_ES_t = cat_neighbors_nodes(h_S, h_E_t, E_idx_t)
+                    h_EXV_encoder_t = h_EXV_encoder_fw[:,t:t+1,:,:]
+                    mask_t = mask[:,t:t+1]
+                    for l, layer in enumerate(self.decoder_layers):
+                        h_ESV_decoder_t = cat_neighbors_nodes(h_V_stack[l], h_ES_t, E_idx_t)
+                        h_V_t = h_V_stack[l][:,t:t+1,:]
+                        h_ESV_t = mask_bw[:,t:t+1,:,:] * h_ESV_decoder_t + h_EXV_encoder_t
+                        h_V_stack[l+1][:,t,:] = layer(h_V_t, h_ESV_t, mask_V=mask_t).squeeze(1)
+                    h_V_t = h_V_stack[-1][:,t,:]
+                    logit_list.append((self.W_out(h_V_t) / temperature)/len(t_list))
+                    logits += tied_beta[t]*(self.W_out(h_V_t) / temperature)/len(t_list)
+            if done_flag:
+                pass
+            else:
+                bias_by_res_gathered = bias_by_res[:,t,:] #[B, 21]
+                probs = F.softmax(logits-constant[None,:]*1e8+constant_bias[None,:]/temperature+bias_by_res_gathered/temperature, dim=-1)
+                if pssm_bias_flag:
+                    pssm_coef_gathered = pssm_coef[:,t]
+                    pssm_bias_gathered = pssm_bias[:,t]
+                    probs = (1-pssm_multi*pssm_coef_gathered[:,None])*probs + pssm_multi*pssm_coef_gathered[:,None]*pssm_bias_gathered
+                if pssm_log_odds_flag:
+                    pssm_log_odds_mask_gathered = pssm_log_odds_mask[:,t]
+                    probs_masked = probs*pssm_log_odds_mask_gathered
+                    probs_masked += probs * 0.001
+                    probs = probs_masked/torch.sum(probs_masked, dim=-1, keepdim=True) #[B, 21]
+                if omit_AA_mask_flag:
+                    omit_AA_mask_gathered = omit_AA_mask[:,t]
+                    probs_masked = probs*(1.0-omit_AA_mask_gathered)
+                    probs = probs_masked/torch.sum(probs_masked, dim=-1, keepdim=True) #[B, 21]
+                S_t_repeat = torch.multinomial(probs, 1).squeeze(-1)
+                S_t_repeat = (chain_mask[:,t]*S_t_repeat + (1-chain_mask[:,t])*S_true[:,t]).long() #hard pick fixed positions
+                for t in t_list:
+                    h_S[:,t,:] = self.W_s(S_t_repeat)
+                    S[:,t] = S_t_repeat
+                    all_probs[:,t,:] = probs.float()
+        output_dict = {"S": S, "probs": all_probs, "decoding_order": decoding_order}
+        return output_dict
+    def conditional_probs(self, X, S, mask, chain_M, residue_idx, chain_encoding_all, randn, backbone_only=False):
+        """ Graph-conditioned sequence model """
+        device=X.device
+        # Prepare node and edge embeddings
+        E, E_idx = self.features(X, mask, residue_idx, chain_encoding_all)
+        h_V_enc = torch.zeros((E.shape[0], E.shape[1], E.shape[-1]), device=E.device)
+        h_E = self.W_e(E)
+        # Encoder is unmasked self-attention
+        mask_attend = gather_nodes(mask.unsqueeze(-1),  E_idx).squeeze(-1)
+        mask_attend = mask.unsqueeze(-1) * mask_attend
+        for layer in self.encoder_layers:
+            h_V_enc, h_E = layer(h_V_enc, h_E, E_idx, mask, mask_attend)
+        # Concatenate sequence embeddings for autoregressive decoder
+        h_S = self.W_s(S)
+        h_ES = cat_neighbors_nodes(h_S, h_E, E_idx)
+        # Build encoder embeddings
+        h_EX_encoder = cat_neighbors_nodes(torch.zeros_like(h_S), h_E, E_idx)
+        h_EXV_encoder = cat_neighbors_nodes(h_V_enc, h_EX_encoder, E_idx)
+        chain_M = chain_M*mask #update chain_M to include missing regions
+        chain_M_np = chain_M.cpu().numpy()
+        idx_to_loop = np.argwhere(chain_M_np[0,:]==1)[:,0]
+        log_conditional_probs = torch.zeros([X.shape[0], chain_M.shape[1], 21], device=device).float()
+        for idx in idx_to_loop:
+            h_V = torch.clone(h_V_enc)
+            order_mask = torch.zeros(chain_M.shape[1], device=device).float()
+            if backbone_only:
+                order_mask = torch.ones(chain_M.shape[1], device=device).float()
+                order_mask[idx] = 0.
+            else:
+                order_mask = torch.zeros(chain_M.shape[1], device=device).float()
+                order_mask[idx] = 1.
+            decoding_order = torch.argsort((order_mask[None,]+0.0001)*(torch.abs(randn))) #[numbers will be smaller for places where chain_M = 0.0 and higher for places where chain_M = 1.0]
+            mask_size = E_idx.shape[1]
+            permutation_matrix_reverse = torch.nn.functional.one_hot(decoding_order, num_classes=mask_size).float()
+            order_mask_backward = torch.einsum('ij, biq, bjp->bqp',(1-torch.triu(torch.ones(mask_size,mask_size, device=device))), permutation_matrix_reverse, permutation_matrix_reverse)
+            mask_attend = torch.gather(order_mask_backward, 2, E_idx).unsqueeze(-1)
+            mask_1D = mask.view([mask.size(0), mask.size(1), 1, 1])
+            mask_bw = mask_1D * mask_attend
+            mask_fw = mask_1D * (1. - mask_attend)
+            h_EXV_encoder_fw = mask_fw * h_EXV_encoder
+            for layer in self.decoder_layers:
+                # Masked positions attend to encoder information, unmasked see.
+                h_ESV = cat_neighbors_nodes(h_V, h_ES, E_idx)
+                h_ESV = mask_bw * h_ESV + h_EXV_encoder_fw
+                h_V = layer(h_V, h_ESV, mask)
+            logits = self.W_out(h_V)
+            log_probs = F.log_softmax(logits, dim=-1)
+            log_conditional_probs[:,idx,:] = log_probs[:,idx,:]
+        return log_conditional_probs
+    def unconditional_probs(self, X, mask, residue_idx, chain_encoding_all):
+        """ Graph-conditioned sequence model """
+        device=X.device
+        # Prepare node and edge embeddings
+        E, E_idx = self.features(X, mask, residue_idx, chain_encoding_all)
+        h_V = torch.zeros((E.shape[0], E.shape[1], E.shape[-1]), device=E.device)
+        h_E = self.W_e(E)
+        # Encoder is unmasked self-attention
+        mask_attend = gather_nodes(mask.unsqueeze(-1),  E_idx).squeeze(-1)
+        mask_attend = mask.unsqueeze(-1) * mask_attend
+        for layer in self.encoder_layers:
+            h_V, h_E = layer(h_V, h_E, E_idx, mask, mask_attend)
+        # Build encoder embeddings
+        h_EX_encoder = cat_neighbors_nodes(torch.zeros_like(h_V), h_E, E_idx)
+        h_EXV_encoder = cat_neighbors_nodes(h_V, h_EX_encoder, E_idx)
+        order_mask_backward = torch.zeros([X.shape[0], X.shape[1], X.shape[1]], device=device)
+        mask_attend = torch.gather(order_mask_backward, 2, E_idx).unsqueeze(-1)
+        mask_1D = mask.view([mask.size(0), mask.size(1), 1, 1])
+        mask_bw = mask_1D * mask_attend
+        mask_fw = mask_1D * (1. - mask_attend)
+        h_EXV_encoder_fw = mask_fw * h_EXV_encoder
+        for layer in self.decoder_layers:
+            h_V = layer(h_V, h_EXV_encoder_fw, mask)
+        logits = self.W_out(h_V)
+        log_probs = F.log_softmax(logits, dim=-1)
+        return log_probs

core/residue_constants.py ADDED Viewed

	@@ -0,0 +1,1104 @@

+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Constants used in AlphaFold.
+Adapted from original code by alexechu.
+"""
+import collections
+import functools
+import os
+from typing import List, Mapping, Tuple
+import numpy as np
+import tree
+# Internal import (35fd).
+# Distance from one CA to next CA [trans configuration: omega = 180].
+ca_ca = 3.80209737096
+# Format: The list for each AA type contains chi1, chi2, chi3, chi4 in
+# this order (or a relevant subset from chi1 onwards). ALA and GLY don't have
+# chi angles so their chi angle lists are empty.
+chi_angles_atoms = {
+    "ALA": [],
+    # Chi5 in arginine is always 0 +- 5 degrees, so ignore it.
+    "ARG": [
+        ["N", "CA", "CB", "CG"],
+        ["CA", "CB", "CG", "CD"],
+        ["CB", "CG", "CD", "NE"],
+        ["CG", "CD", "NE", "CZ"],
+    ],
+    "ASN": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "OD1"]],
+    "ASP": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "OD1"]],
+    "CYS": [["N", "CA", "CB", "SG"]],
+    "GLN": [
+        ["N", "CA", "CB", "CG"],
+        ["CA", "CB", "CG", "CD"],
+        ["CB", "CG", "CD", "OE1"],
+    ],
+    "GLU": [
+        ["N", "CA", "CB", "CG"],
+        ["CA", "CB", "CG", "CD"],
+        ["CB", "CG", "CD", "OE1"],
+    ],
+    "GLY": [],
+    "HIS": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "ND1"]],
+    "ILE": [["N", "CA", "CB", "CG1"], ["CA", "CB", "CG1", "CD1"]],
+    "LEU": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD1"]],
+    "LYS": [
+        ["N", "CA", "CB", "CG"],
+        ["CA", "CB", "CG", "CD"],
+        ["CB", "CG", "CD", "CE"],
+        ["CG", "CD", "CE", "NZ"],
+    ],
+    "MET": [
+        ["N", "CA", "CB", "CG"],
+        ["CA", "CB", "CG", "SD"],
+        ["CB", "CG", "SD", "CE"],
+    ],
+    "PHE": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD1"]],
+    "PRO": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD"]],
+    "SER": [["N", "CA", "CB", "OG"]],
+    "THR": [["N", "CA", "CB", "OG1"]],
+    "TRP": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD1"]],
+    "TYR": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD1"]],
+    "VAL": [["N", "CA", "CB", "CG1"]],
+}
+# If chi angles given in fixed-length array, this matrix determines how to mask
+# them for each AA type. The order is as per restype_order (see below).
+chi_angles_mask = [
+    [0.0, 0.0, 0.0, 0.0],  # ALA
+    [1.0, 1.0, 1.0, 1.0],  # ARG
+    [1.0, 1.0, 0.0, 0.0],  # ASN
+    [1.0, 1.0, 0.0, 0.0],  # ASP
+    [1.0, 0.0, 0.0, 0.0],  # CYS
+    [1.0, 1.0, 1.0, 0.0],  # GLN
+    [1.0, 1.0, 1.0, 0.0],  # GLU
+    [0.0, 0.0, 0.0, 0.0],  # GLY
+    [1.0, 1.0, 0.0, 0.0],  # HIS
+    [1.0, 1.0, 0.0, 0.0],  # ILE
+    [1.0, 1.0, 0.0, 0.0],  # LEU
+    [1.0, 1.0, 1.0, 1.0],  # LYS
+    [1.0, 1.0, 1.0, 0.0],  # MET
+    [1.0, 1.0, 0.0, 0.0],  # PHE
+    [1.0, 1.0, 0.0, 0.0],  # PRO
+    [1.0, 0.0, 0.0, 0.0],  # SER
+    [1.0, 0.0, 0.0, 0.0],  # THR
+    [1.0, 1.0, 0.0, 0.0],  # TRP
+    [1.0, 1.0, 0.0, 0.0],  # TYR
+    [1.0, 0.0, 0.0, 0.0],  # VAL
+]
+# The following chi angles are pi periodic: they can be rotated by a multiple
+# of pi without affecting the structure.
+chi_pi_periodic = [
+    [0.0, 0.0, 0.0, 0.0],  # ALA
+    [0.0, 0.0, 0.0, 0.0],  # ARG
+    [0.0, 0.0, 0.0, 0.0],  # ASN
+    [0.0, 1.0, 0.0, 0.0],  # ASP
+    [0.0, 0.0, 0.0, 0.0],  # CYS
+    [0.0, 0.0, 0.0, 0.0],  # GLN
+    [0.0, 0.0, 1.0, 0.0],  # GLU
+    [0.0, 0.0, 0.0, 0.0],  # GLY
+    [0.0, 0.0, 0.0, 0.0],  # HIS
+    [0.0, 0.0, 0.0, 0.0],  # ILE
+    [0.0, 0.0, 0.0, 0.0],  # LEU
+    [0.0, 0.0, 0.0, 0.0],  # LYS
+    [0.0, 0.0, 0.0, 0.0],  # MET
+    [0.0, 1.0, 0.0, 0.0],  # PHE
+    [0.0, 0.0, 0.0, 0.0],  # PRO
+    [0.0, 0.0, 0.0, 0.0],  # SER
+    [0.0, 0.0, 0.0, 0.0],  # THR
+    [0.0, 0.0, 0.0, 0.0],  # TRP
+    [0.0, 1.0, 0.0, 0.0],  # TYR
+    [0.0, 0.0, 0.0, 0.0],  # VAL
+    [0.0, 0.0, 0.0, 0.0],  # UNK
+]
+# Atoms positions relative to the 8 rigid groups, defined by the pre-omega, phi,
+# psi and chi angles:
+# 0: 'backbone group',
+# 1: 'pre-omega-group', (empty)
+# 2: 'phi-group', (currently empty, because it defines only hydrogens)
+# 3: 'psi-group',
+# 4,5,6,7: 'chi1,2,3,4-group'
+# The atom positions are relative to the axis-end-atom of the corresponding
+# rotation axis. The x-axis is in direction of the rotation axis, and the y-axis
+# is defined such that the dihedral-angle-defining atom (the last entry in
+# chi_angles_atoms above) is in the xy-plane (with a positive y-coordinate).
+# format: [atomname, group_idx, rel_position]
+rigid_group_atom_positions = {
+    "ALA": [
+        ["N", 0, (-0.525, 1.363, 0.000)],
+        ["CA", 0, (0.000, 0.000, 0.000)],
+        ["C", 0, (1.526, -0.000, -0.000)],
+        ["CB", 0, (-0.529, -0.774, -1.205)],
+        ["O", 3, (0.627, 1.062, 0.000)],
+    ],
+    "ARG": [
+        ["N", 0, (-0.524, 1.362, -0.000)],
+        ["CA", 0, (0.000, 0.000, 0.000)],
+        ["C", 0, (1.525, -0.000, -0.000)],
+        ["CB", 0, (-0.524, -0.778, -1.209)],
+        ["O", 3, (0.626, 1.062, 0.000)],
+        ["CG", 4, (0.616, 1.390, -0.000)],
+        ["CD", 5, (0.564, 1.414, 0.000)],
+        ["NE", 6, (0.539, 1.357, -0.000)],
+        ["NH1", 7, (0.206, 2.301, 0.000)],
+        ["NH2", 7, (2.078, 0.978, -0.000)],
+        ["CZ", 7, (0.758, 1.093, -0.000)],
+    ],
+    "ASN": [
+        ["N", 0, (-0.536, 1.357, 0.000)],
+        ["CA", 0, (0.000, 0.000, 0.000)],
+        ["C", 0, (1.526, -0.000, -0.000)],
+        ["CB", 0, (-0.531, -0.787, -1.200)],
+        ["O", 3, (0.625, 1.062, 0.000)],
+        ["CG", 4, (0.584, 1.399, 0.000)],
+        ["ND2", 5, (0.593, -1.188, 0.001)],
+        ["OD1", 5, (0.633, 1.059, 0.000)],
+    ],
+    "ASP": [
+        ["N", 0, (-0.525, 1.362, -0.000)],
+        ["CA", 0, (0.000, 0.000, 0.000)],
+        ["C", 0, (1.527, 0.000, -0.000)],
+        ["CB", 0, (-0.526, -0.778, -1.208)],
+        ["O", 3, (0.626, 1.062, -0.000)],
+        ["CG", 4, (0.593, 1.398, -0.000)],
+        ["OD1", 5, (0.610, 1.091, 0.000)],
+        ["OD2", 5, (0.592, -1.101, -0.003)],
+    ],
+    "CYS": [
+        ["N", 0, (-0.522, 1.362, -0.000)],
+        ["CA", 0, (0.000, 0.000, 0.000)],
+        ["C", 0, (1.524, 0.000, 0.000)],
+        ["CB", 0, (-0.519, -0.773, -1.212)],
+        ["O", 3, (0.625, 1.062, -0.000)],
+        ["SG", 4, (0.728, 1.653, 0.000)],
+    ],
+    "GLN": [
+        ["N", 0, (-0.526, 1.361, -0.000)],
+        ["CA", 0, (0.000, 0.000, 0.000)],
+        ["C", 0, (1.526, 0.000, 0.000)],
+        ["CB", 0, (-0.525, -0.779, -1.207)],
+        ["O", 3, (0.626, 1.062, -0.000)],
+        ["CG", 4, (0.615, 1.393, 0.000)],
+        ["CD", 5, (0.587, 1.399, -0.000)],
+        ["NE2", 6, (0.593, -1.189, -0.001)],
+        ["OE1", 6, (0.634, 1.060, 0.000)],
+    ],
+    "GLU": [
+        ["N", 0, (-0.528, 1.361, 0.000)],
+        ["CA", 0, (0.000, 0.000, 0.000)],
+        ["C", 0, (1.526, -0.000, -0.000)],
+        ["CB", 0, (-0.526, -0.781, -1.207)],
+        ["O", 3, (0.626, 1.062, 0.000)],
+        ["CG", 4, (0.615, 1.392, 0.000)],
+        ["CD", 5, (0.600, 1.397, 0.000)],
+        ["OE1", 6, (0.607, 1.095, -0.000)],
+        ["OE2", 6, (0.589, -1.104, -0.001)],
+    ],
+    "GLY": [
+        ["N", 0, (-0.572, 1.337, 0.000)],
+        ["CA", 0, (0.000, 0.000, 0.000)],
+        ["C", 0, (1.517, -0.000, -0.000)],
+        ["O", 3, (0.626, 1.062, -0.000)],
+    ],
+    "HIS": [
+        ["N", 0, (-0.527, 1.360, 0.000)],
+        ["CA", 0, (0.000, 0.000, 0.000)],
+        ["C", 0, (1.525, 0.000, 0.000)],
+        ["CB", 0, (-0.525, -0.778, -1.208)],
+        ["O", 3, (0.625, 1.063, 0.000)],
+        ["CG", 4, (0.600, 1.370, -0.000)],
+        ["CD2", 5, (0.889, -1.021, 0.003)],
+        ["ND1", 5, (0.744, 1.160, -0.000)],
+        ["CE1", 5, (2.030, 0.851, 0.002)],
+        ["NE2", 5, (2.145, -0.466, 0.004)],
+    ],
+    "ILE": [
+        ["N", 0, (-0.493, 1.373, -0.000)],
+        ["CA", 0, (0.000, 0.000, 0.000)],
+        ["C", 0, (1.527, -0.000, -0.000)],
+        ["CB", 0, (-0.536, -0.793, -1.213)],
+        ["O", 3, (0.627, 1.062, -0.000)],
+        ["CG1", 4, (0.534, 1.437, -0.000)],
+        ["CG2", 4, (0.540, -0.785, -1.199)],
+        ["CD1", 5, (0.619, 1.391, 0.000)],
+    ],
+    "LEU": [
+        ["N", 0, (-0.520, 1.363, 0.000)],
+        ["CA", 0, (0.000, 0.000, 0.000)],
+        ["C", 0, (1.525, -0.000, -0.000)],
+        ["CB", 0, (-0.522, -0.773, -1.214)],
+        ["O", 3, (0.625, 1.063, -0.000)],
+        ["CG", 4, (0.678, 1.371, 0.000)],
+        ["CD1", 5, (0.530, 1.430, -0.000)],
+        ["CD2", 5, (0.535, -0.774, 1.200)],
+    ],
+    "LYS": [
+        ["N", 0, (-0.526, 1.362, -0.000)],
+        ["CA", 0, (0.000, 0.000, 0.000)],
+        ["C", 0, (1.526, 0.000, 0.000)],
+        ["CB", 0, (-0.524, -0.778, -1.208)],
+        ["O", 3, (0.626, 1.062, -0.000)],
+        ["CG", 4, (0.619, 1.390, 0.000)],
+        ["CD", 5, (0.559, 1.417, 0.000)],
+        ["CE", 6, (0.560, 1.416, 0.000)],
+        ["NZ", 7, (0.554, 1.387, 0.000)],
+    ],
+    "MET": [
+        ["N", 0, (-0.521, 1.364, -0.000)],
+        ["CA", 0, (0.000, 0.000, 0.000)],
+        ["C", 0, (1.525, 0.000, 0.000)],
+        ["CB", 0, (-0.523, -0.776, -1.210)],
+        ["O", 3, (0.625, 1.062, -0.000)],
+        ["CG", 4, (0.613, 1.391, -0.000)],
+        ["SD", 5, (0.703, 1.695, 0.000)],
+        ["CE", 6, (0.320, 1.786, -0.000)],
+    ],
+    "PHE": [
+        ["N", 0, (-0.518, 1.363, 0.000)],
+        ["CA", 0, (0.000, 0.000, 0.000)],
+        ["C", 0, (1.524, 0.000, -0.000)],
+        ["CB", 0, (-0.525, -0.776, -1.212)],
+        ["O", 3, (0.626, 1.062, -0.000)],
+        ["CG", 4, (0.607, 1.377, 0.000)],
+        ["CD1", 5, (0.709, 1.195, -0.000)],
+        ["CD2", 5, (0.706, -1.196, 0.000)],
+        ["CE1", 5, (2.102, 1.198, -0.000)],
+        ["CE2", 5, (2.098, -1.201, -0.000)],
+        ["CZ", 5, (2.794, -0.003, -0.001)],
+    ],
+    "PRO": [
+        ["N", 0, (-0.566, 1.351, -0.000)],
+        ["CA", 0, (0.000, 0.000, 0.000)],
+        ["C", 0, (1.527, -0.000, 0.000)],
+        ["CB", 0, (-0.546, -0.611, -1.293)],
+        ["O", 3, (0.621, 1.066, 0.000)],
+        ["CG", 4, (0.382, 1.445, 0.0)],
+        # ['CD', 5, (0.427, 1.440, 0.0)],
+        ["CD", 5, (0.477, 1.424, 0.0)],  # manually made angle 2 degrees larger
+    ],
+    "SER": [
+        ["N", 0, (-0.529, 1.360, -0.000)],
+        ["CA", 0, (0.000, 0.000, 0.000)],
+        ["C", 0, (1.525, -0.000, -0.000)],
+        ["CB", 0, (-0.518, -0.777, -1.211)],
+        ["O", 3, (0.626, 1.062, -0.000)],
+        ["OG", 4, (0.503, 1.325, 0.000)],
+    ],
+    "THR": [
+        ["N", 0, (-0.517, 1.364, 0.000)],
+        ["CA", 0, (0.000, 0.000, 0.000)],
+        ["C", 0, (1.526, 0.000, -0.000)],
+        ["CB", 0, (-0.516, -0.793, -1.215)],
+        ["O", 3, (0.626, 1.062, 0.000)],
+        ["CG2", 4, (0.550, -0.718, -1.228)],
+        ["OG1", 4, (0.472, 1.353, 0.000)],
+    ],
+    "TRP": [
+        ["N", 0, (-0.521, 1.363, 0.000)],
+        ["CA", 0, (0.000, 0.000, 0.000)],
+        ["C", 0, (1.525, -0.000, 0.000)],
+        ["CB", 0, (-0.523, -0.776, -1.212)],
+        ["O", 3, (0.627, 1.062, 0.000)],
+        ["CG", 4, (0.609, 1.370, -0.000)],
+        ["CD1", 5, (0.824, 1.091, 0.000)],
+        ["CD2", 5, (0.854, -1.148, -0.005)],
+        ["CE2", 5, (2.186, -0.678, -0.007)],
+        ["CE3", 5, (0.622, -2.530, -0.007)],
+        ["NE1", 5, (2.140, 0.690, -0.004)],
+        ["CH2", 5, (3.028, -2.890, -0.013)],
+        ["CZ2", 5, (3.283, -1.543, -0.011)],
+        ["CZ3", 5, (1.715, -3.389, -0.011)],
+    ],
+    "TYR": [
+        ["N", 0, (-0.522, 1.362, 0.000)],
+        ["CA", 0, (0.000, 0.000, 0.000)],
+        ["C", 0, (1.524, -0.000, -0.000)],
+        ["CB", 0, (-0.522, -0.776, -1.213)],
+        ["O", 3, (0.627, 1.062, -0.000)],
+        ["CG", 4, (0.607, 1.382, -0.000)],
+        ["CD1", 5, (0.716, 1.195, -0.000)],
+        ["CD2", 5, (0.713, -1.194, -0.001)],
+        ["CE1", 5, (2.107, 1.200, -0.002)],
+        ["CE2", 5, (2.104, -1.201, -0.003)],
+        ["OH", 5, (4.168, -0.002, -0.005)],
+        ["CZ", 5, (2.791, -0.001, -0.003)],
+    ],
+    "VAL": [
+        ["N", 0, (-0.494, 1.373, -0.000)],
+        ["CA", 0, (0.000, 0.000, 0.000)],
+        ["C", 0, (1.527, -0.000, -0.000)],
+        ["CB", 0, (-0.533, -0.795, -1.213)],
+        ["O", 3, (0.627, 1.062, -0.000)],
+        ["CG1", 4, (0.540, 1.429, -0.000)],
+        ["CG2", 4, (0.533, -0.776, 1.203)],
+    ],
+}
+# A list of atoms (excluding hydrogen) for each AA type. PDB naming convention.
+residue_atoms = {
+    "ALA": ["C", "CA", "CB", "N", "O"],
+    "ARG": ["C", "CA", "CB", "CG", "CD", "CZ", "N", "NE", "O", "NH1", "NH2"],
+    "ASP": ["C", "CA", "CB", "CG", "N", "O", "OD1", "OD2"],
+    "ASN": ["C", "CA", "CB", "CG", "N", "ND2", "O", "OD1"],
+    "CYS": ["C", "CA", "CB", "N", "O", "SG"],
+    "GLU": ["C", "CA", "CB", "CG", "CD", "N", "O", "OE1", "OE2"],
+    "GLN": ["C", "CA", "CB", "CG", "CD", "N", "NE2", "O", "OE1"],
+    "GLY": ["C", "CA", "N", "O"],
+    "HIS": ["C", "CA", "CB", "CG", "CD2", "CE1", "N", "ND1", "NE2", "O"],
+    "ILE": ["C", "CA", "CB", "CG1", "CG2", "CD1", "N", "O"],
+    "LEU": ["C", "CA", "CB", "CG", "CD1", "CD2", "N", "O"],
+    "LYS": ["C", "CA", "CB", "CG", "CD", "CE", "N", "NZ", "O"],
+    "MET": ["C", "CA", "CB", "CG", "CE", "N", "O", "SD"],
+    "PHE": ["C", "CA", "CB", "CG", "CD1", "CD2", "CE1", "CE2", "CZ", "N", "O"],
+    "PRO": ["C", "CA", "CB", "CG", "CD", "N", "O"],
+    "SER": ["C", "CA", "CB", "N", "O", "OG"],
+    "THR": ["C", "CA", "CB", "CG2", "N", "O", "OG1"],
+    "TRP": [
+        "C",
+        "CA",
+        "CB",
+        "CG",
+        "CD1",
+        "CD2",
+        "CE2",
+        "CE3",
+        "CZ2",
+        "CZ3",
+        "CH2",
+        "N",
+        "NE1",
+        "O",
+    ],
+    "TYR": ["C", "CA", "CB", "CG", "CD1", "CD2", "CE1", "CE2", "CZ", "N", "O", "OH"],
+    "VAL": ["C", "CA", "CB", "CG1", "CG2", "N", "O"],
+}
+# Naming swaps for ambiguous atom names.
+# Due to symmetries in the amino acids the naming of atoms is ambiguous in
+# 4 of the 20 amino acids.
+# (The LDDT paper lists 7 amino acids as ambiguous, but the naming ambiguities
+# in LEU, VAL and ARG can be resolved by using the 3d constellations of
+# the 'ambiguous' atoms and their neighbours)
+residue_atom_renaming_swaps = {
+    "ASP": {"OD1": "OD2"},
+    "GLU": {"OE1": "OE2"},
+    "PHE": {"CD1": "CD2", "CE1": "CE2"},
+    "TYR": {"CD1": "CD2", "CE1": "CE2"},
+}
+# Van der Waals radii [Angstroem] of the atoms (from Wikipedia)
+van_der_waals_radius = {
+    "C": 1.7,
+    "N": 1.55,
+    "O": 1.52,
+    "S": 1.8,
+}
+Bond = collections.namedtuple("Bond", ["atom1_name", "atom2_name", "length", "stddev"])
+BondAngle = collections.namedtuple(
+    "BondAngle", ["atom1_name", "atom2_name", "atom3name", "angle_rad", "stddev"]
+)
+@functools.lru_cache(maxsize=None)
+def load_stereo_chemical_props() -> (
+    Tuple[
+        Mapping[str, List[Bond]],
+        Mapping[str, List[Bond]],
+        Mapping[str, List[BondAngle]],
+    ]
+):
+    """Load stereo_chemical_props.txt into a nice structure.
+    Load literature values for bond lengths and bond angles and translate
+    bond angles into the length of the opposite edge of the triangle
+    ("residue_virtual_bonds").
+    Returns:
+      residue_bonds: Dict that maps resname -> list of Bond tuples.
+      residue_virtual_bonds: Dict that maps resname -> list of Bond tuples.
+      residue_bond_angles: Dict that maps resname -> list of BondAngle tuples.
+    """
+    stereo_chemical_props_path = os.path.join(
+        os.path.dirname(os.path.abspath(__file__)), "stereo_chemical_props.txt"
+    )
+    with open(stereo_chemical_props_path, "rt") as f:
+        stereo_chemical_props = f.read()
+    lines_iter = iter(stereo_chemical_props.splitlines())
+    # Load bond lengths.
+    residue_bonds = {}
+    next(lines_iter)  # Skip header line.
+    for line in lines_iter:
+        if line.strip() == "-":
+            break
+        bond, resname, length, stddev = line.split()
+        atom1, atom2 = bond.split("-")
+        if resname not in residue_bonds:
+            residue_bonds[resname] = []
+        residue_bonds[resname].append(Bond(atom1, atom2, float(length), float(stddev)))
+    residue_bonds["UNK"] = []
+    # Load bond angles.
+    residue_bond_angles = {}
+    next(lines_iter)  # Skip empty line.
+    next(lines_iter)  # Skip header line.
+    for line in lines_iter:
+        if line.strip() == "-":
+            break
+        bond, resname, angle_degree, stddev_degree = line.split()
+        atom1, atom2, atom3 = bond.split("-")
+        if resname not in residue_bond_angles:
+            residue_bond_angles[resname] = []
+        residue_bond_angles[resname].append(
+            BondAngle(
+                atom1,
+                atom2,
+                atom3,
+                float(angle_degree) / 180.0 * np.pi,
+                float(stddev_degree) / 180.0 * np.pi,
+            )
+        )
+    residue_bond_angles["UNK"] = []
+    def make_bond_key(atom1_name, atom2_name):
+        """Unique key to lookup bonds."""
+        return "-".join(sorted([atom1_name, atom2_name]))
+    # Translate bond angles into distances ("virtual bonds").
+    residue_virtual_bonds = {}
+    for resname, bond_angles in residue_bond_angles.items():
+        # Create a fast lookup dict for bond lengths.
+        bond_cache = {}
+        for b in residue_bonds[resname]:
+            bond_cache[make_bond_key(b.atom1_name, b.atom2_name)] = b
+        residue_virtual_bonds[resname] = []
+        for ba in bond_angles:
+            bond1 = bond_cache[make_bond_key(ba.atom1_name, ba.atom2_name)]
+            bond2 = bond_cache[make_bond_key(ba.atom2_name, ba.atom3name)]
+            # Compute distance between atom1 and atom3 using the law of cosines
+            # c^2 = a^2 + b^2 - 2ab*cos(gamma).
+            gamma = ba.angle_rad
+            length = np.sqrt(
+                bond1.length**2
+                + bond2.length**2
+                - 2 * bond1.length * bond2.length * np.cos(gamma)
+            )
+            # Propagation of uncertainty assuming uncorrelated errors.
+            dl_outer = 0.5 / length
+            dl_dgamma = (2 * bond1.length * bond2.length * np.sin(gamma)) * dl_outer
+            dl_db1 = (2 * bond1.length - 2 * bond2.length * np.cos(gamma)) * dl_outer
+            dl_db2 = (2 * bond2.length - 2 * bond1.length * np.cos(gamma)) * dl_outer
+            stddev = np.sqrt(
+                (dl_dgamma * ba.stddev) ** 2
+                + (dl_db1 * bond1.stddev) ** 2
+                + (dl_db2 * bond2.stddev) ** 2
+            )
+            residue_virtual_bonds[resname].append(
+                Bond(ba.atom1_name, ba.atom3name, length, stddev)
+            )
+    return (residue_bonds, residue_virtual_bonds, residue_bond_angles)
+# Between-residue bond lengths for general bonds (first element) and for Proline
+# (second element).
+between_res_bond_length_c_n = [1.329, 1.341]
+between_res_bond_length_stddev_c_n = [0.014, 0.016]
+# Between-residue cos_angles.
+between_res_cos_angles_c_n_ca = [-0.5203, 0.0353]  # degrees: 121.352 +- 2.315
+between_res_cos_angles_ca_c_n = [-0.4473, 0.0311]  # degrees: 116.568 +- 1.995
+# This mapping is used when we need to store atom data in a format that requires
+# fixed atom data size for every residue (e.g. a numpy array).
+atom_types = [
+    "N",
+    "CA",
+    "C",
+    "CB",
+    "O",
+    "CG",
+    "CG1",
+    "CG2",
+    "OG",
+    "OG1",
+    "SG",
+    "CD",
+    "CD1",
+    "CD2",
+    "ND1",
+    "ND2",
+    "OD1",
+    "OD2",
+    "SD",
+    "CE",
+    "CE1",
+    "CE2",
+    "CE3",
+    "NE",
+    "NE1",
+    "NE2",
+    "OE1",
+    "OE2",
+    "CH2",
+    "NH1",
+    "NH2",
+    "OH",
+    "CZ",
+    "CZ2",
+    "CZ3",
+    "NZ",
+    "OXT",
+]
+atom_order = {atom_type: i for i, atom_type in enumerate(atom_types)}
+atom_type_num = len(atom_types)  # := 37.
+# A compact atom encoding with 14 columns
+# pylint: disable=line-too-long
+# pylint: disable=bad-whitespace
+restype_name_to_atom14_names = {
+    "ALA": ["N", "CA", "C", "O", "CB", "", "", "", "", "", "", "", "", ""],
+    "ARG": [
+        "N",
+        "CA",
+        "C",
+        "O",
+        "CB",
+        "CG",
+        "CD",
+        "NE",
+        "CZ",
+        "NH1",
+        "NH2",
+        "",
+        "",
+        "",
+    ],
+    "ASN": ["N", "CA", "C", "O", "CB", "CG", "OD1", "ND2", "", "", "", "", "", ""],
+    "ASP": ["N", "CA", "C", "O", "CB", "CG", "OD1", "OD2", "", "", "", "", "", ""],
+    "CYS": ["N", "CA", "C", "O", "CB", "SG", "", "", "", "", "", "", "", ""],
+    "GLN": ["N", "CA", "C", "O", "CB", "CG", "CD", "OE1", "NE2", "", "", "", "", ""],
+    "GLU": ["N", "CA", "C", "O", "CB", "CG", "CD", "OE1", "OE2", "", "", "", "", ""],
+    "GLY": ["N", "CA", "C", "O", "", "", "", "", "", "", "", "", "", ""],
+    "HIS": [
+        "N",
+        "CA",
+        "C",
+        "O",
+        "CB",
+        "CG",
+        "ND1",
+        "CD2",
+        "CE1",
+        "NE2",
+        "",
+        "",
+        "",
+        "",
+    ],
+    "ILE": ["N", "CA", "C", "O", "CB", "CG1", "CG2", "CD1", "", "", "", "", "", ""],
+    "LEU": ["N", "CA", "C", "O", "CB", "CG", "CD1", "CD2", "", "", "", "", "", ""],
+    "LYS": ["N", "CA", "C", "O", "CB", "CG", "CD", "CE", "NZ", "", "", "", "", ""],
+    "MET": ["N", "CA", "C", "O", "CB", "CG", "SD", "CE", "", "", "", "", "", ""],
+    "PHE": [
+        "N",
+        "CA",
+        "C",
+        "O",
+        "CB",
+        "CG",
+        "CD1",
+        "CD2",
+        "CE1",
+        "CE2",
+        "CZ",
+        "",
+        "",
+        "",
+    ],
+    "PRO": ["N", "CA", "C", "O", "CB", "CG", "CD", "", "", "", "", "", "", ""],
+    "SER": ["N", "CA", "C", "O", "CB", "OG", "", "", "", "", "", "", "", ""],
+    "THR": ["N", "CA", "C", "O", "CB", "OG1", "CG2", "", "", "", "", "", "", ""],
+    "TRP": [
+        "N",
+        "CA",
+        "C",
+        "O",
+        "CB",
+        "CG",
+        "CD1",
+        "CD2",
+        "NE1",
+        "CE2",
+        "CE3",
+        "CZ2",
+        "CZ3",
+        "CH2",
+    ],
+    "TYR": [
+        "N",
+        "CA",
+        "C",
+        "O",
+        "CB",
+        "CG",
+        "CD1",
+        "CD2",
+        "CE1",
+        "CE2",
+        "CZ",
+        "OH",
+        "",
+        "",
+    ],
+    "VAL": ["N", "CA", "C", "O", "CB", "CG1", "CG2", "", "", "", "", "", "", ""],
+    "UNK": ["", "", "", "", "", "", "", "", "", "", "", "", "", ""],
+}
+# pylint: enable=line-too-long
+# pylint: enable=bad-whitespace
+# This is the standard residue order when coding AA type as a number.
+# Reproduce it by taking 3-letter AA codes and sorting them alphabetically.
+restypes = [
+    "A",
+    "R",
+    "N",
+    "D",
+    "C",
+    "Q",
+    "E",
+    "G",
+    "H",
+    "I",
+    "L",
+    "K",
+    "M",
+    "F",
+    "P",
+    "S",
+    "T",
+    "W",
+    "Y",
+    "V",
+]
+restype_order = {restype: i for i, restype in enumerate(restypes)}
+restype_num = len(restypes)  # := 20.
+unk_restype_index = restype_num  # Catch-all index for unknown restypes.
+restypes_with_x = restypes + ["X"]
+restype_order_with_x = {restype: i for i, restype in enumerate(restypes_with_x)}
+def sequence_to_onehot(
+    sequence: str, mapping: Mapping[str, int], map_unknown_to_x: bool = False
+) -> np.ndarray:
+    """Maps the given sequence into a one-hot encoded matrix.
+    Args:
+      sequence: An amino acid sequence.
+      mapping: A dictionary mapping amino acids to integers.
+      map_unknown_to_x: If True, any amino acid that is not in the mapping will be
+        mapped to the unknown amino acid 'X'. If the mapping doesn't contain
+        amino acid 'X', an error will be thrown. If False, any amino acid not in
+        the mapping will throw an error.
+    Returns:
+      A numpy array of shape (seq_len, num_unique_aas) with one-hot encoding of
+      the sequence.
+    Raises:
+      ValueError: If the mapping doesn't contain values from 0 to
+        num_unique_aas - 1 without any gaps.
+    """
+    num_entries = max(mapping.values()) + 1
+    if sorted(set(mapping.values())) != list(range(num_entries)):
+        raise ValueError(
+            "The mapping must have values from 0 to num_unique_aas-1 "
+            "without any gaps. Got: %s" % sorted(mapping.values())
+        )
+    one_hot_arr = np.zeros((len(sequence), num_entries), dtype=np.int32)
+    for aa_index, aa_type in enumerate(sequence):
+        if map_unknown_to_x:
+            if aa_type.isalpha() and aa_type.isupper():
+                aa_id = mapping.get(aa_type, mapping["X"])
+            else:
+                raise ValueError(f"Invalid character in the sequence: {aa_type}")
+        else:
+            aa_id = mapping[aa_type]
+        one_hot_arr[aa_index, aa_id] = 1
+    return one_hot_arr
+restype_1to3 = {
+    "A": "ALA",
+    "R": "ARG",
+    "N": "ASN",
+    "D": "ASP",
+    "C": "CYS",
+    "Q": "GLN",
+    "E": "GLU",
+    "G": "GLY",
+    "H": "HIS",
+    "I": "ILE",
+    "L": "LEU",
+    "K": "LYS",
+    "M": "MET",
+    "F": "PHE",
+    "P": "PRO",
+    "S": "SER",
+    "T": "THR",
+    "W": "TRP",
+    "Y": "TYR",
+    "V": "VAL",
+}
+# NB: restype_3to1 differs from Bio.PDB.protein_letters_3to1 by being a simple
+# 1-to-1 mapping of 3 letter names to one letter names. The latter contains
+# many more, and less common, three letter names as keys and maps many of these
+# to the same one letter name (including 'X' and 'U' which we don't use here).
+restype_3to1 = {v: k for k, v in restype_1to3.items()}
+# Define a restype name for all unknown residues.
+unk_restype = "UNK"
+resnames = [restype_1to3[r] for r in restypes] + [unk_restype]
+resname_to_idx = {resname: i for i, resname in enumerate(resnames)}
+# Define exploded all-atom representation (atom73)
+atom73_names = ['N', 'CA', 'C', 'CB', 'O']
+for aa1 in restypes:
+    aa3 = restype_1to3[aa1]
+    atom_list = residue_atoms[aa3]
+    for atom in atom_types:
+        if atom in atom_list and atom not in atom73_names:
+            atom73_names.append(f'{aa1}{atom}')
+atom73_names_to_idx = {a: i for i, a in enumerate(atom73_names)}
+restype_atom73_mask = np.zeros((22, 73))
+for i, restype in enumerate(restypes):
+    for atom_name in atom_types:
+        atom73_name = atom_name
+        if atom_name not in ['N', 'CA', 'C', 'CB', 'O']:
+            atom73_name = restype + atom_name
+        if atom73_name in atom73_names_to_idx:
+            atom73_idx = atom73_names_to_idx[atom73_name]
+            restype_atom73_mask[i, atom73_idx] = 1
+# Remove CB for glycine
+restype_atom73_mask[restype_order["G"], 3] = 0
+# Backbone atoms for unk and mask
+restype_atom73_mask[-2:, [0, 1, 2, 4]] = 1
+# The mapping here uses hhblits convention, so that B is mapped to D, J and O
+# are mapped to X, U is mapped to C, and Z is mapped to E. Other than that the
+# remaining 20 amino acids are kept in alphabetical order.
+# There are 2 non-amino acid codes, X (representing any amino acid) and
+# "-" representing a missing amino acid in an alignment.  The id for these
+# codes is put at the end (20 and 21) so that they can easily be ignored if
+# desired.
+HHBLITS_AA_TO_ID = {
+    "A": 0,
+    "B": 2,
+    "C": 1,
+    "D": 2,
+    "E": 3,
+    "F": 4,
+    "G": 5,
+    "H": 6,
+    "I": 7,
+    "J": 20,
+    "K": 8,
+    "L": 9,
+    "M": 10,
+    "N": 11,
+    "O": 20,
+    "P": 12,
+    "Q": 13,
+    "R": 14,
+    "S": 15,
+    "T": 16,
+    "U": 1,
+    "V": 17,
+    "W": 18,
+    "X": 20,
+    "Y": 19,
+    "Z": 3,
+    "-": 21,
+}
+# Partial inversion of HHBLITS_AA_TO_ID.
+ID_TO_HHBLITS_AA = {
+    0: "A",
+    1: "C",  # Also U.
+    2: "D",  # Also B.
+    3: "E",  # Also Z.
+    4: "F",
+    5: "G",
+    6: "H",
+    7: "I",
+    8: "K",
+    9: "L",
+    10: "M",
+    11: "N",
+    12: "P",
+    13: "Q",
+    14: "R",
+    15: "S",
+    16: "T",
+    17: "V",
+    18: "W",
+    19: "Y",
+    20: "X",  # Includes J and O.
+    21: "-",
+}
+restypes_with_x_and_gap = restypes + ["X", "-"]
+MAP_HHBLITS_AATYPE_TO_OUR_AATYPE = tuple(
+    restypes_with_x_and_gap.index(ID_TO_HHBLITS_AA[i])
+    for i in range(len(restypes_with_x_and_gap))
+)
+def _make_standard_atom_mask() -> np.ndarray:
+    """Returns [num_res_types, num_atom_types] mask array."""
+    # +1 to account for unknown (all 0s).
+    mask = np.zeros([restype_num + 1, atom_type_num], dtype=np.int32)
+    for restype, restype_letter in enumerate(restypes):
+        restype_name = restype_1to3[restype_letter]
+        atom_names = residue_atoms[restype_name]
+        for atom_name in atom_names:
+            atom_type = atom_order[atom_name]
+            mask[restype, atom_type] = 1
+    return mask
+STANDARD_ATOM_MASK = _make_standard_atom_mask()
+# A one hot representation for the first and second atoms defining the axis
+# of rotation for each chi-angle in each residue.
+def chi_angle_atom(atom_index: int) -> np.ndarray:
+    """Define chi-angle rigid groups via one-hot representations."""
+    chi_angles_index = {}
+    one_hots = []
+    for k, v in chi_angles_atoms.items():
+        indices = [atom_types.index(s[atom_index]) for s in v]
+        indices.extend([-1] * (4 - len(indices)))
+        chi_angles_index[k] = indices
+    for r in restypes:
+        res3 = restype_1to3[r]
+        one_hot = np.eye(atom_type_num)[chi_angles_index[res3]]
+        one_hots.append(one_hot)
+    one_hots.append(np.zeros([4, atom_type_num]))  # Add zeros for residue `X`.
+    one_hot = np.stack(one_hots, axis=0)
+    one_hot = np.transpose(one_hot, [0, 2, 1])
+    return one_hot
+chi_atom_1_one_hot = chi_angle_atom(1)
+chi_atom_2_one_hot = chi_angle_atom(2)
+# An array like chi_angles_atoms but using indices rather than names.
+chi_angles_atom_indices = [chi_angles_atoms[restype_1to3[r]] for r in restypes]
+chi_angles_atom_indices = tree.map_structure(
+    lambda atom_name: atom_order[atom_name], chi_angles_atom_indices
+)
+chi_angles_atom_indices = np.array(
+    [
+        chi_atoms + ([[0, 0, 0, 0]] * (4 - len(chi_atoms)))
+        for chi_atoms in chi_angles_atom_indices
+    ]
+)
+# Mapping from (res_name, atom_name) pairs to the atom's chi group index
+# and atom index within that group.
+chi_groups_for_atom = collections.defaultdict(list)
+for res_name, chi_angle_atoms_for_res in chi_angles_atoms.items():
+    for chi_group_i, chi_group in enumerate(chi_angle_atoms_for_res):
+        for atom_i, atom in enumerate(chi_group):
+            chi_groups_for_atom[(res_name, atom)].append((chi_group_i, atom_i))
+chi_groups_for_atom = dict(chi_groups_for_atom)
+def _make_rigid_transformation_4x4(ex, ey, translation):
+    """Create a rigid 4x4 transformation matrix from two axes and transl."""
+    # Normalize ex.
+    ex_normalized = ex / np.linalg.norm(ex)
+    # make ey perpendicular to ex
+    ey_normalized = ey - np.dot(ey, ex_normalized) * ex_normalized
+    ey_normalized /= np.linalg.norm(ey_normalized)
+    # compute ez as cross product
+    eznorm = np.cross(ex_normalized, ey_normalized)
+    m = np.stack([ex_normalized, ey_normalized, eznorm, translation]).transpose()
+    m = np.concatenate([m, [[0.0, 0.0, 0.0, 1.0]]], axis=0)
+    return m
+# create an array with (restype, atomtype) --> rigid_group_idx
+# and an array with (restype, atomtype, coord) for the atom positions
+# and compute affine transformation matrices (4,4) from one rigid group to the
+# previous group
+restype_atom37_to_rigid_group = np.zeros([21, 37], dtype=int)
+restype_atom37_mask = np.zeros([21, 37], dtype=np.float32)
+restype_atom37_rigid_group_positions = np.zeros([21, 37, 3], dtype=np.float32)
+restype_atom14_to_rigid_group = np.zeros([21, 14], dtype=int)
+restype_atom14_mask = np.zeros([21, 14], dtype=np.float32)
+restype_atom14_rigid_group_positions = np.zeros([21, 14, 3], dtype=np.float32)
+restype_rigid_group_default_frame = np.zeros([21, 8, 4, 4], dtype=np.float32)
+def _make_rigid_group_constants():
+    """Fill the arrays above."""
+    for restype, restype_letter in enumerate(restypes):
+        resname = restype_1to3[restype_letter]
+        for atomname, group_idx, atom_position in rigid_group_atom_positions[resname]:
+            atomtype = atom_order[atomname]
+            restype_atom37_to_rigid_group[restype, atomtype] = group_idx
+            restype_atom37_mask[restype, atomtype] = 1
+            restype_atom37_rigid_group_positions[restype, atomtype, :] = atom_position
+            atom14idx = restype_name_to_atom14_names[resname].index(atomname)
+            restype_atom14_to_rigid_group[restype, atom14idx] = group_idx
+            restype_atom14_mask[restype, atom14idx] = 1
+            restype_atom14_rigid_group_positions[restype, atom14idx, :] = atom_position
+    for restype, restype_letter in enumerate(restypes):
+        resname = restype_1to3[restype_letter]
+        atom_positions = {
+            name: np.array(pos) for name, _, pos in rigid_group_atom_positions[resname]
+        }
+        # backbone to backbone is the identity transform
+        restype_rigid_group_default_frame[restype, 0, :, :] = np.eye(4)
+        # pre-omega-frame to backbone (currently dummy identity matrix)
+        restype_rigid_group_default_frame[restype, 1, :, :] = np.eye(4)
+        # phi-frame to backbone
+        mat = _make_rigid_transformation_4x4(
+            ex=atom_positions["N"] - atom_positions["CA"],
+            ey=np.array([1.0, 0.0, 0.0]),
+            translation=atom_positions["N"],
+        )
+        restype_rigid_group_default_frame[restype, 2, :, :] = mat
+        # psi-frame to backbone
+        mat = _make_rigid_transformation_4x4(
+            ex=atom_positions["C"] - atom_positions["CA"],
+            ey=atom_positions["CA"] - atom_positions["N"],
+            translation=atom_positions["C"],
+        )
+        restype_rigid_group_default_frame[restype, 3, :, :] = mat
+        # chi1-frame to backbone
+        if chi_angles_mask[restype][0]:
+            base_atom_names = chi_angles_atoms[resname][0]
+            base_atom_positions = [atom_positions[name] for name in base_atom_names]
+            mat = _make_rigid_transformation_4x4(
+                ex=base_atom_positions[2] - base_atom_positions[1],
+                ey=base_atom_positions[0] - base_atom_positions[1],
+                translation=base_atom_positions[2],
+            )
+            restype_rigid_group_default_frame[restype, 4, :, :] = mat
+        # chi2-frame to chi1-frame
+        # chi3-frame to chi2-frame
+        # chi4-frame to chi3-frame
+        # luckily all rotation axes for the next frame start at (0,0,0) of the
+        # previous frame
+        for chi_idx in range(1, 4):
+            if chi_angles_mask[restype][chi_idx]:
+                axis_end_atom_name = chi_angles_atoms[resname][chi_idx][2]
+                axis_end_atom_position = atom_positions[axis_end_atom_name]
+                mat = _make_rigid_transformation_4x4(
+                    ex=axis_end_atom_position,
+                    ey=np.array([-1.0, 0.0, 0.0]),
+                    translation=axis_end_atom_position,
+                )
+                restype_rigid_group_default_frame[restype, 4 + chi_idx, :, :] = mat
+_make_rigid_group_constants()
+def make_atom14_dists_bounds(overlap_tolerance=1.5, bond_length_tolerance_factor=15):
+    """compute upper and lower bounds for bonds to assess violations."""
+    restype_atom14_bond_lower_bound = np.zeros([21, 14, 14], np.float32)
+    restype_atom14_bond_upper_bound = np.zeros([21, 14, 14], np.float32)
+    restype_atom14_bond_stddev = np.zeros([21, 14, 14], np.float32)
+    residue_bonds, residue_virtual_bonds, _ = load_stereo_chemical_props()
+    for restype, restype_letter in enumerate(restypes):
+        resname = restype_1to3[restype_letter]
+        atom_list = restype_name_to_atom14_names[resname]
+        # create lower and upper bounds for clashes
+        for atom1_idx, atom1_name in enumerate(atom_list):
+            if not atom1_name:
+                continue
+            atom1_radius = van_der_waals_radius[atom1_name[0]]
+            for atom2_idx, atom2_name in enumerate(atom_list):
+                if (not atom2_name) or atom1_idx == atom2_idx:
+                    continue
+                atom2_radius = van_der_waals_radius[atom2_name[0]]
+                lower = atom1_radius + atom2_radius - overlap_tolerance
+                upper = 1e10
+                restype_atom14_bond_lower_bound[restype, atom1_idx, atom2_idx] = lower
+                restype_atom14_bond_lower_bound[restype, atom2_idx, atom1_idx] = lower
+                restype_atom14_bond_upper_bound[restype, atom1_idx, atom2_idx] = upper
+                restype_atom14_bond_upper_bound[restype, atom2_idx, atom1_idx] = upper
+        # overwrite lower and upper bounds for bonds and angles
+        for b in residue_bonds[resname] + residue_virtual_bonds[resname]:
+            atom1_idx = atom_list.index(b.atom1_name)
+            atom2_idx = atom_list.index(b.atom2_name)
+            lower = b.length - bond_length_tolerance_factor * b.stddev
+            upper = b.length + bond_length_tolerance_factor * b.stddev
+            restype_atom14_bond_lower_bound[restype, atom1_idx, atom2_idx] = lower
+            restype_atom14_bond_lower_bound[restype, atom2_idx, atom1_idx] = lower
+            restype_atom14_bond_upper_bound[restype, atom1_idx, atom2_idx] = upper
+            restype_atom14_bond_upper_bound[restype, atom2_idx, atom1_idx] = upper
+            restype_atom14_bond_stddev[restype, atom1_idx, atom2_idx] = b.stddev
+            restype_atom14_bond_stddev[restype, atom2_idx, atom1_idx] = b.stddev
+    return {
+        "lower_bound": restype_atom14_bond_lower_bound,  # shape (21,14,14)
+        "upper_bound": restype_atom14_bond_upper_bound,  # shape (21,14,14)
+        "stddev": restype_atom14_bond_stddev,  # shape (21,14,14)
+    }
+standard_residue_bonds, _, standard_residue_bond_angles = load_stereo_chemical_props()

core/stereo_chemical_props.txt ADDED Viewed

	@@ -0,0 +1,345 @@

+Bond			Residue		Mean		StdDev
+CA-CB			ALA		1.520		0.021
+N-CA			ALA		1.459		0.020
+CA-C			ALA		1.525		0.026
+C-O			ALA		1.229		0.019
+CA-CB			ARG		1.535		0.022
+CB-CG			ARG		1.521		0.027
+CG-CD			ARG		1.515		0.025
+CD-NE			ARG		1.460		0.017
+NE-CZ			ARG		1.326		0.013
+CZ-NH1			ARG		1.326		0.013
+CZ-NH2			ARG		1.326		0.013
+N-CA			ARG		1.459		0.020
+CA-C			ARG		1.525		0.026
+C-O			ARG		1.229		0.019
+CA-CB			ASN		1.527		0.026
+CB-CG			ASN		1.506		0.023
+CG-OD1			ASN		1.235		0.022
+CG-ND2			ASN		1.324		0.025
+N-CA			ASN		1.459		0.020
+CA-C			ASN		1.525		0.026
+C-O			ASN		1.229		0.019
+CA-CB			ASP		1.535		0.022
+CB-CG			ASP		1.513		0.021
+CG-OD1			ASP		1.249		0.023
+CG-OD2			ASP		1.249		0.023
+N-CA			ASP		1.459		0.020
+CA-C			ASP		1.525		0.026
+C-O			ASP		1.229		0.019
+CA-CB			CYS		1.526		0.013
+CB-SG			CYS		1.812		0.016
+N-CA			CYS		1.459		0.020
+CA-C			CYS		1.525		0.026
+C-O			CYS		1.229		0.019
+CA-CB			GLU		1.535		0.022
+CB-CG			GLU		1.517		0.019
+CG-CD			GLU		1.515		0.015
+CD-OE1			GLU		1.252		0.011
+CD-OE2			GLU		1.252		0.011
+N-CA			GLU		1.459		0.020
+CA-C			GLU		1.525		0.026
+C-O			GLU		1.229		0.019
+CA-CB			GLN		1.535		0.022
+CB-CG			GLN		1.521		0.027
+CG-CD			GLN		1.506		0.023
+CD-OE1			GLN		1.235		0.022
+CD-NE2			GLN		1.324		0.025
+N-CA			GLN		1.459		0.020
+CA-C			GLN		1.525		0.026
+C-O			GLN		1.229		0.019
+N-CA			GLY		1.456		0.015
+CA-C			GLY		1.514		0.016
+C-O			GLY		1.232		0.016
+CA-CB			HIS		1.535		0.022
+CB-CG			HIS		1.492		0.016
+CG-ND1			HIS		1.369		0.015
+CG-CD2			HIS		1.353		0.017
+ND1-CE1			HIS		1.343		0.025
+CD2-NE2			HIS		1.415		0.021
+CE1-NE2			HIS		1.322		0.023
+N-CA			HIS		1.459		0.020
+CA-C			HIS		1.525		0.026
+C-O			HIS		1.229		0.019
+CA-CB			ILE		1.544		0.023
+CB-CG1			ILE		1.536		0.028
+CB-CG2			ILE		1.524		0.031
+CG1-CD1			ILE		1.500		0.069
+N-CA			ILE		1.459		0.020
+CA-C			ILE		1.525		0.026
+C-O			ILE		1.229		0.019
+CA-CB			LEU		1.533		0.023
+CB-CG			LEU		1.521		0.029
+CG-CD1			LEU		1.514		0.037
+CG-CD2			LEU		1.514		0.037
+N-CA			LEU		1.459		0.020
+CA-C			LEU		1.525		0.026
+C-O			LEU		1.229		0.019
+CA-CB			LYS		1.535		0.022
+CB-CG			LYS		1.521		0.027
+CG-CD			LYS		1.520		0.034
+CD-CE			LYS		1.508		0.025
+CE-NZ			LYS		1.486		0.025
+N-CA			LYS		1.459		0.020
+CA-C			LYS		1.525		0.026
+C-O			LYS		1.229		0.019
+CA-CB			MET		1.535		0.022
+CB-CG			MET		1.509		0.032
+CG-SD			MET		1.807		0.026
+SD-CE			MET		1.774		0.056
+N-CA			MET		1.459		0.020
+CA-C			MET		1.525		0.026
+C-O			MET		1.229		0.019
+CA-CB			PHE		1.535		0.022
+CB-CG			PHE		1.509		0.017
+CG-CD1			PHE		1.383		0.015
+CG-CD2			PHE		1.383		0.015
+CD1-CE1			PHE		1.388		0.020
+CD2-CE2			PHE		1.388		0.020
+CE1-CZ			PHE		1.369		0.019
+CE2-CZ			PHE		1.369		0.019
+N-CA			PHE		1.459		0.020
+CA-C			PHE		1.525		0.026
+C-O			PHE		1.229		0.019
+CA-CB			PRO		1.531		0.020
+CB-CG			PRO		1.495		0.050
+CG-CD			PRO		1.502		0.033
+CD-N			PRO		1.474		0.014
+N-CA			PRO		1.468		0.017
+CA-C			PRO		1.524		0.020
+C-O			PRO		1.228		0.020
+CA-CB			SER		1.525		0.015
+CB-OG			SER		1.418		0.013
+N-CA			SER		1.459		0.020
+CA-C			SER		1.525		0.026
+C-O			SER		1.229		0.019
+CA-CB			THR		1.529		0.026
+CB-OG1			THR		1.428		0.020
+CB-CG2			THR		1.519		0.033
+N-CA			THR		1.459		0.020
+CA-C			THR		1.525		0.026
+C-O			THR		1.229		0.019
+CA-CB			TRP		1.535		0.022
+CB-CG			TRP		1.498		0.018
+CG-CD1			TRP		1.363		0.014
+CG-CD2			TRP		1.432		0.017
+CD1-NE1			TRP		1.375		0.017
+NE1-CE2			TRP		1.371		0.013
+CD2-CE2			TRP		1.409		0.012
+CD2-CE3			TRP		1.399		0.015
+CE2-CZ2			TRP		1.393		0.017
+CE3-CZ3			TRP		1.380		0.017
+CZ2-CH2			TRP		1.369		0.019
+CZ3-CH2			TRP		1.396		0.016
+N-CA			TRP		1.459		0.020
+CA-C			TRP		1.525		0.026
+C-O			TRP		1.229		0.019
+CA-CB			TYR		1.535		0.022
+CB-CG			TYR		1.512		0.015
+CG-CD1			TYR		1.387		0.013
+CG-CD2			TYR		1.387		0.013
+CD1-CE1			TYR		1.389		0.015
+CD2-CE2			TYR		1.389		0.015
+CE1-CZ			TYR		1.381		0.013
+CE2-CZ			TYR		1.381		0.013
+CZ-OH			TYR		1.374		0.017
+N-CA			TYR		1.459		0.020
+CA-C			TYR		1.525		0.026
+C-O			TYR		1.229		0.019
+CA-CB			VAL		1.543		0.021
+CB-CG1			VAL		1.524		0.021
+CB-CG2			VAL		1.524		0.021
+N-CA			VAL		1.459		0.020
+CA-C			VAL		1.525		0.026
+C-O			VAL		1.229		0.019
+-
+Angle			Residue		Mean		StdDev
+N-CA-CB			ALA		110.1		1.4
+CB-CA-C			ALA		110.1		1.5
+N-CA-C			ALA		111.0		2.7
+CA-C-O			ALA		120.1		2.1
+N-CA-CB			ARG		110.6		1.8
+CB-CA-C			ARG		110.4		2.0
+CA-CB-CG		ARG		113.4		2.2
+CB-CG-CD		ARG		111.6		2.6
+CG-CD-NE		ARG		111.8		2.1
+CD-NE-CZ		ARG		123.6		1.4
+NE-CZ-NH1		ARG		120.3		0.5
+NE-CZ-NH2		ARG		120.3		0.5
+NH1-CZ-NH2		ARG		119.4		1.1
+N-CA-C			ARG		111.0		2.7
+CA-C-O			ARG		120.1		2.1
+N-CA-CB			ASN		110.6		1.8
+CB-CA-C			ASN		110.4		2.0
+CA-CB-CG		ASN		113.4		2.2
+CB-CG-ND2		ASN		116.7		2.4
+CB-CG-OD1		ASN		121.6		2.0
+ND2-CG-OD1		ASN		121.9		2.3
+N-CA-C			ASN		111.0		2.7
+CA-C-O			ASN		120.1		2.1
+N-CA-CB			ASP		110.6		1.8
+CB-CA-C			ASP		110.4		2.0
+CA-CB-CG		ASP		113.4		2.2
+CB-CG-OD1		ASP		118.3		0.9
+CB-CG-OD2		ASP		118.3		0.9
+OD1-CG-OD2		ASP		123.3		1.9
+N-CA-C			ASP		111.0		2.7
+CA-C-O			ASP		120.1		2.1
+N-CA-CB			CYS		110.8		1.5
+CB-CA-C			CYS		111.5		1.2
+CA-CB-SG		CYS		114.2		1.1
+N-CA-C			CYS		111.0		2.7
+CA-C-O			CYS		120.1		2.1
+N-CA-CB			GLU		110.6		1.8
+CB-CA-C			GLU		110.4		2.0
+CA-CB-CG		GLU		113.4		2.2
+CB-CG-CD		GLU		114.2		2.7
+CG-CD-OE1		GLU		118.3		2.0
+CG-CD-OE2		GLU		118.3		2.0
+OE1-CD-OE2		GLU		123.3		1.2
+N-CA-C			GLU		111.0		2.7
+CA-C-O			GLU		120.1		2.1
+N-CA-CB			GLN		110.6		1.8
+CB-CA-C			GLN		110.4		2.0
+CA-CB-CG		GLN		113.4		2.2
+CB-CG-CD		GLN		111.6		2.6
+CG-CD-OE1		GLN		121.6		2.0
+CG-CD-NE2		GLN		116.7		2.4
+OE1-CD-NE2		GLN		121.9		2.3
+N-CA-C			GLN		111.0		2.7
+CA-C-O			GLN		120.1		2.1
+N-CA-C			GLY		113.1		2.5
+CA-C-O			GLY		120.6		1.8
+N-CA-CB			HIS		110.6		1.8
+CB-CA-C			HIS		110.4		2.0
+CA-CB-CG		HIS		113.6		1.7
+CB-CG-ND1		HIS		123.2		2.5
+CB-CG-CD2		HIS		130.8		3.1
+CG-ND1-CE1		HIS		108.2		1.4
+ND1-CE1-NE2		HIS		109.9		2.2
+CE1-NE2-CD2		HIS		106.6		2.5
+NE2-CD2-CG		HIS		109.2		1.9
+CD2-CG-ND1		HIS		106.0		1.4
+N-CA-C			HIS		111.0		2.7
+CA-C-O			HIS		120.1		2.1
+N-CA-CB			ILE		110.8		2.3
+CB-CA-C			ILE		111.6		2.0
+CA-CB-CG1		ILE		111.0		1.9
+CB-CG1-CD1		ILE		113.9		2.8
+CA-CB-CG2		ILE		110.9		2.0
+CG1-CB-CG2		ILE		111.4		2.2
+N-CA-C			ILE		111.0		2.7
+CA-C-O			ILE		120.1		2.1
+N-CA-CB			LEU		110.4		2.0
+CB-CA-C			LEU		110.2		1.9
+CA-CB-CG		LEU		115.3		2.3
+CB-CG-CD1		LEU		111.0		1.7
+CB-CG-CD2		LEU		111.0		1.7
+CD1-CG-CD2		LEU		110.5		3.0
+N-CA-C			LEU		111.0		2.7
+CA-C-O			LEU		120.1		2.1
+N-CA-CB			LYS		110.6		1.8
+CB-CA-C			LYS		110.4		2.0
+CA-CB-CG		LYS		113.4		2.2
+CB-CG-CD		LYS		111.6		2.6
+CG-CD-CE		LYS		111.9		3.0
+CD-CE-NZ		LYS		111.7		2.3
+N-CA-C			LYS		111.0		2.7
+CA-C-O			LYS		120.1		2.1
+N-CA-CB			MET		110.6		1.8
+CB-CA-C			MET		110.4		2.0
+CA-CB-CG		MET		113.3		1.7
+CB-CG-SD		MET		112.4		3.0
+CG-SD-CE		MET		100.2		1.6
+N-CA-C			MET		111.0		2.7
+CA-C-O			MET		120.1		2.1
+N-CA-CB			PHE		110.6		1.8
+CB-CA-C			PHE		110.4		2.0
+CA-CB-CG		PHE		113.9		2.4
+CB-CG-CD1		PHE		120.8		0.7
+CB-CG-CD2		PHE		120.8		0.7
+CD1-CG-CD2		PHE		118.3		1.3
+CG-CD1-CE1		PHE		120.8		1.1
+CG-CD2-CE2		PHE		120.8		1.1
+CD1-CE1-CZ		PHE		120.1		1.2
+CD2-CE2-CZ		PHE		120.1		1.2
+CE1-CZ-CE2		PHE		120.0		1.8
+N-CA-C			PHE		111.0		2.7
+CA-C-O			PHE		120.1		2.1
+N-CA-CB			PRO		103.3		1.2
+CB-CA-C			PRO		111.7		2.1
+CA-CB-CG		PRO		104.8		1.9
+CB-CG-CD		PRO		106.5		3.9
+CG-CD-N			PRO		103.2		1.5
+CA-N-CD			PRO		111.7		1.4
+N-CA-C			PRO		112.1		2.6
+CA-C-O			PRO		120.2		2.4
+N-CA-CB			SER		110.5		1.5
+CB-CA-C			SER		110.1		1.9
+CA-CB-OG		SER		111.2		2.7
+N-CA-C			SER		111.0		2.7
+CA-C-O			SER		120.1		2.1
+N-CA-CB			THR		110.3		1.9
+CB-CA-C			THR		111.6		2.7
+CA-CB-OG1		THR		109.0		2.1
+CA-CB-CG2		THR		112.4		1.4
+OG1-CB-CG2		THR		110.0		2.3
+N-CA-C			THR		111.0		2.7
+CA-C-O			THR		120.1		2.1
+N-CA-CB			TRP		110.6		1.8
+CB-CA-C			TRP		110.4		2.0
+CA-CB-CG		TRP		113.7		1.9
+CB-CG-CD1		TRP		127.0		1.3
+CB-CG-CD2		TRP		126.6		1.3
+CD1-CG-CD2		TRP		106.3		0.8
+CG-CD1-NE1		TRP		110.1		1.0
+CD1-NE1-CE2		TRP		109.0		0.9
+NE1-CE2-CD2		TRP		107.3		1.0
+CE2-CD2-CG		TRP		107.3		0.8
+CG-CD2-CE3		TRP		133.9		0.9
+NE1-CE2-CZ2		TRP		130.4		1.1
+CE3-CD2-CE2		TRP		118.7		1.2
+CD2-CE2-CZ2		TRP		122.3		1.2
+CE2-CZ2-CH2		TRP		117.4		1.0
+CZ2-CH2-CZ3		TRP		121.6		1.2
+CH2-CZ3-CE3		TRP		121.2		1.1
+CZ3-CE3-CD2		TRP		118.8		1.3
+N-CA-C			TRP		111.0		2.7
+CA-C-O			TRP		120.1		2.1
+N-CA-CB			TYR		110.6		1.8
+CB-CA-C			TYR		110.4		2.0
+CA-CB-CG		TYR		113.4		1.9
+CB-CG-CD1		TYR		121.0		0.6
+CB-CG-CD2		TYR		121.0		0.6
+CD1-CG-CD2		TYR		117.9		1.1
+CG-CD1-CE1		TYR		121.3		0.8
+CG-CD2-CE2		TYR		121.3		0.8
+CD1-CE1-CZ		TYR		119.8		0.9
+CD2-CE2-CZ		TYR		119.8		0.9
+CE1-CZ-CE2		TYR		119.8		1.6
+CE1-CZ-OH		TYR		120.1		2.7
+CE2-CZ-OH		TYR		120.1		2.7
+N-CA-C			TYR		111.0		2.7
+CA-C-O			TYR		120.1		2.1
+N-CA-CB			VAL		111.5		2.2
+CB-CA-C			VAL		111.4		1.9
+CA-CB-CG1		VAL		110.9		1.5
+CA-CB-CG2		VAL		110.9		1.5
+CG1-CB-CG2		VAL		110.9		1.6
+N-CA-C			VAL		111.0		2.7
+CA-C-O			VAL		120.1		2.1
+-
+Non-bonded distance     Minimum Dist    Tolerance
+C-C                     3.4             1.5
+C-N                     3.25            1.5
+C-S                     3.5             1.5
+C-O                     3.22            1.5
+N-N                     3.1             1.5
+N-S                     3.35            1.5
+N-O                     3.07            1.5
+O-S                     3.32            1.5
+O-O                     3.04            1.5
+S-S                     2.03            1.0
+-

core/utils.py ADDED Viewed

	@@ -0,0 +1,1062 @@

+"""
+https://github.com/ProteinDesignLab/protpardelle
+License: MIT
+Author: Alex Chu
+Various utils for handling protein data.
+"""
+import os
+import shlex
+import subprocess
+import sys
+import torch
+import yaml
+import argparse
+from einops import rearrange, repeat
+import numpy as np
+import torch
+import torch.nn.functional as F
+import Bio
+from Bio.PDB.DSSP import DSSP
+from core import protein
+from core import protein_mpnn
+from core import residue_constants
+PATH_TO_TMALIGN = "/home/alexechu/essentials_kit/ml_utils/align/TMalign/TMalign"
+################ STRUCTURE/FORMAT UTILS #############################
+def aatype_to_seq(aatype, seq_mask=None):
+    if seq_mask is None:
+        seq_mask = torch.ones_like(aatype)
+    mapping = residue_constants.restypes_with_x
+    mapping = mapping + ["<mask>"]
+    unbatched = False
+    if len(aatype.shape) == 1:
+        unbatched = True
+        aatype = [aatype]
+        seq_mask = [seq_mask]
+    seqs = []
+    for i, ai in enumerate(aatype):
+        seq = []
+        for j, aa in enumerate(ai):
+            if seq_mask[i][j] == 1:
+                try:
+                    seq.append(mapping[aa])
+                except IndexError:
+                    print(aatype[i])
+                    raise Exception(f"Error in mapping {aa} at {i},{j}")
+        seqs.append("".join(seq))
+    if unbatched:
+        seqs = seqs[0]
+    return seqs
+def seq_to_aatype(seq, num_tokens=21):
+    if num_tokens == 20:
+        mapping = residue_constants.restype_order
+    if num_tokens == 21:
+        mapping = residue_constants.restype_order_with_x
+    if num_tokens == 22:
+        mapping = residue_constants.restype_order_with_x
+        mapping["<mask>"] = 21
+    return torch.Tensor([mapping[aa] for aa in seq]).long()
+def batched_seq_to_aatype_and_mask(seqs, max_len=None):
+    if max_len is None:
+        max_len = max([len(s) for s in seqs])
+    aatypes = []
+    seq_mask = []
+    for s in seqs:
+        pad_size = max_len - len(s)
+        aatype = seq_to_aatype(s)
+        aatypes.append(F.pad(aatype, (0, pad_size)))
+        mask = torch.ones_like(aatype).float()
+        seq_mask.append(F.pad(mask, (0, pad_size)))
+    return torch.stack(aatypes), torch.stack(seq_mask)
+def atom37_mask_from_aatype(aatype, seq_mask=None):
+    # source_mask is (21,37) originally
+    source_mask = torch.Tensor(residue_constants.restype_atom37_mask).to(aatype.device)
+    bb_atoms = source_mask[residue_constants.restype_order["G"]][None]
+    # Use only the first 20 plus bb atoms for X, mask
+    source_mask = torch.cat([source_mask[:-1], bb_atoms, bb_atoms], 0)
+    atom_mask = source_mask[aatype]
+    if seq_mask is not None:
+        atom_mask *= seq_mask[..., None]
+    return atom_mask
+def atom37_coords_from_atom14(atom14_coords, aatype, return_mask=False):
+    # Unbatched
+    device = atom14_coords.device
+    atom37_coords = torch.zeros((atom14_coords.shape[0], 37, 3)).to(device)
+    for i in range(atom14_coords.shape[0]):  # per residue
+        aa = aatype[i]
+        aa_3name = residue_constants.restype_1to3[residue_constants.restypes[aa]]
+        atom14_atoms = residue_constants.restype_name_to_atom14_names[aa_3name]
+        for j in range(14):
+            atom_name = atom14_atoms[j]
+            if atom_name != "":
+                atom37_idx = residue_constants.atom_order[atom_name]
+                atom37_coords[i, atom37_idx, :] = atom14_coords[i, j, :]
+    if return_mask:
+        atom37_mask = atom37_mask_from_aatype(aatype)
+        return atom37_coords, atom37_mask
+    return atom37_coords
+def atom73_mask_from_aatype(aatype, seq_mask=None):
+    source_mask = torch.Tensor(residue_constants.restype_atom73_mask).to(aatype.device)
+    atom_mask = source_mask[aatype]
+    if seq_mask is not None:
+        atom_mask *= seq_mask[..., None]
+    return atom_mask
+def atom37_to_atom73(atom37, aatype, return_mask=False):
+    # Unbatched
+    atom73 = torch.zeros((atom37.shape[0], 73, 3)).to(atom37)
+    for i in range(atom37.shape[0]):
+        aa = aatype[i]
+        aa1 = residue_constants.restypes[aa]
+        for j, atom37_name in enumerate(residue_constants.atom_types):
+            atom73_name = atom37_name
+            if atom37_name not in ["N", "CA", "C", "O", "CB"]:
+                atom73_name = aa1 + atom73_name
+            if atom73_name in residue_constants.atom73_names_to_idx:
+                atom73_idx = residue_constants.atom73_names_to_idx[atom73_name]
+                atom73[i, atom73_idx, :] = atom37[i, j, :]
+    if return_mask:
+        atom73_mask = atom73_mask_from_aatype(aatype)
+        return atom73, atom73_mask
+    return atom73
+def atom73_to_atom37(atom73, aatype, return_mask=False):
+    # Unbatched
+    atom37_coords = torch.zeros((atom73.shape[0], 37, 3)).to(atom73)
+    for i in range(atom73.shape[0]):  # per residue
+        aa = aatype[i]
+        aa1 = residue_constants.restypes[aa]
+        for j, atom_type in enumerate(residue_constants.atom_types):
+            atom73_name = atom_type
+            if atom73_name not in ["N", "CA", "C", "O", "CB"]:
+                atom73_name = aa1 + atom73_name
+            if atom73_name in residue_constants.atom73_names_to_idx:
+                atom73_idx = residue_constants.atom73_names_to_idx[atom73_name]
+                atom37_coords[i, j, :] = atom73[i, atom73_idx, :]
+    if return_mask:
+        atom37_mask = atom37_mask_from_aatype(aatype)
+        return atom37_coords, atom37_mask
+    return atom37_coords
+def get_dmap(pdb, atoms=["N", "CA", "C", "O"], batched=True, out="torch", device=None):
+    def _dmap_from_coords(coords):
+        coords = coords.contiguous()
+        dmaps = torch.cdist(coords, coords).unsqueeze(1)
+        if out == "numpy":
+            return dmaps.detach().cpu().numpy()
+        elif out == "torch":
+            if device is not None:
+                return dmaps.to(device)
+            else:
+                return dmaps
+    if isinstance(pdb, str):  # input is pdb file
+        coords = load_coords_from_pdb(pdb, atoms=atoms).view(1, -1, 3)
+        return _dmap_from_coords(coords)
+    elif len(pdb.shape) == 2:  # single set of coords
+        if isinstance(pdb, np.ndarray):
+            pdb = torch.Tensor(pdb)
+        return _dmap_from_coords(pdb.unsqueeze(0))
+    elif len(pdb.shape) == 3 and batched:
+        return _dmap_from_coords(pdb)
+    elif len(pdb.shape) == 3 and not batched:
+        return _dmap_from_coords(pdb.view(1, -1, 3))
+    elif len(pdb.shape) == 4:
+        return _dmap_from_coords(pdb.view(pdb.size(0), -1, 3))
+def get_channeled_dmap(coords):
+    # coords is b, nres, natom, 3
+    coords = coords.permute(0, 2, 1, 3)
+    dvecs = coords[..., None, :] - coords[..., None, :, :]  # b, natom, nres, nres, 3
+    dists = torch.sqrt(dvecs.pow(2).sum(-1) + 1e-8)
+    return dists
+def fill_in_cbeta_for_atom37(coords):
+    b = coords[..., 1, :] - coords[..., 0, :]
+    c = coords[..., 2, :] - coords[..., 1, :]
+    a = torch.cross(b, c, dim=-1)
+    cbeta = -0.58273431 * a + 0.56802827 * b - 0.54067466 * c + coords[..., 1, :]
+    new_coords = torch.clone(coords)
+    new_coords[..., 3, :] = cbeta
+    return new_coords
+def get_distogram(coords, n_bins=20, start=2, return_onehot=True, seq_mask=None):
+    # coords is b, nres, natom, 3
+    # distogram for cb atom (assume 3rd atom)
+    coords_with_cb = fill_in_cbeta_for_atom37(coords)
+    dists = get_channeled_dmap(coords_with_cb[:, :, 3:4]).squeeze(1)
+    bins = torch.arange(start, start + n_bins - 1).to(dists.device)
+    dgram = torch.bucketize(dists, bins)
+    dgram_oh = F.one_hot(dgram, n_bins)
+    if seq_mask is not None:
+        mask_2d = seq_mask[:, :, None] * seq_mask[:, None, :]
+        dgram = dgram * mask_2d
+        dgram_oh = dgram_oh * mask_2d[..., None]
+    if return_onehot:
+        return dgram_oh
+    return dgram
+def get_contacts(coords=None, distogram=None, seq_mask=None):
+    if distogram is None:
+        distogram = get_distogram(coords)
+    contacts = (distogram.argmax(-1) < 6).float()
+    if seq_mask is not None:
+        contacts *= seq_mask[..., None] * seq_mask[..., None, :]
+    return contacts
+def dihedral(a, b, c, d):
+    # inputs can be (1,3), (n,3), or (bs,n,3)
+    b1 = a - b
+    b2 = b - c
+    b3 = c - d
+    n1 = F.normalize(torch.cross(b1, b2), dim=-1)
+    n2 = F.normalize(torch.cross(b2, b3), dim=-1)
+    m1 = torch.cross(n1, b2 / b2.norm(dim=-1).unsqueeze(-1))
+    y = (m1 * n2).sum(dim=-1)
+    x = (n1 * n2).sum(dim=-1)
+    return torch.atan2(y, x)
+def get_torsions_from_coords(
+    coords, atoms=["N", "CA", "C", "O"], batched=True, out="torch", device=None
+):
+    """
+    Returns a n-dim array of shape (bs, nres, ntors), where ntors is the
+    number of torsion angles (e.g. 2 if using phi and psi), with units of radians.
+    """
+    if isinstance(coords, np.ndarray):
+        coords = torch.Tensor(coords)
+    if len(coords.shape) == 2:
+        coords = coords.unsqueeze(0)
+    if len(coords.shape) == 4:
+        coords = coords.view(coords.size(0), -1, 3)
+    if len(coords.shape) == 3 and not batched:
+        coords = coords.view(1, -1, 3)
+    if len(coords.shape) == 3:
+        bs = coords.size(0)
+        if "O" in atoms:
+            idxs = [
+                i for i in range(coords.size(1)) if i % 4 != 3
+            ]  # deselect O atoms for N-Ca-C-O coords
+            coords = coords[:, idxs, :]
+        a, b, c, d = (
+            coords[:, :-3, :],
+            coords[:, 1:-2, :],
+            coords[:, 2:-1, :],
+            coords[:, 3:, :],
+        )
+        torsions = dihedral(
+            a, b, c, d
+        )  # output order is psi-omega-phi, reorganize to (bs, nres, 3)
+        torsions = torsions.view(bs, torsions.size(1) // 3, 3)
+        omegaphi = torch.cat(
+            (torch.zeros(bs, 1, 2).to(coords.device), torsions[:, :, 1:]), 1
+        )
+        psi = torch.cat((torsions[:, :, 0], torch.zeros(bs, 1).to(coords.device)), 1)
+        torsions = torch.cat(
+            (
+                omegaphi[:, :, 1].unsqueeze(-1),
+                psi.unsqueeze(-1),
+                omegaphi[:, :, 0].unsqueeze(-1),
+            ),
+            -1,
+        )
+    else:
+        raise Exception("input coords not of correct dims")
+    if out == "numpy":
+        return torsions.detach().cpu().numpy()
+    elif out == "torch":
+        if device is not None:
+            return torsions.to(device)
+        else:
+            return torsions
+def get_trig_from_torsions(torsions, out="torch", device=None):
+    """
+    Calculate unit circle projections from coords input.
+    Returns a n-dim array of shape (bs, nres, ntors, 2), where ntors is the
+    number of torsion angles (e.g. 2 if using phi and psi), and the last
+    dimension is the xy unit-circle coordinates of the corresponding angle.
+    """
+    if isinstance(torsions, np.ndarray):
+        torsions = torch.Tensor(torsions)
+    x = torsions.cos()
+    y = torsions.sin()
+    trig = torch.cat((x.unsqueeze(-1), y.unsqueeze(-1)), -1)
+    if out == "numpy":
+        return trig.detach().cpu().numpy()
+    elif out == "torch":
+        if device is not None:
+            return trig.to(device)
+        else:
+            return trig
+def get_abego_string_from_torsions(torsions):
+    A_bin = (-75, 50)
+    G_bin = (-100, 100)
+    torsions = torsions * 180.0 / np.pi
+    phi, psi = torsions[:, :, 0], torsions[:, :, 1]
+    abego_vec = np.zeros((torsions.size(0), torsions.size(1))).astype(str)
+    A = (phi <= 0) & (psi <= A_bin[1]) & (psi > A_bin[0])
+    B = (phi <= 0) & ((psi > A_bin[1]) | (psi <= A_bin[0]))
+    G = (phi > 0) & (psi <= G_bin[1]) & (psi > G_bin[0])
+    E = (phi > 0) & ((psi > G_bin[1]) | (psi <= G_bin[0]))
+    abego_vec[A] = "A"
+    abego_vec[B] = "B"
+    abego_vec[G] = "G"
+    abego_vec[E] = "E"
+    abego_strs = ["".join(v) for v in abego_vec]
+    return abego_strs
+def get_bond_lengths_from_coords(coords, batched=True, out="torch", device=None):
+    """
+    Returns array of shape (bs, n_res, 4), where final dim is bond lengths
+    in order of N-Ca, Ca-C, C-O, C-N (none for last residue)
+    """
+    if isinstance(coords, np.ndarray):
+        coords = torch.Tensor(coords)
+    if len(coords.shape) == 2:
+        coords = coords.unsqueeze(0)
+    if len(coords.shape) == 3 and not batched:
+        coords = coords.view(1, -1, 3)
+    if len(coords.shape) == 4:
+        coords = coords.view(coords.size(0), -1, 3)
+    N = coords[:, ::4, :]
+    Ca = coords[:, 1::4, :]
+    C = coords[:, 2::4, :]
+    O = coords[:, 3::4, :]
+    NCa = (Ca - N).norm(dim=-1).unsqueeze(-1)
+    CaC = (C - Ca).norm(dim=-1).unsqueeze(-1)
+    CO = (O - C).norm(dim=-1).unsqueeze(-1)
+    CN = (N[:, 1:] - C[:, :-1]).norm(dim=-1)
+    CN = torch.cat([CN, torch.zeros(CN.size(0), 1).to(CN.device)], 1).unsqueeze(-1)
+    blengths = torch.cat((NCa, CaC, CO, CN), -1)
+    if out == "numpy":
+        return blengths.detach().cpu().numpy()
+    elif out == "torch":
+        if device is not None:
+            return blengths.to(device)
+        else:
+            return blengths
+def get_bond_angles_from_coords(coords, batched=True, out="torch", device=None):
+    """
+    Returns array of shape (bs, n_res, 5), where final dim is bond angles
+    in order of N-Ca-C, Ca-C-O, Ca-C-N, O-C-N, C-N-Ca (none for last residue)
+    """
+    def _angle(v1, v2):
+        cos = (v1 * v2).sum(-1) / (v1.norm(dim=-1) * v2.norm(dim=-1))
+        return cos.acos()
+    if isinstance(coords, np.ndarray):
+        coords = torch.Tensor(coords)
+    if len(coords.shape) == 2:
+        coords = coords.unsqueeze(0)
+    if len(coords.shape) == 3 and not batched:
+        coords = coords.view(1, -1, 3)
+    if len(coords.shape) == 4:
+        coords = coords.view(coords.size(0), -1, 3)
+    N = coords[:, ::4, :]
+    Nnext = coords[:, 4::4, :]
+    Ca = coords[:, 1::4, :]
+    Canext = coords[:, 5::4, :]
+    C = coords[:, 2::4, :]
+    O = coords[:, 3::4, :]
+    CaN = N - Ca
+    CaC = C - Ca
+    CCa = Ca - C
+    CO = O - C
+    CNnext = Nnext - C[:, :-1, :]
+    NnextC = -1 * CNnext
+    NnextCanext = Canext - Nnext
+    NCaC = _angle(CaN, CaC).unsqueeze(-1)
+    CaCO = _angle(CCa, CO).unsqueeze(-1)
+    CaCN = _angle(CCa[:, :-1], CNnext).unsqueeze(-1)
+    CaCN = _extend(CaCN)
+    OCN = _angle(CO[:, :-1], CNnext).unsqueeze(-1)
+    OCN = _extend(OCN)
+    CNCa = _angle(NnextC, NnextCanext).unsqueeze(-1)
+    # CNCa = torch.cat([CNCa, torch.zeros(CNCa.size(0), 1).to(CNCa.device)], 1).unsqueeze(-1)
+    CNCa = _extend(CNCa)
+    bangles = torch.cat((NCaC, CaCO, CaCN, OCN, CNCa), -1)
+    if out == "numpy":
+        return bangles.detach().cpu().numpy()
+    elif out == "torch":
+        if device is not None:
+            return bangles.to(device)
+        else:
+            return bangles
+def get_buried_positions_mask(coords, seq_mask=None, threshold=6.0):
+    ca_idx = residue_constants.atom_order["CA"]  # typically 1
+    cb_idx = residue_constants.atom_order["CB"]  # typically 3
+    if seq_mask is None:
+        seq_mask = torch.ones_like(coords)[..., 0, 0]
+    coords = fill_in_cbeta_for_atom37(coords)
+    # get 8 closest neighbors by CB
+    neighbor_coords = coords[:, :, cb_idx]
+    ca_neighbor_dists, edge_index = protein_mpnn.get_closest_neighbors(
+        neighbor_coords, seq_mask, 9
+    )
+    edge_index = edge_index[..., 1:].contiguous()
+    # compute avg CB distance
+    cb_coords = coords[:, :, cb_idx]
+    neighbor_cb = protein_mpnn.gather_nodes(cb_coords, edge_index)
+    avg_cb_dist = (neighbor_cb - cb_coords[..., None, :]).pow(2).sum(-1).sqrt().mean(-1)
+    buried_positions_mask = (avg_cb_dist < threshold).float() * seq_mask
+    return buried_positions_mask
+def get_fullatom_bond_lengths_from_coords(
+    coords, aatype, atom_mask=None, return_format="per_aa"
+):
+    # Also return sidechain bond angles. All unbatched. return list of dicts
+    def dist(xyz1, xyz2):
+        return (xyz1 - xyz2).pow(2).sum().sqrt().detach().cpu().item()
+    assert aatype.max() <= 19
+    seq = aatype_to_seq(aatype)
+    # residue-wise list of dicts [{'N-CA': a, 'CA-C': b}, {'N-CA': a, 'CA-C': b}]
+    all_bond_lens_by_pos = []
+    # aa-wise dict of dicts of lists {'A': {'N-CA': [a, b, c], 'CA-C': [a, b, c]}}
+    all_bond_lens_by_aa = {aa: {} for aa in residue_constants.restypes}
+    for i, res in enumerate(coords):
+        aa3 = residue_constants.restype_1to3[seq[i]]
+        res_bond_lens = {}
+        for j, atom1 in enumerate(residue_constants.atom_types):
+            for k, atom2 in enumerate(residue_constants.atom_types):
+                if j < k and protein.are_atoms_bonded(aa3, atom1, atom2):
+                    if atom_mask is None or (
+                        atom_mask[i, j] > 0.5 and atom_mask[i, k] > 0.5
+                    ):
+                        bond_name = f"{atom1}-{atom2}"
+                        bond_len = dist(res[j], res[k])
+                        res_bond_lens[bond_name] = bond_len
+        all_bond_lens_by_pos.append(res_bond_lens)
+        for key, val in res_bond_lens.items():
+            all_bond_lens_by_aa[seq[i]].setdefault(key, []).append(val)
+    if return_format == "per_aa":
+        return all_bond_lens_by_aa
+    elif return_format == "per_position":
+        return all_bond_lens_by_pos
+def batched_fullatom_bond_lengths_from_coords(
+    coords, aatype, atom_mask=None, return_format="per_aa"
+):
+    # Expects trimmed coords (no mask)
+    if return_format == "per_position":
+        batched_bond_lens = []
+    elif return_format == "per_aa":
+        batched_bond_lens = {aa: {} for aa in residue_constants.restypes}
+    for i, c in enumerate(coords):
+        atom_mask_i = None if atom_mask is None else atom_mask[i]
+        bond_lens = get_fullatom_bond_lengths_from_coords(
+            c, aatype[i], atom_mask=atom_mask_i, return_format=return_format
+        )
+        if return_format == "per_position":
+            batched_bond_lens.extend(bond_lens)
+        elif return_format == "per_aa":
+            for aa, d in bond_lens.items():
+                for bond, lengths in d.items():
+                    batched_bond_lens[aa].setdefault(bond, []).extend(lengths)
+    return batched_bond_lens
+def batched_fullatom_bond_angles_from_coords(coords, aatype, return_format="per_aa"):
+    # Expects trimmed coords (no mask)
+    if return_format == "per_position":
+        batched_bond_angles = []
+    elif return_format == "per_aa":
+        batched_bond_angles = {aa: {} for aa in residue_constants.restypes}
+    for i, c in enumerate(coords):
+        bond_angles = get_fullatom_bond_angles_from_coords(
+            c, aatype[i], return_format=return_format
+        )
+        if return_format == "per_position":
+            batched_bond_angles.extend(bond_angles)
+        elif return_format == "per_aa":
+            for aa, d in bond_angles.items():
+                for bond, lengths in d.items():
+                    batched_bond_angles[aa].setdefault(bond, []).extend(lengths)
+    return batched_bond_angles
+def get_chi_angles(coords, aatype, atom_mask=None, seq_mask=None):
+    # unbatched
+    # return (n, 4) chis in degrees and mask
+    chis = []
+    chi_mask = []
+    atom_order = residue_constants.atom_order
+    seq = aatype_to_seq(aatype, seq_mask=seq_mask)
+    for i, aa1 in enumerate(seq):  # per residue
+        if seq_mask is not None and seq_mask[i] == 0:
+            chis.append([0, 0, 0, 0])
+            chi_mask.append([0, 0, 0, 0])
+        else:
+            chi = []
+            mask = []
+            chi_atoms = residue_constants.chi_angles_atoms[
+                residue_constants.restype_1to3[aa1]
+            ]
+            for j in range(4):  # per chi angle
+                if j > len(chi_atoms) - 1:
+                    chi.append(0)
+                    mask.append(0)
+                elif atom_mask is not None and any(
+                    [atom_mask[i, atom_order[a]] < 0.5 for a in chi_atoms[j]]
+                ):
+                    chi.append(0)
+                    mask.append(0)
+                else:
+                    # Four atoms per dihedral
+                    xyz4 = [coords[i, atom_order[a]] for a in chi_atoms[j]]
+                    angle = dihedral(*xyz4) * 180 / np.pi
+                    chi.append(angle)
+                    mask.append(1)
+            chis.append(chi)
+            chi_mask.append(mask)
+    chis = torch.Tensor(chis)
+    chi_mask = torch.Tensor(chi_mask)
+    return chis, chi_mask
+def fill_Os_from_NCaC_coords(
+    coords: torch.Tensor, out: str = "torch", device: str = None
+):
+    """Given NCaC coords, add O atom coordinates in.
+    (bs, 3n, 3) -> (bs, 4n, 3)
+    """
+    CO_LEN = 1.231
+    if len(coords.shape) == 2:
+        coords = coords.unsqueeze(0)
+    Cs = coords[:, 2:-1:3, :]  # all but last C
+    CCa_norm = F.normalize(coords[:, 1:-2:3, :] - Cs, dim=-1)  # all but last Ca
+    CN_norm = F.normalize(coords[:, 3::3, :] - Cs, dim=-1)  # all but first N
+    Os = F.normalize(CCa_norm + CN_norm, dim=-1) * -CO_LEN
+    Os += Cs
+    # TODO place C-term O atom properly
+    Os = torch.cat([Os, coords[:, -1, :].view(-1, 1, 3) + 1], 1)
+    coords_out = []
+    for i in range(Os.size(1)):
+        coords_out.append(coords[:, i * 3 : (i + 1) * 3, :])
+        coords_out.append(Os[:, i, :].view(-1, 1, 3))
+    coords_out = torch.cat(coords_out, 1)
+    if out == "numpy":
+        return coords_out.detach().cpu().numpy()
+    elif out == "torch":
+        if device is not None:
+            return coords_out.to(device)
+        else:
+            return coords_out
+def _extend(x, axis=1, n=1, prepend=False):
+    # Add an extra zeros 'residue' to the end (or beginning, prepend=True) of a Tensor
+    # Used to extend torsions when there is no 'psi' for last residue
+    shape = list(x.shape)
+    shape[axis] = n
+    if prepend:
+        return torch.cat([torch.zeros(shape).to(x.device), x], axis)
+    else:
+        return torch.cat([x, torch.zeros(shape).to(x.device)], axis)
+def trim_coords(coords, n_res, batched=True):
+    if batched:  # Return list of tensors
+        front = (coords.shape[1] - n_res) // 2
+        return [
+            coords[i, front[i] : front[i] + n_res[i]] for i in range(coords.shape[0])
+        ]
+    else:
+        if isinstance(n_res, torch.Tensor):
+            n_res = n_res.int()
+        front_pad = (coords.shape[0] - n_res) // 2
+        return coords[front_pad : front_pad + n_res]
+def batch_align_on_calpha(x, y):
+    aligned_x = []
+    for i, xi in enumerate(x):
+        xi_calpha = xi[:, 1, :]
+        _, (R, t) = kabsch_align(xi_calpha, y[i, :, 1, :])
+        xi_ctr = xi - xi_calpha.mean(0, keepdim=True)
+        xi_aligned = xi_ctr @ R.t() + t
+        aligned_x.append(xi_aligned)
+    return torch.stack(aligned_x)
+def kabsch_align(p, q):
+    if len(p.shape) > 2:
+        p = p.reshape(-1, 3)
+    if len(q.shape) > 2:
+        q = q.reshape(-1, 3)
+    p_ctr = p - p.mean(0, keepdim=True)
+    t = q.mean(0, keepdim=True)
+    q_ctr = q - t
+    H = p_ctr.t() @ q_ctr
+    U, S, V = torch.svd(H)
+    R = V @ U.t()
+    I_ = torch.eye(3).to(p)
+    I_[-1, -1] = R.det().sign()
+    R = V @ I_ @ U.t()
+    p_aligned = p_ctr @ R.t() + t
+    return p_aligned, (R, t)
+def get_dssp_string(pdb):
+    try:
+        structure = Bio.PDB.PDBParser(QUIET=True).get_structure(pdb[:-3], pdb)
+        dssp = DSSP(structure[0], pdb, dssp="mkdssp")
+        dssp_string = "".join([dssp[k][2] for k in dssp.keys()])
+        return dssp_string
+    except Exception as e:
+        print(e)
+        return None
+def pool_dssp_symbols(dssp_string, newchar=None, chars=["-", "T", "S", "C", " "]):
+    """Replaces all instances of chars with newchar. DSSP chars are helix=GHI, strand=EB, loop=- TSC"""
+    if newchar is None:
+        newchar = chars[0]
+    string_out = dssp_string
+    for c in chars:
+        string_out = string_out.replace(c, newchar)
+    return string_out
+def get_3state_dssp(pdb=None, coords=None):
+    if coords is not None:
+        pdb = "temp_dssp.pdb"
+        write_coords_to_pdb(coords, pdb, batched=False)
+    dssp_string = get_dssp_string(pdb)
+    if dssp_string is not None:
+        dssp_string = pool_dssp_symbols(dssp_string, newchar="L")
+        dssp_string = pool_dssp_symbols(dssp_string, chars=["H", "G", "I"])
+        dssp_string = pool_dssp_symbols(dssp_string, chars=["E", "B"])
+    if coords is not None:
+        subprocess.run(shlex.split(f"rm {pdb}"))
+    return dssp_string
+############## SAVE/LOAD UTILS #################################
+def load_feats_from_pdb(
+    pdb, bb_atoms=["N", "CA", "C", "O"], load_atom73=False, **kwargs
+):
+    feats = {}
+    with open(pdb, "r") as f:
+        pdb_str = f.read()
+    protein_obj = protein.from_pdb_string(pdb_str, **kwargs)
+    bb_idxs = [residue_constants.atom_order[a] for a in bb_atoms]
+    bb_coords = torch.from_numpy(protein_obj.atom_positions[:, bb_idxs])
+    feats["bb_coords"] = bb_coords.float()
+    for k, v in vars(protein_obj).items():
+        feats[k] = torch.Tensor(v)
+    feats["aatype"] = feats["aatype"].long()
+    if load_atom73:
+        feats["atom73_coords"], feats["atom73_mask"] = atom37_to_atom73(
+            feats["atom_positions"], feats["aatype"], return_mask=True
+        )
+    return feats
+def load_coords_from_pdb(
+    pdb,
+    atoms=["N", "CA", "C", "O"],
+    method="raw",
+    also_bfactors=False,
+    normalize_bfactors=True,
+):
+    """Returns array of shape (1, n_res, len(atoms), 3)"""
+    coords = []
+    bfactors = []
+    if method == "raw":  # Raw numpy implementation, faster than biopdb
+        # Indexing into PDB format, allowing XXXX.XXX
+        coords_in_pdb = [slice(30, 38), slice(38, 46), slice(46, 54)]
+        # Indexing into PDB format, allowing XXX.XX
+        bfactor_in_pdb = slice(60, 66)
+        with open(pdb, "r") as f:
+            resi_prev = 1
+            counter = 0
+            for l in f:
+                l_split = l.rstrip("\n").split()
+                if len(l_split) > 0 and l_split[0] == "ATOM" and l_split[2] in atoms:
+                    resi = l_split[5]
+                    if resi == resi_prev:
+                        counter += 1
+                    else:
+                        counter = 0
+                    if counter < len(atoms):
+                        xyz = [
+                            np.array(l[s].strip()).astype(float) for s in coords_in_pdb
+                        ]
+                        coords.append(xyz)
+                        if also_bfactors:
+                            bfactor = np.array(l[bfactor_in_pdb].strip()).astype(float)
+                            bfactors.append(bfactor)
+                    resi_prev = resi
+            coords = torch.Tensor(np.array(coords)).view(1, -1, len(atoms), 3)
+            if also_bfactors:
+                bfactors = torch.Tensor(np.array(bfactors)).view(1, -1, len(atoms))
+    elif method == "biopdb":
+        structure = Bio.PDB.PDBParser(QUIET=True).get_structure(pdb[:-3], pdb)
+        for model in structure:
+            for chain in model:
+                for res in chain:
+                    for atom in atoms:
+                        try:
+                            coords.append(np.asarray(res[atom].get_coord()))
+                            if also_bfactors:
+                                bfactors.append(np.asarray(res[atom].get_bfactor()))
+                        except:
+                            continue
+    else:
+        raise NotImplementedError(f"Invalid method for reading coords: {method}")
+    if also_bfactors:
+        if normalize_bfactors:  # Normalize over Calphas
+            mean_b = bfactors[..., 1].mean()
+            std_b = bfactors[..., 1].var().sqrt()
+            bfactors = (bfactors - mean_b) / (std_b + 1e-6)
+        return coords, bfactors
+    return coords
+def feats_to_pdb_str(
+    atom_positions,
+    aatype=None,
+    atom_mask=None,
+    residue_index=None,
+    chain_index=None,
+    b_factors=None,
+    atom_lines_only=True,
+    conect=False,
+    **kwargs,
+):
+    # Expects unbatched, cropped inputs. needs at least one of atom_mask, aatype
+    # Uses all-GLY aatype if aatype not given: does not infer from atom_mask
+    assert aatype is not None or atom_mask is not None
+    if atom_mask is None:
+        aatype = aatype.cpu()
+        atom_mask = atom37_mask_from_aatype(aatype, torch.ones_like(aatype))
+    if aatype is None:
+        seq_mask = atom_mask[:, residue_constants.atom_order["CA"]].cpu()
+        aatype = seq_mask * residue_constants.restype_order["G"]
+    if residue_index is None:
+        residue_index = torch.arange(aatype.shape[-1])
+    if chain_index is None:
+        chain_index = torch.ones_like(aatype)
+    if b_factors is None:
+        b_factors = torch.ones_like(atom_mask)
+    cast = lambda x: np.array(x.detach().cpu()) if isinstance(x, torch.Tensor) else x
+    prot = protein.Protein(
+        atom_positions=cast(atom_positions),
+        atom_mask=cast(atom_mask),
+        aatype=cast(aatype),
+        residue_index=cast(residue_index),
+        chain_index=cast(chain_index),
+        b_factors=cast(b_factors),
+    )
+    pdb_str = protein.to_pdb(prot, conect=conect)
+    if conect:
+        pdb_str, conect_str = pdb_str
+    if atom_lines_only:
+        pdb_lines = pdb_str.split("\n")
+        atom_lines = [
+            l for l in pdb_lines if len(l.split()) > 1 and l.split()[0] == "ATOM"
+        ]
+        pdb_str = "\n".join(atom_lines) + "\n"
+    if conect:
+        pdb_str = pdb_str + conect_str
+    return pdb_str
+def bb_coords_to_pdb_str(coords, atoms=["N", "CA", "C", "O"]):
+    def _bb_pdb_line(atom, atomnum, resnum, coords, elem, res="GLY"):
+        atm = "ATOM".ljust(6)
+        atomnum = str(atomnum).rjust(5)
+        atomname = atom.center(4)
+        resname = res.ljust(3)
+        chain = "A".rjust(1)
+        resnum = str(resnum).rjust(4)
+        x = str("%8.3f" % (float(coords[0]))).rjust(8)
+        y = str("%8.3f" % (float(coords[1]))).rjust(8)
+        z = str("%8.3f" % (float(coords[2]))).rjust(8)
+        occ = str("%6.2f" % (float(1))).rjust(6)
+        temp = str("%6.2f" % (float(20))).ljust(6)
+        elname = elem.rjust(12)
+        return "%s%s %s %s %s%s    %s%s%s%s%s%s\n" % (
+            atm,
+            atomnum,
+            atomname,
+            resname,
+            chain,
+            resnum,
+            x,
+            y,
+            z,
+            occ,
+            temp,
+            elname,
+        )
+    n = coords.shape[0]
+    na = len(atoms)
+    pdb_str = ""
+    for j in range(0, n, na):
+        for idx, atom in enumerate(atoms):
+            pdb_str += _bb_pdb_line(
+                atom,
+                j + idx + 1,
+                (j + na) // na,
+                coords[j + idx],
+                atom[0],
+            )
+    return pdb_str
+def write_coords_to_pdb(
+    coords_in,
+    filename,
+    batched=True,
+    write_to_frames=False,
+    conect=False,
+    **all_atom_feats,
+):
+    def _write_pdb_string(pdb_str, filename, append=False):
+        write_mode = "a" if append else "w"
+        with open(filename, write_mode) as f:
+            if write_to_frames:
+                f.write("MODEL\n")
+            f.write(pdb_str)
+            if write_to_frames:
+                f.write("ENDMDL\n")
+    if not (batched or write_to_frames):
+        coords_in = [coords_in]
+        filename = [filename]
+        all_atom_feats = {k: [v] for k, v in all_atom_feats.items()}
+    n_atoms_in = coords_in[0].shape[-2]
+    is_bb_or_ca_pdb = n_atoms_in <= 4
+    for i, c in enumerate(coords_in):
+        n_res = c.shape[0]
+        if isinstance(filename, list):
+            fname = filename[i]
+        elif write_to_frames or len(coords_in) == 1:
+            fname = filename
+        else:
+            fname = f"{filename[:-4]}_{i}.pdb"
+        if is_bb_or_ca_pdb:
+            c_flat = rearrange(c, "n a c -> (n a) c")
+            if n_atoms_in == 1:
+                atoms = ["CA"]
+            if n_atoms_in == 3:
+                atoms = ["N", "CA", "C"]
+            if n_atoms_in == 4:
+                atoms = ["N", "CA", "C", "O"]
+            pdb_str = bb_coords_to_pdb_str(c_flat, atoms)
+        else:
+            feats_i = {k: v[i][:n_res] for k, v in all_atom_feats.items()}
+            pdb_str = feats_to_pdb_str(c, conect=conect, **feats_i)
+        _write_pdb_string(pdb_str, fname, append=write_to_frames and i > 0)
+###################### LOSSES ###################################
+def masked_cross_entropy(logprobs, target, loss_mask):
+    # target is onehot
+    cel = -(target * logprobs)
+    cel = cel * loss_mask[..., None]
+    cel = cel.sum((-1, -2)) / loss_mask.sum(-1).clamp(min=1e-6)
+    return cel
+def masked_mse(x, y, mask, weight=None):
+    data_dims = tuple(range(1, len(x.shape)))
+    mse = (x - y).pow(2) * mask
+    if weight is not None:
+        mse = mse * expand(weight, mse)
+    mse = mse.sum(data_dims) / mask.sum(data_dims).clamp(min=1e-6)
+    return mse
+###################### ALIGN ###################################
+def quick_tmalign(
+    p, p_sele, q_sele, tmscore_type="avg", differentiable_rmsd=False, rmsd_type="ca"
+):
+    # sota 210712
+    write_coords_to_pdb(p_sele[:, 1:2], "temp_p.pdb", atoms=["CA"], batched=False)
+    write_coords_to_pdb(q_sele[:, 1:2], "temp_q.pdb", atoms=["CA"], batched=False)
+    cmd = f"{PATH_TO_TMALIGN} temp_p.pdb temp_q.pdb -m temp_matrix.txt"
+    outputs = subprocess.run(shlex.split(cmd), capture_output=True, text=True)
+    # Get RMSD and TM scores
+    tmout = outputs.stdout.split("\n")
+    rmsd = float(tmout[16].split()[4][:-1])
+    tmscore1 = float(tmout[17].split()[1])
+    tmscore2 = float(tmout[18].split()[1])
+    if tmscore_type == "avg":
+        tmscore = (tmscore1 + tmscore2) / 2
+    elif tmscore_type == "1" or tmscore_type == "query":
+        tmscore = tmscore1
+    elif tmscore_type == "2":
+        tmscore = tmscore2
+    elif tmscore_type == "both":
+        tmscore = (tmscore1, tmscore2)
+    # Get R, t and transform p coords
+    m = open("temp_matrix.txt", "r").readlines()[2:5]
+    m = [l.strip()[1:].strip() for l in m]
+    m = torch.Tensor([[float(i) for i in l.split()] for l in m]).to(p_sele.device)
+    R = m[:, 1:].t()
+    t = m[:, 0]
+    aligned_psele = p_sele @ R + t
+    aligned = p @ R + t
+    # Option 2 for rms - MSE of aligned against target coords using TMalign seq alignment. Differentiable
+    if differentiable_rmsd:
+        pi, qi = 0, 0
+        p_idxs, q_idxs = [], []
+        for i, c in enumerate(tmout[23]):
+            if c in [":", "."]:
+                p_idxs.append(pi)
+                q_idxs.append(qi)
+            if tmout[22][i] != "-":
+                pi += 1
+            if tmout[24][i] != "-":
+                qi += 1
+        tmalign_seq_p = p_sele[p_idxs]
+        tmalign_seq_q = q_sele[q_idxs]
+        if rmsd_type == "ca":
+            tmalign_seq_p = tmalign_seq_p[:, 1]
+            tmalign_seq_q = tmalign_seq_q[:, 1]
+        elif rmsd_type == "bb":
+            pass
+        rmsd = (tmalign_seq_p - tmalign_seq_q).pow(2).sum(-1).sqrt().mean()
+    # Delete temp files: p.pdb, q.pdb, matrix.txt, tmalign.out
+    subprocess.run(shlex.split("rm temp_p.pdb"))
+    subprocess.run(shlex.split("rm temp_q.pdb"))
+    subprocess.run(shlex.split("rm temp_matrix.txt"))
+    return {"aligned": aligned, "rmsd": rmsd, "tm_score": tmscore, "R": R, "t": t}
+###################### OTHER ###################################
+def expand(x, tgt=None, dim=1):
+    if tgt is None:
+        for _ in range(dim):
+            x = x[..., None]
+    else:
+        while len(x.shape) < len(tgt.shape):
+            x = x[..., None]
+    return x
+def hookfn(name, verbose=False):
+    def f(grad):
+        if check_nan_inf(grad) > 0:
+            print(name, "grad nan/infs", grad.shape, check_nan_inf(grad), grad)
+        if verbose:
+            print(name, "grad shape", grad.shape, "norm", grad.norm())
+    return f
+def trigger_nan_check(name, x):
+    if check_nan_inf(x) > 0:
+        print(name, check_nan_inf(x))
+        raise Exception
+def check_nan_inf(x):
+    return torch.isinf(x).sum() + torch.isnan(x).sum()
+def directory_find(atom, root="."):
+    for path, dirs, files in os.walk(root):
+        if atom in dirs:
+            return os.path.join(path, atom)
+def dict2namespace(config):
+    namespace = argparse.Namespace()
+    for key, value in config.items():
+        if isinstance(value, dict):
+            new_value = dict2namespace(value)
+        else:
+            new_value = value
+        setattr(namespace, key, new_value)
+    return namespace
+def load_config(path, return_dict=False):
+    with open(path, "r") as f:
+        config_dict = yaml.safe_load(f)
+    config = dict2namespace(config_dict)
+    if return_dict:
+        return config, config_dict
+    else:
+        return config

diffusion.py ADDED Viewed

	@@ -0,0 +1,66 @@

+"""
+https://github.com/ProteinDesignLab/protpardelle
+License: MIT
+Author: Alex Chu
+Noise and diffusion utils.
+"""
+from scipy.stats import norm
+import torch
+from torchtyping import TensorType
+from core import utils
+def noise_schedule(
+    time: TensorType[float],
+    function: str = "uniform",
+    sigma_data: float = 10.0,
+    psigma_mean: float = -1.2,
+    psigma_std: float = 1.2,
+    s_min: float = 0.001,
+    s_max: float = 60,
+    rho: float = 7.0,
+    time_power: float = 4.0,
+    constant_val: float = 0.0,
+):
+    def sampling_noise(time):
+        # high noise = 1; low noise = 0. opposite of Karras et al. schedule
+        term1 = s_max ** (1 / rho)
+        term2 = (1 - time) * (s_min ** (1 / rho) - s_max ** (1 / rho))
+        noise_level = sigma_data * ((term1 + term2) ** rho)
+        return noise_level
+    if function == "lognormal":
+        normal_sample = torch.Tensor(norm.ppf(time.cpu())).to(time)
+        noise_level = sigma_data * torch.exp(psigma_mean + psigma_std * normal_sample)
+    elif function == "uniform":
+        noise_level = sampling_noise(time)
+    elif function == "mpnn":
+        time = time**time_power
+        noise_level = sampling_noise(time)
+    elif function == "constant":
+        noise_level = torch.ones_like(time) * constant_val
+    return noise_level
+def noise_coords(
+    coords: TensorType["b n a x", float],
+    noise_level: TensorType["b", float],
+    dummy_fill_masked_atoms: bool = False,
+    atom_mask: TensorType["b n a"] = None,
+):
+    # Does not apply atom mask after adding noise
+    if dummy_fill_masked_atoms:
+        assert atom_mask is not None
+        dummy_fill_mask = 1 - atom_mask
+        dummy_fill_value = coords[..., 1:2, :]  # CA
+        # dummy_fill_value = utils.fill_in_cbeta_for_atom37(coords)[..., 3:4, :]  # CB
+        coords = (
+            coords * atom_mask[..., None]
+            + dummy_fill_value * dummy_fill_mask[..., None]
+        )
+    noise = torch.randn_like(coords) * utils.expand(noise_level, coords)
+    noisy_coords = coords + noise
+    return noisy_coords

draw_samples.py ADDED Viewed

	@@ -0,0 +1,353 @@

+"""
+https://github.com/ProteinDesignLab/protpardelle
+License: MIT
+Author: Alex Chu
+Entry point for unconditional or simple conditional sampling.
+"""
+import argparse
+from datetime import datetime
+import json
+import os
+import shlex
+import subprocess
+import sys
+import time
+from einops import repeat
+import torch
+from core import data
+from core import residue_constants
+from core import utils
+import diffusion
+import models
+import sampling
+def draw_and_save_samples(
+    model,
+    samples_per_len=8,
+    lengths=range(50, 512),
+    save_dir="./",
+    mode="backbone",
+    **sampling_kwargs,
+):
+    device = model.device
+    if mode == "backbone":
+        total_sampling_time = 0
+        for l in lengths:
+            prot_lens = torch.ones(samples_per_len).long() * l
+            seq_mask = model.make_seq_mask_for_sampling(prot_lens=prot_lens)
+            aux = sampling.draw_backbone_samples(
+                model,
+                seq_mask=seq_mask,
+                pdb_save_path=f"{save_dir}/len{format(l, '03d')}_samp",
+                return_aux=True,
+                return_sampling_runtime=True,
+                **sampling_kwargs,
+            )
+            total_sampling_time += aux["runtime"]
+            print("Samples drawn for length", l)
+        return total_sampling_time
+    elif mode == "allatom":
+        total_sampling_time = 0
+        for l in lengths:
+            prot_lens = torch.ones(samples_per_len).long() * l
+            seq_mask = model.make_seq_mask_for_sampling(prot_lens=prot_lens)
+            aux = sampling.draw_allatom_samples(
+                model,
+                seq_mask=seq_mask,
+                pdb_save_path=f"{save_dir}/len{format(l, '03d')}",
+                return_aux=True,
+                **sampling_kwargs,
+            )
+            total_sampling_time += aux["runtime"]
+            print("Samples drawn for length", l)
+        return total_sampling_time
+def parse_idx_string(idx_str):
+    spans = idx_str.split(",")
+    idxs = []
+    for s in spans:
+        if "-" in s:
+            start, stop = s.split("-")
+            idxs.extend(list(range(int(start), int(stop))))
+        else:
+            idxs.append(int(s))
+    return idxs
+class Manager(object):
+    def __init__(self):
+        self.parser = argparse.ArgumentParser(
+            formatter_class=argparse.RawTextHelpFormatter
+        )
+        self.parser.add_argument(
+            "--model_checkpoint",
+            type=str,
+            default="checkpoints",
+            help="Path to denoiser model weights and config",
+        )
+        self.parser.add_argument(
+            "--mpnnpath",
+            type=str,
+            default="checkpoints/minimpnn_state_dict.pth",
+            help="Path to minimpnn model weights",
+        )
+        self.parser.add_argument(
+            "--modeldir",
+            type=str,
+            help="Model base directory, ex 'training_logs/other/lemon-shape-51'",
+        )
+        self.parser.add_argument("--modelepoch", type=int, help="Model epoch, ex 1000")
+        self.parser.add_argument(
+            "--type", type=str, default="allatom", help="Type of model"
+        )
+        self.parser.add_argument(
+            "--param", type=str, default=None, help="Which sampling param to vary"
+        )
+        self.parser.add_argument(
+            "--paramval", type=str, default=None, help="Which param val to use"
+        )
+        self.parser.add_argument(
+            "--parampath",
+            type=str,
+            default=None,
+            help="Path to json file with params, either use param/paramval or parampath, not both",
+        )
+        self.parser.add_argument(
+            "--perlen", type=int, default=2, help="How many samples per sequence length"
+        )
+        self.parser.add_argument(
+            "--minlen", type=int, required=False, help="Minimum sequence length"
+        )
+        self.parser.add_argument(
+            "--maxlen",
+            type=int,
+            required=False,
+            help="Maximum sequence length, not inclusive",
+        )
+        self.parser.add_argument(
+            "--steplen",
+            type=int,
+            required=False,
+            help="How frequently to select sequence length, for steplen 2, would be 50, 52, 54, etc",
+        )
+        self.parser.add_argument(
+            "--num_lens",
+            type=int,
+            required=False,
+            help="If steplen not provided, how many random lengths to sample at",
+        )
+        self.parser.add_argument(
+            "--targetdir", type=str, default=".", help="Directory to save results"
+        )
+        self.parser.add_argument(
+            "--input_pdb", type=str, required=False, help="PDB file to condition on"
+        )
+        self.parser.add_argument(
+            "--resample_idxs",
+            type=str,
+            required=False,
+            help="Indices from PDB file to resample. Zero-indexed, comma-delimited, can use dashes, eg 0,2-5,7",
+        )
+    def add_argument(self, *args, **kwargs):
+        self.parser.add_argument(*args, **kwargs)
+    def parse_args(self):
+        self.args = self.parser.parse_args()
+        return self.args
+def main():
+    # Set up params, arguments, sampling config
+    ####################
+    manager = Manager()
+    manager.parse_args()
+    args = manager.args
+    print(args)
+    is_test_run = False
+    seed = 0
+    samples_per_len = args.perlen
+    min_len = args.minlen
+    max_len = args.maxlen
+    len_step_size = args.steplen
+    device = "cuda:0"
+    # setting default sampling config
+    if args.type == "backbone":
+        sampling_config = sampling.default_backbone_sampling_config()
+    elif args.type == "allatom":
+        sampling_config = sampling.default_allatom_sampling_config()
+    sampling_kwargs = vars(sampling_config)
+    # Parse conditioning inputs
+    input_pdb_len = None
+    if args.input_pdb:
+        input_feats = utils.load_feats_from_pdb(args.input_pdb, protein_only=True)
+        input_pdb_len = input_feats["aatype"].shape[0]
+        if args.resample_idxs:
+            print(
+                f"Warning: when sampling conditionally, the input pdb length ({input_pdb_len} residues) is used automatically for the sampling lengths."
+            )
+            resample_idxs = parse_idx_string(args.resample_idxs)
+        else:
+            resample_idxs = list(range(input_pdb_len))
+        cond_idxs = [i for i in range(input_pdb_len) if i not in resample_idxs]
+        to_batch_size = lambda x: repeat(x, "... -> b ...", b=samples_per_len).to(
+            device
+        )
+        # For unconditional model, center coords on whole structure
+        centered_coords = data.apply_random_se3(
+            input_feats["atom_positions"],
+            atom_mask=input_feats["atom_mask"],
+            translation_scale=0.0,
+        )
+        cond_kwargs = {}
+        cond_kwargs["gt_coords"] = to_batch_size(centered_coords)
+        cond_kwargs["gt_cond_atom_mask"] = to_batch_size(input_feats["atom_mask"])
+        cond_kwargs["gt_cond_atom_mask"][:, resample_idxs] = 0
+        cond_kwargs["gt_aatype"] = to_batch_size(input_feats["aatype"])
+        cond_kwargs["gt_cond_seq_mask"] = torch.zeros_like(cond_kwargs["gt_aatype"])
+        cond_kwargs["gt_cond_seq_mask"][:, cond_idxs] = 1
+        sampling_kwargs.update(cond_kwargs)
+    # Determine lengths to sample at
+    if min_len is not None and max_len is not None:
+        if len_step_size is not None:
+            sampling_lengths = range(min_len, max_len, len_step_size)
+        else:
+            sampling_lengths = list(
+                torch.randint(min_len, max_len, size=(args.num_lens,))
+            )
+    elif input_pdb_len is not None:
+        sampling_lengths = [input_pdb_len]
+    else:
+        raise Exception("Need to provide a set of protein lengths or an input pdb.")
+    total_num_samples = len(list(sampling_lengths)) * samples_per_len
+    model_directory = args.modeldir
+    epoch = args.modelepoch
+    base_dir = args.targetdir
+    date_string = datetime.now().strftime("%y-%m-%d-%H-%M-%S")
+    if is_test_run:
+        date_string = f"test-{date_string}"
+    # Update sampling config with arguments
+    if args.param:
+        var_param = args.param
+        var_value = args.paramval
+        sampling_kwargs[var_param] = (
+            None
+            if var_value == "None"
+            else int(var_value)
+            if var_param == "n_steps"
+            else float(var_value)
+        )
+    elif args.parampath:
+        with open(args.parampath) as f:
+            var_params = json.loads(f.read())
+            sampling_kwargs.update(var_params)
+    # this is only used for the readme, keep s_min and s_max as params instead of struct_noise_schedule
+    sampling_kwargs_readme = list(sampling_kwargs.items())
+    print("Base directory:", base_dir)
+    save_dir = f"{base_dir}/samples"
+    save_init_dir = f"{base_dir}/samples_inits"
+    print("Samples saved to:", save_dir)
+    ####################
+    torch.manual_seed(seed)
+    if not os.path.exists(save_dir):
+        subprocess.run(shlex.split(f"mkdir -p {save_dir}"))
+    if not os.path.exists(save_init_dir):
+        subprocess.run(shlex.split(f"mkdir -p {save_init_dir}"))
+    # Load model
+    if args.type == "backbone":
+        if args.model_checkpoint:
+            checkpoint = f"{args.model_checkpoint}/backbone_state_dict.pth"
+            cfg_path = f"{args.model_checkpoint}/backbone.yml"
+        else:
+            checkpoint = (
+                f"{model_directory}/checkpoints/epoch{epoch}_training_state.pth"
+            )
+            cfg_path = f"{model_directory}/configs/backbone.yml"
+        cfg = utils.load_config(cfg_path)
+        weights = torch.load(checkpoint, map_location=device)["model_state_dict"]
+        model = models.Protpardelle(cfg, device=device)
+        model.load_state_dict(weights)
+        model.to(device)
+        model.eval()
+        model.device = device
+    elif args.type == "allatom":
+        if args.model_checkpoint:
+            checkpoint = f"{args.model_checkpoint}/allatom_state_dict.pth"
+            cfg_path = f"{args.model_checkpoint}/allatom.yml"
+        else:
+            checkpoint = (
+                f"{model_directory}/checkpoints/epoch{epoch}_training_state.pth"
+            )
+            cfg_path = f"{model_directory}/configs/allatom.yml"
+        config = utils.load_config(cfg_path)
+        weights = torch.load(checkpoint, map_location=device)["model_state_dict"]
+        model = models.Protpardelle(config, device=device)
+        model.load_state_dict(weights)
+        model.load_minimpnn(args.mpnnpath)
+        model.to(device)
+        model.eval()
+        model.device = device
+    # Sampling
+    with open(base_dir + "/readme.txt", "w") as f:
+        f.write(f"Sampling run for {date_string}\n")
+        f.write(f"Random seed {seed}\n")
+        f.write(f"Model checkpoint: {checkpoint}\n")
+        f.write(
+            f"{samples_per_len} samples per length from {min_len}:{max_len}:{len_step_size}\n"
+        )
+        f.write("Sampling params:\n")
+        for k, v in sampling_kwargs_readme:
+            f.write(f"{k}\t{v}\n")
+    print(f"Model loaded from {checkpoint}")
+    print(f"Beginning sampling for {date_string}...")
+    # Draw samples
+    start_time = time.time()
+    sampling_time = draw_and_save_samples(
+        model,
+        samples_per_len=samples_per_len,
+        lengths=sampling_lengths,
+        save_dir=save_dir,
+        mode=args.type,
+        **sampling_kwargs,
+    )
+    time_elapsed = time.time() - start_time
+    print(f"Sampling concluded after {time_elapsed} seconds.")
+    print(f"Of this, {sampling_time} seconds were for actual sampling.")
+    print(f"{total_num_samples} total samples were drawn.")
+    with open(base_dir + "/readme.txt", "a") as f:
+        f.write(f"Total job time: {time_elapsed} seconds\n")
+        f.write(f"Model run time: {sampling_time} seconds\n")
+        f.write(f"Total samples drawn: {total_num_samples}\n")
+    return
+if __name__ == "__main__":
+    main()

evaluation.py ADDED Viewed

	@@ -0,0 +1,406 @@

+"""
+https://github.com/ProteinDesignLab/protpardelle
+License: MIT
+Author: Alex Chu
+Utils for computing evaluation metrics.
+"""
+import argparse
+import os
+import warnings
+from typing import Tuple
+from Bio.Align import substitution_matrices
+import numpy as np
+import torch
+from transformers import AutoTokenizer, EsmForProteinFolding
+from torchtyping import TensorType
+from core import residue_constants
+from core import utils
+from core import protein_mpnn as mpnn
+import modules
+import sampling
+def mean(x):
+    if len(x) == 0:
+        return 0
+    return sum(x) / len(x)
+def calculate_seq_identity(seq1, seq2, seq_mask=None):
+    identity = (seq1 == seq2.to(seq1)).float()
+    if seq_mask is not None:
+        identity *= seq_mask.to(seq1)
+        return identity.sum(-1) / seq_mask.to(seq1).sum(-1).clamp(min=1)
+    else:
+        return identity.mean(-1)
+def design_sequence(coords, model=None, num_seqs=1, disallow_aas=["C"]):
+    # Returns list of strs; seqs like 'MKRLLDS', not aatypes
+    if model is None:
+        model = mpnn.get_mpnn_model()
+    if isinstance(coords, str):
+        temp_pdb = False
+        pdb_fn = coords
+    else:
+        temp_pdb = True
+        pdb_fn = f"tmp{np.random.randint(0, 1e8)}.pdb"
+        gly_idx = residue_constants.restype_order["G"]
+        gly_aatype = (torch.ones(coords.shape[0]) * gly_idx).long()
+        utils.write_coords_to_pdb(coords, pdb_fn, batched=False, aatype=gly_aatype)
+    with torch.no_grad():
+        designed_seqs = mpnn.run_proteinmpnn(
+            model=model,
+            pdb_path=pdb_fn,
+            num_seq_per_target=num_seqs,
+            omit_AAs=disallow_aas,
+        )
+    if temp_pdb:
+        os.system("rm " + pdb_fn)
+    return designed_seqs
+def get_esmfold_model(device=None):
+    if device is None:
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+    model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1").to(device)
+    model.esm = model.esm.half()
+    return model
+def inference_esmfold(sequence_list, model, tokenizer):
+    inputs = tokenizer(
+        sequence_list,
+        return_tensors="pt",
+        padding=True,
+        add_special_tokens=False,
+    ).to(model.device)
+    outputs = model(**inputs)
+    # positions is shape (l, b, n, a, c)
+    pred_coords = outputs.positions[-1].contiguous()
+    plddts = (outputs.plddt[:, :, 1] * inputs.attention_mask).sum(
+        -1
+    ) / inputs.attention_mask.sum(-1).clamp(min=1e-3)
+    return pred_coords, plddts
+def predict_structures(sequences, model="esmfold", tokenizer=None, force_unk_to_X=True):
+    # Expects seqs like 'MKRLLDS', not aatypes
+    # model can be a model, or a string describing which pred model to load
+    if isinstance(sequences, str):
+        sequences = [sequences]
+    if model == "esmfold":
+        model = get_esmfold_model()
+    device = model.device
+    if tokenizer is None:
+        tokenizer = AutoTokenizer.from_pretrained("facebook/esmfold_v1")
+    aatype = [utils.seq_to_aatype(seq).to(device) for seq in sequences]
+    with torch.no_grad():
+        if isinstance(model, EsmForProteinFolding):
+            pred_coords, plddts = inference_esmfold(sequences, model, tokenizer)
+    seq_lens = [len(s) for s in sequences]
+    trimmed_coords = [c[: seq_lens[i]] for i, c in enumerate(pred_coords)]
+    trimmed_coords_atom37 = [
+        utils.atom37_coords_from_atom14(c, aatype[i])
+        for i, c in enumerate(trimmed_coords)
+    ]
+    return trimmed_coords_atom37, plddts
+def compute_structure_metric(coords1, coords2, metric="ca_rmsd", atom_mask=None):
+    # coords1 tensor[l][a][3]
+    def _tmscore(a, b, mask=None):
+        length = len(b)
+        dists = (a - b).pow(2).sum(-1)
+        d0 = 1.24 * ((length - 15) ** (1 / 3)) - 1.8
+        term = 1 / (1 + ((dists) / (d0**2)))
+        if mask is None:
+            return term.mean()
+        else:
+            term = term * mask
+            return term.sum() / mask.sum().clamp(min=1)
+    aligned_coords1_ca, (R, t) = utils.kabsch_align(coords1[:, 1], coords2[:, 1])
+    aligned_coords1 = coords1 - coords1[:, 1:2].mean(0, keepdim=True)
+    aligned_coords1 = aligned_coords1 @ R.t() + t
+    if metric == "ca_rmsd":
+        return (aligned_coords1_ca - coords2[:, 1]).pow(2).sum(-1).sqrt().mean()
+    elif metric == "tm_score":
+        tm = _tmscore(aligned_coords1_ca, coords2[:, 1])
+        # TODO: return 1 - tm score for now so sorts work properly
+        return 1 - tm
+    elif metric == "allatom_tm":
+        # Align on Ca, compute allatom TM
+        assert atom_mask is not None
+        return _tmscore(aligned_coords1, coords2, mask=atom_mask)
+    elif metric == "allatom_lddt":
+        assert atom_mask is not None
+        lddt = modules.lddt(
+            coords1.reshape(-1, 3),
+            coords2.reshape(-1, 3),
+            atom_mask.reshape(-1, 1),
+            per_residue=False,
+        )
+        return lddt
+    else:
+        raise NotImplementedError
+def compute_self_consistency(
+    comparison_structures,  # can be sampled or ground truth
+    sampled_sequences=None,
+    mpnn_model=None,
+    struct_pred_model=None,
+    tokenizer=None,
+    num_seqs=1,
+    return_aux=False,
+    metric="ca_rmsd",
+    output_file=None,
+):
+    # Typically used for eval of backbone sampling or sequence design or joint sampling
+    # (Maybe MPNN) + Fold + TM/RMSD
+    # Expects seqs like 'MKRLLDS', not aatypes
+    per_sample_primary_metrics = []
+    per_sample_secondary_metrics = []
+    per_sample_plddts = []
+    per_sample_coords = []
+    per_sample_seqs = []
+    aux = {}
+    for i, coords in enumerate(comparison_structures):
+        if sampled_sequences is None:
+            seqs_to_predict = design_sequence(
+                coords, model=mpnn_model, num_seqs=num_seqs
+            )
+        else:
+            seqs_to_predict = sampled_sequences[i]
+        pred_coords, plddts = predict_structures(
+            seqs_to_predict, model=struct_pred_model, tokenizer=tokenizer
+        )
+        primary_metric_name = "tm_score" if metric == "tm_score" else "ca_rmsd"
+        secondary_metric_name = "tm_score" if metric == "both" else None
+        primary_metrics = [
+            compute_structure_metric(coords.to(pred), pred, metric=primary_metric_name)
+            for pred in pred_coords
+        ]
+        if secondary_metric_name:
+            secondary_metrics = [
+                compute_structure_metric(
+                    coords.to(pred), pred, metric=secondary_metric_name
+                )
+                for pred in pred_coords
+            ]
+            aux.setdefault(secondary_metric_name, []).extend(secondary_metrics)
+        else:
+            secondary_metrics = primary_metrics
+        aux.setdefault("pred", []).extend(pred_coords)
+        seqs_to_predict_arr = seqs_to_predict
+        if isinstance(seqs_to_predict_arr, str):
+            seqs_to_predict_arr = [seqs_to_predict_arr]
+        aux.setdefault("seqs", []).extend(seqs_to_predict_arr)
+        aux.setdefault("plddt", []).extend(plddts)
+        aux.setdefault("rmsd", []).extend(primary_metrics)
+        # Report best rmsd design only among MPNN reps
+        all_designs = [
+            (m, p, t, c, s)
+            for m, p, t, c, s in zip(
+                primary_metrics,
+                plddts,
+                secondary_metrics,
+                pred_coords,
+                seqs_to_predict_arr,
+            )
+        ]
+        best_rmsd_design = min(all_designs, key=lambda x: x[0])
+        per_sample_primary_metrics.append(best_rmsd_design[0].detach().cpu())
+        per_sample_plddts.append(best_rmsd_design[1].detach().cpu())
+        per_sample_secondary_metrics.append(best_rmsd_design[2].detach().cpu())
+        per_sample_coords.append(best_rmsd_design[3])
+        per_sample_seqs.append(best_rmsd_design[4])
+    best_idx = np.argmin(per_sample_primary_metrics)
+    metrics = {
+        "sc_rmsd_best": per_sample_primary_metrics[best_idx],
+        "sc_plddt_best": per_sample_plddts[best_idx],
+        "sc_rmsd_mean": mean(per_sample_primary_metrics),
+        "sc_plddt_mean": mean(per_sample_plddts),
+    }
+    if metric == "both":
+        metrics["sc_tmscore_best"] = per_sample_secondary_metrics[best_idx]
+        metrics["sc_tmscore_mean"] = mean(per_sample_secondary_metrics)
+    if output_file:
+        pred_coords = per_sample_coords
+        designed_seqs = per_sample_seqs
+        if torch.isnan(pred_coords[best_idx]).sum() == 0:
+            designed_seq = utils.seq_to_aatype(designed_seqs[best_idx])
+            utils.write_coords_to_pdb(
+                pred_coords[best_idx],
+                output_file,
+                batched=False,
+                aatype=designed_seq,
+            )
+    if return_aux:
+        return metrics, best_idx, aux
+    else:
+        return metrics, best_idx
+def compute_secondary_structure_content(coords_batch):
+    dssp_sample = []
+    for i, c in enumerate(coords_batch):
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            dssp_str = utils.get_3state_dssp(coords=c)
+        if dssp_str is None or len(dssp_str) == 0:
+            pass
+        else:
+            dssp_sample.append(dssp_str)
+    dssp_sample = "".join(dssp_sample)
+    metrics = {}
+    metrics["sample_pct_beta"] = mean([c == "E" for c in dssp_sample])
+    metrics["sample_pct_alpha"] = mean([c == "H" for c in dssp_sample])
+    return metrics
+def compute_bond_length_metric(
+    cropped_coords_list, cropped_aatypes_list, atom_mask=None
+):
+    bond_length_dict = utils.batched_fullatom_bond_lengths_from_coords(
+        cropped_coords_list, cropped_aatypes_list, atom_mask=atom_mask
+    )
+    all_errors = {}
+    for aa1, d in bond_length_dict.items():
+        aa3 = residue_constants.restype_1to3[aa1]
+        per_bond_errors = []
+        for bond, lengths in d.items():
+            a1, a2 = bond.split("-")
+            ideal_val = None
+            for bond in residue_constants.standard_residue_bonds[aa3]:
+                if (
+                    bond.atom1_name == a1
+                    and bond.atom2_name == a2
+                    or bond.atom1_name == a2
+                    and bond.atom2_name == a1
+                ):
+                    ideal_val = bond.length
+                    break
+            error = (np.array(lengths) - ideal_val) ** 2
+            per_bond_errors.append(error.mean() ** 0.5)
+        if len(per_bond_errors) > 0:  # often no Cys
+            per_res_errors = np.mean(per_bond_errors)
+            all_errors[aa1] = per_res_errors
+    return np.mean(list(all_errors.values()))
+def evaluate_backbone_generation(
+    model,
+    n_samples=1,
+    mpnn_model=None,
+    struct_pred_model=None,
+    tokenizer=None,
+    sample_length_range=(50, 512),
+):
+    sampling_config = sampling.default_backbone_sampling_config()
+    trimmed_coords, seq_mask = sampling.draw_backbone_samples(
+        model,
+        n_samples=n_samples,
+        sample_length_range=sample_length_range,
+        **vars(sampling_config),
+    )
+    sc_metrics, best_idx, aux = compute_self_consistency(
+        trimmed_coords,
+        mpnn_model=mpnn_model,
+        struct_pred_model=struct_pred_model,
+        tokenizer=tokenizer,
+        return_aux=True,
+    )
+    dssp_metrics = compute_secondary_structure_content(trimmed_coords)
+    all_metrics = {**sc_metrics, **dssp_metrics}
+    all_metrics = {f"bb_{k}": v for k, v in all_metrics.items()}
+    return all_metrics, (trimmed_coords, seq_mask, best_idx, aux["pred"], aux["seqs"])
+def evaluate_allatom_generation(
+    model,
+    n_samples,
+    two_stage_sampling=True,
+    struct_pred_model=None,
+    tokenizer=None,
+    sample_length_range=(50, 512),
+):
+    # Convert allatom model to codesign model by loading miniMPNN
+    model.task = "codesign"
+    model.load_minimpnn()
+    model.eval()
+    sampling_config = sampling.default_allatom_sampling_config()
+    ret = sampling.draw_allatom_samples(
+        model,
+        n_samples=n_samples,
+        two_stage_sampling=two_stage_sampling,
+        **vars(sampling_config),
+    )
+    (
+        cropped_samp_coords,
+        cropped_samp_aatypes,
+        samp_atom_mask,
+        stage1_coords,
+        seq_mask,
+    ) = ret
+    # Compute self consistency
+    if struct_pred_model is None:
+        struct_pred_model = EsmForProteinFolding.from_pretrained(
+            "facebook/esmfold_v1"
+        ).to(device)
+        struct_pred_model.esm = struct_pred_model.esm.half()
+    if tokenizer is None:
+        tokenizer = AutoTokenizer.from_pretrained("facebook/esmfold_v1")
+    designed_seqs = [utils.aatype_to_seq(a) for a in cropped_samp_aatypes]
+    sc_metrics, best_idx, sc_aux = compute_self_consistency(
+        comparison_structures=cropped_samp_coords,
+        sampled_sequences=designed_seqs,
+        struct_pred_model=struct_pred_model,
+        tokenizer=tokenizer,
+        return_aux=True,
+    )
+    aa_metrics_out = {f"aa_{k}": v for k, v in sc_metrics.items()}
+    # Compute secondary structure content
+    cropped_bb_coords = [c[..., [0, 1, 2, 4], :] for c in cropped_samp_coords]
+    dssp_metrics = compute_secondary_structure_content(cropped_bb_coords)
+    aa_metrics_out = {**aa_metrics_out, **dssp_metrics}
+    # Compute bond length RMSE
+    if two_stage_sampling:  # compute on original sample
+        bond_rmse_coords = stage1_coords
+    else:
+        bond_rmse_coords = cropped_samp_coords
+    bond_rmse = compute_bond_length_metric(
+        bond_rmse_coords, cropped_samp_aatypes, samp_atom_mask
+    )
+    aa_metrics_out["aa_bond_rmse"] = bond_rmse
+    # Convert codesign model back to allatom model and return metrics
+    model.task = "allatom"
+    model.remove_minimpnn()
+    aa_aux_out = (
+        cropped_samp_coords,
+        cropped_samp_aatypes,
+        samp_atom_mask,
+        sc_aux["pred"],
+        best_idx,
+    )
+    return aa_metrics_out, aa_aux_out

models.py ADDED Viewed

	@@ -0,0 +1,778 @@

+"""
+https://github.com/ProteinDesignLab/protpardelle
+License: MIT
+Author: Alex Chu
+Top-level model definitions.
+Typically these are initialized with config rather than arguments.
+"""
+import argparse
+from functools import partial
+import os
+from typing import Callable, List, Optional
+from einops import rearrange, repeat
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchtyping import TensorType
+from core import protein_mpnn
+from core import residue_constants
+from core import utils
+import diffusion
+import evaluation
+import modules
+class MiniMPNN(nn.Module):
+    """Wrapper for ProteinMPNN network to predict sequence from structure."""
+    def __init__(self, config: argparse.Namespace):
+        super().__init__()
+        self.config = config
+        self.model_config = cfg = config.model.mpnn_model
+        self.n_tokens = config.data.n_aatype_tokens
+        self.seq_emb_dim = cfg.n_channel
+        time_cond_dim = cfg.n_channel * cfg.noise_cond_mult
+        self.noise_block = modules.NoiseConditioningBlock(cfg.n_channel, time_cond_dim)
+        self.token_embedding = nn.Linear(self.n_tokens, self.seq_emb_dim)
+        self.mpnn_net = modules.NoiseConditionalProteinMPNN(
+            n_channel=cfg.n_channel,
+            n_layers=cfg.n_layers,
+            n_neighbors=cfg.n_neighbors,
+            time_cond_dim=time_cond_dim,
+            vocab_size=config.data.n_aatype_tokens,
+            input_S_is_embeddings=True,
+        )
+        self.proj_out = nn.Linear(cfg.n_channel, self.n_tokens)
+    def forward(
+        self,
+        denoised_coords: TensorType["b n a x", float],
+        coords_noise_level: TensorType["b", float],
+        seq_mask: TensorType["b n", float],
+        residue_index: TensorType["b n", int],
+        seq_self_cond: Optional[TensorType["b n t", float]] = None,  # logprobs
+        return_embeddings: bool = False,
+    ):
+        coords_noise_level_scaled = 0.25 * torch.log(coords_noise_level)
+        noise_cond = self.noise_block(coords_noise_level_scaled)
+        b, n, _, _ = denoised_coords.shape
+        if seq_self_cond is None or not self.model_config.use_self_conditioning:
+            seq_emb_in = torch.zeros(b, n, self.seq_emb_dim).to(denoised_coords)
+        else:
+            seq_emb_in = self.token_embedding(seq_self_cond.exp())
+        node_embs, encoder_embs = self.mpnn_net(
+            denoised_coords, seq_emb_in, seq_mask, residue_index, noise_cond
+        )
+        logits = self.proj_out(node_embs)
+        pred_logprobs = F.log_softmax(logits, -1)
+        if return_embeddings:
+            return pred_logprobs, node_embs, encoder_embs
+        return pred_logprobs
+class CoordinateDenoiser(nn.Module):
+    """Wrapper for U-ViT module to denoise structure coordinates."""
+    def __init__(self, config: argparse.Namespace):
+        super().__init__()
+        self.config = config
+        # Configuration
+        self.sigma_data = config.data.sigma_data
+        m_cfg = config.model.struct_model
+        nc = m_cfg.n_channel
+        bb_atoms = ["N", "CA", "C", "O"]
+        n_atoms = config.model.struct_model.n_atoms
+        self.use_conv = len(m_cfg.uvit.n_filt_per_layer) > 0
+        if self.use_conv and n_atoms == 37:
+            n_atoms += 1  # make it an even number
+        self.n_atoms = n_atoms
+        self.bb_idxs = [residue_constants.atom_order[a] for a in bb_atoms]
+        n_xyz = 9 if config.model.crop_conditional else 6
+        nc_in = n_xyz * n_atoms  # xyz + selfcond xyz + maybe cropcond xyz
+        # Neural networks
+        n_noise_channel = nc * m_cfg.noise_cond_mult
+        self.net = modules.TimeCondUViT(
+            seq_len=config.data.fixed_size,
+            patch_size=m_cfg.uvit.patch_size,
+            dim=nc,
+            depth=m_cfg.uvit.n_layers,
+            n_filt_per_layer=m_cfg.uvit.n_filt_per_layer,
+            heads=m_cfg.uvit.n_heads,
+            dim_head=m_cfg.uvit.dim_head,
+            conv_skip_connection=m_cfg.uvit.conv_skip_connection,
+            n_atoms=n_atoms,
+            channels_per_atom=n_xyz,
+            time_cond_dim=n_noise_channel,
+            position_embedding_type=m_cfg.uvit.position_embedding_type,
+        )
+        self.noise_block = modules.NoiseConditioningBlock(nc, n_noise_channel)
+    def forward(
+        self,
+        noisy_coords: TensorType["b n a x", float],
+        noise_level: TensorType["b", float],
+        seq_mask: TensorType["b n", float],
+        residue_index: Optional[TensorType["b n", int]] = None,
+        struct_self_cond: Optional[TensorType["b n a x", float]] = None,
+        struct_crop_cond: Optional[TensorType["b n a x", float]] = None,
+    ):
+        # Prep inputs and time conditioning
+        actual_var_data = self.sigma_data**2
+        var_noisy_coords = noise_level**2 + actual_var_data
+        emb = noisy_coords / utils.expand(var_noisy_coords.sqrt(), noisy_coords)
+        struct_noise_scaled = 0.25 * torch.log(noise_level)
+        noise_cond = self.noise_block(struct_noise_scaled)
+        # Prepare self- and crop-conditioning and concatenate along channels
+        if struct_self_cond is None:
+            struct_self_cond = torch.zeros_like(noisy_coords)
+        if self.config.model.crop_conditional:
+            if struct_crop_cond is None:
+                struct_crop_cond = torch.zeros_like(noisy_coords)
+            else:
+                struct_crop_cond = struct_crop_cond / self.sigma_data
+            emb = torch.cat([emb, struct_self_cond, struct_crop_cond], -1)
+        else:
+            emb = torch.cat([emb, struct_self_cond], -1)
+        # Run neural network
+        emb = self.net(emb, noise_cond, seq_mask=seq_mask, residue_index=residue_index)
+        # Preconditioning from Karras et al.
+        out_scale = noise_level * actual_var_data**0.5 / torch.sqrt(var_noisy_coords)
+        skip_scale = actual_var_data / var_noisy_coords
+        emb = emb * utils.expand(out_scale, emb)
+        skip_info = noisy_coords * utils.expand(skip_scale, noisy_coords)
+        denoised_coords_x0 = emb + skip_info
+        # Don't use atom mask; denoise all atoms
+        denoised_coords_x0 *= utils.expand(seq_mask, denoised_coords_x0)
+        return denoised_coords_x0
+class Protpardelle(nn.Module):
+    """All-atom protein diffusion-based generative model.
+    This class wraps a structure denoising network and a sequence prediction network
+    to do structure/sequence co-design (for all-atom generation), or backbone generation.
+    It can be trained for one of four main tasks. To produce the all-atom (co-design)
+    Protpardelle model, we will typically pretrain an 'allatom' model, then use this
+    to train a 'seqdes' model. A 'seqdes' model can be trained with either a backbone
+    or allatom denoiser. The two can be combined to yield all-atom (co-design) Protpardelle
+    without further training.
+        'backbone': train only a backbone coords denoiser.
+        'seqdes': train only a mini-MPNN, using a pretrained coords denoiser.
+        'allatom': train only an allatom coords denoiser (cannot do all-atom generation
+            by itself).
+        'codesign': train both an allatom denoiser and mini-MPNN at once.
+    """
+    def __init__(self, config: argparse.Namespace, device: str = "cpu"):
+        super().__init__()
+        self.config = config
+        self.device = device
+        self.task = config.model.task
+        self.n_tokens = config.data.n_aatype_tokens
+        self.use_mpnn_model = self.task in ["seqdes", "codesign"]
+        # Modules
+        self.all_modules = {}
+        self.bb_idxs = [0, 1, 2, 4]
+        self.n_atoms = 37
+        self.struct_model = CoordinateDenoiser(config)
+        self.all_modules["struct_model"] = self.struct_model
+        self.bb_idxs = self.struct_model.bb_idxs
+        self.n_atoms = self.struct_model.n_atoms
+        if self.use_mpnn_model:
+            self.mpnn_model = MiniMPNN(config)
+            self.all_modules["mpnn_model"] = self.mpnn_model
+        # Load any pretrained modules
+        for module_name in self.config.model.pretrained_modules:
+            self.load_pretrained_module(module_name)
+        # Diffusion-related
+        self.sigma_data = self.struct_model.sigma_data
+        self.training_noise_schedule = partial(
+            diffusion.noise_schedule,
+            sigma_data=self.sigma_data,
+            **vars(config.diffusion.training),
+        )
+        self.sampling_noise_schedule_default = self.make_sampling_noise_schedule()
+    def load_pretrained_module(self, module_name: str, ckpt_path: Optional[str] = None):
+        """Load pretrained weights for a given module name."""
+        assert module_name in ["struct_model", "mpnn_model"], module_name
+        # Load pretrained checkpoint
+        if ckpt_path is None:
+            ckpt_path = getattr(self.config.model, f"{module_name}_checkpoint")
+            ckpt_path = os.path.join(self.config.train.home_dir, ckpt_path)
+        ckpt_dict = torch.load(ckpt_path, map_location=self.device)
+        model_state_dict = ckpt_dict["model_state_dict"]
+        # Get only submodule state_dict
+        submodule_state_dict = {
+            sk[len(module_name) + 1 :]: sv
+            for sk, sv in model_state_dict.items()
+            if sk.startswith(module_name)
+        }
+        # Load into module
+        module = dict(self.named_modules())[module_name]
+        module.load_state_dict(submodule_state_dict)
+        # Freeze unneeded modules
+        if module_name == "struct_model":
+            self.struct_model = module
+            if self.task == "seqdes":
+                for p in module.parameters():
+                    p.requires_grad = False
+        if module_name == "mpnn_model":
+            self.mpnn_model = module
+            if self.task not in ["codesign", "seqdes"]:
+                for p in module.parameters():
+                    p.requires_grad = False
+        return module
+    def load_minimpnn(self, mpnn_ckpt_path: Optional[str] = None):
+        """Convert an allatom model to a codesign model."""
+        if mpnn_ckpt_path is None:
+            mpnn_ckpt_path = "checkpoints/minimpnn_state_dict.pth"
+        self.mpnn_model = MiniMPNN(self.config).to(self.device)
+        self.load_pretrained_module("mpnn_model", ckpt_path=mpnn_ckpt_path)
+        self.use_mpnn_model = True
+        return
+    def remove_minimpnn(self):
+        """Revert a codesign model to an allatom model to a codesign model."""
+        self.use_mpnn_model = False
+        self.mpnn_model = None
+        self.all_modules["mpnn_model"] = None
+    def make_sampling_noise_schedule(self, **noise_kwargs):
+        """Make the default sampling noise schedule function."""
+        noise_schedule_kwargs = vars(self.config.diffusion.sampling)
+        if len(noise_kwargs) > 0:
+            noise_schedule_kwargs.update(noise_kwargs)
+        return partial(diffusion.noise_schedule, **noise_schedule_kwargs)
+    def forward(
+        self,
+        *,
+        noisy_coords: TensorType["b n a x", float],
+        noise_level: TensorType["b", float],
+        seq_mask: TensorType["b n", float],
+        residue_index: TensorType["b n", int],
+        struct_self_cond: Optional[TensorType["b n a x", float]] = None,
+        struct_crop_cond: Optional[TensorType["b n a x", float]] = None,
+        seq_self_cond: Optional[TensorType["b n t", float]] = None,  # logprobs
+        run_struct_model: bool = True,
+        run_mpnn_model: bool = True,
+    ):
+        """Main forward function for denoising/co-design.
+        Arguments:
+            noisy_coords: noisy array of xyz coordinates.
+            noise_level: std of noise for each example in the batch.
+            seq_mask: mask indicating which indexes contain data.
+            residue_index: residue ordering. This is used by proteinMPNN, but currently
+                only used by the diffusion model when the 'absolute_residx' or
+                'relative' position_embedding_type is specified.
+            struct_self_cond: denoised coordinates from the previous step, scaled
+                down by sigma data.
+            struct_crop_cond: unnoised coordinates. unscaled (scaled down by sigma
+                data inside the denoiser)
+            seq_self_cond: mpnn-predicted sequence logprobs from the previous step.
+            run_struct_model: flag to optionally not run structure denoiser.
+            run_mpnn_model: flag to optionally not run mini-mpnn.
+        """
+        # Coordinate denoiser
+        denoised_x0 = noisy_coords
+        if run_struct_model:
+            denoised_x0 = self.struct_model(
+                noisy_coords,
+                noise_level,
+                seq_mask,
+                residue_index=residue_index,
+                struct_self_cond=struct_self_cond,
+                struct_crop_cond=struct_crop_cond,
+            )
+        # Mini-MPNN
+        aatype_logprobs = None
+        if self.use_mpnn_model and run_mpnn_model:
+            aatype_logprobs = self.mpnn_model(
+                denoised_x0.detach(),
+                noise_level,
+                seq_mask,
+                residue_index,
+                seq_self_cond=seq_self_cond,
+                return_embeddings=False,
+            )
+            aatype_logprobs = aatype_logprobs * seq_mask[..., None]
+        # Process outputs
+        if aatype_logprobs is None:
+            aatype_logprobs = repeat(seq_mask, "b n -> b n t", t=self.n_tokens)
+            aatype_logprobs = torch.ones_like(aatype_logprobs)
+            aatype_logprobs = F.log_softmax(aatype_logprobs, -1)
+        struct_self_cond_out = denoised_x0.detach() / self.sigma_data
+        seq_self_cond_out = aatype_logprobs.detach()
+        return denoised_x0, aatype_logprobs, struct_self_cond_out, seq_self_cond_out
+    def make_seq_mask_for_sampling(
+        self,
+        prot_lens: Optional[TensorType["b", int]] = None,
+        n_samples: int = 1,
+        min_len: int = 50,
+        max_len: Optional[int] = None,
+    ):
+        """Makes a sequence mask of varying protein lengths (only input required
+        to begin sampling).
+        """
+        if max_len is None:
+            max_len = self.config.data.fixed_size
+        if prot_lens is None:
+            possible_lens = np.arange(min_len, max_len)
+            prot_lens = torch.Tensor(np.random.choice(possible_lens, n_samples))
+        else:
+            n_samples = len(prot_lens)
+            max_len = max(prot_lens)
+        mask = repeat(torch.arange(max_len), "n -> b n", b=n_samples)
+        mask = (mask < prot_lens[:, None]).float().to(self.device)
+        return mask
+    def sample(
+        self,
+        *,
+        seq_mask: TensorType["b n", float] = None,
+        n_samples: int = 1,
+        min_len: int = 50,
+        max_len: int = 512,
+        residue_index: TensorType["b n", int] = None,
+        gt_coords: TensorType["b n a x", float] = None,
+        gt_coords_traj: List[TensorType["b n a x", float]] = None,
+        gt_cond_atom_mask: TensorType["b n a", float] = None,
+        gt_aatype: TensorType["b n", int] = None,
+        gt_cond_seq_mask: TensorType["b n", float] = None,
+        apply_cond_proportion: float = 1.0,
+        n_steps: int = 200,
+        step_scale: float = 1.2,
+        s_churn: float = 50.0,
+        noise_scale: float = 1.0,
+        s_t_min: float = 0.01,
+        s_t_max: float = 50.0,
+        temperature: float = 1.0,
+        top_p: float = 1.0,
+        disallow_aas: List[int] = [4, 20],  # cys, unk
+        sidechain_mode: bool = False,
+        skip_mpnn_proportion: float = 0.7,
+        anneal_seq_resampling_rate: Optional[str] = None,  # linear, cosine
+        use_fullmpnn: bool = False,
+        use_fullmpnn_for_final: bool = True,
+        use_reconstruction_guidance: bool = False,
+        use_classifier_free_guidance: bool = False,  # defaults to replacement guidance if these are all false
+        guidance_scale: float = 1.0,
+        noise_schedule: Optional[Callable] = None,
+        tqdm_pbar: Optional[Callable] = None,
+        return_last: bool = True,
+        return_aux: bool = False,
+    ):
+        """Sampling function for backbone or all-atom diffusion. All arguments are optional.
+        Arguments:
+            seq_mask: mask defining the number and lengths of proteins to be sampled.
+            n_samples: number of samples to draw (if seq_mask not provided).
+            min_len: minimum length of proteins to be sampled (if seq_mask not provided).
+            max_len: maximum length of proteins to be sampled (if seq_mask not provided).
+            residue_index: residue index of proteins to be sampled.
+            gt_coords: conditioning information for coords.
+            gt_coords_traj: conditioning information for coords specified for each timestep
+                (if gt_coords is not provided).
+            gt_cond_atom_mask: mask identifying atoms to apply gt_coords.
+            gt_aatype: conditioning information for sequence.
+            gt_cond_seq_mask: sequence positions to apply gt_aatype.
+            apply_cond_proportion: the proportion of timesteps to apply the conditioning.
+                e.g. if 0.5, then the first 50% of steps use conditioning, and the last 50%
+                are unconditional.
+            n_steps: number of denoising steps (ODE discretizations).
+            step_scale: scale to apply to the score.
+            s_churn: gamma = s_churn / n_steps describes the additional noise to add
+                relatively at each denoising step. Use 0.0 for deterministic sampling or
+                0.2 * n_steps as a rough default for stochastic sampling.
+            noise_scale: scale to apply to gamma.
+            s_t_min: don't apply s_churn below this noise level.
+            s_t_max: don't apply s_churn above this noise level.
+            temperature: scale to apply to aatype logits.
+            top_p: don't tokens which fall outside this proportion of the total probability.
+            disallow_aas: don't sample these token indices.
+            sidechain_mode: whether to do all-atom sampling (False for backbone-only).
+            skip_mpnn_proportion: proportion of timesteps from the start to skip running
+                mini-MPNN.
+            anneal_seq_resampling_rate: whether and how to decay the probability of
+                running mini-MPNN. None, 'linear', or 'cosine'
+            use_fullmpnn: use "full" ProteinMPNN at each step.
+            use_fullmpnn_for_final: use "full" ProteinMPNN at the final step.
+            use_reconstruction_guidance: use reconstruction guidance on the conditioning.
+            use_classifier_free_guidance: use classifier-free guidance on the conditioning.
+            guidance_scale: weight for reconstruction/classifier-free guidance.
+            noise_schedule: specify the noise level timesteps for sampling.
+            tqdm_pbar: progress bar in interactive contexts.
+            return_last: return only the sampled structure and sequence.
+            return_aux: return a dict of everything associated with the sampling run.
+        """
+        def ode_step(sigma_in, sigma_next, xt_in, x0_pred, gamma, guidance_in=None):
+            if gamma > 0:
+                t_hat = sigma_in + gamma * sigma_in
+                sigma_delta = torch.sqrt(t_hat**2 - sigma_in**2)
+                noisier_x = xt_in + utils.expand(
+                    sigma_delta, xt_in
+                ) * noise_scale * torch.randn_like(xt_in).to(xt_in)
+                xt_in = noisier_x * utils.expand(seq_mask, noisier_x)
+                sigma_in = t_hat
+            mask = (sigma_in > 0).float()
+            score = (xt_in - x0_pred) / utils.expand(sigma_in.clamp(min=1e-6), xt_in)
+            score = score * utils.expand(mask, score)
+            if use_reconstruction_guidance:
+                guidance, guidance_mask = guidance_in
+                guidance = guidance * guidance_mask[..., None]
+                guidance_std = guidance[guidance_mask.bool()].var().sqrt()
+                score_std = score[guidance_mask.bool()].var().sqrt()
+                score = score + guidance * guidance_scale
+            if use_classifier_free_guidance:
+                # guidance_in is the unconditional x0 (x0_pred is the conditional x0)
+                # guidance_scale = 1 + w from Ho paper
+                # ==0: use only unconditional score; <1: interpolate the scores;
+                # ==1: use only conditional score; >1: skew towards conditional score
+                uncond_x0 = guidance_in
+                uncond_score = (xt_in - uncond_x0) / utils.expand(
+                    sigma_in.clamp(min=1e-6), xt_in
+                )
+                uncond_score = uncond_score * utils.expand(mask, uncond_score)
+                score = guidance_scale * score + (1 - guidance_scale) * uncond_score
+            step = score * step_scale * utils.expand(sigma_next - sigma_in, score)
+            new_xt = xt_in + step
+            return new_xt
+        def sample_aatype(logprobs):
+            # Top-p truncation
+            probs = F.softmax(logprobs.clone(), dim=-1)
+            sorted_prob, sorted_idxs = torch.sort(probs, descending=True)
+            cumsum_prob = torch.cumsum(sorted_prob, dim=-1)
+            sorted_indices_to_remove = cumsum_prob > top_p
+            sorted_indices_to_remove[..., 0] = 0
+            sorted_prob[sorted_indices_to_remove] = 0
+            orig_probs = torch.scatter(
+                torch.zeros_like(sorted_prob),
+                dim=-1,
+                index=sorted_idxs,
+                src=sorted_prob,
+            )
+            # Apply temperature and disallowed AAs and sample
+            assert temperature >= 0.0
+            scaled_logits = orig_probs.clamp(min=1e-9).log() / (temperature + 1e-4)
+            if disallow_aas:
+                unwanted_mask = torch.zeros(scaled_logits.shape[-1]).to(scaled_logits)
+                unwanted_mask[disallow_aas] = 1
+                scaled_logits -= unwanted_mask * 1e10
+            orig_probs = F.softmax(scaled_logits, dim=-1)
+            categorical = torch.distributions.Categorical(probs=orig_probs)
+            samp_aatype = categorical.sample()
+            return samp_aatype
+        def design_with_fullmpnn(batched_coords, seq_mask):
+            seq_lens = seq_mask.sum(-1).long()
+            designed_seqs = [
+                evaluation.design_sequence(c[: seq_lens[i]], model=fullmpnn_model)[0]
+                for i, c in enumerate(batched_coords)
+            ]
+            designed_aatypes, _ = utils.batched_seq_to_aatype_and_mask(
+                designed_seqs, max_len=seq_mask.shape[-1]
+            )
+            return designed_aatypes
+        # Initialize masks/features
+        if seq_mask is None:  # Sample random lengths
+            assert gt_aatype is None  # Don't condition on aatype without seq_mask
+            seq_mask = self.make_seq_mask_for_sampling(
+                n_samples=n_samples,
+                min_len=min_len,
+                max_len=max_len,
+            )
+        if residue_index is None:
+            residue_index = torch.arange(seq_mask.shape[-1])
+            residue_index = repeat(residue_index, "n -> b n", b=seq_mask.shape[0])
+            residue_index = residue_index.to(seq_mask) * seq_mask
+        if use_fullmpnn or use_fullmpnn_for_final:
+            fullmpnn_model = protein_mpnn.get_mpnn_model(
+                path_to_model_weights=self.config.train.home_dir
+                + "/ProteinMPNN/vanilla_model_weights",
+                device=self.device,
+            )
+        # Initialize noise schedule/parameters
+        to_batch_size = lambda x: x * torch.ones(seq_mask.shape[0]).to(self.device)
+        s_t_min = s_t_min * self.sigma_data
+        s_t_max = s_t_max * self.sigma_data
+        if noise_schedule is None:
+            noise_schedule = self.sampling_noise_schedule_default
+        sigma = noise_schedule(1)
+        timesteps = torch.linspace(1, 0, n_steps + 1)
+        # Set up conditioning/guidance information
+        crop_cond_coords = None
+        if gt_coords is None:
+            coords_shape = seq_mask.shape + (self.n_atoms, 3)
+            xt = torch.randn(*coords_shape).to(self.device) * sigma
+            xt *= utils.expand(seq_mask, xt)
+        else:
+            assert gt_coords_traj is None
+            noise_levels = [to_batch_size(noise_schedule(t)) for t in timesteps]
+            gt_coords_traj = [
+                diffusion.noise_coords(gt_coords, nl) for nl in noise_levels
+            ]
+            xt = gt_coords_traj[0]
+            if gt_cond_atom_mask is not None:
+                crop_cond_coords = gt_coords * gt_cond_atom_mask[..., None]
+        gt_atom_mask = None
+        if gt_aatype is not None:
+            gt_atom_mask = utils.atom37_mask_from_aatype(gt_aatype, seq_mask)
+        fake_logits = repeat(seq_mask, "b n -> b n t", t=self.n_tokens)
+        s_hat = (sample_aatype(fake_logits) * seq_mask).long()
+        # Initialize superposition for all-atom sampling
+        if sidechain_mode:
+            b, n = seq_mask.shape[:2]
+            # Latest predicted x0 for sidechain superpositions
+            atom73_state_0 = torch.zeros(b, n, 73, 3).to(xt)
+            # Current state xt for sidechain superpositions (denoised to different levels)
+            atom73_state_t = torch.randn(b, n, 73, 3).to(xt) * sigma
+            # Noise level of xt
+            sigma73_last = torch.ones(b, n, 73).to(xt) * sigma
+            # Seqhat and mask used to choose sidechains for euler step (b, n)
+            s_hat = (seq_mask * 7).long()
+            mask37 = utils.atom37_mask_from_aatype(s_hat, seq_mask).bool()
+            mask73 = utils.atom73_mask_from_aatype(s_hat, seq_mask).bool()
+            begin_mpnn_step = int(n_steps * skip_mpnn_proportion)
+        # Prepare to run sampling trajectory
+        sigma = to_batch_size(sigma)
+        x0 = None
+        x0_prev = None
+        x_self_cond = None
+        s_logprobs = None
+        s_self_cond = None
+        if tqdm_pbar is None:
+            tqdm_pbar = lambda x: x
+        torch.set_grad_enabled(False)
+        # *t_traj is the denoising trajectory; *0_traj is the evolution of predicted clean data
+        # s0 are aatype probs of shape (b n t); s_hat are discrete aatype of shape (b n)
+        xt_traj, x0_traj, st_traj, s0_traj = [], [], [], []
+        # Sampling trajectory
+        for i, t in tqdm_pbar(enumerate(iter(timesteps[1:]))):
+            # Set up noise levels
+            sigma_next = noise_schedule(t)
+            if i == n_steps - 1:
+                sigma_next *= 0
+            gamma = (
+                s_churn / n_steps
+                if (sigma_next >= s_t_min and sigma_next <= s_t_max)
+                else 0.0
+            )
+            sigma_next = to_batch_size(sigma_next)
+            if sidechain_mode:
+                # Fill in noise for masked positions since xt is initialized to zeros at each step
+                dummy_fill_noise = torch.randn_like(xt) * utils.expand(sigma, xt)
+                zero_atom_mask = utils.atom37_mask_from_aatype(s_hat, seq_mask)
+                dummy_fill_mask = 1 - zero_atom_mask[..., None]
+                xt = xt * zero_atom_mask[..., None] + dummy_fill_noise * dummy_fill_mask
+            else:  # backbone only
+                bb_seq = (seq_mask * residue_constants.restype_order["G"]).long()
+                bb_atom_mask = utils.atom37_mask_from_aatype(bb_seq, seq_mask)
+                xt *= bb_atom_mask[..., None]
+            # Enable grad for reconstruction guidance
+            if use_reconstruction_guidance:
+                torch.set_grad_enabled(True)
+                xt.requires_grad = True
+            # Run denoising network
+            run_mpnn = not sidechain_mode or i > begin_mpnn_step
+            x0, s_logprobs, x_self_cond, s_self_cond = self.forward(
+                noisy_coords=xt,
+                noise_level=sigma,
+                seq_mask=seq_mask,
+                residue_index=residue_index,
+                struct_self_cond=x_self_cond,
+                struct_crop_cond=crop_cond_coords,
+                seq_self_cond=s_self_cond,
+                run_mpnn_model=run_mpnn,
+            )
+            # Compute additional stuff for guidance
+            if use_reconstruction_guidance:
+                loss = (x0 - gt_coords).pow(2).sum(-1)
+                loss = loss * gt_cond_atom_mask
+                loss = loss.sum() / gt_cond_atom_mask.sum().clamp(min=1)
+                xt.retain_grad()
+                loss.backward()
+                guidance = xt.grad.clone()
+                xt.grad *= 0
+                torch.set_grad_enabled(False)
+            if use_classifier_free_guidance:
+                assert not use_reconstruction_guidance
+                uncond_x0, _, _, _ = self.forward(
+                    noisy_coords=xt,
+                    noise_level=sigma,
+                    seq_mask=seq_mask,
+                    residue_index=residue_index,
+                    struct_self_cond=x_self_cond,
+                    seq_self_cond=s_self_cond,
+                    run_mpnn_model=run_mpnn,
+                )
+            # Structure denoising step
+            if not sidechain_mode:  # backbone
+                if sigma[0] > 0:
+                    xt = ode_step(sigma, sigma_next, xt, x0, gamma)
+                else:
+                    xt = x0
+            else:  # allatom
+                # Write x0 into atom73_state_0 for atoms corresponding to old seqhat
+                atom73_state_0[mask73] = x0[mask37]
+                # Determine sequence resampling probability
+                if anneal_seq_resampling_rate is not None:
+                    step_time = 1 - (i - begin_mpnn_step) / max(
+                        1, n_steps - begin_mpnn_step
+                    )
+                    if anneal_seq_resampling_rate == "linear":
+                        resampling_rate = step_time
+                    elif anneal_seq_resampling_rate == "cosine":
+                        k = 2
+                        resampling_rate = (
+                            1 + np.cos(2 * np.pi * (step_time - 0.5))
+                        ) / k
+                    resample_this_step = np.random.uniform() < resampling_rate
+                # Resample sequence or design with full ProteinMPNN
+                if i == n_steps - 1 and use_fullmpnn_for_final:
+                    s_hat = design_with_fullmpnn(x0, seq_mask).to(x0.device)
+                elif anneal_seq_resampling_rate is None or resample_this_step:
+                    if run_mpnn and use_fullmpnn:
+                        s_hat = design_with_fullmpnn(x0, seq_mask).to(x0.device)
+                    else:
+                        s_hat = sample_aatype(s_logprobs)
+                # Overwrite s_hat with any conditioning information
+                if (i + 1) / n_steps <= apply_cond_proportion:
+                    if gt_cond_seq_mask is not None and gt_aatype is not None:
+                        s_hat = (
+                            1 - gt_cond_seq_mask
+                        ) * s_hat + gt_cond_seq_mask * gt_aatype
+                        s_hat = s_hat.long()
+                # Set masks for collapsing superposition using new sequence
+                mask37 = utils.atom37_mask_from_aatype(s_hat, seq_mask).bool()
+                mask73 = utils.atom73_mask_from_aatype(s_hat, seq_mask).bool()
+                # Determine prev noise levels for atoms corresponding to new sequence
+                step_sigma_prev = (
+                    torch.ones(*xt.shape[:-1]).to(xt) * sigma[..., None, None]
+                )
+                step_sigma_prev[mask37] = sigma73_last[mask73]  # b, n, 37
+                step_sigma_next = sigma_next[..., None, None]  # b, 1, 1
+                # Denoising step on atoms corresponding to new sequence
+                b, n = mask37.shape[:2]
+                step_xt = torch.zeros(b, n, 37, 3).to(xt)
+                step_x0 = torch.zeros(b, n, 37, 3).to(xt)
+                step_xt[mask37] = atom73_state_t[mask73]
+                step_x0[mask37] = atom73_state_0[mask73]
+                guidance_in = None
+                if (i + 1) / n_steps <= apply_cond_proportion:
+                    if use_reconstruction_guidance:
+                        guidance_in = (guidance, mask37.float())
+                    elif use_classifier_free_guidance:
+                        guidance_in = uncond_x0
+                step_xt = ode_step(
+                    step_sigma_prev,
+                    step_sigma_next,
+                    step_xt,
+                    step_x0,
+                    gamma,
+                    guidance_in=guidance_in,
+                )
+                xt = step_xt
+                # Write new xt into atom73_state_t for atoms corresponding to new seqhat and update sigma_last
+                atom73_state_t[mask73] = step_xt[mask37]
+                sigma73_last[mask73] = step_sigma_next[0].item()
+            # Replacement guidance if conditioning information provided
+            if (i + 1) / n_steps <= apply_cond_proportion:
+                if gt_coords_traj is not None:
+                    if gt_cond_atom_mask is None:
+                        xt = gt_coords_traj[i + 1]
+                    else:
+                        xt = (1 - gt_cond_atom_mask)[
+                            ..., None
+                        ] * xt + gt_cond_atom_mask[..., None] * gt_coords_traj[i + 1]
+            sigma = sigma_next
+            # Logging
+            xt_scale = self.sigma_data / utils.expand(
+                torch.sqrt(sigma_next**2 + self.sigma_data**2), xt
+            )
+            scaled_xt = xt * xt_scale
+            xt_traj.append(scaled_xt.cpu())
+            x0_traj.append(x0.cpu())
+            st_traj.append(s_hat.cpu())
+            s0_traj.append(s_logprobs.cpu())
+        if return_last:
+            return xt, s_hat, seq_mask
+        elif return_aux:
+            return {
+                "x": xt,
+                "s": s_hat,
+                "seq_mask": seq_mask,
+                "xt_traj": xt_traj,
+                "x0_traj": x0_traj,
+                "st_traj": st_traj,
+                "s0_traj": s0_traj,
+            }
+        else:
+            return xt_traj, x0_traj, st_traj, s0_traj, seq_mask

modules.py ADDED Viewed

	@@ -0,0 +1,696 @@

+"""
+https://github.com/ProteinDesignLab/protpardelle
+License: MIT
+Author: Alex Chu
+Neural network modules. Many of these are adapted from open source modules.
+"""
+from typing import List, Sequence, Optional
+from einops import rearrange, reduce, repeat
+from einops.layers.torch import Rearrange
+import numpy as np
+from rotary_embedding_torch import RotaryEmbedding
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import AutoTokenizer, EsmModel
+from core import protein_mpnn
+from core import residue_constants
+from core import utils
+########################################
+# Adapted from https://github.com/ermongroup/ddim
+def downsample(x):
+    return nn.functional.avg_pool2d(x, 2, 2, ceil_mode=True)
+def upsample_coords(x, shape):
+    new_l, new_w = shape
+    return nn.functional.interpolate(x, size=(new_l, new_w), mode="nearest")
+########################################
+# Adapted from https://github.com/aqlaboratory/openfold
+def permute_final_dims(tensor: torch.Tensor, inds: List[int]):
+    zero_index = -1 * len(inds)
+    first_inds = list(range(len(tensor.shape[:zero_index])))
+    return tensor.contiguous().permute(first_inds + [zero_index + i for i in inds])
+def lddt(
+    all_atom_pred_pos: torch.Tensor,
+    all_atom_positions: torch.Tensor,
+    all_atom_mask: torch.Tensor,
+    cutoff: float = 15.0,
+    eps: float = 1e-10,
+    per_residue: bool = True,
+) -> torch.Tensor:
+    n = all_atom_mask.shape[-2]
+    dmat_true = torch.sqrt(
+        eps
+        + torch.sum(
+            (all_atom_positions[..., None, :] - all_atom_positions[..., None, :, :])
+            ** 2,
+            dim=-1,
+        )
+    )
+    dmat_pred = torch.sqrt(
+        eps
+        + torch.sum(
+            (all_atom_pred_pos[..., None, :] - all_atom_pred_pos[..., None, :, :]) ** 2,
+            dim=-1,
+        )
+    )
+    dists_to_score = (
+        (dmat_true < cutoff)
+        * all_atom_mask
+        * permute_final_dims(all_atom_mask, (1, 0))
+        * (1.0 - torch.eye(n, device=all_atom_mask.device))
+    )
+    dist_l1 = torch.abs(dmat_true - dmat_pred)
+    score = (
+        (dist_l1 < 0.5).type(dist_l1.dtype)
+        + (dist_l1 < 1.0).type(dist_l1.dtype)
+        + (dist_l1 < 2.0).type(dist_l1.dtype)
+        + (dist_l1 < 4.0).type(dist_l1.dtype)
+    )
+    score = score * 0.25
+    dims = (-1,) if per_residue else (-2, -1)
+    norm = 1.0 / (eps + torch.sum(dists_to_score, dim=dims))
+    score = norm * (eps + torch.sum(dists_to_score * score, dim=dims))
+    return score
+class RelativePositionalEncoding(nn.Module):
+    def __init__(self, attn_dim=8, max_rel_idx=32):
+        super().__init__()
+        self.max_rel_idx = max_rel_idx
+        self.n_rel_pos = 2 * self.max_rel_idx + 1
+        self.linear = nn.Linear(self.n_rel_pos, attn_dim)
+    def forward(self, residue_index):
+        d_ij = residue_index[..., None] - residue_index[..., None, :]
+        v_bins = torch.arange(self.n_rel_pos).to(d_ij.device) - self.max_rel_idx
+        idxs = (d_ij[..., None] - v_bins[None, None]).abs().argmin(-1)
+        p_ij = nn.functional.one_hot(idxs, num_classes=self.n_rel_pos)
+        embeddings = self.linear(p_ij.float())
+        return embeddings
+########################################
+# Adapted from https://github.com/NVlabs/edm
+class Noise_Embedding(nn.Module):
+    def __init__(self, num_channels, max_positions=10000, endpoint=False):
+        super().__init__()
+        self.num_channels = num_channels
+        self.max_positions = max_positions
+        self.endpoint = endpoint
+    def forward(self, x):
+        freqs = torch.arange(
+            start=0, end=self.num_channels // 2, dtype=torch.float32, device=x.device
+        )
+        freqs = freqs / (self.num_channels // 2 - (1 if self.endpoint else 0))
+        freqs = (1 / self.max_positions) ** freqs
+        x = x.outer(freqs.to(x.dtype))
+        x = torch.cat([x.cos(), x.sin()], dim=1)
+        return x
+########################################
+# Adapted from github.com/lucidrains
+# https://github.com/lucidrains/denoising-diffusion-pytorch
+# https://github.com/lucidrains/recurrent-interface-network-pytorch
+def exists(x):
+    return x is not None
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if callable(d) else d
+def posemb_sincos_1d(patches, temperature=10000, residue_index=None):
+    _, n, dim, device, dtype = *patches.shape, patches.device, patches.dtype
+    n = torch.arange(n, device=device) if residue_index is None else residue_index
+    assert (dim % 2) == 0, "feature dimension must be multiple of 2 for sincos emb"
+    omega = torch.arange(dim // 2, device=device) / (dim // 2 - 1)
+    omega = 1.0 / (temperature**omega)
+    n = n[..., None] * omega
+    pe = torch.cat((n.sin(), n.cos()), dim=-1)
+    return pe.type(dtype)
+class LayerNorm(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.gamma = nn.Parameter(torch.ones(dim))
+        self.register_buffer("beta", torch.zeros(dim))
+    def forward(self, x):
+        return F.layer_norm(x, x.shape[-1:], self.gamma, self.beta)
+class NoiseConditioningBlock(nn.Module):
+    def __init__(self, n_in_channel, n_out_channel):
+        super().__init__()
+        self.block = nn.Sequential(
+            Noise_Embedding(n_in_channel),
+            nn.Linear(n_in_channel, n_out_channel),
+            nn.SiLU(),
+            nn.Linear(n_out_channel, n_out_channel),
+            Rearrange("b d -> b 1 d"),
+        )
+    def forward(self, noise_level):
+        return self.block(noise_level)
+class TimeCondResnetBlock(nn.Module):
+    def __init__(
+        self, nic, noc, cond_nc, conv_layer=nn.Conv2d, dropout=0.1, n_norm_in_groups=4
+    ):
+        super().__init__()
+        self.block1 = nn.Sequential(
+            nn.GroupNorm(num_groups=nic // n_norm_in_groups, num_channels=nic),
+            nn.SiLU(),
+            conv_layer(nic, noc, 3, 1, 1),
+        )
+        self.cond_proj = nn.Linear(cond_nc, noc * 2)
+        self.mid_norm = nn.GroupNorm(num_groups=noc // 4, num_channels=noc)
+        self.dropout = dropout if dropout is None else nn.Dropout(dropout)
+        self.block2 = nn.Sequential(
+            nn.GroupNorm(num_groups=noc // 4, num_channels=noc),
+            nn.SiLU(),
+            conv_layer(noc, noc, 3, 1, 1),
+        )
+        self.mismatch = False
+        if nic != noc:
+            self.mismatch = True
+            self.conv_match = conv_layer(nic, noc, 1, 1, 0)
+    def forward(self, x, time=None):
+        h = self.block1(x)
+        if time is not None:
+            h = self.mid_norm(h)
+            scale, shift = self.cond_proj(time).chunk(2, dim=-1)
+            h = (h * (utils.expand(scale, h) + 1)) + utils.expand(shift, h)
+        if self.dropout is not None:
+            h = self.dropout(h)
+        h = self.block2(h)
+        if self.mismatch:
+            x = self.conv_match(x)
+        return x + h
+class TimeCondAttention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        dim_context=None,
+        heads=4,
+        dim_head=32,
+        norm=False,
+        norm_context=False,
+        time_cond_dim=None,
+        attn_bias_dim=None,
+        rotary_embedding_module=None,
+    ):
+        super().__init__()
+        hidden_dim = dim_head * heads
+        dim_context = default(dim_context, dim)
+        self.time_cond = None
+        if exists(time_cond_dim):
+            self.time_cond = nn.Sequential(nn.SiLU(), nn.Linear(time_cond_dim, dim * 2))
+            nn.init.zeros_(self.time_cond[-1].weight)
+            nn.init.zeros_(self.time_cond[-1].bias)
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        self.norm = LayerNorm(dim) if norm else nn.Identity()
+        self.norm_context = LayerNorm(dim_context) if norm_context else nn.Identity()
+        self.attn_bias_proj = None
+        if attn_bias_dim is not None:
+            self.attn_bias_proj = nn.Sequential(
+                Rearrange("b a i j -> b i j a"),
+                nn.Linear(attn_bias_dim, heads),
+                Rearrange("b i j a -> b a i j"),
+            )
+        self.to_q = nn.Linear(dim, hidden_dim, bias=False)
+        self.to_kv = nn.Linear(dim_context, hidden_dim * 2, bias=False)
+        self.to_out = nn.Linear(hidden_dim, dim, bias=False)
+        nn.init.zeros_(self.to_out.weight)
+        self.use_rope = False
+        if rotary_embedding_module is not None:
+            self.use_rope = True
+            self.rope = rotary_embedding_module
+    def forward(self, x, context=None, time=None, attn_bias=None, seq_mask=None):
+        # attn_bias is b, c, i, j
+        h = self.heads
+        has_context = exists(context)
+        context = default(context, x)
+        if x.shape[-1] != self.norm.gamma.shape[-1]:
+            print(context.shape, x.shape, self.norm.gamma.shape)
+        x = self.norm(x)
+        if exists(time):
+            scale, shift = self.time_cond(time).chunk(2, dim=-1)
+            x = (x * (scale + 1)) + shift
+        if has_context:
+            context = self.norm_context(context)
+        if seq_mask is not None:
+            x = x * seq_mask[..., None]
+        qkv = (self.to_q(x), *self.to_kv(context).chunk(2, dim=-1))
+        q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), qkv)
+        q = q * self.scale
+        if self.use_rope:
+            q = self.rope.rotate_queries_or_keys(q)
+            k = self.rope.rotate_queries_or_keys(k)
+        sim = torch.einsum("b h i d, b h j d -> b h i j", q, k)
+        if attn_bias is not None:
+            if self.attn_bias_proj is not None:
+                attn_bias = self.attn_bias_proj(attn_bias)
+            sim += attn_bias
+        if seq_mask is not None:
+            attn_mask = torch.einsum("b i, b j -> b i j", seq_mask, seq_mask)[:, None]
+            sim -= (1 - attn_mask) * 1e6
+        attn = sim.softmax(dim=-1)
+        out = torch.einsum("b h i j, b h j d -> b h i d", attn, v)
+        out = rearrange(out, "b h n d -> b n (h d)")
+        out = self.to_out(out)
+        if seq_mask is not None:
+            out = out * seq_mask[..., None]
+        return out
+class TimeCondFeedForward(nn.Module):
+    def __init__(self, dim, mult=4, dim_out=None, time_cond_dim=None, dropout=0.1):
+        super().__init__()
+        if dim_out is None:
+            dim_out = dim
+        self.norm = LayerNorm(dim)
+        self.time_cond = None
+        self.dropout = None
+        inner_dim = int(dim * mult)
+        if exists(time_cond_dim):
+            self.time_cond = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(time_cond_dim, inner_dim * 2),
+            )
+            nn.init.zeros_(self.time_cond[-1].weight)
+            nn.init.zeros_(self.time_cond[-1].bias)
+        self.linear_in = nn.Linear(dim, inner_dim)
+        self.nonlinearity = nn.SiLU()
+        if dropout is not None:
+            self.dropout = nn.Dropout(dropout)
+        self.linear_out = nn.Linear(inner_dim, dim_out)
+        nn.init.zeros_(self.linear_out.weight)
+        nn.init.zeros_(self.linear_out.bias)
+    def forward(self, x, time=None):
+        x = self.norm(x)
+        x = self.linear_in(x)
+        x = self.nonlinearity(x)
+        if exists(time):
+            scale, shift = self.time_cond(time).chunk(2, dim=-1)
+            x = (x * (scale + 1)) + shift
+        if exists(self.dropout):
+            x = self.dropout(x)
+        return self.linear_out(x)
+class TimeCondTransformer(nn.Module):
+    def __init__(
+        self,
+        dim,
+        depth,
+        heads,
+        dim_head,
+        time_cond_dim,
+        attn_bias_dim=None,
+        mlp_inner_dim_mult=4,
+        position_embedding_type: str = "rotary",
+    ):
+        super().__init__()
+        self.rope = None
+        self.pos_emb_type = position_embedding_type
+        if position_embedding_type == "rotary":
+            self.rope = RotaryEmbedding(dim=32)
+        elif position_embedding_type == "relative":
+            self.relpos = nn.Sequential(
+                RelativePositionalEncoding(attn_dim=heads),
+                Rearrange("b i j d -> b d i j"),
+            )
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        TimeCondAttention(
+                            dim,
+                            heads=heads,
+                            dim_head=dim_head,
+                            norm=True,
+                            time_cond_dim=time_cond_dim,
+                            attn_bias_dim=attn_bias_dim,
+                            rotary_embedding_module=self.rope,
+                        ),
+                        TimeCondFeedForward(
+                            dim, mlp_inner_dim_mult, time_cond_dim=time_cond_dim
+                        ),
+                    ]
+                )
+            )
+    def forward(
+        self,
+        x,
+        time=None,
+        attn_bias=None,
+        context=None,
+        seq_mask=None,
+        residue_index=None,
+    ):
+        if self.pos_emb_type == "absolute":
+            pos_emb = posemb_sincos_1d(x)
+            x = x + pos_emb
+        elif self.pos_emb_type == "absolute_residx":
+            assert residue_index is not None
+            pos_emb = posemb_sincos_1d(x, residue_index=residue_index)
+            x = x + pos_emb
+        elif self.pos_emb_type == "relative":
+            assert residue_index is not None
+            pos_emb = self.relpos(residue_index)
+            attn_bias = pos_emb if attn_bias is None else attn_bias + pos_emb
+        if seq_mask is not None:
+            x = x * seq_mask[..., None]
+        for i, (attn, ff) in enumerate(self.layers):
+            x = x + attn(
+                x, context=context, time=time, attn_bias=attn_bias, seq_mask=seq_mask
+            )
+            x = x + ff(x, time=time)
+            if seq_mask is not None:
+                x = x * seq_mask[..., None]
+        return x
+class TimeCondUViT(nn.Module):
+    def __init__(
+        self,
+        *,
+        seq_len: int,
+        dim: int,
+        patch_size: int = 1,
+        depth: int = 6,
+        heads: int = 8,
+        dim_head: int = 32,
+        n_filt_per_layer: List[int] = [],
+        n_blocks_per_layer: int = 2,
+        n_atoms: int = 37,
+        channels_per_atom: int = 6,
+        attn_bias_dim: int = None,
+        time_cond_dim: int = None,
+        conv_skip_connection: bool = False,
+        position_embedding_type: str = "rotary",
+    ):
+        super().__init__()
+        # Initialize configuration params
+        if time_cond_dim is None:
+            time_cond_dim = dim * 4
+        self.position_embedding_type = position_embedding_type
+        channels = channels_per_atom
+        self.n_conv_layers = n_conv_layers = len(n_filt_per_layer)
+        if n_conv_layers > 0:
+            post_conv_filt = n_filt_per_layer[-1]
+        self.conv_skip_connection = conv_skip_connection and n_conv_layers == 1
+        transformer_seq_len = seq_len // (2**n_conv_layers)
+        assert transformer_seq_len % patch_size == 0
+        num_patches = transformer_seq_len // patch_size
+        dim_a = post_conv_atom_dim = max(1, n_atoms // (2 ** (n_conv_layers - 1)))
+        if n_conv_layers == 0:
+            patch_dim = patch_size * n_atoms * channels_per_atom
+            patch_dim_out = patch_size * n_atoms * 3
+            dim_a = n_atoms
+        elif conv_skip_connection and n_conv_layers == 1:
+            patch_dim = patch_size * (channels + post_conv_filt) * post_conv_atom_dim
+            patch_dim_out = patch_size * post_conv_filt * post_conv_atom_dim
+        elif n_conv_layers > 0:
+            patch_dim = patch_dim_out = patch_size * post_conv_filt * post_conv_atom_dim
+        # Make downsampling conv
+        # Downsamples n-1 times where n is n_conv_layers
+        down_conv = []
+        block_in = channels
+        for i, nf in enumerate(n_filt_per_layer):
+            block_out = nf
+            layer = []
+            for j in range(n_blocks_per_layer):
+                n_groups = 2 if i == 0 and j == 0 else 4
+                layer.append(
+                    TimeCondResnetBlock(
+                        block_in, block_out, time_cond_dim, n_norm_in_groups=n_groups
+                    )
+                )
+                block_in = block_out
+            down_conv.append(nn.ModuleList(layer))
+        self.down_conv = nn.ModuleList(down_conv)
+        # Make transformer
+        self.to_patch_embedding = nn.Sequential(
+            Rearrange("b c (n p) a -> b n (p c a)", p=patch_size),
+            nn.Linear(patch_dim, dim),
+            LayerNorm(dim),
+        )
+        self.transformer = TimeCondTransformer(
+            dim,
+            depth,
+            heads,
+            dim_head,
+            time_cond_dim,
+            attn_bias_dim=attn_bias_dim,
+            position_embedding_type=position_embedding_type,
+        )
+        self.from_patch = nn.Sequential(
+            LayerNorm(dim),
+            nn.Linear(dim, patch_dim_out),
+            Rearrange("b n (p c a) -> b c (n p) a", p=patch_size, a=dim_a),
+        )
+        nn.init.zeros_(self.from_patch[-2].weight)
+        nn.init.zeros_(self.from_patch[-2].bias)
+        # Make upsampling conv
+        up_conv = []
+        for i, nf in enumerate(reversed(n_filt_per_layer)):
+            skip_in = nf
+            block_out = nf
+            layer = []
+            for j in range(n_blocks_per_layer):
+                layer.append(
+                    TimeCondResnetBlock(block_in + skip_in, block_out, time_cond_dim)
+                )
+                block_in = block_out
+            up_conv.append(nn.ModuleList(layer))
+        self.up_conv = nn.ModuleList(up_conv)
+        # Conv out
+        if n_conv_layers > 0:
+            self.conv_out = nn.Sequential(
+                nn.GroupNorm(num_groups=block_out // 4, num_channels=block_out),
+                nn.SiLU(),
+                nn.Conv2d(block_out, channels // 2, 3, 1, 1),
+            )
+    def forward(
+        self, coords, time_cond, pair_bias=None, seq_mask=None, residue_index=None
+    ):
+        if self.n_conv_layers > 0:  # pad up to even dims
+            coords = F.pad(coords, (0, 0, 0, 0, 0, 1, 0, 0))
+        x = rearr_coords = rearrange(coords, "b n a c -> b c n a")
+        hiddens = []
+        for i, layer in enumerate(self.down_conv):
+            for block in layer:
+                x = block(x, time=time_cond)
+                hiddens.append(x)
+            if i != self.n_conv_layers - 1:
+                x = downsample(x)
+        if self.conv_skip_connection:
+            x = torch.cat([x, rearr_coords], 1)
+        x = self.to_patch_embedding(x)
+        # if self.position_embedding_type == 'absolute':
+        #     pos_emb = posemb_sincos_1d(x)
+        #     x = x + pos_emb
+        if seq_mask is not None and x.shape[1] == seq_mask.shape[1]:
+            x *= seq_mask[..., None]
+        x = self.transformer(
+            x,
+            time=time_cond,
+            attn_bias=pair_bias,
+            seq_mask=seq_mask,
+            residue_index=residue_index,
+        )
+        x = self.from_patch(x)
+        for i, layer in enumerate(self.up_conv):
+            for block in layer:
+                x = torch.cat([x, hiddens.pop()], 1)
+                x = block(x, time=time_cond)
+            if i != self.n_conv_layers - 1:
+                x = upsample_coords(x, hiddens[-1].shape[2:])
+        if self.n_conv_layers > 0:
+            x = self.conv_out(x)
+            x = x[..., :-1, :]  # drop even-dims padding
+        x = rearrange(x, "b c n a -> b n a c")
+        return x
+########################################
+class LinearWarmupCosineDecay(torch.optim.lr_scheduler._LRScheduler):
+    def __init__(
+        self,
+        optimizer,
+        max_lr,
+        warmup_steps=1000,
+        decay_steps=int(1e6),
+        min_lr=1e-6,
+        **kwargs,
+    ):
+        self.max_lr = max_lr
+        self.min_lr = min_lr
+        self.warmup_steps = warmup_steps
+        self.decay_steps = decay_steps
+        self.total_steps = warmup_steps + decay_steps
+        super(LinearWarmupCosineDecay, self).__init__(optimizer, **kwargs)
+    def get_lr(self):
+        # TODO double check for off-by-one errors
+        if self.last_epoch < self.warmup_steps:
+            curr_lr = self.last_epoch / self.warmup_steps * self.max_lr
+            return [curr_lr for group in self.optimizer.param_groups]
+        elif self.last_epoch < self.total_steps:
+            time = (self.last_epoch - self.warmup_steps) / self.decay_steps * np.pi
+            curr_lr = self.min_lr + (self.max_lr - self.min_lr) * 0.5 * (
+                1 + np.cos(time)
+            )
+            return [curr_lr for group in self.optimizer.param_groups]
+        else:
+            return [self.min_lr for group in self.optimizer.param_groups]
+class NoiseConditionalProteinMPNN(nn.Module):
+    def __init__(
+        self,
+        n_channel=128,
+        n_layers=3,
+        n_neighbors=32,
+        time_cond_dim=None,
+        vocab_size=21,
+        input_S_is_embeddings=False,
+    ):
+        super().__init__()
+        self.n_channel = n_channel
+        self.n_layers = n_layers
+        self.n_neighbors = n_neighbors
+        self.time_cond_dim = time_cond_dim
+        self.vocab_size = vocab_size
+        self.bb_idxs_if_atom37 = [
+            residue_constants.atom_order[a] for a in ["N", "CA", "C", "O"]
+        ]
+        self.mpnn = protein_mpnn.ProteinMPNN(
+            num_letters=vocab_size,
+            node_features=n_channel,
+            edge_features=n_channel,
+            hidden_dim=n_channel,
+            num_encoder_layers=n_layers,
+            num_decoder_layers=n_layers,
+            vocab=vocab_size,
+            k_neighbors=n_neighbors,
+            augment_eps=0.0,
+            dropout=0.1,
+            ca_only=False,
+            time_cond_dim=time_cond_dim,
+            input_S_is_embeddings=input_S_is_embeddings,
+        )
+    def forward(
+        self, denoised_coords, noisy_aatype, seq_mask, residue_index, time_cond
+    ):
+        if denoised_coords.shape[-2] == 37:
+            denoised_coords = denoised_coords[:, :, self.bb_idxs_if_atom37]
+        node_embs, encoder_embs = self.mpnn(
+            X=denoised_coords,
+            S=noisy_aatype,
+            mask=seq_mask,
+            chain_M=seq_mask,
+            residue_idx=residue_index,
+            chain_encoding_all=seq_mask,
+            randn=None,
+            use_input_decoding_order=False,
+            decoding_order=None,
+            causal_mask=False,
+            time_cond=time_cond,
+            return_node_embs=True,
+        )
+        return node_embs, encoder_embs

output_helpers.py ADDED Viewed

The diff for this file is too large to render. See raw diff

package.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ dssp

protpardelle_pymol.py ADDED Viewed

	@@ -0,0 +1,159 @@

+from pymol import cmd
+import os
+import json
+import time
+import threading
+try:
+    from gradio_client import Client
+except ImportError:
+    print("gradio_client not installed, trying install:")
+    import pip
+    pip.main(['install', 'gradio_client'])
+    from gradio_client import Client
+if os.environ.get("GRADIO_LOCAL") != None:
+    public_link = "http://127.0.0.1:7862"
+else:
+    public_link = "spacesplaceholder"
+def thread_protpardelle(input_pdb,
+    resample_idxs,
+    modeltype,
+    mode,
+    minlen=50,
+    maxlen= 60,
+    steplen = 2,
+    per_len = 2):
+    client = Client(public_link)
+    job = client.submit(
+				input_pdb,	# str in 'PDB Content' Textbox component
+				modeltype,	# str in 'Choose a Mode' Radio component
+				f'"{resample_idxs}"',	# str in 'Resampled Idxs' Textbox component
+				mode,	# str (Option from: ['backbone', 'allatom'])
+				minlen,	# int | float (numeric value between 2 and 200) minlen
+				maxlen,	# int | float (numeric value between 3 and 200) in 'maxlen' Slider component
+				steplen,	# int | float (numeric value between 1 and 50) in 'steplen' Slider component
+				per_len,	# int | float (numeric value between 1 and 200) in 'perlen' Slider component
+				api_name="/protpardelle"
+)
+    #start time
+    start = time.time()
+    while (job.done() == False):
+        status =  job.status()
+        elapsed = time.time()-start
+        # format as hh:mm:ss
+        elapsed = time.strftime("%H:%M:%S", time.gmtime(elapsed))
+        print(f"\r protpardelle running since {elapsed}", end="")
+        time.sleep(1)
+    results = job.result()
+    # load each result into pymol
+    results = json.loads(results)
+    for (name,pdb_content) in results:
+        print(name)
+        cmd.read_pdbstr(pdb_content, os.path.basename(name))
+def query_protpardelle(
+    name_of_input: str,
+    selection_resample_idxs: str="",
+    per_len: int = 2,
+    mode: str="allatom",
+):
+    """
+    AUTHOR
+    Simon Duerr
+    https://twitter.com/simonduerr
+    DESCRIPTION
+    Run Protpardelle
+    USAGE
+    protpardelle name_of_input, selection_resampled_idx, modeltype, mode, per_len
+    PARAMETERS
+    name_of_input = string: name of input object
+    selection_resampled_idx = string: selection of resampled protein residues
+    per_len = int: per_len (default: 2)
+    mode = string: mode (default: 'allatom')
+    """
+    if name_of_input != "":
+        input_pdb = cmd.get_pdbstr(name_of_input)
+        all_aa = cmd.index(name_of_input+" and name CA")
+        idx = cmd.index(selection_resample_idxs+" and name CA")
+        #map to zero indexed values
+        aa_mapping = {aa[1]:i for i,aa in enumerate(all_aa)}
+        idx = ",".join([str(aa_mapping[aa[1]]) for aa in idx])
+        print("resampling", idx , "(zero indexed) from", name_of_input)
+    t = threading.Thread(target=thread_protpardelle,
+                         args=(input_pdb, idx, "conditional",mode ),
+                         kwargs={'per_len':per_len},
+                         daemon=True)
+    t.start()
+def query_protpardelle_uncond(
+    minlen: int = 50,
+    maxlen: int = 60,
+    steplen: int = 2,
+    per_len: int = 2,
+    mode: str="allatom",
+):
+    """
+    AUTHOR
+    Simon Duerr
+    https://twitter.com/simonduerr
+    DESCRIPTION
+    Run Protpardelle
+    USAGE
+    protpardelle_uncond minlen, maxlen, steplen, per_len,mode
+    PARAMETERS
+    minlen = int: minlen
+    maxlen = int: maxlen
+    steplen = int: steplen
+    per_len = int: per_len
+    mode = string: mode (default: 'allatom')
+    """
+    modeltype = "unconditional"
+    idx = None
+    input_pdb = None
+    t = threading.Thread(target=thread_protpardelle,
+                         args=(input_pdb, idx, modeltype, mode),
+                         kwargs={'minlen':minlen, 'maxlen':maxlen, 'steplen':steplen,'per_len':per_len},
+                         daemon=True)
+    t.start()
+def setprotpardellelink(link:str):
+    global public_link
+    try:
+        client = Client(link)
+    except:
+        print("could not connect to:", public_link)
+    public_link = link
+cmd.extend("protpardelle_setlink", setprotpardellelink)
+cmd.extend("protpardelle", query_protpardelle)
+cmd.extend("protpardelle_uncond", query_protpardelle_uncond)

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+torch==1.12.1+cu116
+transformers==4.29.1
+einops
+tqdm
+wandb
+rotary-embedding-torch
+biopython
+scipy
+dm-tree
+matplotlib
+seaborn
+black
+ipython
+--extra-index-url https://download.pytorch.org/whl/cu116

sampling.py ADDED Viewed

	@@ -0,0 +1,213 @@

+"""
+https://github.com/ProteinDesignLab/protpardelle
+License: MIT
+Author: Alex Chu
+Configs and convenience functions for wrapping the model sample() function.
+"""
+import argparse
+import time
+from typing import Optional, Tuple
+import torch
+from torchtyping import TensorType
+from core import residue_constants
+from core import utils
+import diffusion
+def default_backbone_sampling_config():
+    config = argparse.Namespace(
+        n_steps=500,
+        s_churn=200,
+        step_scale=1.2,
+        sidechain_mode=False,
+        noise_schedule=lambda t: diffusion.noise_schedule(t, s_max=80, s_min=0.001),
+    )
+    return config
+def default_allatom_sampling_config():
+    noise_schedule = lambda t: diffusion.noise_schedule(t, s_max=80, s_min=0.001)
+    stage2 = argparse.Namespace(
+        apply_cond_proportion=1.0,
+        n_steps=200,
+        s_churn=100,
+        step_scale=1.2,
+        sidechain_mode=True,
+        skip_mpnn_proportion=1.0,
+        noise_schedule=noise_schedule,
+    )
+    config = argparse.Namespace(
+        n_steps=500,
+        s_churn=200,
+        step_scale=1.2,
+        sidechain_mode=True,
+        skip_mpnn_proportion=0.6,
+        use_fullmpnn=False,
+        use_fullmpnn_for_final=True,
+        anneal_seq_resampling_rate="linear",
+        noise_schedule=noise_schedule,
+        stage_2=stage2,
+    )
+    return config
+def draw_backbone_samples(
+    model: torch.nn.Module,
+    seq_mask: TensorType["b n", float] = None,
+    n_samples: int = None,
+    sample_length_range: Tuple[int] = (50, 512),
+    pdb_save_path: Optional[str] = None,
+    return_aux: bool = False,
+    return_sampling_runtime: bool = False,
+    **sampling_kwargs,
+):
+    device = model.device
+    if seq_mask is None:
+        assert n_samples is not None
+        seq_mask = model.make_seq_mask_for_sampling(
+            n_samples=n_samples,
+            min_len=sample_length_range[0],
+            max_len=sample_length_range[1],
+        )
+    start = time.time()
+    aux = model.sample(
+        seq_mask=seq_mask, return_last=False, return_aux=True, **sampling_kwargs
+    )
+    aux["runtime"] = time.time() - start
+    seq_lens = seq_mask.sum(-1).long()
+    cropped_samp_coords = [
+        s[: seq_lens[i], model.bb_idxs] for i, s in enumerate(aux["xt_traj"][-1])
+    ]
+    if pdb_save_path is not None:
+        gly_aatype = (seq_mask * residue_constants.restype_order["G"]).long()
+        trimmed_aatype = [a[: seq_lens[i]] for i, a in enumerate(gly_aatype)]
+        atom_mask = utils.atom37_mask_from_aatype(gly_aatype, seq_mask).cpu()
+        for i in range(len(cropped_samp_coords)):
+            utils.write_coords_to_pdb(
+                cropped_samp_coords[i],
+                f"{pdb_save_path}{i}.pdb",
+                batched=False,
+                aatype=trimmed_aatype[i],
+                atom_mask=atom_mask[i],
+            )
+    if return_aux:
+        return aux
+    else:
+        if return_sampling_runtime:
+            return cropped_samp_coords, seq_mask, aux["runtime"]
+        else:
+            return cropped_samp_coords, seq_mask
+def draw_allatom_samples(
+    model: torch.nn.Module,
+    seq_mask: TensorType["b n", float] = None,
+    n_samples: int = None,
+    sample_length_range: Tuple[int] = (50, 512),
+    two_stage_sampling: bool = True,
+    pdb_save_path: Optional[str] = None,
+    return_aux: bool = False,
+    return_sampling_runtime: bool = False,
+    **sampling_kwargs,
+):
+    """Implement the default 2-stage all-atom sampling routine."""
+    def save_allatom_samples(aux, path):
+        seq_lens = aux["seq_mask"].sum(-1).long()
+        cropped_samp_coords = [
+            c[: seq_lens[i]] for i, c in enumerate(aux["xt_traj"][-1])
+        ]
+        cropped_samp_aatypes = [
+            s[: seq_lens[i]] for i, s in enumerate(aux["st_traj"][-1])
+        ]
+        samp_atom_mask = utils.atom37_mask_from_aatype(
+            aux["st_traj"][-1].to(device), seq_mask
+        )
+        samp_atom_mask = [m[: seq_lens[i]] for i, m in enumerate(samp_atom_mask)]
+        for i, c in enumerate(cropped_samp_coords):
+            utils.write_coords_to_pdb(
+                c,
+                f"{path}{i}.pdb",
+                batched=False,
+                aatype=cropped_samp_aatypes[i],
+                atom_mask=samp_atom_mask[i],
+                conect=True,
+            )
+    device = model.device
+    if seq_mask is None:
+        assert n_samples is not None
+        seq_mask = model.make_seq_mask_for_sampling(
+            n_samples=n_samples,
+            min_len=sample_length_range[0],
+            max_len=sample_length_range[1],
+        )
+    sampling_runtime = 0.0
+    # Stage 1 sampling
+    start = time.time()
+    if "stage_2" in sampling_kwargs:
+        stage_2_kwargs = vars(sampling_kwargs.pop("stage_2"))
+    aux = model.sample(
+        seq_mask=seq_mask,
+        return_last=False,
+        return_aux=True,
+        **sampling_kwargs,
+    )
+    sampling_runtime = time.time() - start
+    if pdb_save_path is not None and two_stage_sampling:
+        save_allatom_samples(aux, pdb_save_path + "_init")
+    # Stage 2 sampling (sidechain refinement only)
+    if two_stage_sampling:
+        samp_seq = aux["st_traj"][-1]
+        samp_coords = aux["xt_traj"][-1]
+        cond_atom_mask = utils.atom37_mask_from_aatype((seq_mask * 7).long(), seq_mask)
+        aux = {f"stage1_{k}": v for k, v in aux.items()}
+        start = time.time()
+        stage2_aux = model.sample(
+            gt_cond_atom_mask=cond_atom_mask.to(device),  # condition on backbone
+            gt_cond_seq_mask=seq_mask.to(device),
+            gt_coords=samp_coords.to(device),
+            gt_aatype=samp_seq.to(device),
+            seq_mask=seq_mask,
+            return_last=False,
+            return_aux=True,
+            **stage_2_kwargs,
+        )
+        sampling_runtime += time.time() - start
+        aux = {**aux, **stage2_aux}
+    if pdb_save_path is not None:
+        save_allatom_samples(aux, pdb_save_path + "_samp")
+    aux["runtime"] = sampling_runtime
+    # Process outputs, crop to correct length
+    if return_aux:
+        return aux
+    else:
+        xt_traj = aux["xt_traj"]
+        st_traj = aux["st_traj"]
+        seq_mask = aux["seq_mask"]
+        seq_lens = seq_mask.sum(-1).long()
+        cropped_samp_coords = [c[: seq_lens[i]] for i, c in enumerate(xt_traj[-1])]
+        cropped_samp_aatypes = [s[: seq_lens[i]] for i, s in enumerate(st_traj[-1])]
+        samp_atom_mask = utils.atom37_mask_from_aatype(st_traj[-1].to(device), seq_mask)
+        samp_atom_mask = [m[: seq_lens[i]] for i, m in enumerate(samp_atom_mask)]
+        orig_xt_traj = aux["stage1_xt_traj"]
+        stage1_coords = [c[: seq_lens[i]] for i, c in enumerate(orig_xt_traj[-1])]
+        ret = (
+            cropped_samp_coords,
+            cropped_samp_aatypes,
+            samp_atom_mask,
+            stage1_coords,
+            seq_mask,
+        )
+        if return_sampling_runtime:
+            ret = ret + (sampling_runtime,)
+        return ret