Upload code

Browse files

Files changed (8) hide show

configuration_prot2text.py +74 -0
conversion.py +470 -0
graphs.py +1137 -0
modeling_prot2text.py +392 -0
pdb2graph.py +171 -0
utils.py +742 -0
utils_convert.py +82 -0
utils_dataset.py +60 -0

configuration_prot2text.py ADDED Viewed

	@@ -0,0 +1,74 @@

+""" Prot2Text configuration"""
+from transformers.configuration_utils import PretrainedConfig
+from transformers import AutoConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class Prot2TextConfig(PretrainedConfig):
+    model_type = "prot2text"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    _keys_to_ignore_on_load_missing = [r"transformer"]
+    def __init__(
+        self,
+        cross_esm_graph=True,
+        decoder_start_token_id=50257,
+        early_stopping=True,
+        eos_token_id=50258,
+        bos_token_id=50257,
+        esm=True,
+        esm_model_name="facebook/esm2_t6_8M_UR50D",
+        gpt_model_name="gpt2",
+        length_penalty=2.0,
+        max_new_tokens=256,
+        no_repeat_ngram_size=3,
+        pad_token_id=50256,
+        prot2text_version="1.1",
+        rgcn=True,
+        rgc_input_dim=67,
+        rgcn_n_layers=6,
+        gpt_config=None,
+        esm_config=None,
+        **kwargs,
+    ):
+        self.cross_esm_graph = cross_esm_graph
+        self.decoder_start_token_id = decoder_start_token_id
+        self.early_stopping = early_stopping
+        self.eos_token_id = eos_token_id
+        self.esm = esm
+        self.esm_model_name = esm_model_name
+        self.gpt_model_name = gpt_model_name
+        self.length_penalty = length_penalty
+        self.max_new_tokens = max_new_tokens
+        self.no_repeat_ngram_size = no_repeat_ngram_size
+        self.pad_token_id = pad_token_id
+        self.prot2text_version = prot2text_version
+        self.rgcn = rgcn
+        self.rgc_input_dim = rgc_input_dim
+        self.rgcn_n_layers = rgcn_n_layers
+        if gpt_config is None:
+            self.gpt_config = AutoConfig.from_pretrained(gpt_model_name,
+                                        _name_or_path= gpt_model_name,
+                                        is_encoder_decoder=True,
+                                        use_cache=False,
+                                        add_cross_attention=True,
+                                        bos_token_id=bos_token_id,
+                                        decoder_start_token_id=decoder_start_token_id,
+                                        eos_token_id=eos_token_id,
+                                        max_new_tokens=max_new_tokens,
+                                        pad_token_id=50256,
+                                        vocab_size=50259,
+                                        num_beams=1,
+                                        max_length=256,
+                                        min_length=1).to_dict()
+        else:
+            self.gpt_config = gpt_config
+        if esm_config is None:
+            self.esm_config = AutoConfig.from_pretrained(esm_model_name).to_dict()
+        self.esm_config = esm_config
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)

conversion.py ADDED Viewed

	@@ -0,0 +1,470 @@

+"""Utilities for converting Graphein Networks to Geometric Deep Learning formats.
+"""
+# %%
+# Graphein
+# Author: Kexin Huang, Arian Jamasb <arian@jamasb.io>
+# License: MIT
+# Project Website: https://github.com/a-r-j/graphein
+# Code Repository: https://github.com/a-r-j/graphein
+from __future__ import annotations
+from typing import List, Optional
+import networkx as nx
+import numpy as np
+import torch
+from graphein.utils.dependencies import import_message
+try:
+    import torch_geometric
+    from torch_geometric.data import Data
+except ImportError:
+    import_message(
+        submodule="graphein.ml.conversion",
+        package="torch_geometric",
+        pip_install=True,
+        conda_channel="rusty1s",
+    )
+try:
+    import dgl
+except ImportError:
+    import_message(
+        submodule="graphein.ml.conversion",
+        package="dgl",
+        pip_install=True,
+        conda_channel="dglteam",
+    )
+try:
+    import jax.numpy as jnp
+except ImportError:
+    import_message(
+        submodule="graphein.ml.conversion",
+        package="jax",
+        pip_install=True,
+        conda_channel="conda-forge",
+    )
+try:
+    import jraph
+except ImportError:
+    import_message(
+        submodule="graphein.ml.conversion",
+        package="jraph",
+        pip_install=True,
+        conda_channel="conda-forge",
+    )
+SUPPORTED_FORMATS = ["nx", "pyg", "dgl", "jraph"]
+"""Supported conversion formats.
+``"nx"``: NetworkX graph
+``"pyg"``: PyTorch Geometric Data object
+``"dgl"``: DGL graph
+``"Jraph"``: Jraph GraphsTuple
+"""
+SUPPORTED_VERBOSITY = ["gnn", "default", "all_info"]
+"""Supported verbosity levels for preserving graph features in conversion."""
+class GraphFormatConvertor:
+    """
+    Provides conversion utilities between NetworkX Graphs and geometric deep learning library destination formats.
+    Currently, we provide support for converstion from ``nx.Graph`` to ``dgl.DGLGraph`` and ``pytorch_geometric.Data``. Supported conversion
+    formats can be retrieved from :const:`~graphein.ml.conversion.SUPPORTED_FORMATS`.
+    :param src_format: The type of graph you'd like to convert from. Supported formats are available in :const:`~graphein.ml.conversion.SUPPORTED_FORMATS`
+    :type src_format: Literal["nx", "pyg", "dgl", "jraph"]
+    :param dst_format: The type of graph format you'd like to convert to. Supported formats are available in:
+        ``graphein.ml.conversion.SUPPORTED_FORMATS``
+    :type dst_format:  Literal["nx", "pyg", "dgl", "jraph"]
+    :param verbose: Select from ``"gnn"``, ``"default"``, ``"all_info"`` to determine how much information is preserved (features)
+        as some are unsupported by various downstream frameworks
+    :type verbose: graphein.ml.conversion.SUPPORTED_VERBOSITY
+    :param columns: List of columns in the node features to retain
+    :type columns: List[str], optional
+    """
+    def __init__(
+        self,
+        src_format: str,
+        dst_format: str,
+        verbose: SUPPORTED_VERBOSITY = "gnn",
+        columns: Optional[List[str]] = None,
+    ):
+        if (src_format not in SUPPORTED_FORMATS) or (
+            dst_format not in SUPPORTED_FORMATS
+        ):
+            raise ValueError(
+                "Please specify from supported format, "
+                + "/".join(SUPPORTED_FORMATS)
+            )
+        self.src_format = src_format
+        self.dst_format = dst_format
+        # supported_verbose_format = ["gnn", "default", "all_info"]
+        if (columns is None) and (verbose not in SUPPORTED_VERBOSITY):
+            raise ValueError(
+                "Please specify the supported verbose mode ("
+                + "/".join(SUPPORTED_VERBOSITY)
+                + ") or specify column names!"
+            )
+        if columns is None:
+            if verbose == "gnn":
+                columns = [
+                    "edge_index",
+                    "coords",
+                    "dist_mat",
+                    "name",
+                    "node_id",
+                ]
+            elif verbose == "default":
+                columns = [
+                    "b_factor",
+                    "chain_id",
+                    "coords",
+                    "dist_mat",
+                    "edge_index",
+                    "kind",
+                    "name",
+                    "node_id",
+                    "residue_name",
+                ]
+            elif verbose == "all_info":
+                columns = [
+                    "atom_type",
+                    "b_factor",
+                    "chain_id",
+                    "chain_ids",
+                    "config",
+                    "coords",
+                    "dist_mat",
+                    "edge_index",
+                    "element_symbol",
+                    "kind",
+                    "name",
+                    "node_id",
+                    "node_type",
+                    "pdb_df",
+                    "raw_pdb_df",
+                    "residue_name",
+                    "residue_number",
+                    "rgroup_df",
+                    "sequence_A",
+                    "sequence_B",
+                ]
+        self.columns = columns
+        self.type2form = {
+            "atom_type": "str",
+            "b_factor": "float",
+            "chain_id": "str",
+            "coords": "np.array",
+            "dist_mat": "np.array",
+            "element_symbol": "str",
+            "node_id": "str",
+            "residue_name": "str",
+            "residue_number": "int",
+            "edge_index": "torch.tensor",
+            "kind": "str",
+        }
+    def convert_nx_to_dgl(self, G: nx.Graph) -> dgl.DGLGraph:
+        """
+        Converts ``NetworkX`` graph to ``DGL``
+        :param G: ``nx.Graph`` to convert to ``DGLGraph``
+        :type G: nx.Graph
+        :return: ``DGLGraph`` object version of input ``NetworkX`` graph
+        :rtype: dgl.DGLGraph
+        """
+        g = dgl.DGLGraph()
+        node_id = list(G.nodes())
+        G = nx.convert_node_labels_to_integers(G)
+        ## add node level feat
+        node_dict = {}
+        for i, (_, feat_dict) in enumerate(G.nodes(data=True)):
+            for key, value in feat_dict.items():
+                if str(key) in self.columns:
+                    node_dict[str(key)] = (
+                        [value] if i == 0 else node_dict[str(key)] + [value]
+                    )
+        string_dict = {}
+        node_dict_transformed = {}
+        for i, j in node_dict.items():
+            if i == "coords":
+                node_dict_transformed[i] = torch.Tensor(np.asarray(j)).type(
+                    "torch.FloatTensor"
+                )
+            elif i == "dist_mat":
+                node_dict_transformed[i] = torch.Tensor(
+                    np.asarray(j[0].values)
+                ).type("torch.FloatTensor")
+            elif self.type2form[i] == "str":
+                string_dict[i] = j
+            elif self.type2form[i] in ["float", "int"]:
+                node_dict_transformed[i] = torch.Tensor(np.array(j))
+        g.add_nodes(
+            len(node_id),
+            node_dict_transformed,
+        )
+        edge_dict = {}
+        edge_index = torch.LongTensor(list(G.edges)).t().contiguous()
+        # add edge level features
+        for i, (_, _, feat_dict) in enumerate(G.edges(data=True)):
+            for key, value in feat_dict.items():
+                if str(key) in self.columns:
+                    edge_dict[str(key)] = (
+                        list(value)
+                        if i == 0
+                        else edge_dict[str(key)] + list(value)
+                    )
+        edge_transform_dict = {}
+        for i, j in node_dict.items():
+            if self.type2form[i] == "str":
+                string_dict[i] = j
+            elif self.type2form[i] in ["float", "int"]:
+                edge_transform_dict[i] = torch.Tensor(np.array(j))
+        g.add_edges(edge_index[0], edge_index[1], edge_transform_dict)
+        # add graph level features
+        graph_dict = {
+            str(feat_name): [G.graph[feat_name]]
+            for feat_name in G.graph
+            if str(feat_name) in self.columns
+        }
+        return g
+    def convert_nx_to_pyg(self, G: nx.Graph) -> Data:
+        """
+        Converts ``NetworkX`` graph to ``pytorch_geometric.data.Data`` object. Requires ``PyTorch Geometric`` (https://pytorch-geometric.readthedocs.io/en/latest/) to be installed.
+        :param G: ``nx.Graph`` to convert to PyTorch Geometric ``Data`` object
+        :type G: nx.Graph
+        :return: ``Data`` object containing networkx graph data
+        :rtype: pytorch_geometric.data.Data
+        """
+        # Initialise dict used to construct Data object & Assign node ids as a feature
+        data = {"node_id": list(G.nodes())}
+        G = nx.convert_node_labels_to_integers(G)
+        # Construct Edge Index
+        edge_index = torch.LongTensor(list(G.edges)).t().contiguous()
+        # Add node features
+        for i, (_, feat_dict) in enumerate(G.nodes(data=True)):
+            for key, value in feat_dict.items():
+                if str(key) in self.columns:
+                    data[str(key)] = (
+                        [value] if i == 0 else data[str(key)] + [value]
+                    )
+        # Add edge features
+        for i, (_, _, feat_dict) in enumerate(G.edges(data=True)):
+            for key, value in feat_dict.items():
+                if str(key) in self.columns:
+                    data[str(key)] = (
+                        list(value) if i == 0 else data[str(key)] + list(value)
+                    )
+        # Add graph-level features
+        for feat_name in G.graph:
+            if str(feat_name) in self.columns:
+                data[str(feat_name)] = [G.graph[feat_name]]
+        if "edge_index" in self.columns:
+            data["edge_index"] = edge_index.view(2, -1)
+        data = Data.from_dict(data)
+        data.num_nodes = G.number_of_nodes()
+        return data
+    @staticmethod
+    def convert_nx_to_nx(G: nx.Graph) -> nx.Graph:
+        """
+        Converts NetworkX graph (``nx.Graph``) to NetworkX graph (``nx.Graph``) object. Redundant - returns itself.
+        :param G: NetworkX Graph
+        :type G: nx.Graph
+        :return: NetworkX Graph
+        :rtype: nx.Graph
+        """
+        return G
+    @staticmethod
+    def convert_dgl_to_nx(G: dgl.DGLGraph) -> nx.Graph:
+        """
+        Converts a DGL Graph (``dgl.DGLGraph``) to a NetworkX (``nx.Graph``) object. Preserves node and edge attributes.
+        :param G: ``dgl.DGLGraph`` to convert to ``NetworkX`` graph.
+        :type G: dgl.DGLGraph
+        :return: NetworkX graph object.
+        :rtype: nx.Graph
+        """
+        node_attrs = G.node_attr_schemes().keys()
+        edge_attrs = G.edge_attr_schemes().keys()
+        return dgl.to_networkx(G, node_attrs, edge_attrs)
+    @staticmethod
+    def convert_pyg_to_nx(G: Data) -> nx.Graph:
+        """Converts PyTorch Geometric ``Data`` object to NetworkX graph (``nx.Graph``).
+        :param G: Pytorch Geometric Data.
+        :type G: torch_geometric.data.Data
+        :returns: NetworkX graph.
+        :rtype: nx.Graph
+        """
+        return torch_geometric.utils.to_networkx(G)
+    def convert_nx_to_jraph(self, G: nx.Graph) -> jraph.GraphsTuple:
+        """Converts NetworkX graph (``nx.Graph``) to Jraph GraphsTuple graph. Requires ``jax`` and ``Jraph``.
+        :param G: Networkx graph to convert.
+        :type G: nx.Graph
+        :return: Jraph GraphsTuple graph.
+        :rtype: jraph.GraphsTuple
+        """
+        G = nx.convert_node_labels_to_integers(G)
+        n_node = len(G)
+        n_edge = G.number_of_edges()
+        edge_list = list(G.edges())
+        senders, receivers = zip(*edge_list)
+        senders, receivers = jnp.array(senders), jnp.array(receivers)
+        # Add node features
+        node_features = {}
+        for i, (_, feat_dict) in enumerate(G.nodes(data=True)):
+            for key, value in feat_dict.items():
+                if str(key) in self.columns:
+                    # node_features[str(key)] = (
+                    #    [value]
+                    #    if i == 0
+                    #    else node_features[str(key)] + [value]
+                    # )
+                    feat = (
+                        [value]
+                        if i == 0
+                        else node_features[str(key)] + [value]
+                    )
+                    try:
+                        feat = torch.tensor(feat)
+                        node_features[str(key)] = feat
+                    except TypeError:
+                        node_features[str(key)] = feat
+        # Add edge features
+        edge_features = {}
+        for i, (_, _, feat_dict) in enumerate(G.edges(data=True)):
+            for key, value in feat_dict.items():
+                if str(key) in self.columns:
+                    edge_features[str(key)] = (
+                        list(value)
+                        if i == 0
+                        else edge_features[str(key)] + list(value)
+                    )
+        # Add graph features
+        global_context = {
+            str(feat_name): [G.graph[feat_name]]
+            for feat_name in G.graph
+            if str(feat_name) in self.columns
+        }
+        return jraph.GraphsTuple(
+            nodes=node_features,
+            senders=senders,
+            receivers=receivers,
+            edges=edge_features,
+            n_node=n_node,
+            n_edge=n_edge,
+            globals=global_context,
+        )
+    def __call__(self, G: nx.Graph):
+        nx_g = eval("self.convert_" + self.src_format + "_to_nx(G)")
+        dst_g = eval("self.convert_nx_to_" + self.dst_format + "(nx_g)")
+        return dst_g
+# def convert_nx_to_pyg_data(G: nx.Graph) -> Data:
+#     # Initialise dict used to construct Data object
+#     data = {"node_id": list(G.nodes())}
+#     G = nx.convert_node_labels_to_integers(G)
+#     # Construct Edge Index
+#     edge_index = torch.LongTensor(list(G.edges)).t().contiguous()
+#     # Add node features
+#     for i, (_, feat_dict) in enumerate(G.nodes(data=True)):
+#         for key, value in feat_dict.items():
+#             data[str(key)] = [value] if i == 0 else data[str(key)] + [value]
+#     # Add edge features
+#     for i, (_, _, feat_dict) in enumerate(G.edges(data=True)):
+#         for key, value in feat_dict.items():
+#             data[str(key)] = (
+#                 list(value) if i == 0 else data[str(key)] + list(value)
+#             )
+#     # Add graph-level features
+#     for feat_name in G.graph:
+#         data[str(feat_name)] = [G.graph[feat_name]]
+#     data["edge_index"] = edge_index.view(2, -1)
+#     data = Data.from_dict(data)
+#     data.num_nodes = G.number_of_nodes()
+#     return data
+def convert_nx_to_pyg_data(G: nx.Graph) -> Data:
+    # Initialise dict used to construct Data object
+    data = {"node_id": list(G.nodes())}
+    G = nx.convert_node_labels_to_integers(G)
+    # Construct Edge Index
+    edge_index = torch.LongTensor(list(G.edges)).t().contiguous()
+    # Add node features
+    for i, (_, feat_dict) in enumerate(G.nodes(data=True)):
+        for key, value in feat_dict.items():
+            data[str(key)] = [value] if i == 0 else data[str(key)] + [value]
+    # Add edge features
+    for i, (_, _, feat_dict) in enumerate(G.edges(data=True)):
+        for key, value in feat_dict.items():
+            if key == 'distance':
+                data[str(key)] = (
+                    [value] if i == 0 else data[str(key)] + [value]
+                )
+            else:
+                data[str(key)] = (
+                    [list(value)] if i == 0 else data[str(key)] + [list(value)]
+                )
+    # Add graph-level features
+    for feat_name in G.graph:
+        data[str(feat_name)] = [G.graph[feat_name]]
+    data["edge_index"] = edge_index.view(2, -1)
+    data = Data.from_dict(data)
+    data.num_nodes = G.number_of_nodes()
+    return data

graphs.py ADDED Viewed

	@@ -0,0 +1,1137 @@

+"""Functions for working with Protein Structure Graphs."""
+# %%
+# Graphein
+# Author: Arian Jamasb <arian@jamasb.io>, Eric Ma, Charlie Harris
+# License: MIT
+# Project Website: https://github.com/a-r-j/graphein
+# Code Repository: https://github.com/a-r-j/graphein
+from __future__ import annotations
+import logging
+import traceback
+from functools import partial
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import networkx as nx
+import numpy as np
+import pandas as pd
+# from Bio.PDB.Polypeptide import three_to_one
+from biopandas.pdb import PandasPdb
+from biopandas.mmcif import PandasMmcif
+from rich.progress import Progress
+from tqdm.contrib.concurrent import process_map
+from graphein.protein.config import (
+    DSSPConfig,
+    GetContactsConfig,
+    ProteinGraphConfig,
+)
+from graphein.protein.edges.distance import (
+    add_distance_to_edges,
+    compute_distmat,
+)
+from graphein.protein.resi_atoms import BACKBONE_ATOMS, RESI_THREE_TO_1
+from graphein.protein.subgraphs import extract_subgraph_from_chains
+from graphein.protein.utils import (
+    ProteinGraphConfigurationError,
+    compute_rgroup_dataframe,
+    filter_dataframe,
+    get_protein_name_from_filename,
+    three_to_one_with_mods,
+)
+from graphein.rna.constants import RNA_ATOMS
+from graphein.utils.utils import (
+    annotate_edge_metadata,
+    annotate_graph_metadata,
+    annotate_node_metadata,
+    compute_edges,
+)
+from .utils_convert import biopandas_mmcif2pdb
+# logging.basicConfig(level="DEBUG")
+log = logging.getLogger(__name__)
+def subset_structure_to_rna(
+    df: pd.DataFrame,
+) -> pd.DataFrame:
+    """
+    Return a subset of atomic dataframe that contains only certain atom names relevant for RNA structures.
+    :param df: Protein Structure dataframe to subset
+    :type df: pd.DataFrame
+    :returns: Subsetted protein structure dataframe
+    :rtype: pd.DataFrame
+    """
+    return filter_dataframe(
+        df, by_column="atom_name", list_of_values=RNA_ATOMS, boolean=True
+    )
+def read_pdb_to_dataframe(
+    pdb_path: Optional[str] = None,
+    pdb_code: Optional[str] = None,
+    uniprot_id: Optional[str] = None,
+    model_index: int = 1,
+) -> pd.DataFrame:
+    """
+    Reads PDB file to ``PandasPDB`` object.
+    Returns ``atomic_df``, which is a dataframe enumerating all atoms and their cartesian coordinates in 3D space. Also
+    contains associated metadata from the PDB file.
+    :param pdb_path: path to PDB file. Defaults to ``None``.
+    :type pdb_path: str, optional
+    :param pdb_code: 4-character PDB accession. Defaults to ``None``.
+    :type pdb_code: str, optional
+    :param uniprot_id: UniProt ID to build graph from AlphaFoldDB. Defaults to ``None``.
+    :type uniprot_id: str, optional
+    :param model_index: Index of model to read. Only relevant for structures containing ensembles. Defaults to ``1``.
+    :type model_index: int, optional
+    :param verbose: print dataframe?
+    :type verbose: bool
+    :param granularity: Specifies granularity of dataframe. See :class:`~graphein.protein.config.ProteinGraphConfig` for further
+        details.
+    :type granularity: str
+    :returns: ``pd.DataFrame`` containing protein structure
+    :rtype: pd.DataFrame
+    """
+    if pdb_code is None and pdb_path is None and uniprot_id is None:
+        raise NameError(
+            "One of pdb_code, pdb_path or uniprot_id must be specified!"
+        )
+    if pdb_path is not None:
+        if pdb_path.endswith('cif'):
+            atomic_df = PandasMmcif().read_mmcif(pdb_path)
+            atomic_df = biopandas_mmcif2pdb(atomic_df, model_index)
+        else:
+            atomic_df = PandasPdb().read_pdb(pdb_path)
+    else:
+        if uniprot_id is not None:
+            atomic_df = PandasPdb().fetch_pdb(
+                uniprot_id=uniprot_id, source="alphafold2-v2"
+            )
+        else:
+            atomic_df = PandasPdb().fetch_pdb(pdb_code)
+        atomic_df = atomic_df.get_model(model_index)
+        if len(atomic_df.df["ATOM"]) == 0:
+            raise ValueError(f"No model found for index: {model_index}")
+    return pd.concat([atomic_df.df["ATOM"], atomic_df.df["HETATM"]])
+def label_node_id(df: pd.DataFrame, granularity: str) -> pd.DataFrame:
+    df["node_id"] = (
+        df["chain_id"].apply(str)
+        + ":"
+        + df["residue_name"]
+        + ":"
+        + df["residue_number"].apply(str)
+    )
+    df["residue_id"] = df["node_id"]
+    if granularity == "atom":
+        df["node_id"] = df["node_id"] + ":" + df["atom_name"]
+    elif granularity in {"rna_atom", "rna_centroid"}:
+        df["node_id"] = (
+            df["node_id"]
+            + ":"
+            + df["atom_number"].apply(str)
+            + ":"
+            + df["atom_name"]
+        )
+    return df
+def deprotonate_structure(df: pd.DataFrame) -> pd.DataFrame:
+    """Remove protons from PDB dataframe.
+    :param df: Atomic dataframe.
+    :type df: pd.DataFrame
+    :returns: Atomic dataframe with all ``atom_name == "H"`` removed.
+    :rtype: pd.DataFrame
+    """
+    log.debug(
+        "Deprotonating protein. This removes H atoms from the pdb_df dataframe"
+    )
+    return filter_dataframe(
+        df, by_column="element_symbol", list_of_values=["H"], boolean=False
+    )
+def convert_structure_to_centroids(df: pd.DataFrame) -> pd.DataFrame:
+    """Overwrite existing ``(x, y, z)`` coordinates with centroids of the amino acids.
+    :param df: Pandas Dataframe protein structure to convert into a dataframe of centroid positions.
+    :type df: pd.DataFrame
+    :return: pd.DataFrame with atoms/residues positions converted into centroid positions.
+    :rtype: pd.DataFrame
+    """
+    log.debug(
+        "Converting dataframe to centroids. This averages XYZ coords of the atoms in a residue"
+    )
+    centroids = calculate_centroid_positions(df)
+    df = df.loc[df["atom_name"] == "CA"].reset_index(drop=True)
+    df["x_coord"] = centroids["x_coord"]
+    df["y_coord"] = centroids["y_coord"]
+    df["z_coord"] = centroids["z_coord"]
+    return df
+def subset_structure_to_atom_type(
+    df: pd.DataFrame, granularity: str
+) -> pd.DataFrame:
+    """
+    Return a subset of atomic dataframe that contains only certain atom names.
+    :param df: Protein Structure dataframe to subset.
+    :type df: pd.DataFrame
+    :returns: Subsetted protein structure dataframe.
+    :rtype: pd.DataFrame
+    """
+    return filter_dataframe(
+        df, by_column="atom_name", list_of_values=[granularity], boolean=True
+    )
+def remove_insertions(df: pd.DataFrame, keep: str = "first") -> pd.DataFrame:
+    """
+    This function removes insertions from PDB dataframes.
+    :param df: Protein Structure dataframe to remove insertions from.
+    :type df: pd.DataFrame
+    :param keep: Specifies which insertion to keep. Options are ``"first"`` or ``"last"``.
+        Default is ``"first"``
+    :type keep: str
+    :return: Protein structure dataframe with insertions removed
+    :rtype: pd.DataFrame
+    """
+    # Catches unnamed insertions
+    duplicates = df.duplicated(
+        subset=["chain_id", "residue_number", "atom_name"], keep=keep
+    )
+    df = df[~duplicates]
+    # Catches explicit insertions
+    df = filter_dataframe(
+        df, by_column="insertion", list_of_values=[""], boolean=True
+    )
+    # Remove alt_locs
+    df = filter_dataframe(
+        df, by_column="alt_loc", list_of_values=["", "A"], boolean=True
+    )
+    return df
+def filter_hetatms(
+    df: pd.DataFrame, keep_hets: List[str]
+) -> List[pd.DataFrame]:
+    """Return hetatms of interest.
+    :param df: Protein Structure dataframe to filter hetatoms from.
+    :type df: pd.DataFrame
+    :param keep_hets: List of hetero atom names to keep.
+    :returns: Protein structure dataframe with heteroatoms removed
+    :rtype: pd.DataFrame
+    """
+    return [df.loc[df["residue_name"] == hetatm] for hetatm in keep_hets]
+def process_dataframe(
+    protein_df: pd.DataFrame,
+    atom_df_processing_funcs: Optional[List[Callable]] = None,
+    hetatom_df_processing_funcs: Optional[List[Callable]] = None,
+    granularity: str = "centroids",
+    chain_selection: str = "all",
+    insertions: bool = False,
+    deprotonate: bool = True,
+    keep_hets: List[str] = [],
+    verbose: bool = False,
+) -> pd.DataFrame:
+    """
+    Process ATOM and HETATM dataframes to produce singular dataframe used for graph construction.
+    :param protein_df: Dataframe to process.
+        Should be the object returned from :func:`~graphein.protein.graphs.read_pdb_to_dataframe`.
+    :type protein_df: pd.DataFrame
+    :param atom_df_processing_funcs: List of functions to process dataframe. These must take in a dataframe and return a
+        dataframe. Defaults to None.
+    :type atom_df_processing_funcs: List[Callable], optional
+    :param hetatom_df_processing_funcs: List of functions to process the hetatom dataframe. These must take in a dataframe and return a dataframe
+    :type hetatom_df_processing_funcs: List[Callable], optional
+    :param granularity: The level of granularity for the graph. This determines the node definition.
+        Acceptable values include: ``"centroids"``, ``"atoms"``,
+        any of the atom_names in the PDB file (e.g. ``"CA"``, ``"CB"``, ``"OG"``, etc.).
+        See: :const:`~graphein.protein.config.GRAPH_ATOMS` and :const:`~graphein.protein.config.GRANULARITY_OPTS`.
+    :type granularity: str
+    :param insertions: Whether or not to keep insertions.
+    :param insertions: bool
+    :param deprotonate: Whether or not to remove hydrogen atoms (i.e. deprotonation).
+    :type deprotonate: bool
+    :param keep_hets: Hetatoms to keep. Defaults to an empty list.
+        To keep a hetatom, pass it inside a list of hetatom names to keep.
+    :type keep_hets: List[str]
+    :param verbose: Verbosity level.
+    :type verbose: bool
+    :param chain_selection: Which protein chain to select. Defaults to ``"all"``. Eg can use ``"ACF"``
+        to select 3 chains (``A``, ``C`` & ``F``)
+    :type chain_selection: str
+    :return: A protein dataframe that can be consumed by
+        other graph construction functions.
+    :rtype: pd.DataFrame
+    """
+    protein_df = label_node_id(protein_df, granularity=granularity)
+    # TODO: Need to properly define what "granularity" is supposed to do.
+    atoms = filter_dataframe(
+        protein_df,
+        by_column="record_name",
+        list_of_values=["ATOM"],
+        boolean=True,
+    )
+    hetatms = filter_dataframe(
+        protein_df,
+        by_column="record_name",
+        list_of_values=["HETATM"],
+        boolean=True,
+    )
+    # This block enables processing via a list of supplied functions operating on the atom and hetatom dataframes
+    # If these are provided, the dataframe returned will be computed only from these and the default workflow
+    # below this block will not execute.
+    if atom_df_processing_funcs is not None:
+        for func in atom_df_processing_funcs:
+            atoms = func(atoms)
+        if hetatom_df_processing_funcs is None:
+            return atoms
+    if hetatom_df_processing_funcs is not None:
+        for func in hetatom_df_processing_funcs:
+            hetatms = func(hetatms)
+        return pd.concat([atoms, hetatms])
+    if keep_hets:
+        hetatms_to_keep = filter_hetatms(hetatms, keep_hets)
+        atoms = pd.concat([atoms] + hetatms_to_keep)
+    # Deprotonate structure by removing H atoms
+    if deprotonate:
+        atoms = deprotonate_structure(atoms)
+    # Restrict DF to desired granularity
+    if granularity == "atom":
+        pass
+    elif granularity in {"centroids", "rna_centroid"}:
+        atoms = convert_structure_to_centroids(atoms)
+    elif granularity == "rna_atom":
+        atoms = subset_structure_to_rna(atoms)
+    else:
+        atoms = subset_structure_to_atom_type(atoms, granularity)
+    protein_df = atoms
+    # Remove alt_loc residues
+    if not insertions:
+        protein_df = remove_insertions(protein_df)
+    # perform chain selection
+    protein_df = select_chains(
+        protein_df, chain_selection=chain_selection, verbose=verbose
+    )
+    log.debug(f"Detected {len(protein_df)} total nodes")
+    # Sort dataframe to place HETATMs
+    protein_df = sort_dataframe(protein_df)
+    return protein_df
+def sort_dataframe(df: pd.DataFrame) -> pd.DataFrame:
+    """Sorts a protein dataframe by chain->residue number->atom number
+    This is useful for distributing hetatms/modified residues through the DF.
+    :param df: Protein dataframe to sort.
+    :type df: pd.DataFrame
+    :return: Sorted protein dataframe.
+    :rtype: pd.DataFrame
+    """
+    return df.sort_values(by=["chain_id", "residue_number", "atom_number"])
+def assign_node_id_to_dataframe(
+    protein_df: pd.DataFrame, granularity: str
+) -> pd.DataFrame:
+    """
+    Assigns the node ID back to the ``pdb_df`` dataframe
+    :param protein_df: Structure Dataframe
+    :type protein_df: pd.DataFrame
+    :param granularity: Granularity of graph. Atom-level,
+        residue (e.g. ``CA``) or ``centroids``.
+        See: :const:`~graphein.protein.config.GRAPH_ATOMS`
+        and :const:`~graphein.protein.config.GRANULARITY_OPTS`.
+    :type granularity: str
+    :return: Returns dataframe with added ``node_ids``
+    :rtype: pd.DataFrame
+    """
+    protein_df["node_id"] = (
+        protein_df["chain_id"].apply(str)
+        + ":"
+        + protein_df["residue_name"]
+        + ":"
+        + protein_df["residue_number"].apply(str)
+    )
+    if granularity in {"atom", "rna_atom"}:
+        protein_df[
+            "node_id"
+        ] = f'{protein_df["node_id"]}:{protein_df["atom_name"]}'
+def select_chains(
+    protein_df: pd.DataFrame, chain_selection: str, verbose: bool = False
+) -> pd.DataFrame:
+    """
+    Extracts relevant chains from ``protein_df``.
+    :param protein_df: pandas dataframe of PDB subsetted to relevant atoms
+        (``CA``, ``CB``).
+    :type protein_df: pd.DataFrame
+    :param chain_selection: Specifies chains that should be extracted from
+        the larger complexed structure.
+    :type chain_selection: str
+    :param verbose: Print dataframe?
+    :type verbose: bool
+    :return: Protein structure dataframe containing only entries in the
+        chain selection.
+    :rtype: pd.DataFrame
+    """
+    if chain_selection != "all":
+        protein_df = filter_dataframe(
+            protein_df,
+            by_column="chain_id",
+            list_of_values=list(chain_selection),
+            boolean=True,
+        )
+    return protein_df
+def initialise_graph_with_metadata(
+    protein_df: pd.DataFrame,
+    raw_pdb_df: pd.DataFrame,
+    granularity: str,
+    name: Optional[str] = None,
+    pdb_code: Optional[str] = None,
+    pdb_path: Optional[str] = None,
+) -> nx.Graph:
+    """
+    Initializes the nx Graph object with initial metadata.
+    :param protein_df: Processed Dataframe of protein structure.
+    :type protein_df: pd.DataFrame
+    :param raw_pdb_df: Unprocessed dataframe of protein structure for comparison and traceability downstream.
+    :type raw_pdb_df: pd.DataFrame
+    :param granularity: Granularity of the graph (eg ``"atom"``, ``"CA"``, ``"CB"`` etc or ``"centroid"``).
+        See: :const:`~graphein.protein.config.GRAPH_ATOMS` and :const:`~graphein.protein.config.GRANULARITY_OPTS`.
+    :type granularity: str
+    :param name: specified given name for the graph. If None, the PDB code or the file name will be used to name the graph.
+    :type name: Optional[str], defaults to ``None``
+    :param pdb_code: PDB ID / Accession code, if the PDB is available on the PDB database.
+    :type pdb_code: Optional[str], defaults to ``None``
+    :param pdb_path: path to local PDB file, if constructing a graph from a local file.
+    :type pdb_path: Optional[str], defaults to ``None``
+    :return: Returns initial protein structure graph with metadata.
+    :rtype: nx.Graph
+    """
+    # Get name for graph if no name was provided
+    if name is None:
+        if pdb_path is not None:
+            name = get_protein_name_from_filename(pdb_path)
+        else:
+            name = pdb_code
+    G = nx.Graph(
+        name=name,
+        pdb_code=pdb_code,
+        pdb_path=pdb_path,
+        chain_ids=list(protein_df["chain_id"].unique()),
+        pdb_df=protein_df,
+        raw_pdb_df=raw_pdb_df,
+        rgroup_df=compute_rgroup_dataframe(remove_insertions(raw_pdb_df)),
+        coords=np.asarray(protein_df[["x_coord", "y_coord", "z_coord"]]),
+    )
+    # Create graph and assign intrinsic graph-level metadata
+    G.graph["node_type"] = granularity
+    # Add Sequences to graph metadata
+    for c in G.graph["chain_ids"]:
+        if granularity == "rna_atom":
+            sequence = protein_df.loc[protein_df["chain_id"] == c][
+                "residue_name"
+            ].str.cat()
+        else:
+            sequence = (
+                protein_df.loc[protein_df["chain_id"] == c]["residue_name"]
+                .apply(three_to_one_with_mods)
+                .str.cat()
+            )
+        G.graph[f"sequence_{c}"] = sequence
+    return G
+def add_nodes_to_graph(
+    G: nx.Graph,
+    protein_df: Optional[pd.DataFrame] = None,
+    verbose: bool = False,
+) -> nx.Graph:
+    """Add nodes into protein graph.
+    :param G: ``nx.Graph`` with metadata to populate with nodes.
+    :type G: nx.Graph
+    :protein_df: DataFrame of protein structure containing nodes & initial node metadata to add to the graph.
+    :type protein_df: pd.DataFrame, optional
+    :param verbose: Controls verbosity of this step.
+    :type verbose: bool
+    :returns: nx.Graph with nodes added.
+    :rtype: nx.Graph
+    """
+    # If no protein dataframe is supplied, use the one stored in the Graph object
+    if protein_df is None:
+        protein_df = G.graph["pdb_df"]
+    # Assign intrinsic node attributes
+    chain_id = protein_df["chain_id"].apply(str)
+    residue_name = protein_df["residue_name"]
+    residue_number = protein_df["residue_number"]  # .apply(str)
+    coords = np.asarray(protein_df[["x_coord", "y_coord", "z_coord"]])
+    b_factor = protein_df["b_factor"]
+    atom_type = protein_df["atom_name"]
+    nodes = protein_df["node_id"]
+    element_symbol = protein_df["element_symbol"]
+    G.add_nodes_from(nodes)
+    # Set intrinsic node attributes
+    nx.set_node_attributes(G, dict(zip(nodes, chain_id)), "chain_id")
+    nx.set_node_attributes(G, dict(zip(nodes, residue_name)), "residue_name")
+    nx.set_node_attributes(
+        G, dict(zip(nodes, residue_number)), "residue_number"
+    )
+    nx.set_node_attributes(G, dict(zip(nodes, atom_type)), "atom_type")
+    nx.set_node_attributes(
+        G, dict(zip(nodes, element_symbol)), "element_symbol"
+    )
+    nx.set_node_attributes(G, dict(zip(nodes, coords)), "coords")
+    nx.set_node_attributes(G, dict(zip(nodes, b_factor)), "b_factor")
+    # TODO: include charge, line_idx for traceability?
+    if verbose:
+        print(nx.info(G))
+        print(G.nodes())
+    return G
+def calculate_centroid_positions(
+    atoms: pd.DataFrame, verbose: bool = False
+) -> pd.DataFrame:
+    """
+    Calculates position of sidechain centroids.
+    :param atoms: ATOM df of protein structure.
+    :type atoms: pd.DataFrame
+    :param verbose: bool controlling verbosity.
+    :type verbose: bool
+    :return: centroids (df).
+    :rtype: pd.DataFrame
+    """
+    centroids = (
+        atoms.groupby("residue_number")
+        .mean()[["x_coord", "y_coord", "z_coord"]]
+        .reset_index()
+    )
+    if verbose:
+        print(f"Calculated {len(centroids)} centroid nodes")
+    log.debug(f"Calculated {len(centroids)} centroid nodes")
+    return centroids
+def compute_edges(
+    G: nx.Graph,
+    funcs: List[Callable],
+    get_contacts_config: Optional[GetContactsConfig] = None,
+) -> nx.Graph:
+    """
+    Computes edges for the protein structure graph. Will compute a pairwise
+    distance matrix between nodes which is
+    added to the graph metadata to facilitate some edge computations.
+    :param G: nx.Graph with nodes to add edges to.
+    :type G: nx.Graph
+    :param funcs: List of edge construction functions.
+    :type funcs: List[Callable]
+    :param get_contacts_config: Config object for ``GetContacts`` if
+        intramolecular edges are being used.
+    :type get_contacts_config: graphein.protein.config.GetContactsConfig
+    :return: Graph with added edges.
+    :rtype: nx.Graph
+    """
+    # This control flow prevents unnecessary computation of the distance matrices
+    if "config" in G.graph:
+        if G.graph["config"].granularity == "atom":
+            G.graph["atomic_dist_mat"] = compute_distmat(G.graph["pdb_df"])
+        else:
+            G.graph["dist_mat"] = compute_distmat(G.graph["pdb_df"])
+    for func in funcs:
+        func(G)
+    return add_distance_to_edges(G)
+def construct_graph(
+    config: Optional[ProteinGraphConfig] = None,
+    name: Optional[str] = None,
+    pdb_path: Optional[str] = None,
+    uniprot_id: Optional[str] = None,
+    pdb_code: Optional[str] = None,
+    chain_selection: str = "all",
+    model_index: int = 1,
+    df_processing_funcs: Optional[List[Callable]] = None,
+    edge_construction_funcs: Optional[List[Callable]] = None,
+    edge_annotation_funcs: Optional[List[Callable]] = None,
+    node_annotation_funcs: Optional[List[Callable]] = None,
+    graph_annotation_funcs: Optional[List[Callable]] = None,
+) -> nx.Graph:
+    """
+    Constructs protein structure graph from a ``pdb_code`` or ``pdb_path``.
+    Users can provide a :class:`~graphein.protein.config.ProteinGraphConfig`
+    object to specify construction parameters.
+    However, config parameters can be overridden by passing arguments directly to the function.
+    :param config: :class:`~graphein.protein.config.ProteinGraphConfig` object. If None, defaults to config in ``graphein.protein.config``.
+    :type config: graphein.protein.config.ProteinGraphConfig, optional
+    :param name: an optional given name for the graph. the PDB ID or PDB file name will be used if not specified.
+    :type name: str, optional
+    :param pdb_path: Path to ``pdb_file`` when constructing a graph from a local pdb file. Default is ``None``.
+    :type pdb_path: Optional[str], defaults to ``None``
+    :param pdb_code: A 4-character PDB ID / accession to be used to construct the graph, if available. Default is ``None``.
+    :type pdb_code: Optional[str], defaults to ``None``
+    :param uniprot_id: UniProt accession ID to build graph from AlphaFold2DB. Default is ``None``.
+    :type uniprot_id: str, optional
+    :param chain_selection: String of polypeptide chains to include in graph. E.g ``"ABDF"`` or ``"all"``. Default is ``"all"``.
+    :type chain_selection: str
+    :param model_index: Index of model to use in the case of structural ensembles. Default is ``1``.
+    :type model_index: int
+    :param df_processing_funcs: List of dataframe processing functions. Default is ``None``.
+    :type df_processing_funcs: List[Callable], optional
+    :param edge_construction_funcs: List of edge construction functions. Default is ``None``.
+    :type edge_construction_funcs: List[Callable], optional
+    :param edge_annotation_funcs: List of edge annotation functions. Default is ``None``.
+    :type edge_annotation_funcs: List[Callable], optional
+    :param node_annotation_funcs: List of node annotation functions. Default is ``None``.
+    :type node_annotation_funcs: List[Callable], optional
+    :param graph_annotation_funcs: List of graph annotation function. Default is ``None``.
+    :type graph_annotation_funcs: List[Callable]
+    :return: Protein Structure Graph
+    :rtype: nx.Graph
+    """
+    if pdb_code is None and pdb_path is None and uniprot_id is None:
+        raise ValueError(
+            "Either a PDB ID, UniProt ID or a path to a local PDB file"
+            " must be specified to construct a graph"
+        )
+    # If no config is provided, use default
+    if config is None:
+        config = ProteinGraphConfig()
+    with Progress(transient=True) as progress:
+        task1 = progress.add_task("Reading PDB file...", total=1)
+        # Get name from pdb_file is no pdb_code is provided
+        # if pdb_path and (pdb_code is None and uniprot_id is None):
+        #    pdb_code = get_protein_name_from_filename(pdb_path)
+        #    pdb_code = pdb_code if len(pdb_code) == 4 else None
+        progress.advance(task1)
+        # If config params are provided, overwrite them
+        config.protein_df_processing_functions = (
+            df_processing_funcs
+            if config.protein_df_processing_functions is None
+            else config.protein_df_processing_functions
+        )
+        config.edge_construction_functions = (
+            edge_construction_funcs
+            if config.edge_construction_functions is None
+            else config.edge_construction_functions
+        )
+        config.node_metadata_functions = (
+            node_annotation_funcs
+            if config.node_metadata_functions is None
+            else config.node_metadata_functions
+        )
+        config.graph_metadata_functions = (
+            graph_annotation_funcs
+            if config.graph_metadata_functions is None
+            else config.graph_metadata_functions
+        )
+        config.edge_metadata_functions = (
+            edge_annotation_funcs
+            if config.edge_metadata_functions is None
+            else config.edge_metadata_functions
+        )
+        raw_df = read_pdb_to_dataframe(
+            pdb_path,
+            pdb_code,
+            uniprot_id,
+            model_index=model_index,
+        )
+        task2 = progress.add_task("Processing PDB dataframe...", total=1)
+        # raw_df = label_node_id(raw_df, granularity=config.granularity)
+        # raw_df.df["ATOM"] = label_node_id(
+        #    raw_df.df["ATOM"], granularity=config.granularity
+        # )
+        # raw_df.df["HETATM"] = label_node_id(
+        #    raw_df.df["HETATM"], granularity=config.granularity
+        # )
+        raw_df = sort_dataframe(raw_df)
+        protein_df = process_dataframe(
+            raw_df,
+            chain_selection=chain_selection,
+            granularity=config.granularity,
+            insertions=config.insertions,
+            keep_hets=config.keep_hets,
+        )
+        progress.advance(task2)
+        task3 = progress.add_task("Initializing graph...", total=1)
+        # Initialise graph with metadata
+        g = initialise_graph_with_metadata(
+            protein_df=protein_df,
+            raw_pdb_df=raw_df,
+            name=name,
+            pdb_code=pdb_code,
+            pdb_path=pdb_path,
+            granularity=config.granularity,
+        )
+        # Add nodes to graph
+        g = add_nodes_to_graph(g)
+        # Add config to graph
+        g.graph["config"] = config
+        g.graph["path"] = g.graph["pdb_path"]
+        # Annotate additional node metadata
+        if config.node_metadata_functions is not None:
+            g = annotate_node_metadata(g, config.node_metadata_functions)
+        progress.advance(task3)
+        task4 = progress.add_task("Constructing edges...", total=1)
+        # Compute graph edges
+        g = compute_edges(
+            g,
+            funcs=config.edge_construction_functions,
+            get_contacts_config=None,
+        )
+        progress.advance(task4)
+    # Annotate additional graph metadata
+    # print(g.graph['dssp_df'])
+    if config.graph_metadata_functions is not None:
+        g = annotate_graph_metadata(g, config.graph_metadata_functions)
+    # Annotate additional edge metadata
+    if config.edge_metadata_functions is not None:
+        g = annotate_edge_metadata(g, config.edge_metadata_functions)
+    return g
+def _mp_graph_constructor(
+    args: Tuple[str, str, int], source: str, config: ProteinGraphConfig
+) -> Union[nx.Graph, None]:
+    """
+    Protein graph constructor for use in multiprocessing several protein structure graphs.
+    :param args: Tuple of pdb code/path and the chain selection for that PDB.
+    :type args: Tuple[str, str]
+    :param use_pdb_code: Whether we are using ``"pdb_code"``s, ``pdb_path``s or ``"uniprot_id"``s.
+    :type use_pdb_code: bool
+    :param config: Protein structure graph construction config (see: :class:`graphein.protein.config.ProteinGraphConfig`).
+    :type config: ProteinGraphConfig
+    :return: Protein structure graph or ``None`` if an error is encountered.
+    :rtype: Union[nx.Graph, None]
+    """
+    log.info(
+        f"Constructing graph for: {args[0]}. Chain selection: {args[1]}. Model index: {args[2]}"
+    )
+    func = partial(construct_graph, config=config)
+    try:
+        if source == "pdb_code":
+            return func(
+                pdb_code=args[0], chain_selection=args[1], model_index=args[2]
+            )
+        elif source == "pdb_path":
+            return func(
+                pdb_path=args[0], chain_selection=args[1], model_index=args[2]
+            )
+        elif source == "uniprot_id":
+            return func(
+                uniprot_id=args[0],
+                chain_selection=args[1],
+                model_index=args[2],
+            )
+    except Exception as ex:
+        log.info(
+            f"Graph construction error (PDB={args[0]})! {traceback.format_exc()}"
+        )
+        log.info(ex)
+        return None
+def construct_graphs_mp(
+    pdb_code_it: Optional[List[str]] = None,
+    pdb_path_it: Optional[List[str]] = None,
+    uniprot_id_it: Optional[List[str]] = None,
+    chain_selections: Optional[List[str]] = None,
+    model_indices: Optional[List[str]] = None,
+    config: ProteinGraphConfig = ProteinGraphConfig(),
+    num_cores: int = 16,
+    return_dict: bool = True,
+    out_path: Optional[str] = None,
+) -> Union[List[nx.Graph], Dict[str, nx.Graph]]:
+    """
+    Constructs protein graphs for a list of pdb codes or pdb paths using multiprocessing.
+    :param pdb_code_it: List of pdb codes to use for protein graph construction
+    :type pdb_code_it: Optional[List[str]], defaults to ``None``
+    :param pdb_path_it: List of paths to PDB files to use for protein graph construction
+    :type pdb_path_it: Optional[List[str]], defaults to ``None``
+    :param chain_selections: List of chains to select from the protein structures (e.g. ``["ABC", "A", "L", "CD"...]``)
+    :type chain_selections: Optional[List[str]], defaults to ``None``
+    :param model_indices: List of model indices to use for protein graph construction. Only relevant for structures containing ensembles of models.
+    :type model_indices: Optional[List[str]], defaults to ``None``
+    :param config: ProteinGraphConfig to use.
+    :type config: graphein.protein.config.ProteinGraphConfig, defaults to default config params
+    :param num_cores: Number of cores to use for multiprocessing. The more the merrier
+    :type num_cores: int, defaults to ``16``
+    :param return_dict: Whether or not to return a dictionary (indexed by pdb codes/paths) or a list of graphs.
+    :type return_dict: bool, default to ``True``
+    :param out_path: Path to save the graphs to. If None, graphs are not saved.
+    :type out_path: Optional[str], defaults to ``None``
+    :return: Iterable of protein graphs. None values indicate there was a problem in constructing the graph for this particular pdb
+    :rtype: Union[List[nx.Graph], Dict[str, nx.Graph]]
+    """
+    assert (
+        pdb_code_it is not None or pdb_path_it is not None
+    ), "Iterable of pdb codes, pdb paths or uniprot IDs required."
+    if pdb_code_it is not None:
+        pdbs = pdb_code_it
+        source = "pdb_code"
+    if pdb_path_it is not None:
+        pdbs = pdb_path_it
+        source = "pdb_path"
+    if uniprot_id_it is not None:
+        pdbs = uniprot_id_it
+        source = "uniprot_id"
+    if chain_selections is None:
+        chain_selections = ["all"] * len(pdbs)
+    if model_indices is None:
+        model_indices = [1] * len(pdbs)
+    constructor = partial(_mp_graph_constructor, source=source, config=config)
+    graphs = list(
+        process_map(
+            constructor,
+            [
+                (pdb, chain_selections[i], model_indices[i])
+                for i, pdb in enumerate(pdbs)
+            ],
+            max_workers=num_cores,
+        )
+    )
+    if out_path is not None:
+        [
+            nx.write_gpickle(
+                g, str(f"{out_path}/" + f"{g.graph['name']}.pickle")
+            )
+            for g in graphs
+        ]
+    if return_dict:
+        graphs = {pdb: graphs[i] for i, pdb in enumerate(pdbs)}
+    return graphs
+def compute_chain_graph(
+    g: nx.Graph,
+    chain_list: Optional[List[str]] = None,
+    remove_self_loops: bool = False,
+    return_weighted_graph: bool = False,
+) -> Union[nx.Graph, nx.MultiGraph]:
+    """Computes a chain-level graph from a protein structure graph.
+    This graph features nodes as individual chains in a complex and edges as
+    the interactions between constituent nodes in each chain. You have the
+    option of returning an unweighted graph (multigraph,
+    ``return_weighted_graph=False``) or a weighted graph
+    (``return_weighted_graph=True``). The difference between these is the
+    unweighted graph features and edge for each interaction between chains
+    (ie the number of edges will be equal to the number of edges in the input
+    protein structure graph), while the weighted graph sums these interactions
+    to a single edge between chains with the counts stored as features.
+    :param g: A protein structure graph to compute the chain graph of.
+    :type g: nx.Graph
+    :param chain_list: A list of chains to extract from the input graph.
+        If ``None``, all chains will be used. This is provided as input to
+        ``extract_subgraph_from_chains``. Default is ``None``.
+    :type chain_list: Optional[List[str]]
+    :param remove_self_loops: Whether to remove self-loops from the graph.
+        Default is False.
+    :type remove_self_loops: bool
+    :return: A chain-level graph.
+    :rtype: Union[nx.Graph, nx.MultiGraph]
+    """
+    # If we are extracting specific chains, do it here.
+    if chain_list is not None:
+        g = extract_subgraph_from_chains(g, chain_list)
+    # Initialise new graph with Metadata
+    h = nx.MultiGraph()
+    h.graph = g.graph
+    h.graph["node_type"] = "chain"
+    # Set nodes
+    nodes_per_chain = {chain: 0 for chain in g.graph["chain_ids"]}
+    sequences = {chain: "" for chain in g.graph["chain_ids"]}
+    for n, d in g.nodes(data=True):
+        nodes_per_chain[d["chain_id"]] += 1
+        sequences[d["chain_id"]] += RESI_THREE_TO_1[d["residue_name"]]
+    h.add_nodes_from(g.graph["chain_ids"])
+    for n, d in h.nodes(data=True):
+        d["num_residues"] = nodes_per_chain[n]
+        d["sequence"] = sequences[n]
+    # Add edges
+    for u, v, d in g.edges(data=True):
+        h.add_edge(
+            g.nodes[u]["chain_id"], g.nodes[v]["chain_id"], kind=d["kind"]
+        )
+    # Remove self-loops if necessary. Checks for equality between nodes in a given edge.
+    if remove_self_loops:
+        edges_to_remove: List[Tuple[str]] = [
+            (u, v) for u, v in h.edges() if u == v
+        ]
+        h.remove_edges_from(edges_to_remove)
+    # Compute a weighted graph if required.
+    if return_weighted_graph:
+        return compute_weighted_graph_from_multigraph(h)
+    return h
+def compute_weighted_graph_from_multigraph(g: nx.MultiGraph) -> nx.Graph:
+    """Computes a weighted graph from a multigraph.
+    This function is used to convert a multigraph to a weighted graph. The
+    weights of the edges are the number of interactions between the nodes.
+    :param g: A multigraph.
+    :type g: nx.MultiGraph
+    :return: A weighted graph.
+    :rtype: nx.Graph
+    """
+    H = nx.Graph()
+    H.graph = g.graph
+    H.add_nodes_from(g.nodes(data=True))
+    for u, v, d in g.edges(data=True):
+        if H.has_edge(u, v):
+            H[u][v]["weight"] += len(d["kind"])
+            H[u][v]["kind"].update(d["kind"])
+            for kind in list(d["kind"]):
+                try:
+                    H[u][v][kind] += 1
+                except KeyError:
+                    H[u][v][kind] = 1
+        else:
+            H.add_edge(u, v, weight=len(d["kind"]), kind=d["kind"])
+            for kind in list(d["kind"]):
+                H[u][v][kind] = 1
+    return H
+def number_groups_of_runs(list_of_values: List[Any]) -> List[str]:
+    """Numbers groups of runs in a list of values.
+    E.g. ``["A", "A", "B", "A", "A", "A", "B", "B"] ->
+    ["A1", "A1", "B1", "A2", "A2", "A2", "B2", "B2"]``
+    :param list_of_values: List of values to number.
+    :type list_of_values: List[Any]
+    :return: List of numbered values.
+    :rtype: List[str]
+    """
+    df = pd.DataFrame({"val": list_of_values})
+    df["idx"] = df["val"].shift() != df["val"]
+    df["sum"] = df.groupby("val")["idx"].cumsum()
+    return list(df["val"].astype(str) + df["sum"].astype(str))
+def compute_secondary_structure_graph(
+    g: nx.Graph,
+    allowable_ss_elements: Optional[List[str]] = None,
+    remove_non_ss: bool = True,
+    remove_self_loops: bool = False,
+    return_weighted_graph: bool = False,
+) -> Union[nx.Graph, nx.MultiGraph]:
+    """Computes a secondary structure graph from a protein structure graph.
+    :param g: A protein structure graph to compute the secondary structure
+        graph of.
+    :type g: nx.Graph
+    :param remove_non_ss: Whether to remove non-secondary structure nodes from
+        the graph. These are denoted as ``"-"`` by DSSP. Default is True.
+    :type remove_non_ss: bool
+    :param remove_self_loops: Whether to remove self-loops from the graph.
+        Default is ``False``.
+    :type remove_self_loops: bool
+    :param return_weighted_graph: Whether to return a weighted graph.
+        Default is False.
+    :type return_weighted_graph: bool
+    :raises ProteinGraphConfigurationError: If the protein structure graph is
+        not configured correctly with secondary structure assignments on all
+        nodes.
+    :return: A secondary structure graph.
+    :rtype: Union[nx.Graph, nx.MultiGraph]
+    """
+    # Initialise list of secondary structure elements we use to build the graph
+    ss_list: List[str] = []
+    # Check nodes have secondary structure assignment & store them in list
+    for _, d in g.nodes(data=True):
+        if "ss" not in d.keys():
+            raise ProteinGraphConfigurationError(
+                "Secondary structure not defined for all nodes."
+            )
+        ss_list.append(d["ss"])
+    # Number SS elements
+    ss_list = pd.Series(number_groups_of_runs(ss_list))
+    ss_list.index = list(g.nodes())
+    # Remove unstructured elements if necessary
+    if remove_non_ss:
+        ss_list = ss_list[~ss_list.str.contains("-")]
+    # Subset to only allowable SS elements if necessary
+    if allowable_ss_elements:
+        ss_list = ss_list[
+            ss_list.str.contains("|".join(allowable_ss_elements))
+        ]
+    constituent_residues: Dict[str, List[str]] = ss_list.index.groupby(
+        ss_list.values
+    )
+    constituent_residues = {
+        k: list(v) for k, v in constituent_residues.items()
+    }
+    residue_counts: Dict[str, int] = ss_list.groupby(ss_list).count().to_dict()
+    # Add Nodes from secondary structure list
+    h = nx.MultiGraph()
+    h.add_nodes_from(ss_list)
+    nx.set_node_attributes(h, residue_counts, "residue_counts")
+    nx.set_node_attributes(h, constituent_residues, "constituent_residues")
+    # Assign ss
+    for n, d in h.nodes(data=True):
+        d["ss"] = n[0]
+    # Add graph-level metadata
+    h.graph = g.graph
+    h.graph["node_type"] = "secondary_structure"
+    # Iterate over edges in source graph and add SS-SS edges to new graph.
+    for u, v, d in g.edges(data=True):
+        try:
+            h.add_edge(
+                ss_list[u], ss_list[v], kind=d["kind"], source=f"{u}_{v}"
+            )
+        except KeyError as e:
+            log.debug(
+                f"Edge {u}-{v} not added to secondary structure graph. \
+                Reason: {e} not in graph"
+            )
+    # Remove self-loops if necessary.
+    # Checks for equality between nodes in a given edge.
+    if remove_self_loops:
+        edges_to_remove: List[Tuple[str]] = [
+            (u, v) for u, v in h.edges() if u == v
+        ]
+        h.remove_edges_from(edges_to_remove)
+    # Create weighted graph from h
+    if return_weighted_graph:
+        return compute_weighted_graph_from_multigraph(h)
+    return h
+def compute_line_graph(g: nx.Graph, repopulate_data: bool = True) -> nx.Graph:
+    """Computes the line graph of a graph.
+    The line graph of a graph G has a node for each edge in G and an edge
+    joining those nodes if the two edges in G share a common node. For directed
+    graphs, nodes are adjacent exactly when the edges they represent form a
+    directed path of length two.
+    The nodes of the line graph are 2-tuples of nodes in the original graph (or
+    3-tuples for multigraphs, with the key of the edge as the third element).
+    :param g: Graph to compute the line graph of.
+    :type g: nx.Graph
+    :param repopulate_data: Whether or not to map node and edge data to edges
+        and nodes of the line graph, defaults to True
+    :type repopulate_data: bool, optional
+    :return: Line graph of g.
+    :rtype: nx.Graph
+    """
+    l_g = nx.generators.line_graph(g)
+    l_g.graph = g.graph
+    if repopulate_data:
+        source_edge_data = {(u, v): d for u, v, d in g.edges(data=True)}
+        nx.set_node_attributes(l_g, source_edge_data)
+        node_list = {}
+        for u, v, d in l_g.edges(data=True):
+            node_union = u + v
+            for n in node_union:
+                if node_union.count(n) > 1:
+                    node_list[(u, v)] = n
+                    break
+        source_node_data = {k: g.nodes[v] for k, v in node_list.items()}
+        nx.set_edge_attributes(l_g, source_node_data)
+    return l_g

modeling_prot2text.py ADDED Viewed

	@@ -0,0 +1,392 @@

+from transformers import GPT2Config, AutoTokenizer, GPT2Config
+from transformers import PretrainedConfig, PreTrainedModel
+import transformers
+from typing import Optional, Tuple, Callable
+import torch
+import torch.nn as nn
+from transformers.modeling_utils import PreTrainedModel, PretrainedConfig
+from .utils import CABlock, _GPT2LMHeadModel
+from .configuration_prot2text import Prot2TextConfig
+import os
+import numpy as np
+from transformers.generation.configuration_utils import GenerationConfig
+from transformers.generation.logits_process import LogitsProcessorList
+from transformers.generation.stopping_criteria import StoppingCriteriaList
+from .pdb2graph import PDB2Graph, download_alphafold_structure
+from .graphs import *
+from .utils_dataset import *
+from graphein.protein.config import ProteinGraphConfig, DSSPConfig
+from graphein.protein.features.nodes.amino_acid import amino_acid_one_hot, meiler_embedding, expasy_protein_scale, hydrogen_bond_acceptor, hydrogen_bond_donor
+from graphein.protein.features.nodes.dssp import  phi, psi, asa, rsa, secondary_structure
+from graphein.protein.edges.distance import (add_peptide_bonds,
+                                             add_hydrogen_bond_interactions,
+                                             add_distance_threshold,
+                                            )
+from torch_geometric.nn import RGCNConv, global_mean_pool
+class EncoderRGCN(PreTrainedModel):
+    '''
+    This class implement the RGCN encoder to encode the protein structure
+    '''
+    def __init__(self, input_dim, hidden_dim=512, n_layers=6, emb_dim=512, dropout=0.2, num_relation=7, prot2text_version='1.0'):
+        super(EncoderRGCN, self).__init__(PretrainedConfig(name='RGCN'))
+        self.n_layers = n_layers
+        self.output_dim = emb_dim
+        self.prot2text_version = prot2text_version
+        self.fc0 = nn.Linear(input_dim, hidden_dim)
+        self.batchnorm_final = nn.BatchNorm1d(hidden_dim)
+        self.batch_norms = nn.ModuleList()
+        self.batch_norms.append(nn.BatchNorm1d(hidden_dim))
+        lst = list()
+        lst.append(RGCNConv(hidden_dim, hidden_dim, num_relations=num_relation))
+        for i in range(n_layers-1):
+            lst.append(RGCNConv(hidden_dim,hidden_dim, num_relations=num_relation))
+        self.conv = nn.ModuleList(lst)
+        self.fc1 = nn.Linear(hidden_dim, hidden_dim)
+        self.fc2 = nn.Linear(hidden_dim, self.output_dim)
+        self.dropout = nn.Dropout(p=dropout)
+        self.relu = nn.LeakyReLU()
+        self.batchnorm = nn.BatchNorm1d(hidden_dim)
+        self.main_input_name = 'nothing'
+    def forward(self, x:Optional[torch.FloatTensor] = None,
+                edge_index:Optional[torch.LongTensor] = None,
+                edge_type:Optional[torch.LongTensor] = None,
+                batch:Optional[torch.LongTensor] = None,
+                **kargs):
+        #construct pyg edge index shape (2, num_edges) from edge_list
+        x = self.relu(self.fc0(x))
+        for i in range(self.n_layers):
+            x = self.conv[i](x, edge_index, edge_type)
+        out = global_mean_pool(x, batch)
+        out = self.relu(self.fc1(out))
+        out = self.relu(self.fc2(out))
+        return out.unsqueeze(1)
+class Prot2TextModel(PreTrainedModel):
+    config_class = Prot2TextConfig
+    _keys_to_ignore_on_load_missing = [r"transformer"]
+    base_model_prefix = "decoder"
+    def __init__(self, config):
+        super().__init__(config)
+        self.gpt_config = GPT2Config.from_dict(config.gpt_config)
+        # if we are using RGCN to encode the protein's structure, define the RGCN encoder
+        if config.rgcn:
+            self.encoder = EncoderRGCN(input_dim=config.rgcn_input_dim, hidden_dim=self.gpt_config.n_embd, n_layers=config.rgcn_n_layers, emb_dim=self.gpt_config.n_embd, prot2text_version=self.config.prot2text_version)
+        # define the GPT2 decoder
+        self.decoder = _GPT2LMHeadModel(self.gpt_config)
+        # if using ESM to encode protein's sequence, define the ESM layer, the Projection layer and the fusion layer
+        if config.esm:
+            self.esm_config = PretrainedConfig.from_dict(config.esm_config)
+            self.esm = transformers.EsmModel(self.esm_config)
+            self.to_embedding = nn.Linear(self.esm_config.hidden_size, self.gpt_config.n_embd)
+            if config.cross_esm_graph and config.rgcn:
+                self.h = nn.ModuleList([CABlock(self.gpt_config,  layer_idx=i) for i in range(4)])
+                self.ln_f = nn.LayerNorm(self.gpt_config.n_embd, eps=self.gpt_config.layer_norm_epsilon)
+        self.config = config
+    def get_encoder(self):
+        return self.encoder
+    def get_decoder(self):
+        return self.decoder
+    def get_input_embeddings(self):
+        if hasattr(self, "transformer"):
+            return self.transformer.wte
+        return self.decoder.transformer.wte
+    def warm_up(self, gpt_model=None, esm_model=None):
+        if esm_model is not None:
+            self.esm = transformers.EsmModel.from_pretrained(esm_model)
+        if gpt_model is not None:
+            self.decoder = _GPT2LMHeadModel.from_pretrained(gpt_model, add_cross_attention=True, use_cache=False)
+            self.decoder.resize_token_embeddings(self.gpt_config.vocab_size)
+            self.decoder.config = self.gpt_config
+    def forward(self,
+                encoder_input_ids: Optional[torch.LongTensor] = None,
+                edge_index: Optional[torch.LongTensor] = None,
+                batch: Optional[torch.LongTensor] = None,
+                x: Optional[torch.FloatTensor] = None,
+                edge_type: Optional[torch.LongTensor] = None,
+                decoder_input_ids: Optional[torch.LongTensor] = None,
+                past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+                past_key_values_graph_esm: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+                decoder_attention_mask: Optional[torch.FloatTensor] = None,
+                attention_mask: Optional[torch.FloatTensor] = None,
+                token_type_ids: Optional[torch.LongTensor] = None,
+                position_ids: Optional[torch.LongTensor] = None,
+                head_mask: Optional[torch.FloatTensor] = None,
+                inputs_embeds: Optional[torch.FloatTensor] = None,
+                encoder_hidden_states: Optional[torch.Tensor] = None,
+                encoder_attention_mask: Optional[torch.FloatTensor] = None,
+                labels: Optional[torch.LongTensor] = None,
+                use_cache: Optional[bool] = None,
+                output_attentions: Optional[bool] = None,
+                output_hidden_states: Optional[bool] = None,
+                return_dict: Optional[bool] = None,
+                get_graph_emb: Optional[bool] = False,
+                **delete_args,
+            ):
+        use_cache = use_cache if use_cache is not None else self.gpt_config.use_cache
+        return_dict = return_dict if return_dict is not None else self.gpt_config.use_return_dict
+        if decoder_input_ids is not None and len(decoder_input_ids.size()) == 3:
+            decoder_input_ids = decoder_input_ids.squeeze(0)
+        if x is not None and self.config.rgcn:
+            graph_emb = self.encoder(x, edge_index, edge_type, batch)
+            graph_mask = None
+        if self.config.esm:
+            if self.config.prot2text_version=='1.0':
+                if encoder_input_ids.size()[1] != 1021:
+                    raise ValueError("For this version of the model you need to PAD/Truncate the amino acid sequence for the ESM model to 1021")
+            esm_emb = self.esm(input_ids=encoder_input_ids, attention_mask=attention_mask, return_dict=return_dict).last_hidden_state
+            esm_emb = self.to_embedding(esm_emb)
+            if not self.config.cross_esm_graph and self.config.rgcn:
+                graph_emb = torch.cat((graph_emb, esm_emb), dim=1)
+                t_add = torch.ones((attention_mask.size(0), 1)).to(attention_mask.get_device())
+                attention_mask = torch.cat((t_add, attention_mask), dim=1)
+            elif self.config.cross_esm_graph and self.config.rgcn:
+                if past_key_values_graph_esm is None:
+                    past_length = 0
+                    past_key_values_graph_esm = tuple([None] * len(self.h))
+                else:
+                    past_length = past_key_values_graph_esm[0][0].size(-2)
+                output_shape = esm_emb.size()
+                all_self_attentions = () if output_attentions else None
+                all_cross_attentions = () if output_attentions and self.gpt_config.add_cross_attention else None
+                all_hidden_states = () if output_hidden_states else None
+                for i, (block, layer_past) in enumerate(zip(self.h, past_key_values_graph_esm)):
+                    outputs = block(
+                        esm_emb,
+                        layer_past=layer_past,
+                        attention_mask=attention_mask,
+                        encoder_hidden_states=graph_emb,
+                        encoder_attention_mask=graph_mask,
+                        use_cache=use_cache,
+                        output_attentions=False,
+                    )
+                    esm_emb = outputs[0]
+                esm_emb = self.ln_f(esm_emb)
+                esm_emb = esm_emb.view(output_shape)
+                graph_emb = esm_emb
+            else:
+                graph_emb = esm_emb
+        else:
+            attention_mask = None
+        if self.config.prot2text_version=='1.0':
+            attention_mask = None
+        if get_graph_emb:
+            return graph_emb
+        transformer_outputs = self.decoder(input_ids=decoder_input_ids,
+                                            past_key_values=past_key_values,
+                                            attention_mask=decoder_attention_mask,
+                                            token_type_ids=token_type_ids,
+                                            position_ids=position_ids,
+                                            head_mask=head_mask,
+                                            inputs_embeds=inputs_embeds,
+                                            encoder_hidden_states=graph_emb,
+                                            encoder_attention_mask=attention_mask,
+                                            labels=labels,
+                                            use_cache=use_cache,
+                                            output_attentions=output_attentions,
+                                            output_hidden_states=output_hidden_states,
+                                            return_dict=return_dict,
+                                            )
+        return transformer_outputs
+    @torch.no_grad()
+    def generate_protein_description(self,
+                                    protein_pdbID=None,
+                                    protein_sequence=None,
+                                    edge_index: Optional[torch.LongTensor] = None,
+                                    x: Optional[torch.FloatTensor] = None,
+                                    edge_type: Optional[torch.LongTensor] = None,
+                                    tokenizer=None,
+                                    device='cpu'
+                                     ):
+        if self.config.esm and not self.config.rgcn and protein_sequence==None:
+            raise ValueError(
+                "The model you are trying to use is based only on protein sequence, please provide an amino-acid protein_sequence"
+            )
+        if self.config.rgcn and protein_pdbID==None and (x==None or edge_index==None or edge_type==None):
+            raise ValueError(
+                "The model you are trying to use is based on protein structure, please provide a AlphaFold ID (you must have to have internet connection using protein_pdbID, or provide the triplet inputs: x (node features), edge_index and edge_type"
+            )
+        if self.config.esm:
+            esmtokenizer = AutoTokenizer.from_pretrained(self.config.esm_model_name)
+        if protein_pdbID==None and protein_sequence==None:
+            raise ValueError(
+                "you need to provide either a protein AlphaFold Id or an amino-acid sequence"
+            )
+        if protein_pdbID!=None:
+            config = {"node_metadata_functions": [amino_acid_one_hot,
+                                                expasy_protein_scale,
+                                                meiler_embedding,
+                                                hydrogen_bond_acceptor, hydrogen_bond_donor
+                                                ],
+                    "edge_construction_functions": [add_peptide_bonds,
+                                                    add_hydrogen_bond_interactions,
+                                                    partial(add_distance_threshold, long_interaction_threshold=3, threshold=10.),],
+                    "graph_metadata_functions":[asa,phi, psi, secondary_structure, rsa],
+                    "dssp_config": DSSPConfig()}
+            config = ProteinGraphConfig(**config)
+            PATH_TO_DATA = f"~/.tmp/pdb/pdb"
+            OUTPUT_FOLDER = f"~/.tmp/pdb/raw"
+            save_dir = f"~/.tmp/pdb/"
+            isExist = os.path.exists(PATH_TO_DATA)
+            if not isExist:
+                os.makedirs(PATH_TO_DATA)
+            isExist = os.path.exists(OUTPUT_FOLDER)
+            if not isExist:
+                os.makedirs(OUTPUT_FOLDER)
+            isExist = os.path.exists(save_dir+'processed')
+            if not isExist:
+                os.makedirs(save_dir+'processed')
+            structure_filename = download_alphafold_structure(uniprot_id=protein_pdbID, out_dir=PATH_TO_DATA)
+            if structure_filename is None:
+                raise ValueError("Error! the ID does not exist in AlphaFoldDB or you do not have internet connection")
+            graph_filename = structure_filename.split('/')
+            graph_filename[-2] = 'raw'
+            graph_filename[-1] = graph_filename[-1].replace('.pdb', '.pt')
+            graph_filename = '/'.join(graph_filename)
+            process_filename = structure_filename.split('/')
+            process_filename[-2] = 'processed'
+            process_filename[-1] = process_filename[-1].replace('.pdb', '.pt')
+            process_filename = '/'.join(process_filename)
+            try:
+                gpdb = PDB2Graph(root = PATH_TO_DATA, output_folder = OUTPUT_FOLDER, config=config, n_processors=1).create_pyg_graph(structure_filename)
+                seq = esmtokenizer(gpdb.sequence, add_special_tokens=True, truncation=True, max_length=1021, padding='max_length',return_tensors="pt") #
+                torch.save(gpdb, graph_filename)
+                gpdb.edge_type = [np.array(gpdb.edge_type.transpose(0,1))]
+                gpdb.encoder_input_ids = seq['input_ids']
+                gpdb.attention_mask = seq['attention_mask']
+                torch.save(gpdb, process_filename)
+            except:
+                os.remove(structure_filename)
+                raise ValueError('creating graphs did not work, probably the pdb file of alphaFold is damaged')
+            self.eval()
+            inputs = gpdb
+            inputs = inputs.to_dict()
+            inputs['edge_type'] =  torch.cat([torch.tensor(inputs['edge_type'][i]) for i in range(len(inputs['edge_type']))], dim=0)
+            inputs['edge_type'] = torch.argmax(inputs['edge_type'], dim=1)
+            for key in ['num_nodes', 'node_id', 'name', 'sequence', 'distance_matrix', 'distance', 'coordinates']:
+                inputs.pop(key)
+            inputs['decoder_input_ids'] = inputs['encoder_input_ids'][:,0:1].clone()
+            inputs['decoder_input_ids'][:,0] = tokenizer.bos_token_id
+            inputs["decoder_attention_mask"] = torch.ones(inputs['decoder_input_ids'].shape[0], 1)
+            self.to(device)
+            inputs = {k: v.to(device=device, non_blocking=True) if hasattr(v, 'to') else v for k, v in inputs.items()}
+            encoder_state = dict()
+            encoder_state['hidden_states'] = self(**inputs, get_graph_emb=True, output_attentions=True)
+            encoder_state['attentions'] = inputs['attention_mask']
+            for key in ['edge_index', 'edge_type', 'x', 'encoder_input_ids']:
+                inputs.pop(key)
+            tok_ids = self.decoder.generate(input_ids=inputs['decoder_input_ids'],
+                                            encoder_outputs=encoder_state,
+                                            use_cache=True,
+                                            output_attentions=False,
+                                            output_scores=False,
+                                            return_dict_in_generate=True,
+                                            encoder_attention_mask=inputs['attention_mask'],
+                                            length_penalty=1.0,
+                                            no_repeat_ngram_size=None,
+                                            early_stopping=False,
+                                            num_beams=1)
+            generated = tokenizer.batch_decode(tok_ids.get('sequences'), skip_special_tokens=True)
+            os.remove(structure_filename)
+            os.remove(graph_filename)
+            os.remove(process_filename)
+            return generated[0].replace('<|stop_token|>', '').replace('<|graph_token|>', '')
+        else:
+            seq = esmtokenizer([protein_sequence], add_special_tokens=True, truncation=True, max_length=1021, padding='max_length', return_tensors="pt")
+            inputs={}
+            inputs['encoder_input_ids'] = seq['input_ids']
+            inputs['attention_mask'] = seq['attention_mask']
+            inputs['decoder_input_ids'] = inputs['encoder_input_ids'][:,0:1].clone()
+            inputs['decoder_input_ids'][:,0] = tokenizer.bos_token_id
+            self.to(device)
+            inputs = {k: v.to(device=device, non_blocking=True) if hasattr(v, 'to') else v for k, v in inputs.items()}
+            encoder_state = dict()
+            encoder_state['hidden_states'] = self(**inputs, get_graph_emb=True, output_attentions=True)
+            generated = tokenizer.batch_decode(self.decoder.generate(input_ids=inputs['decoder_input_ids'], encoder_outputs=encoder_state, use_cache=True), skip_special_tokens=True)
+            return generated[0].replace('<|stop_token|>', '').replace('<|graph_token|>', '')
+    @torch.no_grad()
+    def generate(self,
+                inputs: Optional[torch.Tensor] = None,
+                generation_config: Optional[GenerationConfig] = None,
+                logits_processor: Optional[LogitsProcessorList] = None,
+                stopping_criteria: Optional[StoppingCriteriaList] = None,
+                prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
+                synced_gpus: Optional[bool] = None,
+                assistant_model: Optional["PreTrainedModel"] = None,
+                streamer: Optional["BaseStreamer"] = None,
+                **kwargs,
+            ):
+        encoder_state = self(**kwargs, get_graph_emb=True)
+        input_ids = kwargs['decoder_input_ids']
+        attention_mask = kwargs['decoder_attention_mask']
+        kwargs['encoder_attention_mask'] = kwargs['attention_mask']
+        if not self.config.cross_esm_graph and self.config.rgcn and self.config.esm:
+            t_add = torch.ones((kwargs['encoder_attention_mask'].size(0), 1)).to(kwargs['encoder_attention_mask'].get_device())
+            kwargs['encoder_attention_mask'] = torch.cat((t_add, kwargs['encoder_attention_mask']), dim=1)
+        for key in ['edge_index', 'edge_type', 'x', 'encoder_input_ids', 'decoder_input_ids', 'decoder_attention_mask', 'batch', 'attention_mask', 'max_length',
+                    '_num_nodes', 'node_id', 'name', 'sequence', 'distance_matrix', 'distance', 'coordinates', 'ptr', 'num_nodes',]:
+            if key in kwargs.keys():
+                kwargs.pop(key)
+        return self.decoder.generate(input_ids=input_ids,
+                                     generation_config=generation_config,
+                                     logits_processor=logits_processor,
+                                     stopping_criteria=stopping_criteria,
+                                     prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+                                     synced_gpus=synced_gpus,
+                                     assistant_model=assistant_model,
+                                     streamer=streamer,
+                                     encoder_outputs={'hidden_states': encoder_state, 'attentions':0},
+                                     **kwargs
+                                     )

pdb2graph.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import multiprocessing
+import os
+from tqdm import tqdm
+from sklearn.preprocessing import MultiLabelBinarizer
+from torch_geometric.data import Data
+import torch
+import numpy as np
+from .conversion import convert_nx_to_pyg_data
+from graphein.protein.config import ProteinGraphConfig, DSSPConfig
+from graphein.protein.features.nodes.amino_acid import amino_acid_one_hot, meiler_embedding, expasy_protein_scale, hydrogen_bond_acceptor, hydrogen_bond_donor
+from graphein.protein.features.nodes.dssp import  phi, psi, asa, rsa, secondary_structure
+from graphein.protein.edges.distance import (add_peptide_bonds,
+                                             add_hydrogen_bond_interactions,
+                                             add_disulfide_interactions,
+                                             add_ionic_interactions,
+                                             add_delaunay_triangulation,
+                                             add_distance_threshold,
+                                             add_sequence_distance_edges,
+                                             add_k_nn_edges)
+from functools import partial
+from .graphs import *
+from .utils_dataset import *
+import os
+import sys
+import subprocess
+import wget
+class PDB2Graph():
+    def __init__(self, root, output_folder, config, n_processors=int(multiprocessing.cpu_count())):
+        self.root = root
+        self.output_folder = output_folder
+        self.map_secondary_structure = {'-':0, 'H':1, 'B':2, 'E':3, 'G':4, 'I':5, 'T':6, 'S':7}
+        self.init_ohe_edge_type()
+        self.config = config
+        self.features = ['phi', 'psi', 'rsa', 'asa', 'ss', 'expasy']
+        self.n_processors = n_processors
+        self.raw_dir = root
+        self.processed_dir = self._processed_dir()
+        self.raw_file_names = self._raw_file_names()
+        self.processed_file_names = self._processed_file_names()
+    def _processed_dir(self):
+        #processed_dir = os.path.join(os.path.split(self.root)[0], "processed_new")
+        if not os.path.exists(self.output_folder):
+            os.makedirs(self.output_folder)
+        return self.output_folder
+    def _raw_file_names(self):
+        return os.listdir(self.raw_dir)
+    def _processed_file_names(self):
+        return [self.pdb2pathdata(pdb_path.split(".")[0]) for pdb_path in self.raw_file_names]
+    def create_nx_graph(self, path_to_structure):
+        return construct_graph(self.config, pdb_path = path_to_structure)
+    def create_pyg_graph(self, path_to_structure):
+        pyg_graph = convert_nx_to_pyg_data(self.create_nx_graph(path_to_structure))
+        graph = Data(edge_index = pyg_graph.edge_index,
+                    num_nodes = len(pyg_graph.node_id),
+                    node_id = pyg_graph.node_id,
+                    name = pyg_graph.name[0],
+                    sequence = getattr(pyg_graph, f"sequence_{pyg_graph.chain_id[0]}"),
+                    distance_matrix = pyg_graph.dist_mat,
+                    distance = pyg_graph.distance,
+                    coordinates = torch.FloatTensor(np.array(pyg_graph.coords[0])))
+        #create the features
+        x = np.array([np.argmax(pyg_graph.amino_acid_one_hot, axis=1)]).reshape(-1,1)
+        for feat in self.features:
+            if feat == "ss":
+                feature = np.array([[self.map_secondary_structure.get(feat_node, 0)] \
+                    for feat_node in pyg_graph[feat]])
+            else:
+                feature = np.array(pyg_graph[feat])
+                if len(feature.shape) == 1:
+                    feature = feature.reshape(-1,1)
+            x = np.concatenate((x, feature), axis = 1)
+        graph.edge_type = self.mlb.transform(pyg_graph.kind)
+        graph.x = torch.FloatTensor(x)
+        # y = self.annotations[graph.name.split("_")[0]]
+        # if self.task == 'GeneOntology' :
+        #     graph.y_mf = torch.FloatTensor(y["mf"])
+        #     graph.y_cc = torch.FloatTensor(y["cc"])
+        #     graph.y_bp = torch.FloatTensor(y["bp"])
+        # else:
+        #     graph.y_ec = torch.FloatTensor(y["ec"])
+        return graph
+    def init_ohe_edge_type(self):
+        self.mlb = MultiLabelBinarizer(classes = ['peptide_bond', 'sequence_distance_2', 'sequence_distance_3'
+                                             , 'distance_threshold', 'delaunay', 'hbond', 'k_nn'])
+        self.mlb.fit([['peptide_bond', 'sequence_distance_2', 'sequence_distance_3'
+                                             , 'distance_threshold', 'delaunay', 'hbond', 'k_nn']])
+    def process(self):
+        """Convert the PDB files into torch geometric graphs"""
+        # self.pdb2graph = PDB2Graph(self.config)
+        to_be_processed = self.get_files_to_process()
+        # pool = multiprocessing.Pool(self.n_processors)
+        # for _ in tqdm(pool.imap_unordered(self.graph_creation, to_be_processed), total=len(to_be_processed)):
+        #     continue
+        # pool.close()
+        # pool.join()
+        processes = []
+        for prot in tqdm(to_be_processed):
+            p = multiprocessing.Process(target=self.graph_creation, args=(prot,))
+            processes.append(p)
+            p.start()
+        for process in processes:
+            process.join()
+    def graph_creation(self, pdb):
+        """Create a graph from the PDB file"""
+        # Define the path_to_structure from the pdb name file
+        path_to_structure = self.pdb2pathstructure(pdb)
+        # Convert the structure into a graph
+        g = self.create_pyg_graph(path_to_structure)
+        # Save the graph
+        torch.save(g, os.path.join(self.output_folder, self.pdb2pathdata(pdb)))
+        return None
+    def pdb2pathdata(self, pdb):
+        return pdb+'.pt'
+    def pdb2pathstructure(self, pdb):
+        return os.path.join(self.raw_dir, pdb+'.pdb')
+    def get_files_to_process(self):
+        RAW_FILES = self.processed_file_names
+        PROCESSED_FILES = os.listdir(self.processed_dir)
+        to_be_processed = set(RAW_FILES).difference(set(PROCESSED_FILES))
+        to_be_processed = [path.split('.')[0] for path in to_be_processed]
+        return to_be_processed
+def download_alphafold_structure(
+    uniprot_id: str,
+    out_dir: str,
+    version: int = 4
+    ):
+    BASE_URL = "https://alphafold.ebi.ac.uk/files/"
+    uniprot_id = uniprot_id.upper()
+    query_url = f"{BASE_URL}AF-{uniprot_id}-F1-model_v{version}.pdb"
+    structure_filename = os.path.join(out_dir, f"AF-{uniprot_id}-F1-model_v{version}.pdb")
+    if os.path.exists(structure_filename):
+        return structure_filename
+    try:
+        structure_filename = wget.download(query_url, out=out_dir)
+    except:
+        print('Error.. could not download: ', f"AF-{uniprot_id}-F1-model_v{version}.pdb")
+        return None
+    return structure_filename

utils.py ADDED Viewed

	@@ -0,0 +1,742 @@

+import torch.nn as nn
+from transformers.models.gpt2.modeling_gpt2 import GPT2Attention, GPT2MLP
+from typing import Optional, Tuple, Union, Any, Dict, List
+from transformers import Seq2SeqTrainer, GPT2LMHeadModel
+from torch.utils.data.distributed import DistributedSampler
+import torch
+from transformers.deepspeed import is_deepspeed_zero3_enabled
+from transformers.generation.logits_process import LogitsProcessorList
+from transformers.generation.stopping_criteria import StoppingCriteriaList
+from transformers.generation.utils import GreedySearchOutput, GreedySearchEncoderDecoderOutput, BeamSearchOutput, BeamSearchEncoderDecoderOutput
+from transformers.generation.beam_search import BeamScorer
+from torch_geometric.loader import DataLoader
+from torch_geometric.data import Dataset
+class _GPT2LMHeadModel(GPT2LMHeadModel):
+    def _init_(self, config):
+        super(GPT2LMHeadModel, self).init_(config)
+        self.config = config
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, encoder_outputs=None, **kwargs):
+        '''
+        This function is an edited version of the prepare_inputs_for_generation function from HuggingFace's transformers
+        https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py
+        '''
+        token_type_ids = kwargs.get("token_type_ids", None)
+        # only last token for inputs_ids if past is defined in kwargs
+        if past_key_values:
+            input_ids = input_ids[:, -1].unsqueeze(-1)
+            if token_type_ids is not None:
+                token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
+        attention_mask = kwargs.get("attention_mask", None)
+        position_ids = kwargs.get("position_ids", None)
+        if self.config.prot2text_version=="1.1" or self.config.prot2text_version=="1.2":
+            encoder_attention_mask = kwargs.get("encoder_attention_mask", None)
+        elif self.config.prot2text_version=="1.0":
+            encoder_attention_mask = None
+        if attention_mask is not None and position_ids is None:
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+        else:
+            position_ids = None
+        model_specific_kwargs = {
+            "encoder_hidden_states": encoder_outputs['hidden_states'],
+        }
+        return {
+            "input_ids": input_ids,
+            "past_key_values": past_key_values,
+            "use_cache": kwargs.get("use_cache"),
+            "position_ids": position_ids,
+            "attention_mask": attention_mask,
+            "token_type_ids": token_type_ids,
+            "encoder_attention_mask": encoder_attention_mask,
+            **model_specific_kwargs
+        }
+    def greedy_search(
+        self,
+        input_ids: torch.LongTensor,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        max_length: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[Union[int, List[int]]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_scores: Optional[bool] = None,
+        return_dict_in_generate: Optional[bool] = None,
+        synced_gpus: bool = False,
+        streamer: Optional["BaseStreamer"] = None,
+        **model_kwargs,
+    ) -> Union[GreedySearchOutput, torch.LongTensor]:
+        '''
+        This function is an edited version of the greedy_search function from HuggingFace's transformers
+        https://github.com/huggingface/transformers/blob/main/src/transformers/generation/utils.py
+        '''
+        # init values
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+        if max_length is not None:
+            warnings.warn(
+                "`max_length` is deprecated in this function, use"
+                " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.",
+                UserWarning,
+            )
+            stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
+        pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
+        eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
+        eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
+        output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
+        output_attentions = (
+            output_attentions if output_attentions is not None else self.generation_config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
+        )
+        return_dict_in_generate = (
+            return_dict_in_generate
+            if return_dict_in_generate is not None
+            else self.generation_config.return_dict_in_generate
+        )
+        # init attention / hidden states / scores tuples
+        scores = () if (return_dict_in_generate and output_scores) else None
+        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
+        if return_dict_in_generate and self.config.is_encoder_decoder:
+            encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+            encoder_hidden_states = (
+                model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+            )
+        # keep track of which sequences are already finished
+        unfinished_sequences = torch.ones(input_ids.shape[0], dtype=torch.long, device=input_ids.device)
+        this_peer_finished = False  # used by synced_gpus only
+        while True:
+            if synced_gpus:
+                # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
+                # The following logic allows an early break if all peers finished generating their sequence
+                this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
+                # send 0.0 if we finished, 1.0 otherwise
+                dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
+                # did all peers finish? the reduced sum will be 0.0 then
+                if this_peer_finished_flag.item() == 0.0:
+                    break
+            # prepare model inputs
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+            # forward pass to get next token
+            outputs = self(
+                **model_inputs,
+                return_dict=True,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+            )
+            if synced_gpus and this_peer_finished:
+                continue  # don't waste resources running the code we don't need
+            next_token_logits = outputs.logits[:, -1, :]
+            # pre-process distribution
+            next_tokens_scores = logits_processor(input_ids, next_token_logits)
+            # Store scores, attentions and hidden_states when required
+            if return_dict_in_generate:
+                if output_scores:
+                    scores += (next_tokens_scores,)
+                if output_attentions:
+                    decoder_attentions += (
+                        (outputs.decoder_attentions,) if not self.config.is_encoder_decoder else (outputs.attentions,)
+                    )
+                    if self.config.is_encoder_decoder:
+                        cross_attentions += (outputs.cross_attentions,)
+                if output_hidden_states:
+                    decoder_hidden_states += (
+                        (outputs.decoder_hidden_states,)
+                        if self.config.is_encoder_decoder
+                        else (outputs.hidden_states,)
+                    )
+            # argmax
+            next_tokens = torch.argmax(next_tokens_scores, dim=-1)
+            # finished sentences should have their next token be a padding token
+            if eos_token_id is not None:
+                if pad_token_id is None:
+                    raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
+                next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
+            # update generated ids, model inputs, and length for next step
+            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+            if streamer is not None:
+                streamer.put(next_tokens.cpu())
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+            )
+            # if eos_token was found in one sentence, set sentence to finished
+            if eos_token_id_tensor is not None:
+                unfinished_sequences = unfinished_sequences.mul(
+                    next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
+                )
+                # stop when each sentence is finished
+                if unfinished_sequences.max() == 0:
+                    this_peer_finished = True
+            # stop if we exceed the maximum length
+            try:
+                if stopping_criteria(input_ids, scores):
+                    this_peer_finished = True
+            except:
+                if all(stopping_criteria(input_ids, scores)):
+                    this_peer_finished = True
+            if this_peer_finished and not synced_gpus:
+                break
+        if streamer is not None:
+            streamer.end()
+        if return_dict_in_generate:
+            if self.config.is_encoder_decoder:
+                return GreedySearchEncoderDecoderOutput(
+                    sequences=input_ids,
+                    scores=scores,
+                    encoder_attentions=encoder_attentions,
+                    encoder_hidden_states=encoder_hidden_states,
+                    decoder_attentions=decoder_attentions,
+                    cross_attentions=cross_attentions,
+                    decoder_hidden_states=decoder_hidden_states,
+                )
+            else:
+                return GreedySearchDecoderOnlyOutput(
+                    sequences=input_ids,
+                    scores=scores,
+                    attentions=decoder_attentions,
+                    hidden_states=decoder_hidden_states,
+                )
+        else:
+            return input_ids
+    def _greedy_search(
+        self,
+        input_ids: torch.LongTensor,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        max_length: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[Union[int, List[int]]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_scores: Optional[bool] = None,
+        return_dict_in_generate: Optional[bool] = None,
+        synced_gpus: bool = False,
+        streamer: Optional["BaseStreamer"] = None,
+        **model_kwargs,
+    ) -> Union[GreedySearchOutput, torch.LongTensor]:
+        return self.greedy_search(
+                            input_ids,
+                            logits_processor,
+                            stopping_criteria,
+                            max_length,
+                            pad_token_id,
+                            eos_token_id,
+                            output_attentions,
+                            output_hidden_states,
+                            output_scores,
+                            return_dict_in_generate,
+                            synced_gpus,
+                            streamer,
+                            **model_kwargs,
+                            )
+    def _beam_search(
+        self,
+        input_ids: torch.LongTensor,
+        beam_scorer: BeamScorer,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        max_length: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[Union[int, List[int]]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_scores: Optional[bool] = None,
+        return_dict_in_generate: Optional[bool] = None,
+        synced_gpus: bool = False,
+        **model_kwargs,
+    ) -> Union[BeamSearchOutput, torch.LongTensor]:
+        return self.beam_search(
+                input_ids,
+                beam_scorer,
+                logits_processor,
+                stopping_criteria,
+                max_length,
+                pad_token_id,
+                eos_token_id,
+                output_attentions,
+                output_hidden_states,
+                output_scores,
+                return_dict_in_generate,
+                synced_gpus,
+                **model_kwargs,
+            )
+    def beam_search(
+        self,
+        input_ids: torch.LongTensor,
+        beam_scorer: BeamScorer,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        max_length: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[Union[int, List[int]]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_scores: Optional[bool] = None,
+        return_dict_in_generate: Optional[bool] = None,
+        synced_gpus: bool = False,
+        **model_kwargs,
+    ) -> Union[BeamSearchOutput, torch.LongTensor]:
+        '''
+        This function is an edited version of the beam_search function from HuggingFace's transformers
+        https://github.com/huggingface/transformers/blob/main/src/transformers/generation/utils.py
+        '''
+        # init values
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+        if max_length is not None:
+            warnings.warn(
+                "`max_length` is deprecated in this function, use"
+                " `stopping_criteria=StoppingCriteriaList(MaxLengthCriteria(max_length=max_length))` instead.",
+                UserWarning,
+            )
+            stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
+        if len(stopping_criteria) == 0:
+            warnings.warn("You don't have defined any stopping_criteria, this will likely loop forever", UserWarning)
+        pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
+        eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
+        output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
+        output_attentions = (
+            output_attentions if output_attentions is not None else self.generation_config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
+        )
+        return_dict_in_generate = (
+            return_dict_in_generate
+            if return_dict_in_generate is not None
+            else self.generation_config.return_dict_in_generate
+        )
+        batch_size = len(beam_scorer._beam_hyps)
+        num_beams = beam_scorer.num_beams
+        batch_beam_size, cur_len = input_ids.shape
+        if num_beams * batch_size != batch_beam_size:
+            raise ValueError(
+                f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}."
+            )
+        # init attention / hidden states / scores tuples
+        scores = () if (return_dict_in_generate and output_scores) else None
+        beam_indices = (
+            tuple(() for _ in range(batch_beam_size)) if (return_dict_in_generate and output_scores) else None
+        )
+        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
+        if return_dict_in_generate and self.config.is_encoder_decoder:
+            encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+            encoder_hidden_states = (
+                model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+            )
+        # initialise score of first beam with 0 and the rest with -1e9. This makes sure that only tokens
+        # of the first beam are considered to avoid sampling the exact same tokens across all beams.
+        beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
+        beam_scores[:, 1:] = -1e9
+        beam_scores = beam_scores.view((batch_size * num_beams,))
+        this_peer_finished = False  # used by synced_gpus only
+        while True:
+            if synced_gpus:
+                # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
+                # The following logic allows an early break if all peers finished generating their sequence
+                this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
+                # send 0.0 if we finished, 1.0 otherwise
+                dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
+                # did all peers finish? the reduced sum will be 0.0 then
+                if this_peer_finished_flag.item() == 0.0:
+                    break
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+            outputs = self(
+                **model_inputs,
+                return_dict=True,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+            )
+            if synced_gpus and this_peer_finished:
+                cur_len = cur_len + 1
+                continue  # don't waste resources running the code we don't need
+            next_token_logits = outputs.logits[:, -1, :]
+            # hack: adjust tokens for Marian. For Marian we have to make sure that the `pad_token_id`
+            # cannot be generated both before and after the `nn.functional.log_softmax` operation.
+            # next_token_logits = self.adjust_logits_during_generation(next_token_logits, cur_len=cur_len)
+            next_token_scores = nn.functional.log_softmax(
+                next_token_logits, dim=-1
+            )  # (batch_size * num_beams, vocab_size)
+            next_token_scores_processed = logits_processor(input_ids, next_token_scores)
+            # next_token_scores = next_token_scores_processed + beam_scores[:, None].expand_as(next_token_scores)
+            next_token_scores = next_token_scores_processed + beam_scores[:, None].expand_as(
+                next_token_scores_processed
+            )
+            # Store scores, attentions and hidden_states when required
+            if return_dict_in_generate:
+                if output_scores:
+                    scores += (next_token_scores_processed,)
+                if output_attentions:
+                    decoder_attentions += (
+                        (outputs.decoder_attentions,) if not self.config.is_encoder_decoder else (outputs.attentions,)
+                    )
+                    if self.config.is_encoder_decoder:
+                        cross_attentions += (outputs.cross_attentions,)
+                if output_hidden_states:
+                    decoder_hidden_states += (
+                        (outputs.decoder_hidden_states,)
+                        if self.config.is_encoder_decoder
+                        else (outputs.hidden_states,)
+                    )
+            # reshape for beam search
+            vocab_size = next_token_scores.shape[-1]
+            next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size)
+            # Sample 2 next tokens for each beam (so we have some spare tokens and match output of beam search)
+            next_token_scores, next_tokens = torch.topk(
+                next_token_scores, 2 * num_beams, dim=1, largest=True, sorted=True
+            )
+            next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor")
+            next_tokens = next_tokens % vocab_size
+            # stateless
+            beam_outputs = beam_scorer.process(
+                input_ids,
+                next_token_scores,
+                next_tokens,
+                next_indices,
+                pad_token_id=pad_token_id,
+                eos_token_id=eos_token_id,
+                beam_indices=beam_indices,
+            )
+            beam_scores = beam_outputs["next_beam_scores"]
+            beam_next_tokens = beam_outputs["next_beam_tokens"]
+            beam_idx = beam_outputs["next_beam_indices"]
+            input_ids = torch.cat([input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1)
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+            )
+            if model_kwargs["past_key_values"] is not None:
+                model_kwargs["past_key_values"] = self._reorder_cache(model_kwargs["past_key_values"], beam_idx)
+            if return_dict_in_generate and output_scores:
+                beam_indices = tuple((beam_indices[beam_idx[i]] + (beam_idx[i],) for i in range(len(beam_indices))))
+            # increase cur_len
+            cur_len = cur_len + 1
+            try:
+                if beam_scorer.is_done or stopping_criteria(input_ids, scores):
+                    if not synced_gpus:
+                        break
+                    else:
+                        this_peer_finished = True
+            except:
+                if beam_scorer.is_done or all(stopping_criteria(input_ids, scores)):
+                    if not synced_gpus:
+                        break
+                    else:
+                        this_peer_finished = True
+        sequence_outputs = beam_scorer.finalize(
+            input_ids,
+            beam_scores,
+            next_tokens,
+            next_indices,
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            max_length=stopping_criteria.max_length,
+            beam_indices=beam_indices,
+        )
+        if return_dict_in_generate:
+            if not output_scores:
+                sequence_outputs["sequence_scores"] = None
+            if self.config.is_encoder_decoder:
+                return BeamSearchEncoderDecoderOutput(
+                    sequences=sequence_outputs["sequences"],
+                    sequences_scores=sequence_outputs["sequence_scores"],
+                    scores=scores,
+                    beam_indices=sequence_outputs["beam_indices"],
+                    encoder_attentions=encoder_attentions,
+                    encoder_hidden_states=encoder_hidden_states,
+                    decoder_attentions=decoder_attentions,
+                    cross_attentions=cross_attentions,
+                    decoder_hidden_states=decoder_hidden_states,
+                )
+            else:
+                return BeamSearchDecoderOnlyOutput(
+                    sequences=sequence_outputs["sequences"],
+                    sequences_scores=sequence_outputs["sequence_scores"],
+                    scores=scores,
+                    beam_indices=sequence_outputs["beam_indices"],
+                    attentions=decoder_attentions,
+                    hidden_states=decoder_hidden_states,
+                )
+        else:
+            return sequence_outputs["sequences"]
+class CABlock(nn.Module):
+    '''
+        This function is an edited version of the gpt2 decoder block function from HuggingFace's transformers
+        https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py
+        '''
+    def __init__(self, config, layer_idx=None):
+        super().__init__()
+        hidden_size = config.hidden_size
+        inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
+        self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        self.crossattention = GPT2Attention(config, is_cross_attention=True, layer_idx=layer_idx)
+        self.ln_cross_attn = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        self.mlp = GPT2MLP(inner_dim, config)
+    def forward(
+        self,
+        hidden_states: Optional[Tuple[torch.FloatTensor]],
+        layer_past: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+    ) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
+        residual = hidden_states
+        hidden_states = self.ln_cross_attn(hidden_states)
+        cross_attn_outputs = self.crossattention(
+            hidden_states,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+        )
+        attn_output = cross_attn_outputs[0]
+        # residual connection
+        hidden_states = residual + attn_output
+        residual = hidden_states
+        hidden_states = self.ln_2(hidden_states)
+        feed_forward_hidden_states = self.mlp(hidden_states)
+        # residual connection
+        hidden_states = residual + feed_forward_hidden_states
+        return (hidden_states,)
+class Prot2TextTrainer(Seq2SeqTrainer):
+    '''
+    This function is an edited version of the Seq2SeqTrainer from HuggingFace's transformers
+    '''
+    def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoader:
+        if self.args.world_size > 1:
+            eval_sampler = DistributedSampler(self.eval_dataset, num_replicas=self.args.world_size, rank=self.args.process_index)
+        else:
+            eval_sampler = None
+        return DataLoader(
+            self.eval_dataset,
+            batch_size=self.args.eval_batch_size,
+            collate_fn=None,
+            num_workers=self.args.dataloader_num_workers,
+            pin_memory=self.args.dataloader_pin_memory,
+            sampler=eval_sampler,
+        )
+    def get_train_dataloader(self) -> DataLoader:
+        if self.args.world_size > 1:
+            train_sampler = DistributedSampler(self.train_dataset, num_replicas=self.args.world_size, rank=self.args.process_index)
+        else:
+            train_sampler = None
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.args.per_device_train_batch_size,
+            collate_fn=None,
+            num_workers=self.args.dataloader_num_workers,
+            pin_memory=self.args.dataloader_pin_memory,
+            sampler=train_sampler,
+        )
+    def _prepare_inputs(self, inputs: Dict[str, Union[torch.Tensor, Any]]) -> Dict[str, Union[torch.Tensor, Any]]:
+        """
+        Prepare `inputs` before feeding them to the model, converting them to tensors if they are not already and
+        handling potential state.
+        """
+        inputs = self._prepare_input(inputs)
+        if len(inputs) == 0:
+            raise ValueError(
+                "The batch received was empty, your model won't be able to train on it. Double-check that your "
+                f"training dataset contains keys expected by the model: {','.join(self._signature_columns)}."
+            )
+        if self.args.past_index >= 0 and self._past is not None:
+            inputs["mems"] = self._past
+        inputs = inputs.to_dict()
+        inputs['edge_type'] =  torch.cat([torch.tensor(inputs['edge_type'][i]) for i in range(len(inputs['edge_type']))], dim=0)
+        inputs['edge_type'] = torch.argmax(inputs['edge_type'], dim=1)
+        inputs = {k: v.to(device=self.args.device, non_blocking=True) if hasattr(v, 'to') else v for k, v in inputs.items()}
+        return inputs
+    def prediction_step(
+        self,
+        model: nn.Module,
+        inputs: Dict[str, Union[torch.Tensor, Any]],
+        prediction_loss_only: bool,
+        ignore_keys: Optional[List[str]] = None,
+        ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
+        """
+        Perform an evaluation step on `model` using `inputs`.
+        Subclass and override to inject custom behavior.
+        Args:
+            model (`nn.Module`):
+                The model to evaluate.
+            inputs (`Dict[str, Union[torch.Tensor, Any]]`):
+                The inputs and targets of the model.
+                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
+                argument `labels`. Check your model's documentation for all accepted arguments.
+            prediction_loss_only (`bool`):
+                Whether or not to return the loss only.
+        Return:
+            Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and
+            labels (each being optional).
+        """
+        if not self.args.predict_with_generate or prediction_loss_only:
+            return super().prediction_step(
+                model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys
+            )
+        has_labels = "labels" in inputs
+        inputs = self._prepare_inputs(inputs)
+        # XXX: adapt synced_gpus for fairscale as well
+        gen_kwargs = self._gen_kwargs.copy()
+        if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None:
+            gen_kwargs["max_length"] = self.model.config.max_length
+        gen_kwargs["num_beams"] = (
+            gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.model.config.num_beams
+        )
+        default_synced_gpus = True if is_deepspeed_zero3_enabled() else False
+        gen_kwargs["synced_gpus"] = (
+            gen_kwargs["synced_gpus"] if gen_kwargs.get("synced_gpus") is not None else default_synced_gpus
+        )
+        if "attention_mask" in inputs:
+            gen_kwargs["attention_mask"] = inputs.get("attention_mask", None)
+        if "global_attention_mask" in inputs:
+            gen_kwargs["global_attention_mask"] = inputs.get("global_attention_mask", None)
+        generation_inputs = None
+        gen_kwargs['x'] = inputs.get('x', None)
+        gen_kwargs['edge_index'] = inputs.get('edge_index', None)
+        gen_kwargs['edge_type'] = inputs.get('edge_type', None)
+        gen_kwargs['batch'] = inputs.get('batch', None)
+        gen_kwargs['encoder_input_ids'] = inputs.get('encoder_input_ids', None)
+        gen_kwargs['decoder_input_ids'] = inputs.get('decoder_input_ids', None)[:,0:1]
+        gen_kwargs["decoder_attention_mask"] = torch.ones(gen_kwargs['decoder_input_ids'].shape[0], 1).to(self.args.device)
+        generated_tokens = self.model.generate(
+            generation_inputs,
+            **gen_kwargs,
+        )
+        # in case the batch is shorter than max length, the output should be padded
+        if gen_kwargs.get("max_length") is not None and generated_tokens.shape[-1] < gen_kwargs["max_length"]:
+            generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"])
+        elif gen_kwargs.get("max_new_tokens") is not None and generated_tokens.shape[-1] < (
+            gen_kwargs["max_new_tokens"] + 1
+        ):
+            generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_new_tokens"] + 1)
+        with torch.no_grad():
+            if has_labels:
+                with self.compute_loss_context_manager():
+                    outputs = model(**inputs)
+                if self.label_smoother is not None:
+                    loss = self.label_smoother(outputs, inputs["labels"]).mean().detach()
+                else:
+                    loss = (outputs["loss"] if isinstance(outputs, dict) else outputs[0]).mean().detach()
+            else:
+                loss = None
+        if self.args.prediction_loss_only:
+            return (loss, None, None)
+        if has_labels:
+            labels = inputs["labels"]
+            if gen_kwargs.get("max_length") is not None and labels.shape[-1] < gen_kwargs["max_length"]:
+                labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"])
+            elif gen_kwargs.get("max_new_tokens") is not None and labels.shape[-1] < (
+                gen_kwargs["max_new_tokens"] + 1
+            ):
+                labels = self._pad_tensors_to_max_len(labels, (gen_kwargs["max_new_tokens"] + 1))
+        else:
+            labels = None
+        return (loss, generated_tokens, labels)

utils_convert.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import numpy as np
+from biopandas.pdb import PandasPdb
+pdb_order = [
+    "record_name",
+    "atom_number",
+    "blank_1",
+    "atom_name",
+    "alt_loc",
+    "residue_name",
+    "blank_2",
+    "chain_id",
+    "residue_number",
+    "insertion",
+    "blank_3",
+    "x_coord",
+    "y_coord",
+    "z_coord",
+    "occupancy",
+    "b_factor",
+    "blank_4",
+    "segment_id",
+    "element_symbol",
+    "charge",
+    "line_idx",
+]
+mmcif_read = {
+    "group_PDB": "record_name",
+    "id": "atom_number",
+    "auth_atom_id": "atom_name",
+    "auth_comp_id": "residue_name",
+    "auth_asym_id": "chain_id",
+    "auth_seq_id": "residue_number",
+    "Cartn_x": "x_coord",
+    "Cartn_y": "y_coord",
+    "Cartn_z": "z_coord",
+    "occupancy": "occupancy",
+    "B_iso_or_equiv": "b_factor",
+    "type_symbol": "element_symbol",
+}
+nonefields = [
+    "blank_1",
+    "alt_loc",
+    "blank_2",
+    "insertion",
+    "blank_3",
+    "blank_4",
+    "segment_id",
+    "charge",
+    "line_idx",
+]
+def biopandas_mmcif2pdb(pandasmmcif, model_index = 1):
+    """
+    Converts the ATOM and HETATM dataframes of PandasMmcif() to PandasPdb() format.
+    """
+    pandaspdb = PandasPdb()
+    for a in ["ATOM", "HETATM"]:
+        dfa = pandasmmcif.df[a]
+        dfa = dfa.loc[dfa.pdbx_PDB_model_num == model_index]
+        if a =='ATOM':
+            if len(dfa) == 0:
+                raise ValueError(f"No model found for index: {model_index}")
+        # keep only those fields found in pdb
+        dfa = dfa[mmcif_read.keys()]
+        # rename fields
+        dfa = dfa.rename(columns=mmcif_read)
+        # add empty fields
+        for i in nonefields:
+            dfa[i] = ""
+        dfa["charge"] = np.nan
+        # reorder columns to PandasPdb order
+        dfa = dfa[pdb_order]
+        pandaspdb.df[a] = dfa
+    # update line_idx
+    pandaspdb.df["ATOM"]["line_idx"] = pandaspdb.df["ATOM"].index.values
+    pandaspdb.df["HETATM"]["line_idx"] = pandaspdb.df["HETATM"].index
+    return pandaspdb

utils_dataset.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import numpy as np
+import csv
+def load_GO_annot(filename):
+    # Load GO annotations
+    onts = ['mf', 'bp', 'cc']
+    prot2annot = {}
+    goterms = {ont: [] for ont in onts}
+    gonames = {ont: [] for ont in onts}
+    with open(filename, mode='r') as tsvfile:
+        reader = csv.reader(tsvfile, delimiter='\t')
+        # molecular function
+        next(reader, None)  # skip the headers
+        goterms[onts[0]] = next(reader)
+        next(reader, None)  # skip the headers
+        gonames[onts[0]] = next(reader)
+        # biological process
+        next(reader, None)  # skip the headers
+        goterms[onts[1]] = next(reader)
+        next(reader, None)  # skip the headers
+        gonames[onts[1]] = next(reader)
+        # cellular component
+        next(reader, None)  # skip the headers
+        goterms[onts[2]] = next(reader)
+        next(reader, None)  # skip the headers
+        gonames[onts[2]] = next(reader)
+        next(reader, None)  # skip the headers
+        counts = {ont: np.zeros(len(goterms[ont]), dtype=float) for ont in onts}
+        for row in reader:
+            prot, prot_goterms = row[0], row[1:]
+            prot2annot[prot] = {ont: [] for ont in onts}
+            for i in range(3):
+                goterm_indices = [goterms[onts[i]].index(goterm) for goterm in prot_goterms[i].split(',') if goterm != '']
+                prot2annot[prot][onts[i]] = np.zeros(len(goterms[onts[i]]))
+                prot2annot[prot][onts[i]][goterm_indices] = 1.0
+                counts[onts[i]][goterm_indices] += 1.0
+    return prot2annot, goterms, gonames, counts
+def load_EC_annot(filename):
+    # Load EC annotations """
+    prot2annot = {}
+    with open(filename, mode='r') as tsvfile:
+        reader = csv.reader(tsvfile, delimiter='\t')
+        # molecular function
+        next(reader, None)  # skip the headers
+        ec_numbers = {'ec': next(reader)}
+        next(reader, None)  # skip the headers
+        counts = {'ec': np.zeros(len(ec_numbers['ec']), dtype=float)}
+        for row in reader:
+            prot, prot_ec_numbers = row[0], row[1]
+            ec_indices = [ec_numbers['ec'].index(ec_num) for ec_num in prot_ec_numbers.split(',')]
+            prot2annot[prot] = {'ec': np.zeros(len(ec_numbers['ec']), dtype=np.int64)}
+            prot2annot[prot]['ec'][ec_indices] = 1.0
+            counts['ec'][ec_indices] += 1