Spaces:

wenkai
/

FAPM_demo

Running on Zero

App Files Files Community

wenkai commited on Jun 24

Commit

3e891e6

•

1 Parent(s): 38f9971

Update esm_scripts/extract.py

Browse files

Files changed (1) hide show

esm_scripts/extract.py +65 -0

esm_scripts/extract.py CHANGED Viewed

@@ -131,6 +131,71 @@ def run(args):
                 )
 def main():
     parser = create_parser()
     args = parser.parse_args()

                 )
+def run_demo(model_location, fasta_file, output_dir, include, nogpu,
+             repr_layers=-1, truncation_seq_length=1022, toks_per_batch=4096):
+    model, alphabet = pretrained.load_model_and_alphabet(model_location)
+    model.eval()
+    if isinstance(model, MSATransformer):
+        raise ValueError(
+            "This script currently does not handle models with MSA input (MSA Transformer)."
+        )
+    if torch.cuda.is_available() and not nogpu:
+        model = model.cuda()
+        print("Transferred model to GPU")
+    dataset = FastaBatchedDataset.from_file(fasta_file)
+    batches = dataset.get_batch_indices(toks_per_batch, extra_toks_per_seq=1)
+    data_loader = torch.utils.data.DataLoader(
+        dataset, collate_fn=alphabet.get_batch_converter(truncation_seq_length), batch_sampler=batches
+    )
+    print(f"Read {fasta_file} with {len(dataset)} sequences")
+    output_dir.mkdir(parents=True, exist_ok=True)
+    return_contacts = "contacts" in include
+    assert all(-(model.num_layers + 1) <= i <= model.num_layers for i in repr_layers)
+    repr_layers = [(i + model.num_layers + 1) % (model.num_layers + 1) for i in repr_layers]
+    with torch.no_grad():
+        for batch_idx, (labels, strs, toks) in enumerate(data_loader):
+            print(
+                f"Processing {batch_idx + 1} of {len(batches)} batches ({toks.size(0)} sequences)"
+            )
+            if torch.cuda.is_available() and not nogpu:
+                toks = toks.to(device="cuda", non_blocking=True)
+            out = model(toks, repr_layers=repr_layers, return_contacts=return_contacts)
+            logits = out["logits"].to(device="cpu")
+            representations = {
+                layer: t.to(device="cpu") for layer, t in out["representations"].items()
+            }
+            if return_contacts:
+                contacts = out["contacts"].to(device="cpu")
+            for i, label in enumerate(labels):
+                result = {"label": label}
+                truncate_len = min(truncation_seq_length, len(strs[i]))
+                # Call clone on tensors to ensure tensors are not views into a larger representation
+                # See https://github.com/pytorch/pytorch/issues/1995
+                if "per_tok" in include:
+                    result["representations"] = {
+                        layer: t[i, 1 : truncate_len + 1].clone()
+                        for layer, t in representations.items()
+                    }
+                if "mean" in include:
+                    result["mean_representations"] = {
+                        layer: t[i, 1 : truncate_len + 1].mean(0).clone()
+                        for layer, t in representations.items()
+                    }
+                if "bos" in include:
+                    result["bos_representations"] = {
+                        layer: t[i, 0].clone() for layer, t in representations.items()
+                    }
+                if return_contacts:
+                    result["contacts"] = contacts[i, : truncate_len, : truncate_len].clone()
 def main():
     parser = create_parser()
     args = parser.parse_args()