PLTNUM / scripts /apply_foldseek_to_pdb.py
sagawa's picture
Upload 17 files
4321e7e verified
import os
import random
import argparse
import glob
import pandas as pd
import multiprocessing as mp
from foldseek_util import get_struc_seq
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--pdb_dir",
type=str,
default="./pdb_files",
help="Directory containing PDB files.",
)
parser.add_argument(
"--num_processes",
type=int,
default=2,
help="Number of processes to use for multiprocessing. Default is 2.",
)
parser.add_argument(
"--output_dir",
type=str,
default="./data",
help="Output directory.",
)
return parser.parse_args()
def get_foldseek_seq(pdb_path):
parsed_seqs = get_struc_seq(
"bin/foldseek",
pdb_path,
["A"],
process_id=random.randint(0, 10000000),
)["A"]
return parsed_seqs
if __name__ == "__main__":
config = parse_args()
pdb_files = glob.glob(os.path.join(config.pdb_dir, "*.pdb"))
with mp.Pool(config.num_processes) as pool:
output = pool.map(get_foldseek_seq, pdb_files)
aa, foldseek, aa_foldseek = zip(*output)
result = {}
result["file"] = pdb_files
result["aa"] = aa
result["foldseek"] = foldseek
result["aa_foldseek"] = aa_foldseek
df = pd.DataFrame(result)
df.to_csv(os.path.join(config.output_dir, "foldseek_result.csv"), index=False)