|
import os
|
|
import random
|
|
import argparse
|
|
import glob
|
|
import pandas as pd
|
|
import multiprocessing as mp
|
|
from foldseek_util import get_struc_seq
|
|
|
|
|
|
def parse_args():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument(
|
|
"--pdb_dir",
|
|
type=str,
|
|
default="./pdb_files",
|
|
help="Directory containing PDB files.",
|
|
)
|
|
parser.add_argument(
|
|
"--num_processes",
|
|
type=int,
|
|
default=2,
|
|
help="Number of processes to use for multiprocessing. Default is 2.",
|
|
)
|
|
parser.add_argument(
|
|
"--output_dir",
|
|
type=str,
|
|
default="./data",
|
|
help="Output directory.",
|
|
)
|
|
return parser.parse_args()
|
|
|
|
|
|
def get_foldseek_seq(pdb_path):
|
|
parsed_seqs = get_struc_seq(
|
|
"bin/foldseek",
|
|
pdb_path,
|
|
["A"],
|
|
process_id=random.randint(0, 10000000),
|
|
)["A"]
|
|
return parsed_seqs
|
|
|
|
|
|
if __name__ == "__main__":
|
|
config = parse_args()
|
|
|
|
pdb_files = glob.glob(os.path.join(config.pdb_dir, "*.pdb"))
|
|
|
|
with mp.Pool(config.num_processes) as pool:
|
|
output = pool.map(get_foldseek_seq, pdb_files)
|
|
|
|
aa, foldseek, aa_foldseek = zip(*output)
|
|
|
|
result = {}
|
|
result["file"] = pdb_files
|
|
result["aa"] = aa
|
|
result["foldseek"] = foldseek
|
|
result["aa_foldseek"] = aa_foldseek
|
|
|
|
df = pd.DataFrame(result)
|
|
|
|
df.to_csv(os.path.join(config.output_dir, "foldseek_result.csv"), index=False)
|
|
|