File size: 1,403 Bytes
8918ac7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import argparse
import os
from utils import read_multi_fasta

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("-f", "--fasta_file", type=str, default=None)
    parser.add_argument("-d", "--af_dir", type=str, default=None)
    parser.add_argument("-o", "--output_dir", type=str, default=None)
    parser.add_argument("-c", "--chunk_size", type=int, default=50)
    args = parser.parse_args()
    
    for f in os.listdir(args.af_dir):
        if "model" in f:
            os.rename(os.path.join(args.af_dir, f), os.path.join(args.af_dir, f"{f.split('-')[1]}.pdb"))
    
    downloaded_uids = [p[:-4] for p in os.listdir(args.af_dir)]
    seqs = read_multi_fasta(args.fasta_file)
    unfold_seqs = {}
    for head, seq in seqs.items():
        uid = head.split("|")[1].strip()
        if uid not in downloaded_uids:
            unfold_seqs[head] = seq
    total_seqs = len(unfold_seqs)
    print(f"Total unfold {total_seqs} sequences")
    idx = 0
    for head, seq in unfold_seqs.items():
        uid = head.split("|")[1]
        chunk_idx = idx // args.chunk_size
        os.makedirs(os.path.join(args.output_dir, f"chunk_{chunk_idx}"), exist_ok=True)
        with open(os.path.join(args.output_dir, f"chunk_{chunk_idx}", f"{uid}.fasta"), "w") as f:
            f.write(f"{head}\n{seq}")
        idx += 1
    print("Done")