luost26's picture
Update
753e275
raw
history blame
4.02 kB
import os
import re
import json
import shelve
from Bio import PDB
from typing import Optional, Tuple, List
from dataclasses import dataclass, field
@dataclass
class EvalTask:
in_path: str
ref_path: str
info: dict
structure: str
name: str
method: str
cdr: str
ab_chains: List
residue_first: Optional[Tuple] = None
residue_last: Optional[Tuple] = None
scores: dict = field(default_factory=dict)
def get_gen_biopython_model(self):
parser = PDB.PDBParser(QUIET=True)
return parser.get_structure(self.in_path, self.in_path)[0]
def get_ref_biopython_model(self):
parser = PDB.PDBParser(QUIET=True)
return parser.get_structure(self.ref_path, self.ref_path)[0]
def save_to_db(self, db: shelve.Shelf):
db[self.in_path] = self
def to_report_dict(self):
return {
'method': self.method,
'structure': self.structure,
'cdr': self.cdr,
'filename': os.path.basename(self.in_path),
**self.scores
}
class TaskScanner:
def __init__(self, root, postfix=None, db: Optional[shelve.Shelf]=None):
super().__init__()
self.root = root
self.postfix = postfix
self.visited = set()
self.db = db
if db is not None:
for k in db.keys():
self.visited.add(k)
def _get_metadata(self, fpath):
json_path = os.path.join(
os.path.dirname(os.path.dirname(fpath)),
'metadata.json'
)
tag_name = os.path.basename(os.path.dirname(fpath))
method_name = os.path.basename(
os.path.dirname(os.path.dirname(os.path.dirname(fpath)))
)
try:
antibody_chains = set()
info = None
with open(json_path, 'r') as f:
metadata = json.load(f)
for item in metadata['items']:
if item['tag'] == tag_name:
info = item
antibody_chains.add(item['residue_first'][0])
if info is not None:
info['antibody_chains'] = list(antibody_chains)
info['structure'] = metadata['identifier']
info['method'] = method_name
return info
except (json.JSONDecodeError, FileNotFoundError) as e:
return None
def scan(self) -> List[EvalTask]:
tasks = []
if self.postfix is None or not self.postfix:
input_fname_pattern = '^\d+\.pdb$'
ref_fname = 'REF1.pdb'
else:
input_fname_pattern = f'^\d+\_{self.postfix}\.pdb$'
ref_fname = f'REF1_{self.postfix}.pdb'
for parent, _, files in os.walk(self.root):
for fname in files:
fpath = os.path.join(parent, fname)
if not re.match(input_fname_pattern, fname):
continue
if os.path.getsize(fpath) == 0:
continue
if fpath in self.visited:
continue
# Path to the reference structure
ref_path = os.path.join(parent, ref_fname)
if not os.path.exists(ref_path):
continue
# CDR information
info = self._get_metadata(fpath)
if info is None:
continue
tasks.append(EvalTask(
in_path = fpath,
ref_path = ref_path,
info = info,
structure = info['structure'],
name = info['name'],
method = info['method'],
cdr = info['tag'],
ab_chains = info['antibody_chains'],
residue_first = info.get('residue_first', None),
residue_last = info.get('residue_last', None),
))
self.visited.add(fpath)
return tasks