Spaces:
Sleeping
Sleeping
| import argparse | |
| import os | |
| import warnings | |
| from Bio.PDB import PDBList, PDBParser, PDBIO, Select # type: ignore | |
| from Bio.PDB.PDBExceptions import PDBConstructionWarning # type: ignore | |
| # Suppress PDB construction warnings | |
| warnings.simplefilter('ignore', PDBConstructionWarning) | |
| # Define a filter to keep only the protein (no water, no ligands) | |
| class ProteinSelect(Select): | |
| def accept_residue(self, residue): | |
| return residue.get_resname() not in ['HOH', 'WAT', 'NAG', 'MAN'] # Removes water and sugars | |
| def download_and_clean_pdb(pdb_id, output_file=None, data_dir='data'): | |
| """ | |
| Download and clean a PDB file. | |
| Args: | |
| pdb_id: PDB ID to download (e.g., '3KAS') | |
| output_file: Output filename (default: {data_dir}/{pdb_id.lower()}_clean.pdb) | |
| data_dir: Directory to store downloaded files (default: 'data') | |
| Returns: | |
| str: Path to the cleaned PDB file | |
| Raises: | |
| FileNotFoundError: If the PDB file cannot be downloaded or found | |
| Exception: For other errors during processing | |
| """ | |
| pdb_id = pdb_id.lower() # 3KAS -> 3kas | |
| if output_file is None: | |
| output_file = f'{data_dir}/{pdb_id}.pdb' | |
| # Create data directory if it doesn't exist | |
| os.makedirs(data_dir, exist_ok=True) | |
| # Check if final output file exists - if yes, return immediately | |
| if os.path.exists(output_file) and os.path.getsize(output_file) > 0: | |
| print(f"📁 METHOD: Using existing output file - {output_file}") | |
| return output_file | |
| # Download .ent file using PDBList | |
| print(f"📡 METHOD: Downloading from PDB - {pdb_id}") | |
| pdbl = PDBList() | |
| ent_file = pdbl.retrieve_pdb_file(pdb_id, pdir=data_dir, file_format='pdb') | |
| if not ent_file or not os.path.exists(ent_file): | |
| expected_ent = f'{data_dir}/pdb{pdb_id}.ent' | |
| if not os.path.exists(expected_ent): | |
| raise FileNotFoundError(f"❌ Failed to download PDB file for {pdb_id}. The PDB ID may not exist.") | |
| ent_file = expected_ent | |
| # Parse and save the clean version | |
| with warnings.catch_warnings(): | |
| warnings.simplefilter('ignore', PDBConstructionWarning) | |
| parser_obj = PDBParser(QUIET=True) | |
| structure = parser_obj.get_structure(pdb_id, ent_file) | |
| io = PDBIO() | |
| io.set_structure(structure) | |
| io.save(output_file, ProteinSelect()) | |
| # Delete the .ent file | |
| if os.path.exists(ent_file): | |
| os.remove(ent_file) | |
| return output_file | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser(description='Download and clean a PDB file') | |
| parser.add_argument('pdb_id', type=str, help='PDB ID to download (e.g., 3KAS)') | |
| parser.add_argument('--output', type=str, default=None, help='Output filename (default: {pdb_id}_clean.pdb)') | |
| parser.add_argument('--data-dir', type=str, default='data', help='Directory to store downloaded files (default: data)') | |
| args = parser.parse_args() | |
| # Clean PDB ID - remove common prefixes like "pdb_id=" | |
| pdb_id = args.pdb_id.strip() | |
| if '=' in pdb_id: | |
| pdb_id = pdb_id.split('=')[-1].strip() | |
| pdb_id = pdb_id.upper() | |
| try: | |
| output_file = download_and_clean_pdb(pdb_id, args.output, args.data_dir) | |
| print(f"Target {pdb_id.lower()} is downloaded and cleaned as {output_file}!") | |
| except Exception as e: | |
| print(f"Error processing {pdb_id}: {str(e)}") | |
| raise |