Spaces:

42Cummer
/

BroteinShake

Sleeping

App Files Files Community

BroteinShake / scripts /download.py

42Cummer

Upload 129 files

32c275c verified about 2 months ago

raw

history blame contribute delete

3.43 kB

	import argparse
	import os
	import warnings
	from Bio.PDB import PDBList, PDBParser, PDBIO, Select # type: ignore
	from Bio.PDB.PDBExceptions import PDBConstructionWarning # type: ignore

	# Suppress PDB construction warnings
	warnings.simplefilter('ignore', PDBConstructionWarning)

	# Define a filter to keep only the protein (no water, no ligands)
	class ProteinSelect(Select):
	def accept_residue(self, residue):
	return residue.get_resname() not in ['HOH', 'WAT', 'NAG', 'MAN'] # Removes water and sugars

	def download_and_clean_pdb(pdb_id, output_file=None, data_dir='data'):
	"""
	Download and clean a PDB file.

	Args:
	pdb_id: PDB ID to download (e.g., '3KAS')
	output_file: Output filename (default: {data_dir}/{pdb_id.lower()}_clean.pdb)
	data_dir: Directory to store downloaded files (default: 'data')

	Returns:
	str: Path to the cleaned PDB file

	Raises:
	FileNotFoundError: If the PDB file cannot be downloaded or found
	Exception: For other errors during processing
	"""
	pdb_id = pdb_id.lower() # 3KAS -> 3kas
	if output_file is None:
	output_file = f'{data_dir}/{pdb_id}.pdb'

	# Create data directory if it doesn't exist
	os.makedirs(data_dir, exist_ok=True)

	# Check if final output file exists - if yes, return immediately
	if os.path.exists(output_file) and os.path.getsize(output_file) > 0:
	print(f"📁 METHOD: Using existing output file - {output_file}")
	return output_file

	# Download .ent file using PDBList
	print(f"📡 METHOD: Downloading from PDB - {pdb_id}")
	pdbl = PDBList()
	ent_file = pdbl.retrieve_pdb_file(pdb_id, pdir=data_dir, file_format='pdb')

	if not ent_file or not os.path.exists(ent_file):
	expected_ent = f'{data_dir}/pdb{pdb_id}.ent'
	if not os.path.exists(expected_ent):
	raise FileNotFoundError(f"❌ Failed to download PDB file for {pdb_id}. The PDB ID may not exist.")
	ent_file = expected_ent

	# Parse and save the clean version
	with warnings.catch_warnings():
	warnings.simplefilter('ignore', PDBConstructionWarning)
	parser_obj = PDBParser(QUIET=True)
	structure = parser_obj.get_structure(pdb_id, ent_file)
	io = PDBIO()
	io.set_structure(structure)
	io.save(output_file, ProteinSelect())

	# Delete the .ent file
	if os.path.exists(ent_file):
	os.remove(ent_file)

	return output_file

	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description='Download and clean a PDB file')
	parser.add_argument('pdb_id', type=str, help='PDB ID to download (e.g., 3KAS)')
	parser.add_argument('--output', type=str, default=None, help='Output filename (default: {pdb_id}_clean.pdb)')
	parser.add_argument('--data-dir', type=str, default='data', help='Directory to store downloaded files (default: data)')

	args = parser.parse_args()

	# Clean PDB ID - remove common prefixes like "pdb_id="
	pdb_id = args.pdb_id.strip()
	if '=' in pdb_id:
	pdb_id = pdb_id.split('=')[-1].strip()
	pdb_id = pdb_id.upper()

	try:
	output_file = download_and_clean_pdb(pdb_id, args.output, args.data_dir)
	print(f"Target {pdb_id.lower()} is downloaded and cleaned as {output_file}!")
	except Exception as e:
	print(f"Error processing {pdb_id}: {str(e)}")
	raise