ASCARIS / code /add_structure.py
fatmacankara's picture
Update code/add_structure.py
0df7f2f
raw
history blame
14.4 kB
import ast
import re
import time
import json
import zlib
from xml.etree import ElementTree
from urllib.parse import urlparse, parse_qs, urlencode
import requests
import unipressed
from requests.adapters import HTTPAdapter, Retry
from unipressed import IdMappingClient
import Bio
from Bio import SeqIO
import pandas as pd
import numpy as np
from pathlib import Path
from Bio.PDB import *
from io import StringIO
from utils import *
import math
import json
UNIPROT_ANNOTATION_COLS = ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding',
'activeSite',
'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
'crosslink', 'mutagenesis', 'strand',
'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain',
'caBinding', 'bindingSite', 'region',
'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif',
'coiledCoil', 'peptide',
'transitPeptide', 'glycosylation', 'propeptide', 'disulfideBinary',
'intMetBinary', 'intramembraneBinary',
'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary',
'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary',
'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
'glycosylationBinary', 'propeptideBinary']
def get_pdb_ids(protein_id):
try:
request = IdMappingClient.submit(
source="UniProtKB_AC-ID", dest="PDB", ids={protein_id})
pdb_list = list(request.each_result())
return [i['to'] for i in pdb_list]
except requests.exceptions.HTTPError:
return []
except unipressed.id_mapping.core.IdMappingError:
print('IdMappingError caused by UniProt API service, please try later.')
return []
except KeyError:
return []
def fix_filename(filename):
try:
if Path(filename).suffix == '.pdb':
pass
elif Path(filename).stem.endswith("ent"):
filename_replace_ext = filename.with_name( Path(filename).stem[3:])
Path(filename).rename(filename_replace_ext.with_suffix('.pdb'))
elif Path(filename).stem.startswith("pdb"):
filename_replace_ext = Path(filename).with_name(Path(filename).stem[3:])
Path(filename).rename(filename_replace_ext.with_suffix('.pdb'))
else:
filename_replace_ext = filename.with_suffix(".pdb")
Path(filename).rename(filename_replace_ext)
except:
FileNotFoundError
def fetch_uniprot_ids(pdb_code):
response = requests.get(f"https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/{pdb_code}")
response.raise_for_status()
resp = response.json()
return list(list(list(resp.values())[0].values())[0].keys())
def addPDBinfo(data, path_to_output_files):
# pdb_fasta = pd.DataFrame(columns=['pdbID', 'chain', 'pdbSequence'])
pdb_info = pd.DataFrame(columns=['uniprotID', 'pdbID', 'chain', 'resolution'])
print('Retrieving PDB structures...\n')
up_list = data.uniprotID.to_list()
pdbs = [get_pdb_ids(i) for i in up_list]
if len(pdbs) >= 1:
pdbs = [item for sublist in pdbs for item in sublist]
pdbs = list(filter(None, pdbs))
pdbs = set(pdbs)
pdbs = [i.lower() for i in pdbs]
else:
pdbs = []
print('No PDB structure found for the query. ')
print('\n>>Starting PDB structures download...\n')
print('\n>>Processing PDB structures...\n')
parser = PDBParser()
ppb = PPBuilder()
index = 0
for search in pdbs:
print(f'Searching for {search.upper()}')
try:
pdb_url = f"https://files.rcsb.org/download/{search}.pdb"
response = requests.get(pdb_url)
response.raise_for_status() # Check for a successful response
pdb_data = response.text
pdb_parser = PDBParser(QUIET=True) # QUIET=True suppresses warnings
pdb_file_content = StringIO(pdb_data)
structure = pdb_parser.get_structure(search, pdb_file_content)
pdb_data_list = pdb_data.split('\n')
pdb_data_list = [i for i in pdb_data_list if i.startswith('DBREF')]
pdb_data_list = [[list(filter(None, i.split(' '))) for j in i.split(' ') if j == 'UNP'] for
i in pdb_data_list]
pdb_data_list = [i for i in pdb_data_list if i != []]
header = structure.header
for unp in pdb_data_list:
if (unp[0][5] == 'UNP') & (unp[0][6].split('-')[0] in up_list):
pdb_info.at[index, 'uniprotID'] = unp[0][6].split('-')[0]
pdb_info.at[index, 'pdbID'] = unp[0][1].upper()
pdb_info.at[index, 'chain'] = unp[0][2].upper()
pdb_info.at[index, 'resolution'] = header.get('resolution', 'N/A')
pdb_info.at[index, 'start'] = unp[0][8]
pdb_info.at[index, 'end'] = unp[0][9]
index += 1
except:
continue
pdb_info.replace({'None': np.NaN}, inplace=True)
print('PDB file processing finished..')
return pdb_info
from add_sasa import *
def downloadPDB(pdbID, path_to_output_files):
pdbl = PDBList()
existing_pdb = list(Path(path_to_output_files / 'pdb_structures').glob("*"))
existing_pdb = [str(i) for i in existing_pdb]
existing_pdb = [i.split('/')[-1].split('.')[0].lower() for i in existing_pdb]
if pdbID not in existing_pdb:
# print(f'Downloading PDB file for {pdbID.upper()}..')
file = pdbl.retrieve_pdb_file(pdbID, pdir=Path(path_to_output_files / 'pdb_structures'), file_format="pdb")
fix_filename(file)
file = fix_filename(file)
file = Path(path_to_output_files / 'pdb_structures' / f'{pdbID}.pdb')
else:
print(f'PDB file for {pdbID.upper()} exists..')
file = Path(path_to_output_files / 'pdb_structures' / f'{pdbID}.pdb')
fix_filename(file)
file = fix_filename(file)
file = Path(path_to_output_files / 'pdb_structures' / f'{pdbID}.pdb')
existing_free_sasa = list(Path(path_to_output_files / 'freesasa_files').glob("*"))
existing_free_sasa = [str(i) for i in existing_free_sasa]
existing_free_sasa = [i.split('/')[-1].split('.')[0] for i in existing_free_sasa]
if pdbID not in existing_free_sasa:
run_freesasa(file, Path(path_to_output_files / 'freesasa_files' / f'{pdbID}.txt'), include_hetatms=True,
outdir=None, force_rerun=False, file_type='pdb')
return file
def processFile(data, path_to_output_files):
for i in data.index:
protein = data.at[i,'uniprotID']
pdbID = data.at[i,'pdbID'].lower()
chain = data.at[i,'chain']
pos = int(data.at[i, 'pos'])
wt = data.at[i, 'wt']
url = f'https://files.rcsb.org/download/{pdbID}.pdb'
response = requests.get(url)
if response.status_code == 200:
with open(f'{path_to_output_files}/pdb_structures/{pdbID}.pdb', 'w') as f:
f.write(response.text)
print(f"Downloaded {pdbID}.pdb successfully.")
else:
print(f"Failed to download {pdbID}.pdb. Status code: {response.status_code}")
file = Path(path_to_output_files / 'pdb_structures' / f'{pdbID}.pdb')
run_freesasa(file, Path(path_to_output_files / 'freesasa_files' / f'{pdbID}.txt'), include_hetatms=True,
outdir=None, force_rerun=False, file_type='pdb')
filename = Path(path_to_output_files / 'freesasa_files' / f'{pdbID}.txt')
data.loc[i, 'sasa'] = sasa(protein, pos, wt, 1, filename, path_to_output_files,file_type='pdb')
newCol = {}
with open(file, encoding="utf8") as f:
for line in f.readlines():
if line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA' and line[21].upper() == chain.upper():
coords= [line[31:38].strip(), line[39:46].strip(), line[47:54].strip()]
resnums_for_sasa = line[22:26].strip()
newCol[resnums_for_sasa] = coords
elif line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA' and line[21] == ' ':
coords= [line[31:38].strip(), line[39:46].strip(), line[47:54].strip()]
resnums_for_sasa = line[22:26].strip()
newCol[resnums_for_sasa] = coords
data.at[i, 'coordinates'] = json.dumps(newCol)
return data
def distance(x1, y1, z1, x2, y2, z2):
d = math.sqrt(math.pow(x2 - x1, 2) +
math.pow(y2 - y1, 2) +
math.pow(z2 - z1, 2) * 1.0)
return d
def find_distance(coordMut, coordAnnot):
if coordMut != np.NaN:
try:
dist = distance(float(coordMut[0]), float(coordMut[1]), float(coordMut[2]), float(coordAnnot[0]),
float(coordAnnot[1]), float(coordAnnot[2]))
return "%.2f" % dist
except:
ValueError
dist = 'nan'
return dist
else:
return np.NaN
def domainDistance(domStart, domEnd, coordinates, mutationPosition, matchList, posOnPDB):
resList = list(range(domStart, domEnd))
domainDistanceList = []
for i in resList:
try:
domainPos = ast.literal_eval(matchList)[str(i)]
coordMut = coordinates[str(posOnPDB)]
coordDomain = coordinates[str(domainPos)]
distance = find_distance(coordMut, coordDomain)
domainDistanceList.append(distance)
return min(domainDistanceList)
except KeyError:
domainDistanceList = np.NaN
return np.NaN
def match3D(data):
data.fillna(np.NaN, inplace=True)
for i in data.index:
coordinates = ast.literal_eval(data.at[i, 'coordinates'])
pos = str(data.at[i, 'pos'])
matchList = data.at[i, 'MATCHDICT']
try:
posOnPDB = ast.literal_eval(data.at[i, 'MATCHDICT'])[pos]
coordMut = coordinates[str(posOnPDB)]
if data.at[i, 'distance'] == -1000:
domStart = data.at[i, 'domStart']
domEnd = data.at[i, 'domEnd']
data.at[i, 'distance'] = domainDistance(domStart, domEnd, coordinates, pos, matchList, posOnPDB)
except KeyError:
posOnPDB = np.NaN
coordMut = np.NaN
data.at[i, 'distance'] = np.NaN
for col in UNIPROT_ANNOTATION_COLS[0:30]:
allDist = []
if (data.at[i, col] != np.NaN) & (data.at[i, col] != 'hit') & (data.at[i, col] != '[]')& (data.at[i, col] != []):
annotation_list = ast.literal_eval(data.at[i, col])
integer_list = [int(element) for element in annotation_list if element != 'null']
for annotPosition in integer_list:
coordAnnot = coordinates[str(annotPosition)]
distance = find_distance(coordMut, coordAnnot)
allDist.append(distance)
if len(allDist)>0:
data.at[i, col] = min(allDist)
return data
def domainDistanceModels(domStart, domEnd, coordinates, mutationPosition):
resList = list(range(domStart, domEnd))
domainDistanceList = []
for i in resList:
try:
coordMut = (coordinates)[mutationPosition]
coordDomain = (coordinates)[i]
distance = find_distance(coordMut, coordDomain)
domainDistanceList.append(distance)
return min(domainDistanceList)
except KeyError:
domainDistanceList = np.NaN
return np.NaN
def match3DModels(data):
data.fillna(np.NaN, inplace=True)
for i in data.index:
pos = int(data.at[i, 'pos'])
coords = data.at[i, 'coordinates']
if type(coords) != dict:
coordinates = ast.literal_eval(coords)
else:
coordinates = coords
pass
coordMut = coordinates[pos]
if data.at[i, 'distance'] == -1000:
domStart = data.at[i, 'domStart']
domEnd = data.at[i, 'domEnd']
data.at[i, 'distance'] = domainDistanceModels(domStart, domEnd, coordinates, pos)
for col in UNIPROT_ANNOTATION_COLS[0:30]:
allDist = []
if (data.at[i, col] != np.NaN) & (data.at[i, col] != 'hit') & (data.at[i, col] != '[]')& (data.at[i, col] != []):
annotation_list = ast.literal_eval(data.at[i, col])
integer_list = [int(element) for element in annotation_list]
for annotPosition in integer_list:
try:
coordAnnot = coordinates[annotPosition]
except KeyError:
coordAnnot = []
distance = find_distance(coordMut, coordAnnot)
allDist.append(distance)
if len(allDist)>0:
allDist = [float(i) for i in allDist]
data.at[i, col] = min(allDist)
return data
def selectMaxAnnot(data):
if len(data) >0:
for i in data.index:
total = 0
nanCounter = 0
hitCounter = 0
for col in UNIPROT_ANNOTATION_COLS[0:30]:
if (str(data.at[i,col]) != 'nan') and (data.at[i,col] != '[]') and (data.at[i,col] != 'hit') and (data.at[i,col] != ''):
total += float(data.at[i,col])
elif (str(data.at[i,col]) == 'nan') or (data.at[i,col] == '[]') or (data.at[i,col] != ''):
nanCounter +=1
if data.at[i,col] == 'hit':
hitCounter += 1
if hitCounter > 0:
data.at[i, 'hitTotal'] = hitCounter
else:
data.at[i, 'hitTotal'] = np.NaN
if nanCounter != 30:
data.at[i, 'annotTotal'] = total
else:
data.at[i, 'annotTotal'] = np.NaN
else:
data['annotTotal'] = np.NaN
return data