Spaces:
Runtime error
Runtime error
import os | |
import multiprocessing | |
from multiprocessing import Pool | |
from turtle import width | |
import numpy as np | |
from moleculekit.molecule import Molecule | |
from scipy.spatial import KDTree | |
from sklearn.cluster import AgglomerativeClustering | |
def create_grid_fromBB(boundingBox, voxelSize=1): | |
"""Create a grid from a bounding box. | |
Parameters | |
---------- | |
boundingBox : list | |
List of the form [xmin, xmax, ymin, ymax, zmin, zmax] | |
voxelSize : float | |
Size of the voxels in Angstrom | |
Returns | |
------- | |
grid : numpy.ndarray | |
Grid of shape (nx, ny, nz) | |
box_N : numpy.ndarray | |
Number of voxels in each dimension | |
""" | |
# increase grid by 0.5 to sample everything | |
xrange = np.arange(boundingBox[0][0], boundingBox[1][0] + 0.5, step=voxelSize) | |
yrange = np.arange(boundingBox[0][1], boundingBox[1][1] + 0.5, step=voxelSize) | |
zrange = np.arange(boundingBox[0][2], boundingBox[1][2] + 0.5, step=voxelSize) | |
gridpoints = np.zeros((xrange.shape[0] * yrange.shape[0] * zrange.shape[0], 3)) | |
i = 0 | |
for x in xrange: | |
for y in yrange: | |
for z in zrange: | |
gridpoints[i][0] = x | |
gridpoints[i][1] = y | |
gridpoints[i][2] = z | |
i += 1 | |
return gridpoints, (xrange.shape[0], yrange.shape[0], zrange.shape[0]) | |
def get_bb(points): | |
"""Return bounding box from a set of points (N,3) | |
Parameters | |
---------- | |
points : numpy.ndarray | |
Set of points (N,3) | |
Returns | |
------- | |
boundingBox : list | |
List of the form [xmin, xmax, ymin, ymax, zmin, zmax] | |
""" | |
minx = np.min(points[:, 0]) | |
maxx = np.max(points[:, 0]) | |
miny = np.min(points[:, 1]) | |
maxy = np.max(points[:, 1]) | |
minz = np.min(points[:, 2]) | |
maxz = np.max(points[:, 2]) | |
bb = [[minx, miny, minz], [maxx, maxy, maxz]] | |
return bb | |
def get_all_protein_resids(pdb_file): | |
"""Return all protein residues from a pdb file | |
Parameters | |
---------- | |
pdb_file : str | |
Path to pdb file | |
Returns | |
------- | |
resids : numpy.ndarray | |
indexes of ca atoms | |
""" | |
try: | |
prot = Molecule(pdb_file) | |
except: | |
exit("could not read file") | |
prot.filter("protein") | |
return prot.get("index", sel="name CA") | |
def get_all_metalbinding_resids(pdb_file): | |
"""Return all metal binding residues from a pdb file | |
Parameters | |
---------- | |
pdb_file : str | |
Path to pdb file | |
Returns | |
------- | |
resids : numpy.ndarray | |
indexes of name CA that are metal binding | |
""" | |
try: | |
prot = Molecule(pdb_file) | |
except: | |
exit("could not read file") | |
prot.filter("protein") | |
return prot.get( | |
"index", | |
sel="name CA and resname HIS HID HIE HIP CYS CYX GLU GLH GLN ASP ASH ASN GLN MET", | |
) | |
def get_all_resids_from_list(pdb_file, resids): | |
"""Return all metal binding residues from a pdb file | |
Parameters | |
---------- | |
pdb_file : str | |
Path to pdb file | |
resids : list | |
id of resids that are metal binding | |
Returns | |
------- | |
resids : numpy.ndarray | |
indexes of name CA resids | |
""" | |
try: | |
prot = Molecule(pdb_file) | |
except: | |
exit("could not read file") | |
prot.filter("protein") | |
return prot.get( | |
"index", | |
sel=f"name CA and resid {resids}", | |
) | |
def compute_average_p_fast(point, cutoff=1): | |
"""Using KDTree find the closest gridpoints | |
Parameters | |
---------- | |
point : numpy.ndarray | |
Point of shape (3,) | |
cutoff : float | |
Cutoff distance in Angstrom | |
Returns | |
------- | |
average_p : numpy.ndarray | |
Average probability of shape (1,)""" | |
p = 0 | |
nearest_neighbors, indices = tree.query( | |
point, k=15, distance_upper_bound=cutoff, workers=1 | |
) | |
if np.min(nearest_neighbors) != np.inf: | |
p = np.mean(output_v[indices[nearest_neighbors != np.inf]]) | |
return p | |
def get_probability_mean(grid, prot_centers, pvalues): | |
"""Compute the mean probability of all gridpoints from the globalgrid based on the individual boxes | |
Parameters | |
---------- | |
grid : numpy.ndarray | |
Grid of shape (nx, ny, nz) | |
prot_centers : numpy.ndarray | |
Protein centers of shape (N,3) | |
pvalues : numpy.ndarray | |
Probability values of shape (N,1) | |
Returns | |
------- | |
mean_p : numpy.ndarray | |
Mean probability over grid of shape (nx, ny, nz) | |
""" | |
global output_v | |
output_v = pvalues | |
global prot_v | |
prot_v = prot_centers | |
cpuCount = multiprocessing.cpu_count() | |
global tree | |
tree = KDTree(prot_v) | |
p = Pool(cpuCount) | |
results = p.map(compute_average_p_fast, grid) | |
return np.array(results) | |
def write_cubefile(bb, pvalues, box_N, outname="Metal3D_pmap.cube", gridres=1): | |
"""Write a cube file from a probability map | |
The cube specification from gaussian is used, distance are converted to bohr | |
Parameters | |
---------- | |
bb : list | |
List of the form [xmin, xmax, ymin, ymax, zmin, zmax] | |
pvalues : numpy.ndarray | |
Probability values of shape (nx, ny, nz) | |
box_N : tuple | |
Number of voxels in each dimension | |
outname : str | |
Name of the output file | |
gridres:float | |
Resolution of the grid used for writing the voxels | |
""" | |
with open(outname, "w") as cube: | |
cube.write(" Metal3D Cube File\n") | |
cube.write(" Outer Loop: X, Middle Loop y, inner Loop z\n") | |
angstromToBohr = 1.89 | |
cube.write( | |
f" 1 {bb[0][0]*angstromToBohr: .6f} {bb[0][1]*angstromToBohr: .6f} {bb[0][2]*angstromToBohr: .6f}\n" | |
) | |
cube.write( | |
f"{str(box_N[0]).rjust(5)} {1.890000*gridres:.9f} 0.000000 0.000000\n" | |
) | |
cube.write( | |
f"{str(box_N[1]).rjust(5)} 0.000000 {1.890000*gridres:.9f} 0.000000\n" | |
) | |
cube.write( | |
f"{str(box_N[2]).rjust(5)} 0.000000 0.000000 {1.890000*gridres:.9f}\n" | |
) | |
cube.write(" 1 1.000000 0.000000 0.000000 0.000000\n") | |
o = pvalues.reshape(box_N) | |
for x in range(box_N[0]): | |
for y in range(box_N[1]): | |
for z in range(box_N[2]): | |
cube.write(f" {o[x][y][z]: .5E}") | |
if z % 6 == 5: | |
cube.write("\n") | |
cube.write("\n") | |
def find_unique_sites( | |
pvalues, grid, writeprobes=False, probefile="probes.pdb", threshold=5, p=0.75 | |
): | |
"""The probability voxels are points and the voxel clouds may contain multiple metals | |
This function finds the unique sites and returns the coordinates of the unique sites with the highest p for each cluster. | |
It uses the AgglomerativeClustering algorithm to find the unique sites. | |
The threshold is the maximum distance between two points in the same cluster it can be changed to get more metal points. | |
Parameters | |
---------- | |
pvalues : numpy.ndarray | |
Probability values of shape (N, 1) | |
grid : numpy.ndarray | |
Grid of shape (N, 3) | |
writeprobes : bool | |
If True, write the probes to a pdb file | |
probefile : str | |
Name of the output file | |
threshold : float | |
Maximum distance between two points in the same cluster | |
p : float | |
Minimum probability of a point to be considered a unique site | |
""" | |
points = grid[pvalues > p] | |
point_p = pvalues[pvalues > p] | |
if len(points) == 0: | |
return "no metals found" | |
clustering = AgglomerativeClustering( | |
n_clusters=None, linkage="complete", distance_threshold=threshold | |
).fit(points) | |
message = f"min metal p={p}, n(metals) found: {clustering.n_clusters_}" | |
sites = [] | |
for i in range(clustering.n_clusters_): | |
c_points = points[clustering.labels_ == i] | |
c_points_p = point_p[clustering.labels_ == i] | |
position = c_points[np.argmax(c_points_p)] | |
sites.append((position, np.max(c_points_p))) | |
if writeprobes: | |
print(f"writing probes to {probefile}") | |
with open(probefile, "w") as f: | |
for i, site in enumerate(sites): | |
f.write( | |
f"HETATM {i+1:3} ZN ZN A {i+1:3} {site[0][0]: 8.3f}{site[0][1]: 8.3f}{site[0][2]: 8.3f} {site[1]:.2f} 0.0 ZN2+\n" | |
) | |
return message | |