Spaces:
Sleeping
Sleeping
File size: 5,847 Bytes
fba5f30 ce48cf1 fba5f30 9000099 ce48cf1 49f079e ce48cf1 fba5f30 ce48cf1 fba5f30 ce48cf1 fba5f30 ce48cf1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
from sklearn.preprocessing import OrdinalEncoder
import xgboost as xgb
import numpy as np
import matplotlib.pyplot as plt
import argparse, h5py, os, re, pkg_resources
import streamlit as st
st.title('POMFinder')
st.write('Welcome to DeepStruc that is a Deep Generative Model which has been trained to solve a mono-metallic structure (<200 atoms) based on a PDF!')
st.write('Upload a PDF to use DeepStruc to predict the structure.')
# Define the file upload widget
pdf_file = st.file_uploader("Upload PDF file in .gr format", type=["gr"])
# Define the form to get the other parameters
num_structures = st.number_input("Qmin value", min_value=0, max_value=2, value=0.7)
#structure_index = st.number_input("Index of structure to visualize", min_value=0, value=3)
#sigma = st.number_input("Standard deviation for sampling", min_value=0.1, value=3.0)
parser = argparse.ArgumentParser(prog='POMFinder', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
requiredNamed = parser.add_argument_group('required named arguments')
requiredNamed.add_argument("-d", "--data", default=None, type=str,
help="a directory of PDFs or a file.", required=True)
requiredNamed.add_argument("-n", "--nyquist", default="No", type=str,
help="is the data nyquist sampled", required=True)
parser.add_argument("-i", "--Qmin", default=0.7, type=float,
help="Qmin value of the experimental PDF")
parser.add_argument("-a", "--Qmax", default=30, type=float,
help="Qmax value of the experimental PDF")
parser.add_argument("-f", "--file_name", default='', type=str,
help="Name of the output file")
parser.add_argument("-m", "--Qdamp", default=0.04, type=float,
help="Qdamp value of the experimental PDF")
def main(args=None):
args = parser.parse_args(args=args)
y, y_onehotenc_cat, y_onehotenc_values, POMFinder = get_POMFinder()
r, Gr = PDF_Preparation(args.data, args.Qmin, args.Qmax, args.Qdamp, rmax=10, nyquist=args.nyquist)
res, y_pred_proba = POMPredicter(POMFinder, Gr, y_onehotenc_cat);
def get_POMFinder():
# Get file paths
load_files = pkg_resources.resource_listdir(__name__, 'Backend/')
DataBase_path = pkg_resources.resource_filename(__name__, "Backend/"+load_files[0])
POMFinder_path = pkg_resources.resource_filename(__name__, "Backend/"+load_files[1])
# Import the Database
hf_name = h5py.File(DataBase_path, "r")
y = hf_name.get('y')
enc = OrdinalEncoder()
y_onehotenc_cat = enc.fit(np.array(y))
y_onehotenc_values = enc.fit_transform(np.array(y))
# Import POMFinder
POMFinder = xgb.XGBClassifier()
POMFinder.load_model(POMFinder_path)
return y, y_onehotenc_cat, y_onehotenc_values, POMFinder
def PDF_Preparation(Your_PDF_Name, Qmin, Qmax, Qdamp, rmax, nyquist, plot=True):
for i in range(1000):
with open(Your_PDF_Name, "r") as file:
data = file.read().splitlines(True)
if len(data[0]) == 0:
with open(Your_PDF_Name, 'w') as fout:
fout.writelines(data[1:])
break
first_line = data[0]
if len(first_line) > 3 and re.match(r'^-?\d+(?:\.\d+)?$', first_line[0]) != None and re.match(r'^-?\d+(?:\.\d+)?$', first_line[1]) == None and re.match(r'^-?\d+(?:\.\d+)?$', first_line[2]) != None:
PDF = np.loadtxt(Your_PDF_Name)
break
else:
with open(Your_PDF_Name, 'w') as fout:
fout.writelines(data[1:])
r, Gr = PDF[:,0], PDF[:,1]
if r[0] != 0: # In the case that the Data not start at 0.
Gr = Gr[np.where(r==1)[0][0]:] # Remove Data from 0 to 0.5 AA
Gr = Gr[::10] # Nyquist sample the rest of the Data
Gr = np.concatenate(([0,0,0,0,0,0,0,0,0,0], Gr), axis=0) # Concatenate 0 - 0.5 AA on the Gr.
if nyquist == "No" or nyquist == "no":
Gr = Gr[::10] # Nyquist sample Data
if len(Gr) >= (rmax*10+1):
Gr = Gr[:(rmax*10+1)] # In the case Data is up to more than 30 AA, we do not use it.
else:
Gr = np.concatenate((Gr, np.zeros((101-len(Gr),))), axis=0) # In case Data is not going to 30 AA, we add 0's.
Gr[:10] = np.zeros((10,))
r = np.arange(0, (rmax+0.1), 0.1)
# Normalise it to the data from the database
Gr /= np.max(Gr)
# Add experimental parameters to the Gr
Gr = np.expand_dims(np.concatenate((np.expand_dims(Qmin, axis=0), np.expand_dims(Qmax, axis=0), np.expand_dims(Qdamp, axis=0), Gr), axis=0), axis=0)
if plot:
# Plot the transformation to make sure everything is alright
plt.plot(PDF[:,0], PDF[:,1], label="Original Data")
plt.plot(r, Gr[0,3:], label="Gr ready for ML")
plt.legend()
plt.title("Original Data vs. normalised Data")
plt.xlabel("r (AA)")
plt.ylabel("Gr")
plt.show()
return r, Gr
def POMPredicter(POMFinder, Gr, y_onehotenc_cat):
y_pred_proba = POMFinder.predict_proba(Gr);
y_pred_proba = y_pred_proba[:,1];
#print (np.shape(y_pred_proba))
#y_pred_proba = y_pred_proba[0];
res = sorted(range(len(y_pred_proba)), key = lambda sub: y_pred_proba[sub]);
res.reverse();
print ("The 1st guess from the model is: ", str(y_onehotenc_cat.categories_[0][res[0]])[2:-2]+"cale.xyz")
print ("The 2nd guess from the model is: ", str(y_onehotenc_cat.categories_[0][res[1]])[2:-2]+"cale.xyz")
print ("The 3rd guess from the model is: ", str(y_onehotenc_cat.categories_[0][res[2]])[2:-2]+"cale.xyz")
print ("The 4th guess from the model is: ", str(y_onehotenc_cat.categories_[0][res[3]])[2:-2]+"cale.xyz")
print ("The 5th guess from the model is: ", str(y_onehotenc_cat.categories_[0][res[4]])[2:-2]+"cale.xyz")
return res, y_pred_proba
|