POMFinder / app.py
AndySAnker's picture
Update app.py
4b1230e verified
from sklearn.preprocessing import OrdinalEncoder
import xgboost as xgb
import numpy as np
import matplotlib.pyplot as plt
import argparse, h5py, os, re, zipfile
import streamlit as st
def append_cale_to_name(name):
if name.startswith("icsd"):
return name + "cale"
return name
def get_POMFinder():
# Get file paths
load_files = "Backend/"
DataBase_path = "Backend/POMFinder_443structures_100Dataset_per_Structure_xPDF_hypercube_sampling_Grmax_Name.h5"
POMFinder_path = "Backend/XGBoost_443structures_100PDFperStructure_xPDF_hypercube_sampling_Grmax.model"
# Import the Database
hf_name = h5py.File(DataBase_path, "r")
y = hf_name.get('y')
enc = OrdinalEncoder()
y_onehotenc_cat = enc.fit(np.array(y))
y_onehotenc_values = enc.fit_transform(np.array(y))
# Import POMFinder
POMFinder = xgb.XGBClassifier()
POMFinder.load_model(POMFinder_path)
return y, y_onehotenc_cat, y_onehotenc_values, POMFinder
def PDF_Preparation(Your_PDF_Name, Qmin, Qmax, Qdamp, rmax, nyquist):
for i in range(1000):
with open(Your_PDF_Name, "r") as file:
data = file.read().splitlines(True)
if len(data[0]) == 0:
with open(Your_PDF_Name, 'w') as fout:
fout.writelines(data[1:])
break
first_line = data[0]
if len(first_line) > 3 and re.match(r'^-?\d+(?:\.\d+)?$', first_line[0]) != None and re.match(r'^-?\d+(?:\.\d+)?$', first_line[1]) == None and re.match(r'^-?\d+(?:\.\d+)?$', first_line[2]) != None:
PDF = np.loadtxt(Your_PDF_Name)
break
else:
with open(Your_PDF_Name, 'w') as fout:
fout.writelines(data[1:])
r, Gr = PDF[:,0], PDF[:,1]
if r[0] != 0: # In the case that the Data not start at 0.
Gr = Gr[np.where(r==1)[0][0]:] # Remove Data from 0 to 0.5 AA
Gr = Gr[::10] # Nyquist sample the rest of the Data
Gr = np.concatenate(([0,0,0,0,0,0,0,0,0,0], Gr), axis=0) # Concatenate 0 - 0.5 AA on the Gr.
if not nyquist:
Gr = Gr[::10] # Pseudo Nyquist sample Data
if len(Gr) >= (rmax*10+1):
Gr = Gr[:(rmax*10+1)] # In the case Data is up to more than 30 AA, we do not use it.
else:
Gr = np.concatenate((Gr, np.zeros((101-len(Gr),))), axis=0) # In case Data is not going to 30 AA, we add 0's.
Gr[:10] = np.zeros((10,))
r = np.arange(0, (rmax+0.1), 0.1)
# Normalise it to the data from the database
Gr /= np.max(Gr)
# Add experimental parameters to the Gr
Gr = np.expand_dims(np.concatenate((np.expand_dims(Qmin, axis=0), np.expand_dims(Qmax, axis=0), np.expand_dims(Qdamp, axis=0), Gr), axis=0), axis=0)
# Create a new figure object
fig, ax = plt.subplots()
# Plot the transformation to make sure everything is alright
ax.plot(PDF[:,0], PDF[:,1], label="Original Data")
ax.plot(r, Gr[0,3:], label="Gr ready for ML")
ax.legend()
ax.set_title("Original Data vs. normalised Data")
ax.set_xlabel("r (AA)")
ax.set_ylabel("Gr")
st.pyplot(fig)
return r, Gr
def POMPredicter(POMFinder, Gr, y_onehotenc_cat):
y_pred_proba = POMFinder.predict_proba(Gr);
y_pred_proba = y_pred_proba[:,1];
res = sorted(range(len(y_pred_proba)), key = lambda sub: y_pred_proba[sub]);
res.reverse();
name = str(y_onehotenc_cat.categories_[0][res[0]])[2:-2]
if name.startswith("icsd"):
st.markdown(f'<span style="font-size: 24px; color: green;">The 1st guess from the model is: <b>{str(y_onehotenc_cat.categories_[0][res[0]])[2:-2]+"cale.xyz"}</b> with a probability of {100*y_pred_proba[res[0]]:.2f} %</span> <hr/>',unsafe_allow_html=True)
else:
st.markdown(f'<span style="font-size: 24px; color: green;">The 1st guess from the model is: <b>{str(y_onehotenc_cat.categories_[0][res[0]])[2:-2]+".xyz"}</b> with a probability of {100*y_pred_proba[res[0]]:.2f} %</span> <hr/>',unsafe_allow_html=True)
name = str(y_onehotenc_cat.categories_[0][res[1]])[2:-2]
if name.startswith("icsd"):
st.markdown(f'<span style="font-size: 24px; color: green;">The 1st guess from the model is: <b>{str(y_onehotenc_cat.categories_[0][res[1]])[2:-2]+"cale.xyz"}</b> with a probability of {100*y_pred_proba[res[1]]:.2f} %</span> <hr/>',unsafe_allow_html=True)
else:
st.markdown(f'<span style="font-size: 24px; color: green;">The 1st guess from the model is: <b>{str(y_onehotenc_cat.categories_[0][res[1]])[2:-2]+".xyz"}</b> with a probability of {100*y_pred_proba[res[1]]:.2f} %</span> <hr/>',unsafe_allow_html=True)
name = str(y_onehotenc_cat.categories_[0][res[2]])[2:-2]
if name.startswith("icsd"):
st.markdown(f'<span style="font-size: 24px; color: green;">The 1st guess from the model is: <b>{str(y_onehotenc_cat.categories_[0][res[2]])[2:-2]+"cale.xyz"}</b> with a probability of {100*y_pred_proba[res[2]]:.2f} %</span> <hr/>',unsafe_allow_html=True)
else:
st.markdown(f'<span style="font-size: 24px; color: green;">The 1st guess from the model is: <b>{str(y_onehotenc_cat.categories_[0][res[2]])[2:-2]+".xyz"}</b> with a probability of {100*y_pred_proba[res[2]]:.2f} %</span> <hr/>',unsafe_allow_html=True)
name = str(y_onehotenc_cat.categories_[0][res[3]])[2:-2]
if name.startswith("icsd"):
st.markdown(f'<span style="font-size: 24px; color: green;">The 1st guess from the model is: <b>{str(y_onehotenc_cat.categories_[0][res[3]])[2:-2]+"cale.xyz"}</b> with a probability of {100*y_pred_proba[res[3]]:.2f} %</span> <hr/>',unsafe_allow_html=True)
else:
st.markdown(f'<span style="font-size: 24px; color: green;">The 1st guess from the model is: <b>{str(y_onehotenc_cat.categories_[0][res[3]])[2:-2]+".xyz"}</b> with a probability of {100*y_pred_proba[res[3]]:.2f} %</span> <hr/>',unsafe_allow_html=True)
name = str(y_onehotenc_cat.categories_[0][res[4]])[2:-2]
if name.startswith("icsd"):
st.markdown(f'<span style="font-size: 24px; color: green;">The 1st guess from the model is: <b>{str(y_onehotenc_cat.categories_[0][res[4]])[2:-2]+"cale.xyz"}</b> with a probability of {100*y_pred_proba[res[4]]:.2f} %</span> <hr/>',unsafe_allow_html=True)
else:
st.markdown(f'<span style="font-size: 24px; color: green;">The 1st guess from the model is: <b>{str(y_onehotenc_cat.categories_[0][res[4]])[2:-2]+".xyz"}</b> with a probability of {100*y_pred_proba[res[4]]:.2f} %</span> <hr/>',unsafe_allow_html=True)
# st.markdown(f'<span style="font-size: 24px; color: green;">The 2nd guess from the model is: <b>{str(y_onehotenc_cat.categories_[0][res[1]])[2:-2]+"cale.xyz"}</b> with a probability of {100*y_pred_proba[res[1]]:.2f} %</span> <hr/>',unsafe_allow_html=True)
# st.markdown(f'<span style="font-size: 24px; color: green;">The 3rd guess from the model is: <b>{str(y_onehotenc_cat.categories_[0][res[2]])[2:-2]+"cale.xyz"}</b> with a probability of {100*y_pred_proba[res[2]]:.2f} %</span> <hr/>',unsafe_allow_html=True)
# st.markdown(f'<span style="font-size: 24px; color: green;">The 4th guess from the model is: <b>{str(y_onehotenc_cat.categories_[0][res[3]])[2:-2]+"cale.xyz"}</b> with a probability of {100*y_pred_proba[res[3]]:.2f} %</span> <hr/>',unsafe_allow_html=True)
# st.markdown(f'<span style="font-size: 24px; color: green;">The 5th guess from the model is: <b>{str(y_onehotenc_cat.categories_[0][res[4]])[2:-2]+"cale.xyz"}</b> with a probability of {100*y_pred_proba[res[4]]:.2f} %</span> <hr/>',unsafe_allow_html=True)
return res, y_pred_proba
# Define a download button to download the file
def download_button(file_name, button_text):
with open(file_name, "rb") as f:
bytes = f.read()
st.download_button(
label=button_text,
data=bytes,
file_name=file_name,
mime="text/xyz",)
def create_zip_with_txt(y_onehotenc_cat, res, y_pred_proba):
"""
Create a zip file containing specific .xyz files and a txt file with model guesses.
Parameters:
y_onehotenc_cat: The one-hot encoder categories object.
res: The result index for selecting the appropriate file.
y_pred_proba: The probability associated with each prediction.
Returns:
None. A zip file is created in the current directory.
"""
zip_file_name = "POMFinder_results.zip"
# Create a zip file
with zipfile.ZipFile(zip_file_name, 'w') as zipf:
# Add the .xyz files
for i in range(5):
name = str(y_onehotenc_cat.categories_[0][res[i]])[2:-2]
if name.startswith("icsd"):
file_path = "Backend/COD_ICSD_XYZs_POMs_unique99/" + str(y_onehotenc_cat.categories_[0][res[i]])[2:-2] + "cale.xyz"
zipf.write(file_path, arcname=str(y_onehotenc_cat.categories_[0][res[i]])[2:-2] + "cale.xyz")
else:
file_path = "Backend/COD_ICSD_XYZs_POMs_unique99/" + str(y_onehotenc_cat.categories_[0][res[i]])[2:-2] + ".xyz"
zipf.write(file_path, arcname=str(y_onehotenc_cat.categories_[0][res[i]])[2:-2] + ".xyz")
# Create and add the txt file with model guesses
with zipf.open("model_guesses.txt", "w") as f:
for i in range(5):
name = str(y_onehotenc_cat.categories_[0][res[i]])[2:-2]
if name.startswith("icsd"):
guess_text = f'The {i+1}st guess from the model is: {str(y_onehotenc_cat.categories_[0][res[i]])[2:-2]+"cale.xyz"} with a probability of {y_pred_proba[res[i]]:.2f} %\n'
f.write(guess_text.encode('utf-8'))
else:
guess_text = f'The {i+1}st guess from the model is: {str(y_onehotenc_cat.categories_[0][res[i]])[2:-2]+".xyz"} with a probability of {y_pred_proba[res[i]]:.2f} %\n'
f.write(guess_text.encode('utf-8'))
st.title('POMFinder')
st.write('Welcome to POMFinder which is a tree-based supervised learning algorithm that can predict the polyoxometalate cluster from a Pair Distribution Function.')
st.write('Upload a PDF to use POMFinder to predict the structure.')
# Define the file upload widget
pdf_file = st.file_uploader("Upload PDF file in .gr format", type=["gr"])
# Define the form to get the other parameters
Qmin = 0.7 #st.number_input("Qmin value of the experimental PDF", min_value=0.0, max_value=2.0, value=0.7)
Qmax = 30 #st.number_input("Qmax value of the experimental PDF", min_value=15.0, max_value=40.0, value=30.0)
Qdamp = 0.04 #st.number_input("Qdamp value of the experimental PDF", min_value=0.00, max_value=0.08, value=0.04)
nyquist = st.checkbox("Is the data nyquist sampled", value=False)
parser = argparse.ArgumentParser(prog='POMFinder', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
args = parser.parse_args()
args.data = "uploaded_file.gr"
args.nyquist = nyquist
args.Qmin = Qmin
args.Qmax = Qmax
args.Qdamp = Qdamp
args.file_name = "POMFinder_results.txt"
if pdf_file is None:
st.warning("Please upload a PDF file.")
else:
# Get the contents of the file as bytes
file_bytes = pdf_file.read()
# Save the contents of the file to disk
with open("uploaded_file.gr", "wb") as f:
f.write(file_bytes)
#Predict with POMFinder
y, y_onehotenc_cat, y_onehotenc_values, POMFinder = get_POMFinder()
r, Gr = PDF_Preparation(args.data, args.Qmin, args.Qmax, args.Qdamp, rmax=10, nyquist=args.nyquist)
res, y_pred_proba = POMPredicter(POMFinder, Gr, y_onehotenc_cat);
# Download the structural database
#download_button("COD_ICSD_XYZs_POMs_unique99.zip", "Download structural database")
create_zip_with_txt(y_onehotenc_cat, res, y_pred_proba)
download_button("POMFinder_results.zip", "Download top-5 predictions")
st.subheader('Cite')
st.write('If you use POMFinder, our code or results, please consider citing our paper. Thanks in advance!')
st.write('POMFinder: Identifying polyoxometalate cluster structures from pair distribution function data using explainable machine learning **2023** (https://chemrxiv.org/engage/chemrxiv/article-details/64e5fef7dd1a73847f5951b9)')
st.subheader('LICENSE')
st.write('This project is licensed under the Apache License Version 2.0, January 2004 - see the LICENSE file at https://github.com/AndySAnker/POMFinder/blob/master/LICENSE.txt for details.')
st.write("")
st.subheader('Github')
st.write('https://github.com/AndySAnker/POMFinder')
st.subheader('Questions')
st.write('ansoan@dtu.dk or etsk@chem.ku.dk')