from sklearn.preprocessing import OrdinalEncoder import xgboost as xgb import numpy as np import matplotlib.pyplot as plt import argparse, h5py, os, re, pkg_resources import streamlit as st def get_POMFinder(): # Get file paths load_files = "Backend/" #pkg_resources.resource_listdir(__name__, 'Backend/') DataBase_path = "Backend/POMFinder_443structures_100Dataset_per_Structure_xPDF_hypercube_sampling_Grmax_Name.h5" #pkg_resources.resource_filename(__name__, "Backend/"+load_files[0]) POMFinder_path = "Backend/XGBoost_443structures_100PDFperStructure_xPDF_hypercube_sampling_Grmax.model" #pkg_resources.resource_filename(__name__, "Backend/"+load_files[1]) # Import the Database hf_name = h5py.File(DataBase_path, "r") y = hf_name.get('y') enc = OrdinalEncoder() y_onehotenc_cat = enc.fit(np.array(y)) y_onehotenc_values = enc.fit_transform(np.array(y)) # Import POMFinder POMFinder = xgb.XGBClassifier() POMFinder.load_model(POMFinder_path) return y, y_onehotenc_cat, y_onehotenc_values, POMFinder def PDF_Preparation(Your_PDF_Name, Qmin, Qmax, Qdamp, rmax, nyquist): for i in range(1000): with open(Your_PDF_Name, "r") as file: data = file.read().splitlines(True) if len(data[0]) == 0: with open(Your_PDF_Name, 'w') as fout: fout.writelines(data[1:]) break first_line = data[0] if len(first_line) > 3 and re.match(r'^-?\d+(?:\.\d+)?$', first_line[0]) != None and re.match(r'^-?\d+(?:\.\d+)?$', first_line[1]) == None and re.match(r'^-?\d+(?:\.\d+)?$', first_line[2]) != None: PDF = np.loadtxt(Your_PDF_Name) break else: with open(Your_PDF_Name, 'w') as fout: fout.writelines(data[1:]) r, Gr = PDF[:,0], PDF[:,1] if r[0] != 0: # In the case that the Data not start at 0. Gr = Gr[np.where(r==1)[0][0]:] # Remove Data from 0 to 0.5 AA Gr = Gr[::10] # Nyquist sample the rest of the Data Gr = np.concatenate(([0,0,0,0,0,0,0,0,0,0], Gr), axis=0) # Concatenate 0 - 0.5 AA on the Gr. if nyquist == "No" or nyquist == "no": Gr = Gr[::10] # Nyquist sample Data if len(Gr) >= (rmax*10+1): Gr = Gr[:(rmax*10+1)] # In the case Data is up to more than 30 AA, we do not use it. else: Gr = np.concatenate((Gr, np.zeros((101-len(Gr),))), axis=0) # In case Data is not going to 30 AA, we add 0's. Gr[:10] = np.zeros((10,)) r = np.arange(0, (rmax+0.1), 0.1) # Normalise it to the data from the database Gr /= np.max(Gr) # Add experimental parameters to the Gr Gr = np.expand_dims(np.concatenate((np.expand_dims(Qmin, axis=0), np.expand_dims(Qmax, axis=0), np.expand_dims(Qdamp, axis=0), Gr), axis=0), axis=0) # Create a new figure object fig, ax = plt.subplots() # Plot the transformation to make sure everything is alright ax.plot(PDF[:,0], PDF[:,1], label="Original Data") ax.plot(r, Gr[0,3:], label="Gr ready for ML") ax.legend() ax.set_title("Original Data vs. normalised Data") ax.set_xlabel("r (AA)") ax.set_ylabel("Gr") st.pyplot(fig) return r, Gr def POMPredicter(POMFinder, Gr, y_onehotenc_cat): y_pred_proba = POMFinder.predict_proba(Gr); y_pred_proba = y_pred_proba[:,1]; res = sorted(range(len(y_pred_proba)), key = lambda sub: y_pred_proba[sub]); res.reverse(); st.write("The 1st guess from the model is: ", str(y_onehotenc_cat.categories_[0][res[0]])[2:-2]+"cale.xyz") st.write("The 2nd guess from the model is: ", str(y_onehotenc_cat.categories_[0][res[1]])[2:-2]+"cale.xyz") st.write("The 3rd guess from the model is: ", str(y_onehotenc_cat.categories_[0][res[2]])[2:-2]+"cale.xyz") st.write("The 4th guess from the model is: ", str(y_onehotenc_cat.categories_[0][res[3]])[2:-2]+"cale.xyz") st.write("The 5th guess from the model is: ", str(y_onehotenc_cat.categories_[0][res[4]])[2:-2]+"cale.xyz") return res, y_pred_proba st.title('POMFinder') st.write('Welcome to DeepStruc that is a Deep Generative Model which has been trained to solve a mono-metallic structure (<200 atoms) based on a PDF!') st.write('Upload a PDF to use DeepStruc to predict the structure.') # Define the file upload widget pdf_file = st.file_uploader("Upload PDF file in .gr format", type=["gr"]) # Define the form to get the other parameters Qmin = st.number_input("Qmin value of the experimental PDF", min_value=0.0, max_value=2.0, value=0.7) Qmax = st.number_input("Qmax value of the experimental PDF", min_value=15.0, max_value=40.0, value=30.0) Qdamp = st.number_input("Qdamp value of the experimental PDF", min_value=0.00, max_value=0.08, value=0.04) parser = argparse.ArgumentParser(prog='POMFinder', formatter_class=argparse.ArgumentDefaultsHelpFormatter) args = parser.parse_args() args.data = "uploaded_file.gr" args.nyquist = "True" args.Qmin = Qmin args.Qmax = Qmax args.Qdamp = Qdamp args.file_name = "POMFinder_results.txt" if pdf_file is None: st.warning("Please upload a PDF file.") else: # Get the contents of the file as bytes file_bytes = pdf_file.read() # Save the contents of the file to disk with open("uploaded_file.gr", "wb") as f: f.write(file_bytes) #Predict with POMFinder y, y_onehotenc_cat, y_onehotenc_values, POMFinder = get_POMFinder() st.write("POMfinder loaded") r, Gr = PDF_Preparation(args.data, args.Qmin, args.Qmax, args.Qdamp, rmax=10, nyquist=args.nyquist) st.write("Data loaded") res, y_pred_proba = POMPredicter(POMFinder, Gr, y_onehotenc_cat); st.write("Predictions is: ", res) st.subheader('Cite') st.write('If you use DeepStruc, our code or results, please consider citing our papers. Thanks in advance!') st.write('DeepStruc: Towards structure solution from pair distribution function data using deep generative models **2023** (https://pubs.rsc.org/en/content/articlehtml/2022/dd/d2dd00086e)') st.write('Characterising the atomic structure of mono-metallic nanoparticles from x-ray scattering data using conditional generative models **2020** (https://par.nsf.gov/biblio/10300745)') st.subheader('LICENSE') st.write('This project is licensed under the Apache License Version 2.0, January 2004 - see the LICENSE file at https://github.com/EmilSkaaning/DeepStruc/blob/main/LICENSE.md for details.') st.write("") st.subheader('Github') st.write('https://github.com/EmilSkaaning/DeepStruc') st.subheader('Questions') st.write('andy@chem.ku.dk or etsk@chem.ku.dk')