POMFinder

Sleeping

File size: 12,570 Bytes

from sklearn.preprocessing import OrdinalEncoder
import xgboost as xgb
import numpy as np
import matplotlib.pyplot as plt
import argparse, h5py, os, re, zipfile
import streamlit as st

def append_cale_to_name(name):
    if name.startswith("icsd"):
        return name + "cale"
    return name
    
def get_POMFinder():
    # Get file paths
    load_files = "Backend/"
    DataBase_path = "Backend/POMFinder_443structures_100Dataset_per_Structure_xPDF_hypercube_sampling_Grmax_Name.h5"
    POMFinder_path = "Backend/XGBoost_443structures_100PDFperStructure_xPDF_hypercube_sampling_Grmax.model"
    # Import the Database    
    hf_name = h5py.File(DataBase_path, "r")
    y = hf_name.get('y')
    enc = OrdinalEncoder()
    y_onehotenc_cat = enc.fit(np.array(y))
    y_onehotenc_values = enc.fit_transform(np.array(y))

    # Import POMFinder
    POMFinder = xgb.XGBClassifier()
    POMFinder.load_model(POMFinder_path)
    return y, y_onehotenc_cat, y_onehotenc_values, POMFinder


def PDF_Preparation(Your_PDF_Name, Qmin, Qmax, Qdamp, rmax, nyquist):
    for i in range(1000):
        with open(Your_PDF_Name, "r") as file:
            data = file.read().splitlines(True)
            if len(data[0]) == 0:
                with open(Your_PDF_Name, 'w') as fout:
                    fout.writelines(data[1:])
                break
            first_line = data[0]
            if len(first_line) > 3 and re.match(r'^-?\d+(?:\.\d+)?$', first_line[0]) != None and re.match(r'^-?\d+(?:\.\d+)?$', first_line[1]) == None and re.match(r'^-?\d+(?:\.\d+)?$', first_line[2]) != None:
                PDF = np.loadtxt(Your_PDF_Name)
                break
            else:
                with open(Your_PDF_Name, 'w') as fout:
                    fout.writelines(data[1:])
        
    r, Gr = PDF[:,0], PDF[:,1]
    if r[0] != 0: # In the case that the Data not start at 0.
      Gr = Gr[np.where(r==1)[0][0]:] # Remove Data from 0 to 0.5 AA
      Gr = Gr[::10] # Nyquist sample the rest of the Data
      Gr = np.concatenate(([0,0,0,0,0,0,0,0,0,0], Gr), axis=0) # Concatenate 0 - 0.5 AA on the Gr.
    if not nyquist:
      Gr = Gr[::10] # Pseudo Nyquist sample Data
    if len(Gr) >= (rmax*10+1):
      Gr = Gr[:(rmax*10+1)] # In the case Data is up to more than 30 AA, we do not use it.
    else:
      Gr = np.concatenate((Gr, np.zeros((101-len(Gr),))), axis=0) # In case Data is not going to 30 AA, we add 0's.

    Gr[:10] = np.zeros((10,))
    r = np.arange(0, (rmax+0.1), 0.1)
    # Normalise it to the data from the database
    Gr /= np.max(Gr)
    # Add experimental parameters to the Gr
    Gr = np.expand_dims(np.concatenate((np.expand_dims(Qmin, axis=0), np.expand_dims(Qmax, axis=0), np.expand_dims(Qdamp, axis=0), Gr), axis=0), axis=0)

    # Create a new figure object
    fig, ax = plt.subplots()

    # Plot the transformation to make sure everything is alright
    ax.plot(PDF[:,0], PDF[:,1], label="Original Data")
    ax.plot(r, Gr[0,3:], label="Gr ready for ML")
    ax.legend()
    ax.set_title("Original Data vs. normalised Data")
    ax.set_xlabel("r (AA)")
    ax.set_ylabel("Gr")
    
    st.pyplot(fig)
        
    return r, Gr

def POMPredicter(POMFinder, Gr, y_onehotenc_cat):
    y_pred_proba = POMFinder.predict_proba(Gr);
    y_pred_proba = y_pred_proba[:,1];
    res = sorted(range(len(y_pred_proba)), key = lambda sub: y_pred_proba[sub]);
    res.reverse();
    
    name = str(y_onehotenc_cat.categories_[0][res[0]])[2:-2]
    if name.startswith("icsd"):
        st.markdown(f'<span style="font-size: 24px; color: green;">The 1st guess from the model is: <b>{str(y_onehotenc_cat.categories_[0][res[0]])[2:-2]+"cale.xyz"}</b> with a probability of {100*y_pred_proba[res[0]]:.2f} %</span> <hr/>',unsafe_allow_html=True)
    else:
        st.markdown(f'<span style="font-size: 24px; color: green;">The 1st guess from the model is: <b>{str(y_onehotenc_cat.categories_[0][res[0]])[2:-2]+".xyz"}</b> with a probability of {100*y_pred_proba[res[0]]:.2f} %</span> <hr/>',unsafe_allow_html=True)

    name = str(y_onehotenc_cat.categories_[0][res[1]])[2:-2]
    if name.startswith("icsd"):
        st.markdown(f'<span style="font-size: 24px; color: green;">The 1st guess from the model is: <b>{str(y_onehotenc_cat.categories_[0][res[1]])[2:-2]+"cale.xyz"}</b> with a probability of {100*y_pred_proba[res[1]]:.2f} %</span> <hr/>',unsafe_allow_html=True)
    else:
        st.markdown(f'<span style="font-size: 24px; color: green;">The 1st guess from the model is: <b>{str(y_onehotenc_cat.categories_[0][res[1]])[2:-2]+".xyz"}</b> with a probability of {100*y_pred_proba[res[1]]:.2f} %</span> <hr/>',unsafe_allow_html=True)

    name = str(y_onehotenc_cat.categories_[0][res[2]])[2:-2]
    if name.startswith("icsd"):
        st.markdown(f'<span style="font-size: 24px; color: green;">The 1st guess from the model is: <b>{str(y_onehotenc_cat.categories_[0][res[2]])[2:-2]+"cale.xyz"}</b> with a probability of {100*y_pred_proba[res[2]]:.2f} %</span> <hr/>',unsafe_allow_html=True)
    else:
        st.markdown(f'<span style="font-size: 24px; color: green;">The 1st guess from the model is: <b>{str(y_onehotenc_cat.categories_[0][res[2]])[2:-2]+".xyz"}</b> with a probability of {100*y_pred_proba[res[2]]:.2f} %</span> <hr/>',unsafe_allow_html=True)

    name = str(y_onehotenc_cat.categories_[0][res[3]])[2:-2]
    if name.startswith("icsd"):
        st.markdown(f'<span style="font-size: 24px; color: green;">The 1st guess from the model is: <b>{str(y_onehotenc_cat.categories_[0][res[3]])[2:-2]+"cale.xyz"}</b> with a probability of {100*y_pred_proba[res[3]]:.2f} %</span> <hr/>',unsafe_allow_html=True)
    else:
        st.markdown(f'<span style="font-size: 24px; color: green;">The 1st guess from the model is: <b>{str(y_onehotenc_cat.categories_[0][res[3]])[2:-2]+".xyz"}</b> with a probability of {100*y_pred_proba[res[3]]:.2f} %</span> <hr/>',unsafe_allow_html=True)

    name = str(y_onehotenc_cat.categories_[0][res[4]])[2:-2]
    if name.startswith("icsd"):
        st.markdown(f'<span style="font-size: 24px; color: green;">The 1st guess from the model is: <b>{str(y_onehotenc_cat.categories_[0][res[4]])[2:-2]+"cale.xyz"}</b> with a probability of {100*y_pred_proba[res[4]]:.2f} %</span> <hr/>',unsafe_allow_html=True)
    else:
        st.markdown(f'<span style="font-size: 24px; color: green;">The 1st guess from the model is: <b>{str(y_onehotenc_cat.categories_[0][res[4]])[2:-2]+".xyz"}</b> with a probability of {100*y_pred_proba[res[4]]:.2f} %</span> <hr/>',unsafe_allow_html=True)
    
    #    st.markdown(f'<span style="font-size: 24px; color: green;">The 2nd guess from the model is: <b>{str(y_onehotenc_cat.categories_[0][res[1]])[2:-2]+"cale.xyz"}</b> with a probability of {100*y_pred_proba[res[1]]:.2f} %</span> <hr/>',unsafe_allow_html=True)
    #    st.markdown(f'<span style="font-size: 24px; color: green;">The 3rd guess from the model is: <b>{str(y_onehotenc_cat.categories_[0][res[2]])[2:-2]+"cale.xyz"}</b> with a probability of {100*y_pred_proba[res[2]]:.2f} %</span> <hr/>',unsafe_allow_html=True)
    #    st.markdown(f'<span style="font-size: 24px; color: green;">The 4th guess from the model is: <b>{str(y_onehotenc_cat.categories_[0][res[3]])[2:-2]+"cale.xyz"}</b> with a probability of {100*y_pred_proba[res[3]]:.2f} %</span> <hr/>',unsafe_allow_html=True)
    #    st.markdown(f'<span style="font-size: 24px; color: green;">The 5th guess from the model is: <b>{str(y_onehotenc_cat.categories_[0][res[4]])[2:-2]+"cale.xyz"}</b> with a probability of {100*y_pred_proba[res[4]]:.2f} %</span> <hr/>',unsafe_allow_html=True)
    
    return res, y_pred_proba

# Define a download button to download the file
def download_button(file_name, button_text):
    with open(file_name, "rb") as f:
        bytes = f.read()
        st.download_button(
        label=button_text,
        data=bytes,
        file_name=file_name,
        mime="text/xyz",)

def create_zip_with_txt(y_onehotenc_cat, res, y_pred_proba):
    """
    Create a zip file containing specific .xyz files and a txt file with model guesses.
    
    Parameters:
        y_onehotenc_cat: The one-hot encoder categories object.
        res: The result index for selecting the appropriate file.
        y_pred_proba: The probability associated with each prediction.
        
    Returns:
        None. A zip file is created in the current directory.
    """
    
    zip_file_name = "POMFinder_results.zip"
    
    # Create a zip file
    with zipfile.ZipFile(zip_file_name, 'w') as zipf:
        # Add the .xyz files
        for i in range(5):
            name = str(y_onehotenc_cat.categories_[0][res[i]])[2:-2]
            if name.startswith("icsd"):
                file_path = "Backend/COD_ICSD_XYZs_POMs_unique99/" + str(y_onehotenc_cat.categories_[0][res[i]])[2:-2] + "cale.xyz"
                zipf.write(file_path, arcname=str(y_onehotenc_cat.categories_[0][res[i]])[2:-2] + "cale.xyz")
            else:
                file_path = "Backend/COD_ICSD_XYZs_POMs_unique99/" + str(y_onehotenc_cat.categories_[0][res[i]])[2:-2] + ".xyz"
                zipf.write(file_path, arcname=str(y_onehotenc_cat.categories_[0][res[i]])[2:-2] + ".xyz")
        # Create and add the txt file with model guesses
        with zipf.open("model_guesses.txt", "w") as f:
            for i in range(5):
                name = str(y_onehotenc_cat.categories_[0][res[i]])[2:-2]
                if name.startswith("icsd"):
                    guess_text = f'The {i+1}st guess from the model is: {str(y_onehotenc_cat.categories_[0][res[i]])[2:-2]+"cale.xyz"} with a probability of {y_pred_proba[res[i]]:.2f} %\n'
                    f.write(guess_text.encode('utf-8'))
                else:
                    guess_text = f'The {i+1}st guess from the model is: {str(y_onehotenc_cat.categories_[0][res[i]])[2:-2]+".xyz"} with a probability of {y_pred_proba[res[i]]:.2f} %\n'
                    f.write(guess_text.encode('utf-8'))

st.title('POMFinder')
st.write('Welcome to POMFinder which is a tree-based supervised learning algorithm that can predict the polyoxometalate cluster from a Pair Distribution Function.')
st.write('Upload a PDF to use POMFinder to predict the structure.')

# Define the file upload widget
pdf_file = st.file_uploader("Upload PDF file in .gr format", type=["gr"])

# Define the form to get the other parameters
Qmin = 0.7 #st.number_input("Qmin value of the experimental PDF", min_value=0.0, max_value=2.0, value=0.7)
Qmax = 30 #st.number_input("Qmax value of the experimental PDF", min_value=15.0, max_value=40.0, value=30.0)
Qdamp = 0.04 #st.number_input("Qdamp value of the experimental PDF", min_value=0.00, max_value=0.08, value=0.04)
nyquist = st.checkbox("Is the data nyquist sampled", value=False)

parser = argparse.ArgumentParser(prog='POMFinder', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
args = parser.parse_args()
args.data = "uploaded_file.gr"
args.nyquist = nyquist
args.Qmin = Qmin
args.Qmax = Qmax
args.Qdamp = Qdamp
args.file_name = "POMFinder_results.txt"

if pdf_file is None:
    st.warning("Please upload a PDF file.")
else:
    # Get the contents of the file as bytes
    file_bytes = pdf_file.read()

    # Save the contents of the file to disk
    with open("uploaded_file.gr", "wb") as f:
        f.write(file_bytes)

    #Predict with POMFinder
    y, y_onehotenc_cat, y_onehotenc_values, POMFinder = get_POMFinder()
    r, Gr = PDF_Preparation(args.data, args.Qmin, args.Qmax, args.Qdamp, rmax=10, nyquist=args.nyquist)
    res, y_pred_proba = POMPredicter(POMFinder, Gr, y_onehotenc_cat);

    # Download the structural database
    #download_button("COD_ICSD_XYZs_POMs_unique99.zip", "Download structural database")
    create_zip_with_txt(y_onehotenc_cat, res, y_pred_proba)
    download_button("POMFinder_results.zip", "Download top-5 predictions")

st.subheader('Cite')

st.write('If you use POMFinder, our code or results, please consider citing our paper. Thanks in advance!')

st.write('POMFinder: Identifying polyoxometalate cluster structures from pair distribution function data using explainable machine learning **2023** (https://chemrxiv.org/engage/chemrxiv/article-details/64e5fef7dd1a73847f5951b9)')

st.subheader('LICENSE')

st.write('This project is licensed under the Apache License Version 2.0, January 2004 - see the LICENSE file at https://github.com/AndySAnker/POMFinder/blob/master/LICENSE.txt for details.')
st.write("")

st.subheader('Github')
st.write('https://github.com/AndySAnker/POMFinder')

st.subheader('Questions')
st.write('ansoan@dtu.dk or etsk@chem.ku.dk')