POMFinder

Sleeping

App Files Files Community

POMFinder / app.py

AndySAnker

Update app.py

4b1230e verified almost 2 years ago

raw

history blame contribute delete

12.6 kB

	from sklearn.preprocessing import OrdinalEncoder
	import xgboost as xgb
	import numpy as np
	import matplotlib.pyplot as plt
	import argparse, h5py, os, re, zipfile
	import streamlit as st

	def append_cale_to_name(name):
	if name.startswith("icsd"):
	return name + "cale"
	return name

	def get_POMFinder():
	# Get file paths
	load_files = "Backend/"
	DataBase_path = "Backend/POMFinder_443structures_100Dataset_per_Structure_xPDF_hypercube_sampling_Grmax_Name.h5"
	POMFinder_path = "Backend/XGBoost_443structures_100PDFperStructure_xPDF_hypercube_sampling_Grmax.model"
	# Import the Database
	hf_name = h5py.File(DataBase_path, "r")
	y = hf_name.get('y')
	enc = OrdinalEncoder()
	y_onehotenc_cat = enc.fit(np.array(y))
	y_onehotenc_values = enc.fit_transform(np.array(y))

	# Import POMFinder
	POMFinder = xgb.XGBClassifier()
	POMFinder.load_model(POMFinder_path)
	return y, y_onehotenc_cat, y_onehotenc_values, POMFinder


	def PDF_Preparation(Your_PDF_Name, Qmin, Qmax, Qdamp, rmax, nyquist):
	for i in range(1000):
	with open(Your_PDF_Name, "r") as file:
	data = file.read().splitlines(True)
	if len(data[0]) == 0:
	with open(Your_PDF_Name, 'w') as fout:
	fout.writelines(data[1:])
	break
	first_line = data[0]
	if len(first_line) > 3 and re.match(r'^-?\d+(?:\.\d+)?$', first_line[0]) != None and re.match(r'^-?\d+(?:\.\d+)?$', first_line[1]) == None and re.match(r'^-?\d+(?:\.\d+)?$', first_line[2]) != None:
	PDF = np.loadtxt(Your_PDF_Name)
	break
	else:
	with open(Your_PDF_Name, 'w') as fout:
	fout.writelines(data[1:])

	r, Gr = PDF[:,0], PDF[:,1]
	if r[0] != 0: # In the case that the Data not start at 0.
	Gr = Gr[np.where(r==1)[0][0]:] # Remove Data from 0 to 0.5 AA
	Gr = Gr[::10] # Nyquist sample the rest of the Data
	Gr = np.concatenate(([0,0,0,0,0,0,0,0,0,0], Gr), axis=0) # Concatenate 0 - 0.5 AA on the Gr.
	if not nyquist:
	Gr = Gr[::10] # Pseudo Nyquist sample Data
	if len(Gr) >= (rmax*10+1):
	Gr = Gr[:(rmax*10+1)] # In the case Data is up to more than 30 AA, we do not use it.
	else:
	Gr = np.concatenate((Gr, np.zeros((101-len(Gr),))), axis=0) # In case Data is not going to 30 AA, we add 0's.

	Gr[:10] = np.zeros((10,))
	r = np.arange(0, (rmax+0.1), 0.1)
	# Normalise it to the data from the database
	Gr /= np.max(Gr)
	# Add experimental parameters to the Gr
	Gr = np.expand_dims(np.concatenate((np.expand_dims(Qmin, axis=0), np.expand_dims(Qmax, axis=0), np.expand_dims(Qdamp, axis=0), Gr), axis=0), axis=0)

	# Create a new figure object
	fig, ax = plt.subplots()

	# Plot the transformation to make sure everything is alright
	ax.plot(PDF[:,0], PDF[:,1], label="Original Data")
	ax.plot(r, Gr[0,3:], label="Gr ready for ML")
	ax.legend()
	ax.set_title("Original Data vs. normalised Data")
	ax.set_xlabel("r (AA)")
	ax.set_ylabel("Gr")

	st.pyplot(fig)

	return r, Gr

	def POMPredicter(POMFinder, Gr, y_onehotenc_cat):
	y_pred_proba = POMFinder.predict_proba(Gr);
	y_pred_proba = y_pred_proba[:,1];
	res = sorted(range(len(y_pred_proba)), key = lambda sub: y_pred_proba[sub]);
	res.reverse();

	name = str(y_onehotenc_cat.categories_[0][res[0]])[2:-2]
	if name.startswith("icsd"):
	st.markdown(f'<span style="font-size: 24px; color: green;">The 1st guess from the model is: <b>{str(y_onehotenc_cat.categories_[0][res[0]])[2:-2]+"cale.xyz"}</b> with a probability of {100*y_pred_proba[res[0]]:.2f} %</span> <hr/>',unsafe_allow_html=True)
	else:
	st.markdown(f'<span style="font-size: 24px; color: green;">The 1st guess from the model is: <b>{str(y_onehotenc_cat.categories_[0][res[0]])[2:-2]+".xyz"}</b> with a probability of {100*y_pred_proba[res[0]]:.2f} %</span> <hr/>',unsafe_allow_html=True)

	name = str(y_onehotenc_cat.categories_[0][res[1]])[2:-2]
	if name.startswith("icsd"):
	st.markdown(f'<span style="font-size: 24px; color: green;">The 1st guess from the model is: <b>{str(y_onehotenc_cat.categories_[0][res[1]])[2:-2]+"cale.xyz"}</b> with a probability of {100*y_pred_proba[res[1]]:.2f} %</span> <hr/>',unsafe_allow_html=True)
	else:
	st.markdown(f'<span style="font-size: 24px; color: green;">The 1st guess from the model is: <b>{str(y_onehotenc_cat.categories_[0][res[1]])[2:-2]+".xyz"}</b> with a probability of {100*y_pred_proba[res[1]]:.2f} %</span> <hr/>',unsafe_allow_html=True)

	name = str(y_onehotenc_cat.categories_[0][res[2]])[2:-2]
	if name.startswith("icsd"):
	st.markdown(f'<span style="font-size: 24px; color: green;">The 1st guess from the model is: <b>{str(y_onehotenc_cat.categories_[0][res[2]])[2:-2]+"cale.xyz"}</b> with a probability of {100*y_pred_proba[res[2]]:.2f} %</span> <hr/>',unsafe_allow_html=True)
	else:
	st.markdown(f'<span style="font-size: 24px; color: green;">The 1st guess from the model is: <b>{str(y_onehotenc_cat.categories_[0][res[2]])[2:-2]+".xyz"}</b> with a probability of {100*y_pred_proba[res[2]]:.2f} %</span> <hr/>',unsafe_allow_html=True)

	name = str(y_onehotenc_cat.categories_[0][res[3]])[2:-2]
	if name.startswith("icsd"):
	st.markdown(f'<span style="font-size: 24px; color: green;">The 1st guess from the model is: <b>{str(y_onehotenc_cat.categories_[0][res[3]])[2:-2]+"cale.xyz"}</b> with a probability of {100*y_pred_proba[res[3]]:.2f} %</span> <hr/>',unsafe_allow_html=True)
	else:
	st.markdown(f'<span style="font-size: 24px; color: green;">The 1st guess from the model is: <b>{str(y_onehotenc_cat.categories_[0][res[3]])[2:-2]+".xyz"}</b> with a probability of {100*y_pred_proba[res[3]]:.2f} %</span> <hr/>',unsafe_allow_html=True)

	name = str(y_onehotenc_cat.categories_[0][res[4]])[2:-2]
	if name.startswith("icsd"):
	st.markdown(f'<span style="font-size: 24px; color: green;">The 1st guess from the model is: <b>{str(y_onehotenc_cat.categories_[0][res[4]])[2:-2]+"cale.xyz"}</b> with a probability of {100*y_pred_proba[res[4]]:.2f} %</span> <hr/>',unsafe_allow_html=True)
	else:
	st.markdown(f'<span style="font-size: 24px; color: green;">The 1st guess from the model is: <b>{str(y_onehotenc_cat.categories_[0][res[4]])[2:-2]+".xyz"}</b> with a probability of {100*y_pred_proba[res[4]]:.2f} %</span> <hr/>',unsafe_allow_html=True)

	# st.markdown(f'<span style="font-size: 24px; color: green;">The 2nd guess from the model is: <b>{str(y_onehotenc_cat.categories_[0][res[1]])[2:-2]+"cale.xyz"}</b> with a probability of {100*y_pred_proba[res[1]]:.2f} %</span> <hr/>',unsafe_allow_html=True)
	# st.markdown(f'<span style="font-size: 24px; color: green;">The 3rd guess from the model is: <b>{str(y_onehotenc_cat.categories_[0][res[2]])[2:-2]+"cale.xyz"}</b> with a probability of {100*y_pred_proba[res[2]]:.2f} %</span> <hr/>',unsafe_allow_html=True)
	# st.markdown(f'<span style="font-size: 24px; color: green;">The 4th guess from the model is: <b>{str(y_onehotenc_cat.categories_[0][res[3]])[2:-2]+"cale.xyz"}</b> with a probability of {100*y_pred_proba[res[3]]:.2f} %</span> <hr/>',unsafe_allow_html=True)
	# st.markdown(f'<span style="font-size: 24px; color: green;">The 5th guess from the model is: <b>{str(y_onehotenc_cat.categories_[0][res[4]])[2:-2]+"cale.xyz"}</b> with a probability of {100*y_pred_proba[res[4]]:.2f} %</span> <hr/>',unsafe_allow_html=True)

	return res, y_pred_proba

	# Define a download button to download the file
	def download_button(file_name, button_text):
	with open(file_name, "rb") as f:
	bytes = f.read()
	st.download_button(
	label=button_text,
	data=bytes,
	file_name=file_name,
	mime="text/xyz",)

	def create_zip_with_txt(y_onehotenc_cat, res, y_pred_proba):
	"""
	Create a zip file containing specific .xyz files and a txt file with model guesses.

	Parameters:
	y_onehotenc_cat: The one-hot encoder categories object.
	res: The result index for selecting the appropriate file.
	y_pred_proba: The probability associated with each prediction.

	Returns:
	None. A zip file is created in the current directory.
	"""

	zip_file_name = "POMFinder_results.zip"

	# Create a zip file
	with zipfile.ZipFile(zip_file_name, 'w') as zipf:
	# Add the .xyz files
	for i in range(5):
	name = str(y_onehotenc_cat.categories_[0][res[i]])[2:-2]
	if name.startswith("icsd"):
	file_path = "Backend/COD_ICSD_XYZs_POMs_unique99/" + str(y_onehotenc_cat.categories_[0][res[i]])[2:-2] + "cale.xyz"
	zipf.write(file_path, arcname=str(y_onehotenc_cat.categories_[0][res[i]])[2:-2] + "cale.xyz")
	else:
	file_path = "Backend/COD_ICSD_XYZs_POMs_unique99/" + str(y_onehotenc_cat.categories_[0][res[i]])[2:-2] + ".xyz"
	zipf.write(file_path, arcname=str(y_onehotenc_cat.categories_[0][res[i]])[2:-2] + ".xyz")
	# Create and add the txt file with model guesses
	with zipf.open("model_guesses.txt", "w") as f:
	for i in range(5):
	name = str(y_onehotenc_cat.categories_[0][res[i]])[2:-2]
	if name.startswith("icsd"):
	guess_text = f'The {i+1}st guess from the model is: {str(y_onehotenc_cat.categories_[0][res[i]])[2:-2]+"cale.xyz"} with a probability of {y_pred_proba[res[i]]:.2f} %\n'
	f.write(guess_text.encode('utf-8'))
	else:
	guess_text = f'The {i+1}st guess from the model is: {str(y_onehotenc_cat.categories_[0][res[i]])[2:-2]+".xyz"} with a probability of {y_pred_proba[res[i]]:.2f} %\n'
	f.write(guess_text.encode('utf-8'))

	st.title('POMFinder')
	st.write('Welcome to POMFinder which is a tree-based supervised learning algorithm that can predict the polyoxometalate cluster from a Pair Distribution Function.')
	st.write('Upload a PDF to use POMFinder to predict the structure.')

	# Define the file upload widget
	pdf_file = st.file_uploader("Upload PDF file in .gr format", type=["gr"])

	# Define the form to get the other parameters
	Qmin = 0.7 #st.number_input("Qmin value of the experimental PDF", min_value=0.0, max_value=2.0, value=0.7)
	Qmax = 30 #st.number_input("Qmax value of the experimental PDF", min_value=15.0, max_value=40.0, value=30.0)
	Qdamp = 0.04 #st.number_input("Qdamp value of the experimental PDF", min_value=0.00, max_value=0.08, value=0.04)
	nyquist = st.checkbox("Is the data nyquist sampled", value=False)

	parser = argparse.ArgumentParser(prog='POMFinder', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
	args = parser.parse_args()
	args.data = "uploaded_file.gr"
	args.nyquist = nyquist
	args.Qmin = Qmin
	args.Qmax = Qmax
	args.Qdamp = Qdamp
	args.file_name = "POMFinder_results.txt"

	if pdf_file is None:
	st.warning("Please upload a PDF file.")
	else:
	# Get the contents of the file as bytes
	file_bytes = pdf_file.read()

	# Save the contents of the file to disk
	with open("uploaded_file.gr", "wb") as f:
	f.write(file_bytes)

	#Predict with POMFinder
	y, y_onehotenc_cat, y_onehotenc_values, POMFinder = get_POMFinder()
	r, Gr = PDF_Preparation(args.data, args.Qmin, args.Qmax, args.Qdamp, rmax=10, nyquist=args.nyquist)
	res, y_pred_proba = POMPredicter(POMFinder, Gr, y_onehotenc_cat);

	# Download the structural database
	#download_button("COD_ICSD_XYZs_POMs_unique99.zip", "Download structural database")
	create_zip_with_txt(y_onehotenc_cat, res, y_pred_proba)
	download_button("POMFinder_results.zip", "Download top-5 predictions")

	st.subheader('Cite')

	st.write('If you use POMFinder, our code or results, please consider citing our paper. Thanks in advance!')

	st.write('POMFinder: Identifying polyoxometalate cluster structures from pair distribution function data using explainable machine learning 2023 (https://chemrxiv.org/engage/chemrxiv/article-details/64e5fef7dd1a73847f5951b9)')

	st.subheader('LICENSE')

	st.write('This project is licensed under the Apache License Version 2.0, January 2004 - see the LICENSE file at https://github.com/AndySAnker/POMFinder/blob/master/LICENSE.txt for details.')
	st.write("")

	st.subheader('Github')
	st.write('https://github.com/AndySAnker/POMFinder')

	st.subheader('Questions')
	st.write('ansoan@dtu.dk or etsk@chem.ku.dk')