File size: 5,847 Bytes
fba5f30
 
ce48cf1
fba5f30
 
9000099
ce48cf1
49f079e
ce48cf1
 
 
 
 
 
 
 
fba5f30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ce48cf1
fba5f30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ce48cf1
fba5f30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ce48cf1
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
from sklearn.preprocessing import OrdinalEncoder
import xgboost as xgb
import numpy as np
import matplotlib.pyplot as plt
import argparse, h5py, os, re, pkg_resources
import streamlit as st

st.title('POMFinder')

st.write('Welcome to DeepStruc that is a Deep Generative Model which has been trained to solve a mono-metallic structure (<200 atoms) based on a PDF!')
st.write('Upload a PDF to use DeepStruc to predict the structure.')

# Define the file upload widget
pdf_file = st.file_uploader("Upload PDF file in .gr format", type=["gr"])

# Define the form to get the other parameters
num_structures = st.number_input("Qmin value", min_value=0, max_value=2, value=0.7)
#structure_index = st.number_input("Index of structure to visualize", min_value=0, value=3)
#sigma = st.number_input("Standard deviation for sampling", min_value=0.1, value=3.0)

parser = argparse.ArgumentParser(prog='POMFinder', formatter_class=argparse.ArgumentDefaultsHelpFormatter)

requiredNamed = parser.add_argument_group('required named arguments')

requiredNamed.add_argument("-d", "--data", default=None, type=str,
                    help="a directory of PDFs or a file.", required=True)

requiredNamed.add_argument("-n", "--nyquist", default="No", type=str,
                    help="is the data nyquist sampled", required=True)

parser.add_argument("-i", "--Qmin", default=0.7, type=float,
                    help="Qmin value of the experimental PDF")

parser.add_argument("-a", "--Qmax", default=30, type=float,
                    help="Qmax value of the experimental PDF")

parser.add_argument("-f", "--file_name", default='', type=str,
                    help="Name of the output file")

parser.add_argument("-m", "--Qdamp", default=0.04, type=float,
                    help="Qdamp value of the experimental PDF")

def main(args=None):
    args = parser.parse_args(args=args)
    y, y_onehotenc_cat, y_onehotenc_values, POMFinder = get_POMFinder()
    r, Gr = PDF_Preparation(args.data, args.Qmin, args.Qmax, args.Qdamp, rmax=10, nyquist=args.nyquist)
    res, y_pred_proba = POMPredicter(POMFinder, Gr, y_onehotenc_cat);



def get_POMFinder():
    # Get file paths
    load_files = pkg_resources.resource_listdir(__name__, 'Backend/')
    DataBase_path = pkg_resources.resource_filename(__name__, "Backend/"+load_files[0])
    POMFinder_path = pkg_resources.resource_filename(__name__, "Backend/"+load_files[1])
    # Import the Database    
    hf_name = h5py.File(DataBase_path, "r")
    y = hf_name.get('y')
    enc = OrdinalEncoder()
    y_onehotenc_cat = enc.fit(np.array(y))
    y_onehotenc_values = enc.fit_transform(np.array(y))

    # Import POMFinder
    POMFinder = xgb.XGBClassifier()
    POMFinder.load_model(POMFinder_path)
    return y, y_onehotenc_cat, y_onehotenc_values, POMFinder


def PDF_Preparation(Your_PDF_Name, Qmin, Qmax, Qdamp, rmax, nyquist, plot=True):
    for i in range(1000):
        with open(Your_PDF_Name, "r") as file:
            data = file.read().splitlines(True)
            if len(data[0]) == 0:
                with open(Your_PDF_Name, 'w') as fout:
                    fout.writelines(data[1:])
                break
            first_line = data[0]
            if len(first_line) > 3 and re.match(r'^-?\d+(?:\.\d+)?$', first_line[0]) != None and re.match(r'^-?\d+(?:\.\d+)?$', first_line[1]) == None and re.match(r'^-?\d+(?:\.\d+)?$', first_line[2]) != None:
                PDF = np.loadtxt(Your_PDF_Name)
                break
            else:
                with open(Your_PDF_Name, 'w') as fout:
                    fout.writelines(data[1:])
        
    r, Gr = PDF[:,0], PDF[:,1]
    if r[0] != 0: # In the case that the Data not start at 0.
      Gr = Gr[np.where(r==1)[0][0]:] # Remove Data from 0 to 0.5 AA
      Gr = Gr[::10] # Nyquist sample the rest of the Data
      Gr = np.concatenate(([0,0,0,0,0,0,0,0,0,0], Gr), axis=0) # Concatenate 0 - 0.5 AA on the Gr.
    if nyquist == "No" or nyquist == "no":
      Gr = Gr[::10] # Nyquist sample Data
    if len(Gr) >= (rmax*10+1):
      Gr = Gr[:(rmax*10+1)] # In the case Data is up to more than 30 AA, we do not use it.
    else:
      Gr = np.concatenate((Gr, np.zeros((101-len(Gr),))), axis=0) # In case Data is not going to 30 AA, we add 0's.

    Gr[:10] = np.zeros((10,))
    r = np.arange(0, (rmax+0.1), 0.1)
    # Normalise it to the data from the database
    Gr /= np.max(Gr)
    # Add experimental parameters to the Gr
    Gr = np.expand_dims(np.concatenate((np.expand_dims(Qmin, axis=0), np.expand_dims(Qmax, axis=0), np.expand_dims(Qdamp, axis=0), Gr), axis=0), axis=0)

    if plot:
        # Plot the transformation to make sure everything is alright
        plt.plot(PDF[:,0], PDF[:,1], label="Original Data")
        plt.plot(r, Gr[0,3:], label="Gr ready for ML")
        plt.legend()
        plt.title("Original Data vs. normalised Data")
        plt.xlabel("r (AA)")
        plt.ylabel("Gr")
        plt.show()
    
    return r, Gr

def POMPredicter(POMFinder, Gr, y_onehotenc_cat):
    y_pred_proba = POMFinder.predict_proba(Gr);
    y_pred_proba = y_pred_proba[:,1];
    #print (np.shape(y_pred_proba))
    #y_pred_proba = y_pred_proba[0];
    res = sorted(range(len(y_pred_proba)), key = lambda sub: y_pred_proba[sub]);
    res.reverse();
    print ("The 1st guess from the model is: ", str(y_onehotenc_cat.categories_[0][res[0]])[2:-2]+"cale.xyz")
    print ("The 2nd guess from the model is: ", str(y_onehotenc_cat.categories_[0][res[1]])[2:-2]+"cale.xyz")
    print ("The 3rd guess from the model is: ", str(y_onehotenc_cat.categories_[0][res[2]])[2:-2]+"cale.xyz")
    print ("The 4th guess from the model is: ", str(y_onehotenc_cat.categories_[0][res[3]])[2:-2]+"cale.xyz")
    print ("The 5th guess from the model is: ", str(y_onehotenc_cat.categories_[0][res[4]])[2:-2]+"cale.xyz")
    
    return res, y_pred_proba