# -*- coding: utf-8 -*- """ Created on Mon Sep 4 10:38:59 2023 @author: BM109X32G-10GPU-02 """ from sklearn.metrics import confusion_matrix import matplotlib.pyplot as plt import numpy as np from sklearn.datasets import make_blobs import json import numpy as np import math from tqdm import tqdm from scipy import sparse from sklearn.metrics import median_absolute_error,r2_score, mean_absolute_error,mean_squared_error import pickle import pandas as pd import matplotlib.pyplot as plt from rdkit import Chem from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler from sklearn.neural_network import MLPClassifier from sklearn.svm import SVC from tensorflow.keras.models import Model, load_model from tensorflow.keras.layers import Dense, Input, Flatten, Conv1D, MaxPooling1D, concatenate from tensorflow.keras import metrics, optimizers from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau def split_smiles(smiles, kekuleSmiles=True): try: mol = Chem.MolFromSmiles(smiles) smiles = Chem.MolToSmiles(mol, kekuleSmiles=kekuleSmiles) except: pass splitted_smiles = [] for j, k in enumerate(smiles): if len(smiles) == 1: return [smiles] if j == 0: if k.isupper() and smiles[j + 1].islower() and smiles[j + 1] != "c": splitted_smiles.append(k + smiles[j + 1]) else: splitted_smiles.append(k) elif j != 0 and j < len(smiles) - 1: if k.isupper() and smiles[j + 1].islower() and smiles[j + 1] != "c": splitted_smiles.append(k + smiles[j + 1]) elif k.islower() and smiles[j - 1].isupper() and k != "c": pass else: splitted_smiles.append(k) elif j == len(smiles) - 1: if k.islower() and smiles[j - 1].isupper() and k != "c": pass else: splitted_smiles.append(k) return splitted_smiles def get_maxlen(all_smiles, kekuleSmiles=True): maxlen = 0 for smi in tqdm(all_smiles): spt = split_smiles(smi, kekuleSmiles=kekuleSmiles) if spt is None: continue maxlen = max(maxlen, len(spt)) return maxlen def get_dict(all_smiles, save_path, kekuleSmiles=True): words = [' '] for smi in tqdm(all_smiles): spt = split_smiles(smi, kekuleSmiles=kekuleSmiles) if spt is None: continue for w in spt: if w in words: continue else: words.append(w) with open(save_path, 'w') as js: json.dump(words, js) return words def one_hot_coding(smi, words, kekuleSmiles=True, max_len=1000): coord_j = [] coord_k = [] spt = split_smiles(smi, kekuleSmiles=kekuleSmiles) if spt is None: return None for j,w in enumerate(spt): if j >= max_len: break try: k = words.index(w) except: continue coord_j.append(j) coord_k.append(k) data = np.repeat(1, len(coord_j)) output = sparse.csr_matrix((data, (coord_j, coord_k)), shape=(max_len, len(words))) return output def split_dataset(dataset, ratio): """Shuffle and split a dataset.""" # np.random.seed(111) # fix the seed for shuffle. #np.random.shuffle(dataset) n = int(ratio * len(dataset)) return dataset[:n], dataset[n:] def plot_confusion_matrix(cm, savename, title='Confusion Matrix'): plt.figure(figsize=(12, 8), dpi=100) np.set_printoptions(precision=2) ind_array = [np.arange(3)] x, y = np.meshgrid(ind_array, ind_array) for x_val, y_val in zip(x.flatten(), y.flatten()): c = cm[y_val][x_val] if c > 0.001: plt.text(x_val, y_val, "%0.2f" % (c,), color='red', fontsize=15, va='center', ha='center') plt.imshow(cm, interpolation='nearest', cmap=plt.cm.binary) plt.title(title) plt.colorbar() xlocations = np.array(range(len(classes))) plt.xticks(xlocations, classes, rotation=90) plt.yticks(xlocations, classes) plt.ylabel('Actual label') plt.xlabel('Predict label') # offset the tick tick_marks = np.array(range(len(classes))) + 0.5 plt.gca().set_xticks(tick_marks, minor=True) plt.gca().set_yticks(tick_marks, minor=True) plt.gca().xaxis.set_ticks_position('none') plt.gca().yaxis.set_ticks_position('none') plt.grid(True, which='minor', linestyle='-') plt.gcf().subplots_adjust(bottom=0.15) # show confusion matrix plt.savefig(savename, format='png') plt.show() def main(sm): with open("dict.json", "r", encoding="utf-8") as f: words = json.load(f) inchis = list([sm]) rts = list([0]) smiles, targets = [], [] for i, inc in enumerate(tqdm(inchis)): mol = Chem.MolFromSmiles(inc) if mol is None: continue else: smi = Chem.MolToSmiles(mol) smiles.append(smi) targets.append(rts[i]) features = [] for i, smi in enumerate(tqdm(smiles)): xi = one_hot_coding(smi, words, max_len=600) if xi is not None: features.append(xi.todense()) features = np.asarray(features) targets = np.asarray(targets) X_test=features Y_test=targets n_features=10 model = RandomForestRegressor(n_estimators=100, criterion='friedman_mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=1.0, max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, ccp_alpha=0.0, max_samples=None) from tensorflow.keras import backend as K load_model = pickle.load(open(r"predict.dat","rb")) Y_predict = load_model.predict(K.cast_to_floatx(X_test).reshape((np.size(X_test,0),np.size(X_test,1)*np.size(X_test,2)))) #Y_predict = model.predict(X_test) x = list(Y_test) y = list(Y_predict) return Y_predict if __name__ == "__main__": x = main("CCCCCCC1=CC=C(C2(C3=CC=C(CCCCCC)C=C3)C3=CC4=C(C=C3C3=C2C=C(/C=C2\SC(=S)N(CC)C2=O)S3)C(C2=CC=C(CCCCCC)C=C2)(C2=CC=C(CCCCCC)C=C2)C2=C4SC(/C=C3\SC(=S)N(CC)C3=O)=C2)C=C1")