Spaces:
Sleeping
Sleeping
| # -*- coding: utf-8 -*- | |
| """ | |
| Created on Mon Sep 4 10:38:59 2023 | |
| @author: BM109X32G-10GPU-02 | |
| """ | |
| from sklearn.metrics import confusion_matrix | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| from sklearn.datasets import make_blobs | |
| import json | |
| import numpy as np | |
| import math | |
| from tqdm import tqdm | |
| from scipy import sparse | |
| from sklearn.metrics import median_absolute_error,r2_score, mean_absolute_error,mean_squared_error | |
| import pickle | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| from rdkit import Chem | |
| from sklearn.ensemble import RandomForestRegressor | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.preprocessing import MinMaxScaler | |
| from sklearn.neural_network import MLPClassifier | |
| from sklearn.svm import SVC | |
| from tensorflow.keras.models import Model, load_model | |
| from tensorflow.keras.layers import Dense, Input, Flatten, Conv1D, MaxPooling1D, concatenate | |
| from tensorflow.keras import metrics, optimizers | |
| from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau | |
| def split_smiles(smiles, kekuleSmiles=True): | |
| try: | |
| mol = Chem.MolFromSmiles(smiles) | |
| smiles = Chem.MolToSmiles(mol, kekuleSmiles=kekuleSmiles) | |
| except: | |
| pass | |
| splitted_smiles = [] | |
| for j, k in enumerate(smiles): | |
| if len(smiles) == 1: | |
| return [smiles] | |
| if j == 0: | |
| if k.isupper() and smiles[j + 1].islower() and smiles[j + 1] != "c": | |
| splitted_smiles.append(k + smiles[j + 1]) | |
| else: | |
| splitted_smiles.append(k) | |
| elif j != 0 and j < len(smiles) - 1: | |
| if k.isupper() and smiles[j + 1].islower() and smiles[j + 1] != "c": | |
| splitted_smiles.append(k + smiles[j + 1]) | |
| elif k.islower() and smiles[j - 1].isupper() and k != "c": | |
| pass | |
| else: | |
| splitted_smiles.append(k) | |
| elif j == len(smiles) - 1: | |
| if k.islower() and smiles[j - 1].isupper() and k != "c": | |
| pass | |
| else: | |
| splitted_smiles.append(k) | |
| return splitted_smiles | |
| def get_maxlen(all_smiles, kekuleSmiles=True): | |
| maxlen = 0 | |
| for smi in tqdm(all_smiles): | |
| spt = split_smiles(smi, kekuleSmiles=kekuleSmiles) | |
| if spt is None: | |
| continue | |
| maxlen = max(maxlen, len(spt)) | |
| return maxlen | |
| def get_dict(all_smiles, save_path, kekuleSmiles=True): | |
| words = [' '] | |
| for smi in tqdm(all_smiles): | |
| spt = split_smiles(smi, kekuleSmiles=kekuleSmiles) | |
| if spt is None: | |
| continue | |
| for w in spt: | |
| if w in words: | |
| continue | |
| else: | |
| words.append(w) | |
| with open(save_path, 'w') as js: | |
| json.dump(words, js) | |
| return words | |
| def one_hot_coding(smi, words, kekuleSmiles=True, max_len=1000): | |
| coord_j = [] | |
| coord_k = [] | |
| spt = split_smiles(smi, kekuleSmiles=kekuleSmiles) | |
| if spt is None: | |
| return None | |
| for j,w in enumerate(spt): | |
| if j >= max_len: | |
| break | |
| try: | |
| k = words.index(w) | |
| except: | |
| continue | |
| coord_j.append(j) | |
| coord_k.append(k) | |
| data = np.repeat(1, len(coord_j)) | |
| output = sparse.csr_matrix((data, (coord_j, coord_k)), shape=(max_len, len(words))) | |
| return output | |
| def split_dataset(dataset, ratio): | |
| """Shuffle and split a dataset.""" | |
| # np.random.seed(111) # fix the seed for shuffle. | |
| #np.random.shuffle(dataset) | |
| n = int(ratio * len(dataset)) | |
| return dataset[:n], dataset[n:] | |
| def plot_confusion_matrix(cm, savename, title='Confusion Matrix'): | |
| plt.figure(figsize=(12, 8), dpi=100) | |
| np.set_printoptions(precision=2) | |
| ind_array = [np.arange(3)] | |
| x, y = np.meshgrid(ind_array, ind_array) | |
| for x_val, y_val in zip(x.flatten(), y.flatten()): | |
| c = cm[y_val][x_val] | |
| if c > 0.001: | |
| plt.text(x_val, y_val, "%0.2f" % (c,), color='red', fontsize=15, va='center', ha='center') | |
| plt.imshow(cm, interpolation='nearest', cmap=plt.cm.binary) | |
| plt.title(title) | |
| plt.colorbar() | |
| xlocations = np.array(range(len(classes))) | |
| plt.xticks(xlocations, classes, rotation=90) | |
| plt.yticks(xlocations, classes) | |
| plt.ylabel('Actual label') | |
| plt.xlabel('Predict label') | |
| # offset the tick | |
| tick_marks = np.array(range(len(classes))) + 0.5 | |
| plt.gca().set_xticks(tick_marks, minor=True) | |
| plt.gca().set_yticks(tick_marks, minor=True) | |
| plt.gca().xaxis.set_ticks_position('none') | |
| plt.gca().yaxis.set_ticks_position('none') | |
| plt.grid(True, which='minor', linestyle='-') | |
| plt.gcf().subplots_adjust(bottom=0.15) | |
| # show confusion matrix | |
| plt.savefig(savename, format='png') | |
| plt.show() | |
| def main(sm): | |
| with open("dict.json", "r", encoding="utf-8") as f: | |
| words = json.load(f) | |
| inchis = list([sm]) | |
| rts = list([0]) | |
| smiles, targets = [], [] | |
| for i, inc in enumerate(tqdm(inchis)): | |
| mol = Chem.MolFromSmiles(inc) | |
| if mol is None: | |
| continue | |
| else: | |
| smi = Chem.MolToSmiles(mol) | |
| smiles.append(smi) | |
| targets.append(rts[i]) | |
| features = [] | |
| for i, smi in enumerate(tqdm(smiles)): | |
| xi = one_hot_coding(smi, words, max_len=600) | |
| if xi is not None: | |
| features.append(xi.todense()) | |
| features = np.asarray(features) | |
| targets = np.asarray(targets) | |
| X_test=features | |
| Y_test=targets | |
| n_features=10 | |
| model = RandomForestRegressor(n_estimators=100, criterion='friedman_mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=1.0, max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, ccp_alpha=0.0, max_samples=None) | |
| from tensorflow.keras import backend as K | |
| load_model = pickle.load(open(r"predict.dat","rb")) | |
| # model = load_model('C:/Users/sunjinyu/Desktop/FingerID Reference/drug-likeness/CNN/single_model.h5') | |
| Y_predict = load_model.predict(K.cast_to_floatx(X_test).reshape((np.size(X_test,0),np.size(X_test,1)*np.size(X_test,2)))) | |
| #Y_predict = model.predict(X_test) | |
| x = list(Y_test) | |
| y = list(Y_predict) | |
| return Y_predict | |
| def edit_dataset(drug,non_drug,task): | |
| # np.random.seed(111) # fix the seed for shuffle. | |
| # np.random.shuffle(non_drug) | |
| non_drug=non_drug[0:len(drug)] | |
| # np.random.shuffle(non_drug) | |
| # np.random.shuffle(drug) | |
| dataset_train_drug, dataset_test_drug = split_dataset(drug, 0.9) | |
| # dataset_train_drug,dataset_dev_drug = split_dataset(dataset_train_drug, 0.9) | |
| dataset_train_no, dataset_test_no = split_dataset(non_drug, 0.9) | |
| # dataset_train_no,dataset_dev_no = split_dataset(dataset_train_no, 0.9) | |
| dataset_train = pd.concat([dataset_train_drug,dataset_train_no], axis=0) | |
| dataset_test=pd.concat([ dataset_test_drug,dataset_test_no], axis=0) | |
| # dataset_dev = dataset_dev_drug+dataset_dev_no | |
| return dataset_train, dataset_test | |