Spaces:
Sleeping
Sleeping
# -*- coding: utf-8 -*- | |
""" | |
Created on Mon Jun 8 09:32:26 2020 | |
@author: Muammer | |
""" | |
import os | |
script_dir = os.path.dirname(os.path.abspath(__file__)) | |
import numpy as np | |
from sklearn.model_selection import train_test_split | |
from sklearn import linear_model | |
from sklearn.metrics import ( | |
f1_score, accuracy_score, confusion_matrix, classification_report, matthews_corrcoef | |
) | |
from sklearn.multiclass import OneVsRestClassifier | |
import pandas as pd | |
from tqdm import tqdm | |
import math | |
representation_name = "" | |
representation_path = "" | |
dataset = "nc" | |
detailed_output = False | |
def convert_dataframe_to_multi_col(representation_dataframe): | |
entry = pd.DataFrame(representation_dataframe['Entry']) | |
vector = pd.DataFrame(list(representation_dataframe['Vector'])) | |
multi_col_representation_vector = pd.merge(left=entry, right=vector, left_index=True, right_index=True) | |
return multi_col_representation_vector | |
def class_based_scores(c_report, c_matrix): | |
c_report = pd.DataFrame(c_report).transpose() | |
c_report = c_report.drop(['precision', 'recall'], axis=1) | |
c_report = c_report.drop(labels=['accuracy', 'macro avg', 'weighted avg'], axis=0) | |
cm = c_matrix.astype('float') / c_matrix.sum(axis=1)[:, np.newaxis] | |
accuracy = cm.diagonal() | |
accuracy = pd.Series(accuracy, index=c_report.index) | |
c_report['accuracy'] = accuracy | |
total = c_report['support'].sum() | |
num_classes = np.shape(c_matrix)[0] | |
mcc = np.zeros(shape=(num_classes,), dtype='float32') | |
for j in range(num_classes): | |
tp = np.sum(c_matrix[j, j]) | |
fp = np.sum(c_matrix[j, np.concatenate((np.arange(0, j), np.arange(j+1, num_classes)))]) | |
fn = np.sum(c_matrix[np.concatenate((np.arange(0, j), np.arange(j+1, num_classes))), j]) | |
tn = int(total - tp - fp - fn) | |
mcc[j] = ((tp * tn) - (fp * fn)) / math.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) | |
mcc = pd.Series(mcc, index=c_report.index) | |
c_report['mcc'] = mcc | |
return c_report | |
def score_protein_rep(dataset): | |
protein_list = pd.read_csv(os.path.join(script_dir, '../data/preprocess/entry_class_nn.csv')) | |
dataframe = pd.read_csv(representation_path) | |
vecsize = dataframe.shape[1] - 1 | |
x = np.empty([0, vecsize]) | |
xemp = np.zeros((1, vecsize), dtype=float) | |
y = [] | |
ne = [] | |
print("\n\nPreprocessing data for drug-target protein family prediction...\n ") | |
for index, row in tqdm(protein_list.iterrows(), total=len(protein_list)): | |
pdrow = dataframe.loc[dataframe['Entry'] == row['Entry']] | |
if len(pdrow) != 0: | |
a = pdrow.loc[:, pdrow.columns != 'Entry'] | |
a = np.array(a) | |
a.shape = (1, vecsize) | |
x = np.append(x, a, axis=0) | |
y.append(row['Class']) | |
else: | |
ne.append(index) | |
x = np.append(x, xemp, axis=0) | |
y.append(0.0) | |
x = x.astype(np.float64) | |
y = np.array(y) | |
y = y.astype(np.float64) | |
target_names = ['Enzyme', 'Membrane receptor', 'Transcription factor', 'Ion channel', 'Other'] | |
labels = [1.0, 11.0, 12.0, 1005.0, 2000.0] | |
f1 = [] | |
accuracy = [] | |
mcc = [] | |
report_list = [] | |
train_index = pd.read_csv(os.path.join(script_dir, '../data/preprocess/indexes/' + dataset + '_trainindex.csv')) | |
test_index = pd.read_csv(os.path.join(script_dir, '../data/preprocess/indexes/testindex_family.csv')) | |
train_index = train_index.dropna(axis=1) | |
test_index = test_index.dropna(axis=1) | |
#conf_matrices = [] | |
print('Producing protein family predictions...\n') | |
for i in tqdm(range(10)): | |
clf = linear_model.SGDClassifier(class_weight="balanced", loss="log_loss", penalty="elasticnet", max_iter=1000, tol=1e-3, random_state=i, n_jobs=-1) | |
clf2 = OneVsRestClassifier(clf, n_jobs=-1) | |
train_indexx = train_index.iloc[i].astype(int) | |
test_indexx = test_index.iloc[i].astype(int) | |
for index in ne: | |
train_indexx = train_indexx[train_indexx != index] | |
test_indexx = test_indexx[test_indexx != index] | |
train_X, test_X = x[train_indexx], x[test_indexx] | |
train_y, test_y = y[train_indexx], y[test_indexx] | |
clf2.fit(train_X, train_y) | |
y_pred = clf2.predict(test_X) | |
f1_ = f1_score(test_y, y_pred, average='weighted') | |
f1.append(f1_) | |
ac = accuracy_score(test_y, y_pred) | |
accuracy.append(ac) | |
#c_report = classification_report(test_y, y_pred, target_names=target_names, output_dict=True) | |
#c_matrix = confusion_matrix(test_y, y_pred, labels=labels) | |
#conf_matrices.append(c_matrix) | |
#class_report = class_based_scores(c_report, c_matrix) | |
mcc_score = matthews_corrcoef(test_y, y_pred) | |
mcc.append(mcc_score) | |
#report_list.append(class_report) | |
#f1_perclass = pd.concat([r['f1-score'] for r in report_list], axis=1) | |
#ac_perclass = pd.concat([r['accuracy'] for r in report_list], axis=1) | |
#mcc_perclass = pd.concat([r['mcc'] for r in report_list], axis=1) | |
results = { | |
"f1": f1, | |
"accuracy": accuracy, | |
"mcc": mcc, | |
} | |
return results |