PROBE

Sleeping

File size: 5,153 Bytes

dd49f8a
 
 
 
 
 
37a12fb
acd43b4
 
dd49f8a
 
37a12fb
 
 
 
 
 
dd49f8a
 
 
 
 
 
 
 
 
 
 
37a12fb
dd49f8a
 
 
 
 
 
37a12fb
dd49f8a
 
 
 
 
 
 
 
 
 
 
 
 
 
37a12fb
 
dd49f8a
 
 
37a12fb
dd49f8a
 
7dcad68
dd49f8a
37a12fb
 
dd49f8a
 
 
 
 
 
 
 
 
37a12fb
dd49f8a
37a12fb
dd49f8a
 
 
 
37a12fb
dd49f8a
 
 
 
 
37a12fb
dd49f8a
 
37a12fb
dd49f8a
 
 
 
37a12fb
 
7dcad68
37a12fb
dd49f8a
 
b59bcc0
dd49f8a
 
37a12fb
b5647e6
37a12fb
 
dd49f8a
 
 
37a12fb
 
 
dd49f8a
 
 
 
37a12fb
dd49f8a
37a12fb
dd49f8a
 
37a12fb
dd49f8a
 
37a12fb
b59bcc0
 
 
dd49f8a
b59bcc0
37a12fb
 
dd49f8a
b59bcc0
dd49f8a
b59bcc0
 
 
37a12fb

# -*- coding: utf-8 -*-
"""
Created on Mon Jun  8 09:32:26 2020

@author: Muammer
"""
import os
script_dir = os.path.dirname(os.path.abspath(__file__))

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import (
    f1_score, accuracy_score, confusion_matrix, classification_report, matthews_corrcoef
)
from sklearn.multiclass import OneVsRestClassifier
import pandas as pd
from tqdm import tqdm
import math

representation_name = ""
representation_path = ""
dataset = "nc"
detailed_output = False

def convert_dataframe_to_multi_col(representation_dataframe):
    entry = pd.DataFrame(representation_dataframe['Entry'])
    vector = pd.DataFrame(list(representation_dataframe['Vector']))
    multi_col_representation_vector = pd.merge(left=entry, right=vector, left_index=True, right_index=True)
    return multi_col_representation_vector

def class_based_scores(c_report, c_matrix):
    c_report = pd.DataFrame(c_report).transpose()
    c_report = c_report.drop(['precision', 'recall'], axis=1)
    c_report = c_report.drop(labels=['accuracy', 'macro avg', 'weighted avg'], axis=0)
    
    cm = c_matrix.astype('float') / c_matrix.sum(axis=1)[:, np.newaxis]
    accuracy = cm.diagonal()
    accuracy = pd.Series(accuracy, index=c_report.index)
    c_report['accuracy'] = accuracy
    
    total = c_report['support'].sum()
    num_classes = np.shape(c_matrix)[0]
    mcc = np.zeros(shape=(num_classes,), dtype='float32')

    for j in range(num_classes):
        tp = np.sum(c_matrix[j, j])
        fp = np.sum(c_matrix[j, np.concatenate((np.arange(0, j), np.arange(j+1, num_classes)))])
        fn = np.sum(c_matrix[np.concatenate((np.arange(0, j), np.arange(j+1, num_classes))), j])
        tn = int(total - tp - fp - fn)
        mcc[j] = ((tp * tn) - (fp * fn)) / math.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
    
    mcc = pd.Series(mcc, index=c_report.index)
    c_report['mcc'] = mcc

    return c_report

def score_protein_rep(dataset):
    protein_list = pd.read_csv(os.path.join(script_dir, '../data/preprocess/entry_class_nn.csv'))
    dataframe = pd.read_csv(representation_path)
    vecsize = dataframe.shape[1] - 1

    x = np.empty([0, vecsize])
    xemp = np.zeros((1, vecsize), dtype=float)
    y = []
    ne = []

    print("\n\nPreprocessing data for drug-target protein family prediction...\n ")
    for index, row in tqdm(protein_list.iterrows(), total=len(protein_list)):
        pdrow = dataframe.loc[dataframe['Entry'] == row['Entry']]
        if len(pdrow) != 0:
            a = pdrow.loc[:, pdrow.columns != 'Entry']
            a = np.array(a)
            a.shape = (1, vecsize)
            x = np.append(x, a, axis=0)
            y.append(row['Class'])
        else:
            ne.append(index)
            x = np.append(x, xemp, axis=0)
            y.append(0.0)

    x = x.astype(np.float64)
    y = np.array(y)
    y = y.astype(np.float64)

    target_names = ['Enzyme', 'Membrane receptor', 'Transcription factor', 'Ion channel', 'Other']
    labels = [1.0, 11.0, 12.0, 1005.0, 2000.0]

    f1 = []
    accuracy = []
    mcc = []
    report_list = []

    train_index = pd.read_csv(os.path.join(script_dir, '../data/preprocess/indexes/' + dataset + '_trainindex.csv'))
    test_index = pd.read_csv(os.path.join(script_dir, '../data/preprocess/indexes/testindex_family.csv'))
    train_index = train_index.dropna(axis=1)
    test_index = test_index.dropna(axis=1)

    #conf_matrices = []

    print('Producing protein family predictions...\n')
    for i in tqdm(range(10)):
        clf = linear_model.SGDClassifier(class_weight="balanced", loss="log_loss", penalty="elasticnet", max_iter=1000, tol=1e-3, random_state=i, n_jobs=-1)
        clf2 = OneVsRestClassifier(clf, n_jobs=-1)

        train_indexx = train_index.iloc[i].astype(int)
        test_indexx = test_index.iloc[i].astype(int)

        for index in ne:
            train_indexx = train_indexx[train_indexx != index]
            test_indexx = test_indexx[test_indexx != index]

        train_X, test_X = x[train_indexx], x[test_indexx]
        train_y, test_y = y[train_indexx], y[test_indexx]

        clf2.fit(train_X, train_y)
        y_pred = clf2.predict(test_X)

        f1_ = f1_score(test_y, y_pred, average='weighted')
        f1.append(f1_)

        ac = accuracy_score(test_y, y_pred)
        accuracy.append(ac)

        #c_report = classification_report(test_y, y_pred, target_names=target_names, output_dict=True)
        #c_matrix = confusion_matrix(test_y, y_pred, labels=labels)
        #conf_matrices.append(c_matrix)

        #class_report = class_based_scores(c_report, c_matrix)
        mcc_score = matthews_corrcoef(test_y, y_pred)
        mcc.append(mcc_score)
        
        #report_list.append(class_report)

    #f1_perclass = pd.concat([r['f1-score'] for r in report_list], axis=1)
    #ac_perclass = pd.concat([r['accuracy'] for r in report_list], axis=1)
    #mcc_perclass = pd.concat([r['mcc'] for r in report_list], axis=1)

    results = {
        "f1": f1,
        "accuracy": accuracy,
        "mcc": mcc,
    }

    return results