import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import pickle
import sys
#print('Reading file...')
infile = sys.argv[1]
covid19df = pd.read_csv(infile)
# function to convert sequence strings into k-mer words, default size = 6 (hexamer words)
kmer_size = 6
NGram = 4
#KFold_val = 10
def getKmers(sequence, size=kmer_size):
return [sequence[x:x+size].lower() for x in range(len(sequence) - size + 1)]
#print('Creating token using K_Mer...')
covid19df['words'] = covid19df.apply(lambda x: getKmers(x['SEQ']), axis=1)
covid_texts = list(covid19df['words'])
#test_labels = np.array(covid19df.pop('CLASS'))
#print('Converting token to list...')
for item in range(len(covid_texts)):
covid_texts[item] = ' '.join(covid_texts[item])
#print('Performing Count Vectorization...')
cv = pickle.load(open('countVectTrain.pkl', 'rb'))
X = cv.transform(covid_texts)
# load the model from disk
filename = 'corona_pred.pkl'
model = pickle.load(open(filename, 'rb'))
test_pred = model.predict(X)
pred_prob = model.predict_proba(X)
test_pred_prob = pred_prob.max(1)*100
covid19df = covid19df.drop('words', axis=1)
df_test_pred = pd.DataFrame(data=test_pred, index=None, columns=["pred_label"])
#df_test_labels = pd.DataFrame(data=test_labels, index=None, columns=["test_label"])
df_pred_prob = pd.DataFrame(data=test_pred_prob, index=None, columns=["pred_prob_percentage"])
covid19df.reset_index(inplace = True, drop = True)
df_test_pred.reset_index(inplace = True, drop = True)
#df_test_labels.reset_index(inplace = True, drop = True)
df_out = pd.concat([covid19df, df_test_pred, df_pred_prob], axis=1)
df_out.to_csv('corona_pred_out.csv', index=False)
#mylist = str("Patient ID,Class
")
#mylist = str("
Sequence ID | Class | Probability (in %) |
---|---|---|
" + df_out.iloc[row,0] + " | " + "" + str(df_out.iloc[row,2]) + " | " + "" + str(df_out.iloc[row,3]) + " |