|
import re
|
|
import pandas as pd
|
|
import time
|
|
from multiprocessing import Pool
|
|
import difflib
|
|
from utils import Ontology
|
|
import os
|
|
|
|
|
|
def filter(x_list):
|
|
new_go = []
|
|
|
|
for i in x_list:
|
|
if i in filter_go:
|
|
new_go.append(i)
|
|
return '; '.join(new_go)
|
|
|
|
|
|
def fuzzy_match(texts):
|
|
text_dict = {}
|
|
for context in texts:
|
|
if context in choices:
|
|
text_dict[context] = context
|
|
elif context not in choices:
|
|
|
|
sim_list = difflib.get_close_matches(context.lower(), choices, n=1, cutoff=0.9)
|
|
if len(sim_list) > 0:
|
|
text_dict[context] = sim_list[0]
|
|
else:
|
|
|
|
pass
|
|
return text_dict
|
|
|
|
|
|
def txt_map(x, txt_dict):
|
|
if type(x) == str:
|
|
x = eval(x)
|
|
x_ = []
|
|
for i in x:
|
|
if i == '':
|
|
continue
|
|
if i in txt_dict:
|
|
x_.append(txt_dict[i])
|
|
else:
|
|
|
|
pass
|
|
return x_
|
|
|
|
|
|
def go_map_prob(x, GO_dict):
|
|
res = []
|
|
for t in x:
|
|
if t[0] in GO_dict:
|
|
res.append((GO_dict[t[0]], t[1]))
|
|
else:
|
|
pass
|
|
|
|
return res
|
|
|
|
|
|
def txt_map_prob(x, txt_dict):
|
|
if type(x) == str:
|
|
x = eval(x)
|
|
x_ = []
|
|
temp = set()
|
|
for i in x:
|
|
if i[0] == '':
|
|
continue
|
|
elif i[0] in txt_dict and txt_dict[i[0]] not in temp:
|
|
x_.append((txt_dict[i[0]].lower(), i[1]))
|
|
temp.add(txt_dict[i[0]])
|
|
|
|
|
|
|
|
else:
|
|
continue
|
|
return x_
|
|
|
|
|
|
def go_map(x, GO_dict):
|
|
res = []
|
|
for t in x:
|
|
if t in GO_dict:
|
|
res.append(GO_dict[t])
|
|
else:
|
|
|
|
print("{} not in GO_dict".format(t))
|
|
return res
|
|
|
|
|
|
def prop(df):
|
|
prop_annotations = []
|
|
for i, row in df.iterrows():
|
|
|
|
annot_set = set()
|
|
annots = row['GO_label']
|
|
for go_id in annots:
|
|
annot_set |= godb.get_anchestors(go_id)
|
|
annots = list(annot_set)
|
|
prop_annotations.append(annots)
|
|
df['prop_annotations'] = prop_annotations
|
|
return df
|
|
|
|
|
|
def pred_text_to_go(df, with_prob=False):
|
|
|
|
if with_prob:
|
|
df['pred_list_prob'] = df['pred'].apply(lambda x: [eval(i.strip()) for i in x.split(';')])
|
|
df['pred_list'] = df['pred_list_prob'].apply(lambda x: [i[0] for i in x])
|
|
else:
|
|
df['pred_list'] = df['pred'].apply(lambda x: list(set([i.strip() for i in x.split(';')])))
|
|
|
|
t0 = time.time()
|
|
txt_dict = {}
|
|
all_txt = []
|
|
for txt in df['pred_list']:
|
|
if type(txt) == str:
|
|
all_txt.extend(eval(txt))
|
|
else:
|
|
all_txt.extend(txt)
|
|
all_txt = list(set(all_txt))
|
|
if '' in all_txt:
|
|
all_txt.remove('')
|
|
n = len(all_txt)
|
|
thread = 10
|
|
size = int(n / thread)
|
|
inds = list(range(0, n, size))
|
|
inds.append(n)
|
|
all_txt_sep = [all_txt[i: min(i + size, n)] for i in inds[:-1]]
|
|
with Pool(processes=thread) as pool:
|
|
result = pool.map(fuzzy_match, all_txt_sep)
|
|
pool.close()
|
|
pool.join()
|
|
for d in result:
|
|
txt_dict.update(d)
|
|
|
|
|
|
|
|
if with_prob:
|
|
df['pred_list_prob'] = df['pred_list_prob'].apply(lambda x: txt_map_prob(x, txt_dict))
|
|
print("fuzzy matching time: {}".format(time.time() - t0))
|
|
df['pred_list_go_prob'] = df['pred_list_prob'].apply(lambda x: go_map_prob(x, GO_dict))
|
|
n0 = df.shape[0]
|
|
df['len'] = df['pred_list_go_prob'].apply(lambda x: len(x))
|
|
df = df[df['len'] > 0]
|
|
df = df.drop('len', axis=1)
|
|
df = df.dropna()
|
|
print('{}条数据,不为空的预测有{}条'.format(n0, df.shape[0]))
|
|
else:
|
|
df['pred_list'] = df['pred_list'].apply(lambda x: txt_map(x, txt_dict))
|
|
df['pred_list'] = df['pred_list'].apply(lambda x: [i.lower() for i in list(set(x))])
|
|
print("fuzzy matching time: {}".format(time.time() - t0))
|
|
df['pred_list_go'] = df['pred_list'].apply(lambda x: go_map(x, GO_dict))
|
|
|
|
n0 = df.shape[0]
|
|
df['len'] = df['pred_list_go'].apply(lambda x: len(x))
|
|
df = df[df['len'] > 0]
|
|
df = df.drop('len', axis=1)
|
|
df = df.dropna()
|
|
print('{}条数据,不为空的预测有{}条'.format(n0, df.shape[0]))
|
|
return df
|
|
|
|
|
|
def cal_f1(df):
|
|
df['label_list_go'] = df['label'].apply(lambda x: [i.strip() for i in x.split(';')])
|
|
df['pred_list_go'] = df['pred_list'].apply(lambda x: [i.strip() for i in x.split(';')])
|
|
|
|
labels = []
|
|
pred_labels = []
|
|
for l in df['label_list_go']:
|
|
labels.extend(l)
|
|
|
|
label_count = {}
|
|
for x in labels:
|
|
if x not in label_count:
|
|
label_count[x] = 1
|
|
else:
|
|
label_count[x] += 1
|
|
|
|
labels = list(set(labels))
|
|
total = len(labels)
|
|
tp_dict, fp_dict, fn_dict = dict(zip(labels, [0] * len(labels))), dict(zip(labels, [0] * len(labels))), dict(
|
|
zip(labels, [0] * len(labels)))
|
|
for preds, label in zip(df['pred_list_go'], df['label_list_go']):
|
|
for t in label:
|
|
|
|
|
|
if t in preds:
|
|
tp_dict[t] += 1
|
|
else:
|
|
fn_dict[t] += 1
|
|
for p in preds:
|
|
|
|
|
|
if p not in label:
|
|
if p in fp_dict:
|
|
fp_dict[p] += 1
|
|
else:
|
|
fp_dict[p] = 1
|
|
pred_labels.extend(preds)
|
|
p_total = len(set(pred_labels))
|
|
recall, pr = 0., 0.
|
|
for x in labels:
|
|
recall += tp_dict[x] / (1.0 * (tp_dict[x] + fn_dict[x] + 1e-8))
|
|
pr += tp_dict[x] / (1.0 * (tp_dict[x] + fp_dict[x] + 1e-8))
|
|
r = recall / total
|
|
p = pr / p_total
|
|
f1 = 2 * p * r / (p + r)
|
|
|
|
print("preds not in labels: {}".format(len(list(fp_dict.keys())) - total))
|
|
print("recall:{}; percision:{}; f1 score: {}".format(r, p, f1))
|
|
|
|
|
|
def cat_go(x):
|
|
try:
|
|
cat = godb.get_namespace(x)
|
|
except:
|
|
print("{} not found".format(x))
|
|
return
|
|
if cat == NAMESPACES['mf']:
|
|
return 'mf'
|
|
elif cat == NAMESPACES['bp']:
|
|
return 'bp'
|
|
elif cat == NAMESPACES['cc']:
|
|
return 'cc'
|
|
return
|
|
|
|
|
|
def remove_root(x):
|
|
if 'molecular_function' in x:
|
|
x.remove('molecular_function')
|
|
if 'biological_process' in x:
|
|
x.remove('biological_process')
|
|
if 'cellular_component' in x:
|
|
x.remove('cellular_component')
|
|
return x
|
|
|
|
if __name__ == "__main__":
|
|
NAMESPACES = {
|
|
'cc': 'cellular_component',
|
|
'mf': 'molecular_function',
|
|
'bp': 'biological_process'
|
|
}
|
|
|
|
if 1==1:
|
|
data = pd.read_csv('/cluster/home/wenkai/LAVIS/data/pretrain/swissprot_domain_and_train_exp_prompt_new.csv', sep='|')
|
|
print('数据规模:{}'.format(data.shape[0]))
|
|
|
|
|
|
|
|
godb = Ontology(f'/cluster/home/wenkai/LAVIS/data/go1.4-basic.obo', with_rels=True)
|
|
go_des = pd.read_csv('/cluster/home/wenkai/LAVIS/data/go_descriptions1.4.txt', sep='|', header=None)
|
|
go_des.columns = ['id', 'text']
|
|
go_des = go_des.dropna()
|
|
go_des['id'] = go_des['id'].apply(lambda x: re.sub('_', ':', x))
|
|
go_des['ont'] = go_des['id'].apply(lambda x: cat_go(x))
|
|
go_des = go_des.dropna()
|
|
go_obo_set = set(go_des['id'].tolist())
|
|
go_des['text'] = go_des['text'].apply(lambda x: x.lower())
|
|
|
|
data['GO_label'] = data['GO_label'].apply(lambda x: [i.strip() for i in x.split(';')])
|
|
data = prop(data)
|
|
|
|
|
|
go_dict = {}
|
|
for x_list in data['prop_annotations']:
|
|
for goid in x_list:
|
|
if goid in go_dict:
|
|
go_dict[goid] += 1
|
|
else:
|
|
go_dict[goid] = 1
|
|
df_stat = pd.DataFrame({'id': list(go_dict.keys()), 'count': list(go_dict.values())})
|
|
data_gos = set(df_stat['id'].tolist())
|
|
go_des = go_des[go_des['id'].isin(data_gos)]
|
|
filter_go = data_gos.intersection(go_obo_set)
|
|
print(f"包括父节点的GO有{len(data_gos)}个,其中在go1.4.obo中出现的GO有{len(filter_go)}个")
|
|
|
|
go_des.to_pickle('/cluster/home/wenkai/LAVIS/data/pretrain/mf_bp_cc/go_des.pkl')
|
|
id2text_dict = dict(zip(go_des['id'], go_des['text']))
|
|
GO_dict = dict(zip(go_des['text'], go_des['id']))
|
|
|
|
choices_mf = list(set(go_des[go_des['ont'] == 'mf']['text']))
|
|
choices_bp = list(set(go_des[go_des['ont'] == 'bp']['text']))
|
|
choices_cc = list(set(go_des[go_des['ont'] == 'cc']['text']))
|
|
|
|
choices_mf = {x.lower(): x for x in choices_mf}
|
|
choices_bp = {x.lower(): x for x in choices_bp}
|
|
choices_cc = {x.lower(): x for x in choices_cc}
|
|
|
|
data['GO_label'] = data['GO_label'].apply(lambda x: filter(x))
|
|
data = data[data['GO_label'] != '']
|
|
data['function'] = data['GO_label'].apply(lambda x: [id2text_dict[i.strip()] for i in x.split(';')])
|
|
data['function'] = data['function'].apply(lambda x: '; '.join(x))
|
|
|
|
terms = pd.DataFrame({'gos': list(filter_go)})
|
|
terms.to_pickle('/cluster/home/wenkai/LAVIS/data/pretrain/mf_bp_cc/terms.pkl')
|
|
terms.to_pickle('/cluster/home/wenkai/deepgozero/data/blip2/pretrain/terms.pkl')
|
|
|
|
terms_mf = pd.DataFrame({'gos': list(set(go_des[go_des['ont'] == 'mf']['id']))})
|
|
terms_mf.to_pickle('/cluster/home/wenkai/deepgozero/data/blip2/pretrain/mf/terms.pkl')
|
|
terms_mf.to_pickle('/cluster/home/wenkai/deepgo2/data/mf/terms.pkl')
|
|
terms_bp = pd.DataFrame({'gos': list(set(go_des[go_des['ont'] == 'bp']['id']))})
|
|
terms_bp.to_pickle('/cluster/home/wenkai/deepgozero/data/blip2/pretrain/bp/terms.pkl')
|
|
terms_bp.to_pickle('/cluster/home/wenkai/deepgo2/data/bp/terms.pkl')
|
|
terms_cc = pd.DataFrame({'gos': list(set(go_des[go_des['ont'] == 'cc']['id']))})
|
|
terms_cc.to_pickle('/cluster/home/wenkai/deepgozero/data/blip2/pretrain/cc/terms.pkl')
|
|
terms_cc.to_pickle('/cluster/home/wenkai/deepgo2/data/cc/terms.pkl')
|
|
else:
|
|
godb = Ontology(f'/cluster/home/wenkai/LAVIS/data/go1.4-basic.obo', with_rels=True)
|
|
terms = pd.read_pickle('/cluster/home/wenkai/LAVIS/data/pretrain/mf_bp_cc/terms.pkl')
|
|
filter_go = set(terms['gos'].tolist())
|
|
|
|
terms_mf = pd.read_pickle('/cluster/home/wenkai/deepgo2/data/mf/terms.pkl')
|
|
terms_bp = pd.read_pickle('/cluster/home/wenkai/deepgo2/data/bp/terms.pkl')
|
|
terms_cc = pd.read_pickle('/cluster/home/wenkai/deepgo2/data/cc/terms.pkl')
|
|
|
|
choices_mf = {x.lower(): x for x in terms_mf['gos'].tolist()}
|
|
choices_bp = {x.lower(): x for x in terms_bp['gos'].tolist()}
|
|
choices_cc = {x.lower(): x for x in terms_cc['gos'].tolist()}
|
|
|
|
go_des = pd.read_pickle('/cluster/home/wenkai/LAVIS/data/pretrain/mf_bp_cc/go_des.pkl')
|
|
id2text_dict = dict(zip(go_des['id'], go_des['text']))
|
|
GO_dict = dict(zip(go_des['text'], go_des['id']))
|
|
|
|
|
|
|
|
df_interpro = pd.read_csv('/cluster/home/wenkai/LAVIS/data/uniprot_sprot_blip2_func_data.txt', sep='|',
|
|
nrows=546389,
|
|
header=None)
|
|
df_interpro.columns = ['name', 'seq', 'go', 'text', 'evi', 'ipr']
|
|
df_interpro = df_interpro[df_interpro['ipr'].notnull()]
|
|
df_interpro['ipr'] = df_interpro['ipr'].apply(lambda x: [i.strip() for i in x.split(';')])
|
|
|
|
iprs = []
|
|
for x in df_interpro['ipr'].tolist():
|
|
if len(x) > 0:
|
|
iprs.extend(x)
|
|
iprs = list(set(iprs))
|
|
print("ipr个数:{}".format(len(iprs)))
|
|
df_ipr = pd.DataFrame({'interpros': iprs})
|
|
df_ipr.to_pickle('/cluster/home/wenkai/LAVIS/data/interpros.pkl')
|
|
df_ipr.to_pickle('/cluster/home/wenkai/deepgozero/data/blip2/pretrain/interpros.pkl')
|
|
|
|
|
|
'''
|
|
# test cases
|
|
df_real = pd.read_csv('/cluster/home/wenkai/LAVIS/data/pretrain/test_2000.csv', sep='|')
|
|
df_real[col] = df_real[col].apply(lambda x: [i.strip() for i in x.split(';')])
|
|
#df_real[col] = df_real[col].apply(lambda x: filter(x))
|
|
df_real = df_real[df_real[col] != '']
|
|
print(df_real.shape)
|
|
#df_real['GO_label'] = df_real['GO_label'].apply(lambda x: [id2text_dict[i] for i in x])
|
|
#df_real['GO_label'] = df_real['GO_label'].apply(lambda x: [GO_dict[i] for i in x])
|
|
df_real = prop(df_real)
|
|
#df_real['prop_annotations'] = df_real['prop_annotations'].apply(lambda x: [id2text_dict[i] for i in x])
|
|
#df_real['prop_annotations'] = df_real['prop_annotations'].apply(lambda x: remove_root(x))
|
|
#df_real['prop_annotations'] = df_real['prop_annotations'].apply(lambda x: list(set([GO_dict[i] for i in x])))
|
|
for ont in ['mf', 'bp', 'cc']:
|
|
file_name = 'output_{}_test_2000'.format(ont)
|
|
if ont == 'mf':
|
|
choices = choices_mf
|
|
elif ont == 'bp':
|
|
choices = choices_bp
|
|
elif ont == 'cc':
|
|
choices = choices_cc
|
|
print("对{}预测文本进行标准化...".format(file_name))
|
|
df_pred = pd.read_csv('/cluster/home/wenkai/LAVIS/output/{}.txt'.format(file_name), sep='|', header=None, on_bad_lines='skip')
|
|
df_pred.columns = ['name', 'pred', 'label']
|
|
n0 = df_pred.shape[0]
|
|
df_pred = pred_text_to_go(df_pred, with_prob=True)
|
|
print("{}中有{}条数据未能找到相似度高的GO描述".format(file_name, n0-df_pred.shape[0]))
|
|
#df_pred['pred_list'] = df_pred['pred_list'].apply(lambda x: '; '.join(x))
|
|
#cal_f1(df_pred)
|
|
df_pred[['name', 'pred_list_prob', 'label']].to_csv('/cluster/home/wenkai/LAVIS/output/{}_standard.csv'.format(file_name), sep='|', index=False)
|
|
|
|
df_pred = pd.merge(df_pred[['name', 'pred_list_go_prob']], df_interpro[['name', 'ipr']], on='name', how='left')
|
|
df_pred['ipr'] = df_pred['ipr'].fillna("").apply(list)
|
|
ipr_and_pred = []
|
|
for x, y in zip(df_pred['ipr'], df_pred['pred_list_go_prob']):
|
|
try:
|
|
ipr_and_pred.append(x + y)
|
|
except:
|
|
ipr_and_pred.append(y)
|
|
df_pred['ipr_and_pred'] = ipr_and_pred
|
|
print(df_real.isnull().sum())
|
|
df_pred = pd.merge(df_pred, df_real[['name', 'protein', 'prop_annotations']], on='name', how='left')
|
|
#df_pred = df_pred.dropna()
|
|
print(df_pred.shape)
|
|
df_pred[['name', 'protein', 'ipr', 'pred_list_go_prob', 'ipr_and_pred', 'prop_annotations']].to_pickle(
|
|
'/cluster/home/wenkai/deepgozero/data/blip2/pretrain/{}/test_2000_data.pkl'.format(ont))
|
|
'''
|
|
|
|
'''
|
|
df_real = pd.read_csv('/cluster/home/wenkai/LAVIS/data/pretrain/nextprot_mf.csv', sep='|')
|
|
df_real['GO_label'] = df_real['GO_label'].apply(lambda x: [i.strip() for i in x.split(';')])
|
|
df_real['GO_label'] = df_real['GO_label'].apply(lambda x: [id2text_dict[i] for i in x])
|
|
df_real['GO_label'] = df_real['GO_label'].apply(lambda x: [GO_dict[i] for i in x])
|
|
df_real = prop(df_real)
|
|
df_real['prop_annotations'] = df_real['prop_annotations'].apply(lambda x: [id2text_dict[i] for i in x])
|
|
df_real['prop_annotations'] = df_real['prop_annotations'].apply(lambda x: remove_root(x))
|
|
df_real['prop_annotations'] = df_real['prop_annotations'].apply(lambda x: list(set([GO_dict[i] for i in x])))
|
|
|
|
file = 'output_nextprot'
|
|
choices = choices_mf
|
|
df_pred = pd.read_csv('/cluster/home/wenkai/LAVIS/output/{}.txt'.format(file), sep='|', header=None, on_bad_lines='skip')
|
|
df_pred.columns = ['name', 'pred', 'label']
|
|
df_pred = pred_text_to_go(df_pred, with_prob=True)
|
|
df_pred[['name', 'pred_list_prob', 'label']].to_csv('/cluster/home/wenkai/LAVIS/output/{}_standard.csv'.format(file), sep='|', index=False)
|
|
|
|
df_pred = pd.merge(df_pred, df_real[['name', 'protein', 'prop_annotations']], on='name', how='left')
|
|
df_pred['ipr'] = [[] for _ in range(df_pred.shape[0])]
|
|
df_pred['ipr_and_pred'] = df_pred['pred_list_go_prob']
|
|
df_pred[['name', 'protein', 'ipr', 'pred_list_go_prob', 'ipr_and_pred', 'prop_annotations']].to_pickle(
|
|
'/cluster/home/wenkai/deepgozero/data/blip2/pretrain/mf/nextprot_data.pkl')
|
|
'''
|
|
|
|
cat_id = {'mf': '445772', 'bp': '496359', 'cc': '505955'}
|
|
col = 'GO_label'
|
|
for ont in ['mf', 'bp', 'cc']:
|
|
|
|
if ont == 'mf':
|
|
choices = choices_mf
|
|
elif ont == 'bp':
|
|
choices = choices_bp
|
|
elif ont == 'cc':
|
|
choices = choices_cc
|
|
for split in ['train', 'val', 'test']:
|
|
|
|
df_real = pd.read_csv(f'/cluster/home/wenkai/LAVIS/data/pretrain/mf_bp_cc/{split}_exp_{ont}_new.csv',
|
|
sep='|')
|
|
df_real[col] = df_real[col].apply(lambda x: [i.strip() for i in x.split(';')])
|
|
df_real[col] = df_real[col].apply(lambda x: filter(x))
|
|
df_real = df_real[df_real[col] != '']
|
|
print(df_real.shape)
|
|
df_real['GO_label'] = df_real['GO_label'].apply(lambda x: [i.strip() for i in x.split(';')])
|
|
df_real['GO_label'] = df_real['GO_label'].apply(lambda x: [id2text_dict[i] for i in x])
|
|
df_real['GO_label'] = df_real['GO_label'].apply(lambda x: [GO_dict[i] for i in x])
|
|
df_real = prop(df_real)
|
|
df_real['prop_annotations'] = df_real['prop_annotations'].apply(lambda x: [id2text_dict[i] for i in x])
|
|
df_real['prop_annotations'] = df_real['prop_annotations'].apply(lambda x: remove_root(x))
|
|
df_real['prop_annotations'] = df_real['prop_annotations'].apply(lambda x: list(set([GO_dict[i] for i in x])))
|
|
|
|
|
|
df_pred = pd.read_csv(
|
|
f'/cluster/home/wenkai/LAVIS/output/mf_bp_cc/output_{split}_{ont}_exp_{cat_id[ont]}.txt', sep='|',
|
|
header=None, on_bad_lines='skip')
|
|
df_pred.columns = ['name', 'pred', 'label']
|
|
n0 = df_pred.shape[0]
|
|
df_pred = pred_text_to_go(df_pred, with_prob=True)
|
|
print("{}中有{}条数据未能找到相似度高的GO描述".format(ont, n0 - df_pred.shape[0]))
|
|
df_pred[['name', 'pred_list_prob', 'label']].to_csv(
|
|
f'/cluster/home/wenkai/LAVIS/output/mf_bp_cc/output_{split}_{ont}_{cat_id[ont]}_standard.csv', sep='|',
|
|
index=False)
|
|
|
|
df_pred = pd.merge(df_pred[['name', 'pred_list_go_prob']], df_interpro[['name', 'ipr']], on='name', how='left')
|
|
df_pred['ipr'] = df_pred['ipr'].fillna("").apply(list)
|
|
ipr_and_pred = []
|
|
for x, y in zip(df_pred['ipr'], df_pred['pred_list_go_prob']):
|
|
try:
|
|
ipr_and_pred.append(x + y)
|
|
except:
|
|
ipr_and_pred.append(y)
|
|
df_pred['ipr_and_pred'] = ipr_and_pred
|
|
|
|
df_pred = pd.merge(df_pred, df_real[['name', 'protein', 'prop_annotations']], on='name', how='left')
|
|
df_pred = df_pred.dropna()
|
|
df_pred[['name', 'protein', 'ipr', 'pred_list_go_prob', 'ipr_and_pred', 'prop_annotations']].to_pickle(
|
|
f'/cluster/home/wenkai/deepgozero/data/blip2/pretrain/{ont}/{split}_data_{cat_id[ont]}.pkl')
|
|
df_pred[['name', 'protein', 'ipr', 'pred_list_go_prob', 'ipr_and_pred', 'prop_annotations']].to_pickle(
|
|
f'/cluster/home/wenkai/deepgo2/data/{ont}/{split}_data_{cat_id[ont]}.pkl')
|
|
if split == 'val':
|
|
df_pred[['name', 'protein', 'ipr', 'pred_list_go_prob', 'ipr_and_pred', 'prop_annotations']].to_pickle(
|
|
f'/cluster/home/wenkai/deepgozero/data/blip2/pretrain/{ont}/valid_data_{cat_id[ont]}.pkl')
|
|
df_pred[['name', 'protein', 'ipr', 'pred_list_go_prob', 'ipr_and_pred', 'prop_annotations']].to_pickle(
|
|
f'/cluster/home/wenkai/deepgo2/data/{ont}/valid_data_{cat_id[ont]}.pkl')
|
|
print(f"{ont} {split} deepgozero propagation data completed")
|
|
|
|
|