MHN-React / mhnreact /train.py
uragankatrrin's picture
Upload 12 files
2956799
raw history blame
No virus
39.9 kB
# -*- coding: utf-8 -*-
"""
Author: Philipp Seidl
ELLIS Unit Linz, LIT AI Lab, Institute for Machine Learning
Johannes Kepler University Linz
Contact: seidl@ml.jku.at
Training
"""
from .utils import str2bool, lgamma, multinom_gk, top_k_accuracy
from .data import load_templates, load_dataset_from_csv, load_USPTO
from .model import ModelConfig, MHN, StaticQK, SeglerBaseline, Retrosim
from .molutils import convert_smiles_to_fp, FP_featurizer, smarts2appl, getTemplateFingerprint, disable_rdkit_logging
from collections import defaultdict
import argparse
import os
import numpy as np
import pandas as pd
import datetime
import sys
from time import time
import matplotlib.pyplot as plt
import torch
import multiprocessing
import warnings
from joblib import Memory
cachedir = 'data/cache/'
memory = Memory(cachedir, verbose=0, bytes_limit=80e9)
def parse_args():
parser = argparse.ArgumentParser(description="Train MHNreact.",
epilog="--", prog="Train")
parser.add_argument('-f', type=str)
parser.add_argument('--model_type', type=str, default='mhn',
help="Model-type: choose from 'segler', 'fortunato', 'mhn' or 'staticQK', default:'mhn'")
parser.add_argument("--exp_name", type=str, default='', help="experiment name, (added as postfix to the file-names)")
parser.add_argument("-d", "--dataset_type", type=str, default='sm',
help="Input Dataset 'sm' for Scheider-USPTO-50k 'lg' for USPTO large or 'golden' or use keyword '--csv_path to specify an input file', default: 'sm'")
parser.add_argument("--csv_path", default=None, type=str, help="path to preprocessed trainings file + split columns, default: None")
parser.add_argument("--split_col", default='split', type=str, help="split column of csv, default: 'split'")
parser.add_argument("--input_col", default='prod_smiles', type=str, help="input column of csv, default: 'pro_smiles'")
parser.add_argument("--reactants_col", default='reactants_can', type=str, help="reactant colum of csv, default: 'reactants_can'")
parser.add_argument("--fp_type", type=str, default='morganc',
help="Fingerprint type for the input only!: default: 'morgan', other options: 'rdk', 'ECFP', 'ECFC', 'MxFP', 'Morgan2CBF' or a combination of fingerprints with '+'' for max-pooling and '&' for concatination e.g. maccs+morganc+topologicaltorsion+erg+atompair+pattern+rdkc+layered+mhfp, default: 'morganc'")
parser.add_argument("--template_fp_type", type=str, default='rdk',
help="Fingerprint type for the template fingerprint, default: 'rdk'")
parser.add_argument("--device", type=str, default='best',
help="Device to run the model on, preferably 'cuda:0', default: 'best' (takes the gpu with most RAM)")
parser.add_argument("--fp_size", type=int, default=4096,
help="fingerprint-size used for templates as well as for inputs, default: 4096")
parser.add_argument("--fp_radius", type=int, default=2, help="fingerprint-radius (if applicable to the fingerprint-type), default: 2")
parser.add_argument("--epochs", type=int, default=10, help='number of epochs, default: 10')
parser.add_argument("--pretrain_epochs", type=int, default=0,
help="applicability-matrix pretraining epochs if applicable (e.g. fortunato model_type), default: 0")
parser.add_argument("--save_model", type=str2bool, default=False, help="save the model, default: False")
parser.add_argument("--dropout", type=float, default=0.2, help="dropout rate for encoders, default: 0.2")
parser.add_argument("--lr", type=float, default=5e-4, help="learning-rate, dfeault: 5e-4")
parser.add_argument("--hopf_beta", type=float, default=0.05, help="hopfield beta parameter, default: 0.125")
parser.add_argument("--hopf_asso_dim", type=int, default=512, help="association dimension, default: 512")
parser.add_argument("--hopf_num_heads", type=int, default=1, help="hopfield number of heads, default: 1")
parser.add_argument("--hopf_association_activation", type=str, default='None',
help="hopfield association activation function recommended:'Tanh' or 'None', other: 'ReLU', 'SeLU', 'GeLU', or 'None' for more, see torch.nn, default: 'None'")
parser.add_argument("--norm_input", default=True, type=str2bool,
help="input-normalization, default: True")
parser.add_argument("--norm_asso", default=True, type=str2bool,
help="association-normalization, default: True")
# additional experimental hyperparams
parser.add_argument("--hopf_n_layers", default=1, type=int, help="Number of hopfield-layers, default: 1")
parser.add_argument("--mol_encoder_layers", default=1, type=int, help="Number of molecule-encoder layers, default: 1")
parser.add_argument("--temp_encoder_layers", default=1, type=int, help="Number of template-encoder layers, default: 1")
parser.add_argument("--encoder_af", default='ReLU', type=str,
help="Encoder-NN intermediate activation function (before association_activation function), default: 'ReLU'")
parser.add_argument("--hopf_pooling_operation_head", default='mean', type=str, help="Pooling operation over heads default=max, (max, min, mean, ...), default: 'mean'")
parser.add_argument("--splitting_scheme", default=None, type=str, help="Splitting_scheme for non-csv-input, default: None, other options: 'class-freq', 'random'")
parser.add_argument("--concat_rand_template_thresh", default=-1, type=int, help="Concatinates a random vector to the tempalte-fingerprint at all templates with num_training samples > this threshold; -1 (default) means deactivated")
parser.add_argument("--repl_quotient", default=10, type=float, help="Only if --concat_rand_template_thresh >= 0 - Quotient of how much should be replaced by random in template-embedding, (default: 10)")
parser.add_argument("--verbose", default=False, type=str2bool, help="If verbose, will print out more stuff, default: False")
parser.add_argument("--batch_size", default=128, type=int, help="Training batch-size, default: 128")
parser.add_argument("--eval_every_n_epochs", default=1, type=int, help="Evaluate every _ epochs (Evaluation is costly for USPTO-Lg), default: 1")
parser.add_argument("--save_preds", default=False, type=str2bool, help="Save predictions for test split at the end of training, default: False")
parser.add_argument("--wandb", default=False, type=str2bool, help="Save to wandb; login required, default: False")
parser.add_argument("--seed", default=None, type=int, help="Seed your run to make it reproducible, defualt: None")
parser.add_argument("--template_fp_type2", default=None, type=str, help="experimental template_fp_type for layer 2, default: None")
parser.add_argument("--layer2weight",default=0.2, type=float, help="hopf-layer2 weight of p, default: 0.2")
parser.add_argument("--reactant_pooling", default='max', type=str, help="reactant pooling operation over template-fingerprint, default: 'max', options: 'min','mean','lgamma'")
parser.add_argument("--ssretroeval", default=False, type=str2bool, help="single-step retro-synthesis eval, default: False")
parser.add_argument("--addval2train", default=False, type=str2bool, help="adds the validation set to the training set, default: False")
parser.add_argument("--njobs",default=-1, type=int, help="Number of jobs, default: -1 -> uses all available")
parser.add_argument("--eval_only_loss", default=False, type=str2bool, help="if only loss should be evaluated (if top-k acc may be time consuming), default: False")
parser.add_argument("--only_templates_in_batch", default=False, type=str2bool, help="while training only forwards templates that are in the batch, default: False")
parser.add_argument("--plot_res", default=False, type=str2bool, help="Plotting results for USPTO-sm/lg, default: False")
args = parser.parse_args()
if args.njobs ==-1:
args.njobs = int(multiprocessing.cpu_count())
if args.device=='best':
from .utils import get_best_gpu
try:
args.device = get_best_gpu()
except:
print('couldnt get the best gpu, using cpu instead')
args.device = 'cpu'
# some save checks on model type
if (args.model_type == 'segler') & (args.pretrain_epochs>=1):
print('changing model type to fortunato because of pretraining_epochs>0')
args.model_type = 'fortunato'
if ((args.model_type == 'staticQK') or (args.model_type == 'retrosim')) & (args.epochs>1):
print('changing epochs to 1 (StaticQK is not lernable ;)')
args.epochs=1
if args.template_fp_type != args.fp_type:
print('fp_type must be the same as template_fp_type --> setting template_fp_type to fp_type')
args.template_fp_type = args.fp_type
if args.save_model & (args.fp_type=='MxFP'):
warnings.warn('Currently MxFP is not recommended for saving the model paprameter (fragment dict for others would need to be saved or compued again, currently not implemented)')
return args
@memory.cache(ignore=['njobs'])
def featurize_smiles(X, fp_type='morgan', fp_size=4096, fp_radius=2, njobs=1, verbose=False):
X_fp = {}
if fp_type in ['MxFP','MACCS','Morgan2CBF','Morgan4CBF', 'Morgan6CBF', 'ErG','AtomPair','TopologicalTorsion','RDK']:
print('computing', fp_type)
if fp_type == 'MxFP':
fp_types = ['MACCS','Morgan2CBF','Morgan4CBF', 'Morgan6CBF', 'ErG','AtomPair','TopologicalTorsion','RDK']
else:
fp_types = [fp_type]
remaining = int(fp_size)
for fp_type in fp_types:
print(fp_type,end=' ')
feat = FP_featurizer(fp_types=fp_type,
max_features= (fp_size//len(fp_types)) if (fp_type != fp_types[-1]) else remaining )
X_fp[f'train_{fp_type}'] = feat.fit(X['train'])
X_fp[f'valid_{fp_type}'] = feat.transform(X['valid'])
X_fp[f'test_{fp_type}'] = feat.transform(X['test'])
remaining -= X_fp[f'train_{fp_type}'].shape[1]
#X_fp['train'].shape, X_fp['test'].shape
X_fp['train'] = np.hstack([ X_fp[f'train_{fp_type}'] for fp_type in fp_types])
X_fp['valid'] = np.hstack([ X_fp[f'valid_{fp_type}'] for fp_type in fp_types])
X_fp['test'] = np.hstack([ X_fp[f'test_{fp_type}'] for fp_type in fp_types])
else: #fp_type in ['rdk','morgan','ecfp4','pattern','morganc','rdkc']:
if verbose: print('computing', fp_type, 'folded')
for split in X.keys():
X_fp[split] = convert_smiles_to_fp(X[split], fp_size=fp_size, which=fp_type, radius=fp_radius, njobs=njobs, verbose=verbose)
return X_fp
def compute_template_fp(fp_len=2048, reactant_pooling='max', do_log=True):
"""Pre-Compute the template-fingerprint"""
# combine them to one fingerprint
comb_template_fp = np.zeros((max(template_list.keys())+1,fp_len if reactant_pooling!='concat' else fp_len*6))
for i in template_list:
tpl = template_list[i]
try:
pr, rea = str(tpl).split('>>')
idxx = temp_part_to_fp[pr]
prod_fp = templates_fp['fp'][idxx]
except:
print('err', pr, end='\r')
prod_fp = np.zeros(fp_len)
rea_fp = templates_fp['fp'][[temp_part_to_fp[r] for r in str(rea).split('.')]] # max-pooling
if reactant_pooling=='only_product':
rea_fp = np.zeros(fp_len)
if reactant_pooling=='max':
rea_fp = np.log(1 + rea_fp.max(0))
elif reactant_pooling=='mean':
rea_fp = np.log(1 + rea_fp.mean(0))
elif reactant_pooling=='sum':
rea_fp = np.log(1 + rea_fp.mean(0))
elif reactant_pooling=='lgamma':
rea_fp = multinom_gk(rea_fp, axis=0)
elif reactant_pooling=='concat':
rs = str(rea).split('.')
rs.sort()
for ii, r in enumerate(rs):
idx = temp_part_to_fp[r]
rea_fp = templates_fp['fp'][idx]
comb_template_fp[i, (fp_len*(ii+1)):(fp_len*(ii+2))] = np.log(1 + rea_fp)
comb_template_fp[i,:prod_fp.shape[0]] = np.log(1 + prod_fp) #- rea_fp*0.5
if reactant_pooling!='concat':
#comb_template_fp[i] = multinom_gk(np.stack([np.log(1+prod_fp), rea_fp]))
#comb_template_fp[i,fp_len:] = rea_fp
comb_template_fp[i,:rea_fp.shape[0]] = comb_template_fp[i, :rea_fp.shape[0]] - rea_fp*0.5
return comb_template_fp
def set_up_model(args, template_list=None):
hpn_config = ModelConfig(num_templates = int(max(template_list.keys()))+1,
#len(template_list.values()), #env.num_templates, #
dropout=args.dropout,
fingerprint_type=args.fp_type,
template_fp_type = args.template_fp_type,
fp_size = args.fp_size,
fp_radius= args.fp_radius,
device=args.device,
lr=args.lr,
hopf_beta=args.hopf_beta, #1/(128**0.5),#1/(2048**0.5),
hopf_input_size=args.fp_size,
hopf_output_size=None,
hopf_num_heads=args.hopf_num_heads,
hopf_asso_dim=args.hopf_asso_dim,
hopf_association_activation = args.hopf_association_activation, #or ReLU, Tanh works better, SELU, GELU
norm_input = args.norm_input,
norm_asso = args.norm_asso,
hopf_n_layers= args.hopf_n_layers,
mol_encoder_layers=args.mol_encoder_layers,
temp_encoder_layers=args.temp_encoder_layers,
encoder_af=args.encoder_af,
hopf_pooling_operation_head = args.hopf_pooling_operation_head,
batch_size=args.batch_size,
)
print(hpn_config.__dict__)
if args.model_type=='segler': # baseline
clf = SeglerBaseline(hpn_config)
elif args.model_type=='mhn':
clf = MHN(hpn_config, layer2weight=args.layer2weight)
elif args.model_type=='fortunato': # pretraining with applicability-matrix
clf = SeglerBaseline(hpn_config)
elif args.model_type=='staticQK': # staticQK
clf = StaticQK(hpn_config)
elif args.model_type=='retrosim': # staticQK
clf = Retrosim(hpn_config)
else:
raise NotImplementedError
return clf, hpn_config
def set_up_template_encoder(args, clf, label_to_n_train_samples=None, template_list=None):
if isinstance(clf, SeglerBaseline):
clf.templates = []
elif args.model_type=='staticQK':
clf.template_list = list(template_list.values())
clf.update_template_embedding(which=args.template_fp_type, fp_size=args.fp_size, radius=args.fp_radius, njobs=args.njobs)
elif args.model_type=='retrosim':
#clf.template_list = list(X['train'].values())
clf.fit_with_train(X_fp['train'], y['train'])
else:
import hashlib
PATH = './data/cache/'
if not os.path.exists(PATH):
os.mkdir(PATH)
fn_templ_emb = f'{PATH}templ_emb_{args.fp_size}_{args.template_fp_type}{args.fp_radius}_{len(template_list)}_{int(hashlib.sha512((str(template_list)).encode()).hexdigest(), 16)}.npy'
if (os.path.exists(fn_templ_emb)): # load the template embedding
print(f'loading tfp from file {fn_templ_emb}')
templ_emb = np.load(fn_templ_emb)
# !!! beware of different fingerprint types
clf.template_list = list(template_list.values())
if args.only_templates_in_batch:
clf.templates_np = templ_emb
clf.templates = None
else:
clf.templates = torch.from_numpy(templ_emb).float().to(clf.config.device)
else:
if args.template_fp_type=='MxFP':
clf.template_list = list(template_list.values())
clf.templates = torch.from_numpy(comb_template_fp).float().to(clf.config.device)
clf.set_templates_recursively()
elif args.template_fp_type=='Tfidf':
clf.template_list = list(template_list.values())
clf.templates = torch.from_numpy(tfidf_template_fp).float().to(clf.config.device)
clf.set_templates_recursively()
elif args.template_fp_type=='random':
clf.template_list = list(template_list.values())
clf.templates = torch.from_numpy(np.random.rand(len(template_list),args.fp_size)).float().to(clf.config.device)
clf.set_templates_recursively()
else:
clf.set_templates(list(template_list.values()), which=args.template_fp_type, fp_size=args.fp_size,
radius=args.fp_radius, learnable=False, njobs=args.njobs, only_templates_in_batch=args.only_templates_in_batch)
#if len(template_list)<100000:
np.save(fn_templ_emb, clf.templates_np if args.only_templates_in_batch else clf.templates.detach().cpu().numpy().astype(np.float16))
# concatinate the current fingerprint with a random fingerprint if the threshold is above
if (args.concat_rand_template_thresh != -1) & (args.repl_quotient>0):
REPLACE_FACTOR = int(args.repl_quotient) # default was 8
# fold the original fingerprint
pre_comp_templates = clf.templates_np if args.only_templates_in_batch else clf.templates.detach().cpu().numpy()
# mask of labels with mor than 49 training samples
l_mask = np.array([label_to_n_train_samples[k]>=args.concat_rand_template_thresh for k in template_list])
print(f'Num of templates with added rand-vect of size {pre_comp_templates.shape[1]//REPLACE_FACTOR} due to >=thresh ({args.concat_rand_template_thresh}):',l_mask.sum())
# remove the bits with the lowest variance
v = pre_comp_templates.var(0)
idx_lowest_var_half = v.argsort()[:(pre_comp_templates.shape[1]//REPLACE_FACTOR)]
# the new zero-init-vectors
pre = np.zeros([pre_comp_templates.shape[0], pre_comp_templates.shape[1]//REPLACE_FACTOR]).astype(np.float)
print(pre.shape, l_mask.shape, l_mask.sum()) #(616, 1700) (11790,) 519
print(pre_comp_templates.shape, len(template_list)) #(616, 17000) 616
# only the ones with >thresh will receive a random vect
pre[l_mask] = np.random.rand(l_mask.sum(), pre.shape[1])
pre_comp_templates[:,idx_lowest_var_half] = pre
#clf.templates = torch.from_numpy(pre_comp_templates).float().to(clf.config.device)
if pre_comp_templates.shape[0]<100000:
print('adding template_matrix to params')
param = torch.nn.Parameter(torch.from_numpy(pre_comp_templates).float(), requires_grad=False)
clf.register_parameter(name='templates+noise', param=param)
clf.templates = param.to(clf.config.device)
clf.set_templates_recursively()
else: #otherwise might cause memory issues
print('more than 100k templates')
if args.only_templates_in_batch:
clf.templates = None
clf.templates_np = pre_comp_templates
else:
clf.templates = torch.from_numpy(pre_comp_templates).float()
clf.set_templates_recursively()
# set's this for the first layer!!
if args.template_fp_type2=='MxFP':
print('first_layer template_fingerprint is set to MxFP')
clf.templates = torch.from_numpy(comb_template_fp).float().to(clf.config.device)
elif args.template_fp_type2=='Tfidf':
print('first_layer template_fingerprint is set to Tfidf')
clf.templates = torch.from_numpy(tfidf_template_fp).float().to(clf.config.device)
elif args.template_fp_type2=='random':
print('first_layer template_fingerprint is set to random')
clf.templates = torch.from_numpy(np.random.rand(len(template_list),args.fp_size)).float().to(clf.config.device)
elif args.template_fp_type2=='stfp':
print('first_layer template_fingerprint is set to stfp ! only works with 4096 fp_size')
tfp = getTemplateFingerprint(list(template_list.values()))
clf.templates = torch.from_numpy(tfp).float().to(clf.config.device)
return clf
if __name__ == '__main__':
args = parse_args()
run_id = str(time()).split('.')[0]
fn_postfix = str(args.exp_name) + '_' + run_id
if args.wandb:
import wandb
wandb.init(project='mhn-react', entity='phseidl', name=args.dataset_type+'_'+args.model_type+'_'+fn_postfix, config=args.__dict__)
else:
wandb=None
if not args.verbose:
disable_rdkit_logging()
if args.seed is not None:
from .utils import seed_everything
seed_everything(args.seed)
print('seeded with',args.seed)
# load csv or data
if args.csv_path is None:
X, y = load_USPTO(which=args.dataset_type)
template_list = load_templates(which=args.dataset_type)
else:
X, y, template_list, test_reactants_can = load_dataset_from_csv(**vars(args))
if args.addval2train:
print('adding val to train')
X['train'] = [*X['train'],*X['valid']]
y['train'] = np.concatenate([y['train'],y['valid']])
splits = ['train', 'valid', 'test']
#TODO split up in seperate class
if args.splitting_scheme == 'class-freq':
X_all = np.concatenate([X[split] for split in splits], axis=0)
y_all = np.concatenate([y[split] for split in splits])
# sort class by frequency / assumes class-index is ordered (wich is mildely violated)
res = y_all.argsort()
# use same split proportions
cum_split_lens = np.cumsum([len(y[split]) for split in splits]) #cumulative split length
X['train'] = X_all[res[0:cum_split_lens[0]]]
y['train'] = y_all[res[0:cum_split_lens[0]]]
X['valid'] = X_all[res[cum_split_lens[0]:cum_split_lens[1]]]
y['valid'] = y_all[res[cum_split_lens[0]:cum_split_lens[1]]]
X['test'] = X_all[res[cum_split_lens[1]:]]
y['test'] = y_all[res[cum_split_lens[1]:]]
for split in splits:
print(split, y[split].shape[0], 'samples (', y[split].max(),'max label)')
if args.splitting_scheme == 'remove_once_in_train_and_not_in_test':
print('remove_once_in_train')
from collections import Counter
cc = Counter()
cc.update(y['train'])
classes_set_only_once_in_train = set(np.array(list(cc.keys()))[ (np.array(list(cc.values())))==1])
not_in_test = set(y['train']).union(y['valid']) - (set(y['test']))
classes_set_only_once_in_train = (classes_set_only_once_in_train.intersection(not_in_test))
remove_those_mask = np.array([yii in classes_set_only_once_in_train for yii in y['train']])
X['train'] = np.array(X['train'])[~remove_those_mask]
y['train'] = np.array(y['train'])[~remove_those_mask]
print(remove_those_mask.mean(),'%', remove_those_mask.sum(), 'samples removed')
if args.splitting_scheme == 'random':
print('random-splitting-scheme:8-1-1')
if args.ssretroeval:
print('ssretroeval not available')
raise NotImplementedError
import numpy as np
from sklearn.model_selection import train_test_split
def _unpack(lod):
r = []
for k,v in lod.items():
[r.append(i) for i in v]
return r
X_all = _unpack(X)
y_all = np.array( _unpack(y) )
X['train'], X['test'], y['train'], y['test'] = train_test_split(X_all, y_all, test_size=0.2, random_state=70135)
X['test'], X['valid'], y['test'], y['valid'] = train_test_split(X['test'], y['test'], test_size=0.5, random_state=70135)
zero_shot = set(y['test']).difference( set(y['train']).union(set(y['valid'])) )
zero_shot_mask = np.array([yi in zero_shot for yi in y['test']])
print(sum(zero_shot_mask))
#y['test'][zero_shot_mask] = list(zero_shot)[0] #not right but quick
if args.model_type=='staticQK' or args.model_type=='retrosim':
print('staticQK model: caution: use pattern, or rdk -fingerprint-embedding')
fp_size = args.fp_size
radius = args.fp_radius #quite important ;)
fp_embedding = args.fp_type
X_fp = featurize_smiles(X, fp_type=args.fp_type, fp_size=args.fp_size, fp_radius=args.fp_radius, njobs=args.njobs)
if args.template_fp_type=='MxFP' or (args.template_fp_type2=='MxFP'):
temp_part_to_fp = {}
for i in template_list:
tpl = template_list[i]
for part in str(tpl).split('>>'):
for p in str(part).split('.'):
temp_part_to_fp[p]=None
for i, k in enumerate(temp_part_to_fp):
temp_part_to_fp[k] = i
fp_types = ['Morgan2CBF','Morgan4CBF', 'Morgan6CBF','AtomPair','TopologicalTorsion', 'Pattern', 'RDK']
#MACCS ErG don't work --> errors with explicit / inplicit valence
templates_fp = {}
remaining = args.fp_size
for fp_type in fp_types:
#print(fp_type, end='\t')
# if it's that last use up the remaining fps
te_feat = FP_featurizer(fp_types=fp_type,
max_features=(args.fp_size//len(fp_types)) if (fp_type != fp_types[-1]) else remaining,
log_scale=False
)
templates_fp[fp_type] = te_feat.fit(list(temp_part_to_fp.keys())[:], is_smarts=True)
#print(np.unique(templates_fp[fp_type]), end='\r')
remaining -= templates_fp[fp_type].shape[1]
templates_fp['fp'] = np.hstack([ templates_fp[f'{fp_type}'] for fp_type in fp_types])
if args.template_fp_type=='MxFP' or (args.template_fp_type2=='MxFP'):
comb_template_fp = compute_template_fp(fp_len= args.fp_size, reactant_pooling=args.reactant_pooling)
if args.template_fp_type=='Tfidf' or (args.template_fp_type2 == 'Tfidf'):
print('using tfidf template-fingerprint')
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = (list(template_list.values()))
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1,12), max_features=args.fp_size)
tfidf_template_fp = vectorizer.fit_transform(corpus).toarray()
tfidf_template_fp.shape
acutal_fp_size = X_fp['train'].shape[1]
if acutal_fp_size != args.fp_size:
args.fp_size = int(X_fp['train'].shape[1])
print('Warning: fp-size has changed to', acutal_fp_size)
label_to_n_train_samples = {}
n_train_samples_to_label = defaultdict(list)
n_templates = max(template_list.keys())+1 #max(max(y['train']), max(y['test']), max(y['valid']))
for i in range(n_templates):
n_train_samples = (y['train']==i).sum()
label_to_n_train_samples[i] = n_train_samples
n_train_samples_to_label[n_train_samples].append(i)
up_to = 11
n_samples = []
masks = []
ntes = range(up_to)
mask_dict = {}
for nte in ntes: # Number of training examples
split = f'nte_{nte}'
#print(split)
mask = np.zeros(y['test'].shape)
if isinstance(nte, int):
for label_with_nte in n_train_samples_to_label[nte]:
mask += (y['test'] == label_with_nte)
mask = mask>=1
masks.append(mask)
mask_dict[str(nte)] = mask
n_samples.append(mask.sum())
# for greater than 10 # >10
n_samples.append((np.array(masks).max(0)==0).sum())
mask_dict['>10'] = (np.array(masks).max(0)==0)
sum(n_samples), mask.shape
ntes = range(50) #to 49
for nte in ntes: # Number of training examples
split = f'nte_{nte}'
#print(split)
mask = np.zeros(y['test'].shape)
for label_with_nte in n_train_samples_to_label[nte]:
mask += (y['test'] == label_with_nte)
mask = mask>=1
masks.append(mask)
# for greater than 10 # >49
n_samples.append((np.array(masks).max(0)==0).sum())
mask_dict['>49'] = np.array(masks).max(0)==0
print(n_samples)
clf, hpn_config = set_up_model(args, template_list=template_list)
clf = set_up_template_encoder(args, clf, label_to_n_train_samples=label_to_n_train_samples, template_list=template_list)
if args.verbose:
print(clf.config.__dict__)
print(clf)
wda = torch.optim.AdamW(clf.parameters(), lr=args.lr, weight_decay=1e-2)
if args.wandb:
wandb.watch(clf)
# pretraining with applicablity matrix, if applicable
if args.model_type == 'fortunato' or args.pretrain_epochs>1:
print('pretraining on applicability-matrix -- loading the matrix')
_, y_appl = load_USPTO(args.dataset_type, is_appl_matrix=True)
if args.splitting_scheme == 'remove_once_in_train_and_not_in_test':
y_appl['train'] = y_appl['train'][~remove_those_mask]
# check random if the applicability is true for y
splt = 'train'
for i in range(500):
i = np.random.randint(len(y[splt]))
#assert ( y_appl[splt][i].indices == y[splt][i] ).sum()==1
print('pre-training (BCE-loss)')
for epoch in range(args.pretrain_epochs):
clf.train_from_np(X_fp['train'], X_fp['train'], y_appl['train'], use_dataloader=True, is_smiles=False,
epochs=1, wandb=wandb, verbose=args.verbose, bs=args.batch_size,
permute_batches=True, shuffle=True, optimizer=wda,
only_templates_in_batch=args.only_templates_in_batch)
y_pred = clf.evaluate(X_fp['valid'], X_fp['valid'], y_appl['valid'],
split='pretrain_valid', is_smiles=False, only_loss=True,
bs=args.batch_size,wandb=wandb)
appl_acc = ((y_appl['valid'].toarray()) == (y_pred>0.5)).mean()
print(f'{epoch:2.0f} -- train_loss: {clf.hist["loss"][-1]:1.3f}, loss_valid: {clf.hist["loss_pretrain_valid"][-1]:1.3f}, train_acc: {appl_acc:1.5f}')
fn_hist = None
y_preds = None
for epoch in range(round(args.epochs / args.eval_every_n_epochs)):
if not isinstance(clf, StaticQK):
now = time()
clf.train_from_np(X_fp['train'], X_fp['train'], y['train'], use_dataloader=True, is_smiles=False,
epochs=args.eval_every_n_epochs, wandb=wandb, verbose=args.verbose, bs=args.batch_size,
permute_batches=True, shuffle=True, optimizer=wda, only_templates_in_batch=args.only_templates_in_batch)
if args.verbose: print(f'training took {(time()-now)/60:3.1f} min for {args.eval_every_n_epochs} epochs')
for split in ['valid', 'test']:
print(split, 'evaluating', end='\r')
now = time()
#only_loss = ((epoch%5)==4) if args.dataset_type=='lg' else True
y_preds = clf.evaluate(X_fp[split], X_fp[split], y[split], is_smiles=False, split=split, bs=args.batch_size, only_loss=args.eval_only_loss, wandb=wandb);
if args.verbose: print(f'eval {split} took',(time()-now)/60,'min')
if not isinstance(clf, StaticQK):
try:
print(f'{epoch:2.0f} -- train_loss: {clf.hist["loss"][-1]:1.3f}, loss_valid: {clf.hist["loss_valid"][-1]:1.3f}, val_t1acc: {clf.hist["t1_acc_valid"][-1]:1.3f}, val_t100acc: {clf.hist["t100_acc_valid"][-1]:1.3f}')
except:
pass
now = time()
ks = [1, 2, 3, 4, 5, 10, 20, 30, 40, 50, 100]
for nte in mask_dict: # Number of training examples
split = f'nte_{nte}'
#print(split)
mask = mask_dict[nte]
topkacc = top_k_accuracy(np.array(y['test'])[mask], y_preds[mask, :], k=ks, ret_arocc=False)
new_hist = {}
for k, tkacc in zip(ks, topkacc):
new_hist[f't{k}_acc_{split}'] = tkacc
#new_hist[(f'arocc_{split}')] = (arocc)
new_hist[f'steps_{split}'] = (clf.steps)
for k in new_hist:
clf.hist[k].append(new_hist[k])
if args.verbose: print(f'eval nte-test took',(time()-now)/60,'min')
fn_hist = clf.save_hist(prefix=f'USTPO_{args.dataset_type}_{args.model_type}_', postfix=fn_postfix)
if args.save_preds:
PATH = './data/preds/'
if not os.path.exists(PATH):
os.mkdir(PATH)
pred_fn = f'{PATH}USPTO_{args.dataset_type}_test_{args.model_type}_{fn_postfix}.npy'
print('saving predictions to',pred_fn)
np.save(pred_fn,y_preds)
args.save_preds = pred_fn
if args.save_model:
model_save_path = clf.save_model(prefix=f'USPTO_{args.dataset_type}_{args.model_type}_valloss{clf.hist.get("loss_valid",[-1])[-1]:1.3f}_',name_as_conf=False, postfix=fn_postfix)
# Serialize data into file:
import json
json.dump( args.__dict__, open( f"data/model/{fn_postfix}_args.json", 'w' ) )
json.dump( hpn_config.__dict__,
open( f"data/model/{fn_postfix}_config.json", 'w' ) )
print('model saved to', model_save_path)
print(min(clf.hist.get('loss_valid',[-1])))
if args.plot_res:
from plotutils import plot_topk, plot_nte
plt.figure()
clf.plot_loss()
plt.draw()
plt.figure()
plot_topk(clf.hist, sets=['valid'])
if args.dataset_type=='sm':
baseline_val_res = {1:0.4061, 10:0.6827, 50: 0.7883, 100:0.8400}
plt.plot(list(baseline_val_res.keys()), list(baseline_val_res.values()), 'k.--')
plt.draw()
plt.figure()
best_cpt = np.array(clf.hist['loss_valid'])[::-1].argmin()+1
print(best_cpt)
try:
best_cpt = np.array(clf.hist['t10_acc_valid'])[::-1].argmax()+1
print(best_cpt)
except:
print('err with t10_acc_valid')
plot_nte(clf.hist, dataset=args.dataset_type.capitalize(), last_cpt=best_cpt, include_bar=True, model_legend=args.exp_name,
n_samples=n_samples, z=1.96)
if os.path.exists('data/figs/'):
try:
os.mkdir(f'data/figs/{args.exp_name}/')
except:
pass
plt.savefig(f'data/figs/{args.exp_name}/training_examples_vs_top100_acc_{args.dataset_type}_{hash(str(args))}.pdf')
plt.draw()
fn_hist = clf.save_hist(prefix=f'USTPO_{args.dataset_type}_{args.model_type}_', postfix=fn_postfix)
if args.ssretroeval:
print('testing on the real test set ;)')
from .data import load_templates
from .retroeval import run_templates, topkaccuracy
from .utils import sort_by_template_and_flatten
a = list(template_list.keys())
#assert list(range(len(a))) == a
templates = list(template_list.values())
#templates = [*templates, *expert_templates]
template_product_smarts = [str(s).split('>')[0] for s in templates]
#execute all template
print('execute all templates')
test_product_smarts = [xi[0] for xi in X['test']] #added later
smarts2appl = memory.cache(smarts2appl, ignore=['njobs','nsplits', 'use_tqdm'])
appl = smarts2appl(test_product_smarts, template_product_smarts, njobs=args.njobs)
n_pairs = len(test_product_smarts) * len(template_product_smarts)
n_appl = len(appl[0])
print(n_pairs, n_appl, n_appl/n_pairs)
#forward
split = 'test'
print('len(X_fp[test]):',len(X_fp[split]))
y[split] = np.zeros(len(X[split])).astype(np.int)
clf.eval()
if y_preds is None:
y_preds = clf.evaluate(X_fp[split], X_fp[split], y[split], is_smiles=False,
split='ttest', bs=args.batch_size, only_loss=True, wandb=None);
template_scores = y_preds #this should allready be test
####
if y_preds.shape[1]>100000:
kth = 200
print(f'only evaluating top {kth} applicable predicted templates')
# only take top kth and multiply by applicability matrix
appl_mtrx = np.zeros_like(y_preds, dtype=bool)
appl_mtrx[appl[0], appl[1]] = 1
appl_and_topkth = ([], [])
for row in range(len(y_preds)):
argpreds = (np.argpartition(-(y_preds[row]*appl_mtrx[row]), kth, axis=0)[:kth])
# if there are less than kth applicable
mask = appl_mtrx[row][argpreds]
argpreds = argpreds[mask]
#if len(argpreds)!=kth:
# print('changed to ', len(argpreds))
appl_and_topkth[0].extend([row for _ in range(len(argpreds))])
appl_and_topkth[1].extend(list(argpreds))
appl = appl_and_topkth
####
print('running the templates')
run_templates = run_templates #memory.cache( ) ... allready cached to tmp
prod_idx_reactants, prod_temp_reactants = run_templates(test_product_smarts, templates, appl, njobs=args.njobs)
#sorted_results = sort_by_template(template_scores, prod_idx_reactants)
#flat_results = flatten_per_product(sorted_results, remove_duplicates=True)
#now aglomerates over same outcome
flat_results = sort_by_template_and_flatten(y_preds, prod_idx_reactants, agglo_fun=sum)
accs = topkaccuracy(test_reactants_can, flat_results, [*list(range(1,101)), 100000])
mtrcs2 = {f't{k}acc_ttest':accs[k-1] for k in [1,2,3,5,10,20,50,100,101]}
if wandb:
wandb.log(mtrcs2)
print('Single-step retrosynthesis-evaluation, results on ttest:')
#print([k[:-6]+'|' for k in mtrcs2.keys()])
[print(k[:-6],end='\t') for k in mtrcs2.keys()]
print()
for k,v in mtrcs2.items():
print(f'{v*100:2.2f}',end='\t')
# save the history of this experiment
EXP_DIR = 'data/experiments/'
df = pd.DataFrame([args.__dict__])
df['min_loss_valid'] = min(clf.hist.get('loss_valid', [-1]))
df['min_loss_train'] = 0 if ((args.model_type=='staticQK') or (args.model_type=='retrosim')) else min(clf.hist.get('loss',[-1]))
try:
df['max_t1_acc_valid'] = max(clf.hist.get('t1_acc_valid', [0]))
df['max_t100_acc_valid'] = max(clf.hist.get('t100_acc_valid', [0]))
except:
pass
df['hist'] = [clf.hist]
df['n_samples'] = [n_samples]
df['fn_hist'] = fn_hist if fn_hist else None
df['fn_model'] = '' if not args.save_model else model_save_path
df['date'] = str(datetime.datetime.fromtimestamp(time()))
df['cmd'] = ' '.join(sys.argv[:])
if not os.path.exists(EXP_DIR):
os.mkdir(EXP_DIR)
df.to_csv(f'{EXP_DIR}{run_id}.tsv', sep='\t')
df