import gradio as gr from transformers import AutoTokenizer, AutoModelForSeq2SeqLM #from src.utils import plogp, sf_decode, sim import pandas as pd from rdkit import Chem from rdkit.Chem import AllChem from rdkit import DataStructs from rdkit.Chem import Descriptors import selfies as sf from rdkit.Chem import RDConfig import os import sys sys.path.append(os.path.join(RDConfig.RDContribDir, 'SA_Score')) import sascorer def get_largest_ring_size(mol): cycle_list = mol.GetRingInfo().AtomRings() if cycle_list: cycle_length = max([len(j) for j in cycle_list]) else: cycle_length = 0 return cycle_length def plogp(smile): if smile: mol = Chem.MolFromSmiles(smile) if mol: log_p = Descriptors.MolLogP(mol) sas_score = sascorer.calculateScore(mol) largest_ring_size = get_largest_ring_size(mol) cycle_score = max(largest_ring_size - 6, 0) if log_p and sas_score and largest_ring_size: p_logp = log_p - sas_score - cycle_score return p_logp else: return -100 else: return -100 else: return -100 def sf_decode(selfies): try: decode = sf.decoder(selfies) return decode except sf.DecoderError: return '' def sim(input_smile, output_smile): if input_smile and output_smile: input_mol = Chem.MolFromSmiles(input_smile) output_mol = Chem.MolFromSmiles(output_smile) if input_mol and output_mol: input_fp = AllChem.GetMorganFingerprint(input_mol, 2) output_fp = AllChem.GetMorganFingerprint(output_mol, 2) sim = DataStructs.TanimotoSimilarity(input_fp, output_fp) return sim else: return None else: return None def greet(name): tokenizer = AutoTokenizer.from_pretrained("zjunlp/MolGen-large-opt") model = AutoModelForSeq2SeqLM.from_pretrained("zjunlp/MolGen-large-opt") input = name sf_input = tokenizer(input, return_tensors="pt") molecules = model.generate( input_ids=sf_input["input_ids"], attention_mask=sf_input["attention_mask"], do_sample=True, max_length=100, min_length=5, top_k=30, top_p=1, num_return_sequences=10 ) sf_output = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True).replace(" ","") for g in molecules] sf_output = list(set(sf_output)) input_sm = sf_decode(input) sm_output = [sf_decode(sf) for sf in sf_output] input_plogp = plogp(input_sm) plogp_improve = [plogp(i)-input_plogp for i in sm_output] simm = [sim(i,input_sm) for i in sm_output] candidate_selfies = {"candidates": sf_output, "improvement": plogp_improve, "sim": simm} data = pd.DataFrame(candidate_selfies) return data[(data['improvement']> 0) & (data['sim']>0.4)] examples = [ ['[C][C][=Branch1][C][=O][N][C][C][O][C][C][O][C][C][O][C][C][Ring1][N]'],['[C][C][S][C][C][S][C][C][C][S][C][C][S][C][Ring1][=C]'] ] iface = gr.Interface(fn=greet, inputs="text", outputs="numpy", title="Molecular Language Model as Multi-task Generator",examples=examples) iface.launch()