ZJU-Fangyin commited on
Commit
514d010
β€’
1 Parent(s): 095dcae

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +112 -0
app.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
3
+ #from src.utils import plogp, sf_decode, sim
4
+ import pandas as pd
5
+ from rdkit import Chem
6
+ from rdkit.Chem import AllChem
7
+ from rdkit import DataStructs
8
+ from rdkit.Chem import Descriptors
9
+ import selfies as sf
10
+ from rdkit.Chem import RDConfig
11
+ import os
12
+ import sys
13
+ sys.path.append(os.path.join(RDConfig.RDContribDir, 'SA_Score'))
14
+ import sascorer
15
+
16
+ def get_largest_ring_size(mol):
17
+ cycle_list = mol.GetRingInfo().AtomRings()
18
+ if cycle_list:
19
+ cycle_length = max([len(j) for j in cycle_list])
20
+ else:
21
+ cycle_length = 0
22
+ return cycle_length
23
+
24
+ def plogp(smile):
25
+ if smile:
26
+ mol = Chem.MolFromSmiles(smile)
27
+ if mol:
28
+ log_p = Descriptors.MolLogP(mol)
29
+ sas_score = sascorer.calculateScore(mol)
30
+ largest_ring_size = get_largest_ring_size(mol)
31
+ cycle_score = max(largest_ring_size - 6, 0)
32
+ if log_p and sas_score and largest_ring_size:
33
+ p_logp = log_p - sas_score - cycle_score
34
+ return p_logp
35
+ else:
36
+ return -100
37
+ else:
38
+ return -100
39
+ else:
40
+ return -100
41
+
42
+ def sf_decode(selfies):
43
+ try:
44
+ decode = sf.decoder(selfies)
45
+ return decode
46
+ except sf.DecoderError:
47
+ return ''
48
+
49
+ def sim(input_smile, output_smile):
50
+ if input_smile and output_smile:
51
+ input_mol = Chem.MolFromSmiles(input_smile)
52
+ output_mol = Chem.MolFromSmiles(output_smile)
53
+ if input_mol and output_mol:
54
+ input_fp = AllChem.GetMorganFingerprint(input_mol, 2)
55
+ output_fp = AllChem.GetMorganFingerprint(output_mol, 2)
56
+ sim = DataStructs.TanimotoSimilarity(input_fp, output_fp)
57
+ return sim
58
+ else: return None
59
+ else: return None
60
+
61
+
62
+ def greet(name):
63
+
64
+ tokenizer = AutoTokenizer.from_pretrained("zjunlp/MolGen-large-opt")
65
+ model = AutoModelForSeq2SeqLM.from_pretrained("zjunlp/MolGen-large-opt")
66
+
67
+ input = name
68
+
69
+ sf_input = tokenizer(input, return_tensors="pt")
70
+ molecules = model.generate(
71
+ input_ids=sf_input["input_ids"],
72
+ attention_mask=sf_input["attention_mask"],
73
+ do_sample=True,
74
+ max_length=100,
75
+ min_length=5,
76
+ top_k=30,
77
+ top_p=1,
78
+ num_return_sequences=10
79
+ )
80
+ sf_output = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True).replace(" ","") for g in molecules]
81
+ sf_output = list(set(sf_output))
82
+ input_sm = sf_decode(input)
83
+ sm_output = [sf_decode(sf) for sf in sf_output]
84
+
85
+
86
+
87
+ input_plogp = plogp(input_sm)
88
+ plogp_improve = [plogp(i)-input_plogp for i in sm_output]
89
+
90
+
91
+ simm = [sim(i,input_sm) for i in sm_output]
92
+
93
+ candidate_selfies = {"candidates": sf_output, "improvement": plogp_improve, "sim": simm}
94
+ data = pd.DataFrame(candidate_selfies)
95
+
96
+ return data[(data['improvement']> 0) & (data['sim']>0.4)]
97
+
98
+
99
+
100
+
101
+
102
+
103
+ examples = [
104
+
105
+ ['[C][C][=Branch1][C][=O][N][C][C][O][C][C][O][C][C][O][C][C][Ring1][N]'],['[C][C][S][C][C][S][C][C][C][S][C][C][S][C][Ring1][=C]']
106
+
107
+ ]
108
+
109
+
110
+
111
+ iface = gr.Interface(fn=greet, inputs="text", outputs="numpy", title="Molecular Language Model as Multi-task Generator",examples=examples)
112
+ iface.launch()