Spaces:
Running
Running
Commit
·
0095ae9
1
Parent(s):
b3dd269
synthesizable
Browse files
app.py
CHANGED
@@ -17,22 +17,98 @@ from rdkit import Chem
|
|
17 |
class PeptideAnalyzer:
|
18 |
def __init__(self):
|
19 |
self.bond_patterns = [
|
20 |
-
(r'OC\(=O\)', 'ester'), # Ester bond
|
21 |
(r'N\(C\)C\(=O\)', 'n_methyl'), # N-methylated peptide bond
|
22 |
(r'N[0-9]C\(=O\)', 'proline'), # Proline peptide bond
|
23 |
(r'NC\(=O\)', 'peptide'), # Standard peptide bond
|
24 |
(r'C\(=O\)N\(C\)', 'n_methyl_reverse'), # Reverse N-methylated
|
25 |
(r'C\(=O\)N[12]?', 'peptide_reverse') # Reverse peptide bond
|
26 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
# Three to one letter code mapping
|
28 |
self.three_to_one = {
|
29 |
'Ala': 'A', 'Cys': 'C', 'Asp': 'D', 'Glu': 'E',
|
30 |
'Phe': 'F', 'Gly': 'G', 'His': 'H', 'Ile': 'I',
|
31 |
'Lys': 'K', 'Leu': 'L', 'Met': 'M', 'Asn': 'N',
|
32 |
'Pro': 'P', 'Gln': 'Q', 'Arg': 'R', 'Ser': 'S',
|
33 |
-
'Thr': 'T', 'Val': 'V', 'Trp': 'W', 'Tyr': 'Y'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
|
|
|
|
|
|
|
|
36 |
def is_peptide(self, smiles):
|
37 |
"""Check if the SMILES represents a peptide structure"""
|
38 |
mol = Chem.MolFromSmiles(smiles)
|
@@ -73,14 +149,34 @@ class PeptideAnalyzer:
|
|
73 |
is_cyclic = len(peptide_cycles) > 0 and not smiles.endswith('C(=O)O')
|
74 |
return is_cyclic, peptide_cycles, aromatic_cycles
|
75 |
|
76 |
-
def split_on_bonds(self, smiles):
|
|
|
77 |
positions = []
|
78 |
used = set()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
# Find Gly pattern first
|
80 |
gly_pattern = r'NCC\(=O\)'
|
81 |
for match in re.finditer(gly_pattern, smiles):
|
82 |
if not any(p in range(match.start(), match.end()) for p in used):
|
83 |
-
|
84 |
'start': match.start(),
|
85 |
'end': match.end(),
|
86 |
'type': 'gly',
|
@@ -91,56 +187,70 @@ class PeptideAnalyzer:
|
|
91 |
for pattern, bond_type in self.bond_patterns:
|
92 |
for match in re.finditer(pattern, smiles):
|
93 |
if not any(p in range(match.start(), match.end()) for p in used):
|
94 |
-
|
95 |
'start': match.start(),
|
96 |
'end': match.end(),
|
97 |
'type': bond_type,
|
98 |
'pattern': match.group()
|
99 |
})
|
100 |
used.update(range(match.start(), match.end()))
|
101 |
-
|
102 |
-
|
103 |
-
|
|
|
|
|
|
|
104 |
|
105 |
# Create segments
|
106 |
segments = []
|
107 |
-
|
108 |
-
|
109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
segments.append({
|
111 |
-
'content':
|
112 |
-
'
|
|
|
|
|
113 |
})
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
segments.append({
|
125 |
-
'content':
|
126 |
-
'bond_before': '
|
127 |
-
'bond_after': next_pos['pattern']
|
128 |
})
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
segments.append({
|
133 |
-
'content': content,
|
134 |
-
'bond_before': current['pattern'],
|
135 |
-
'bond_after': next_pos['pattern']
|
136 |
-
})
|
137 |
-
|
138 |
-
# Last segment
|
139 |
-
if positions[-1]['end'] < len(smiles):
|
140 |
segments.append({
|
141 |
-
'content':
|
142 |
-
'bond_before':
|
|
|
143 |
})
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
return segments
|
145 |
|
146 |
def clean_terminal_carboxyl(self, segment):
|
@@ -164,9 +274,23 @@ class PeptideAnalyzer:
|
|
164 |
def identify_residue(self, segment):
|
165 |
"""Identify residue with Pro reconstruction"""
|
166 |
# Only clean terminal carboxyl if this is the last segment
|
|
|
|
|
167 |
content = self.clean_terminal_carboxyl(segment)
|
168 |
mods = self.get_modifications(segment)
|
169 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
170 |
# Proline (P) - flexible ring numbers
|
171 |
if any([
|
172 |
# Check for any ring number in bond patterns
|
@@ -174,8 +298,8 @@ class PeptideAnalyzer:
|
|
174 |
any(f'[C@@H]{n}' in content or f'[C@H]{n}' in content for n in '123456789'))
|
175 |
for n in '123456789'
|
176 |
]) or any([(segment.get('bond_before', '').startswith(f'C(=O)N{n}') and 'CCC' in content and
|
177 |
-
|
178 |
-
|
179 |
]) or any([
|
180 |
# Check ending patterns with any ring number
|
181 |
(f'CCCN{n}' in content and content.endswith('=O') and
|
@@ -192,430 +316,226 @@ class PeptideAnalyzer:
|
|
192 |
]):
|
193 |
return 'Pro', mods
|
194 |
|
195 |
-
#
|
|
|
|
|
|
|
|
|
196 |
if re.search(r'c[0-9]c\[nH\]c[0-9]ccccc[0-9][0-9]', content) and \
|
197 |
'c[nH]c' in content.replace(' ', ''):
|
|
|
|
|
198 |
return 'Trp', mods
|
199 |
|
200 |
-
# Lysine (K)
|
201 |
if '[C@@H](CCCCN)' in content or '[C@H](CCCCN)' in content:
|
|
|
|
|
202 |
return 'Lys', mods
|
203 |
-
|
204 |
-
# Arginine (R)
|
205 |
if '[C@@H](CCCNC(=N)N)' in content or '[C@H](CCCNC(=N)N)' in content:
|
|
|
|
|
206 |
return 'Arg', mods
|
207 |
|
208 |
-
if
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
elif segment.get('bond_before') and segment.get('bond_before').startswith('C(=O)N'):
|
213 |
return 'Gly', mods
|
214 |
-
|
|
|
215 |
if 'CC(C)C[C@H]' in content or 'CC(C)C[C@@H]' in content or '[C@@H](CC(C)C)' in content or '[C@H](CC(C)C)' in content or (('N[C@H](CCC(C)C)' in content or 'N[C@@H](CCC(C)C)' in content) and segment.get('bond_before') is None):
|
|
|
|
|
216 |
return 'Leu', mods
|
217 |
|
218 |
-
|
|
|
|
|
|
|
|
|
219 |
return 'Thr', mods
|
|
|
|
|
|
|
|
|
|
|
220 |
if re.search(r'\[C@H\]\(Cc\d+ccccc\d+\)', content) or re.search(r'\[C@@H\]\(Cc\d+ccccc\d+\)', content):
|
|
|
|
|
221 |
return 'Phe', mods
|
222 |
|
223 |
-
if ('[C
|
224 |
-
'[C@@H](C(C)C)' in content or
|
225 |
-
'[C@H]C(C)C' in content
|
226 |
-
'[C@@H]C(C)C' in content
|
227 |
-
):
|
228 |
-
if not any(p in content for p in ['CC(C)C[C@H]', 'CC(C)C[C@@H]']): # Still check not Leu
|
229 |
-
return 'Val', mods
|
230 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
231 |
if any([
|
232 |
-
'CC[C
|
233 |
-
'CC[C@@H](C)' in content,
|
234 |
'[C@@H](CC)C' in content,
|
235 |
-
'[C@H](CC)C' in content,
|
236 |
-
'C(C)C[C@H]' in content and 'CC(C)C' not in content,
|
237 |
'C(C)C[C@@H]' in content and 'CC(C)C' not in content
|
238 |
]):
|
|
|
|
|
|
|
|
|
|
|
|
|
239 |
return 'Ile', mods
|
240 |
|
|
|
241 |
if ('[C@H](C)' in content or '[C@@H](C)' in content):
|
242 |
if not any(p in content for p in ['C(C)C', 'COC', 'CN(', 'C(C)O', 'CC[C@H]', 'CC[C@@H]']):
|
|
|
|
|
243 |
return 'Ala', mods
|
244 |
|
245 |
-
# Tyrosine (
|
246 |
if re.search(r'Cc[0-9]ccc\(O\)cc[0-9]', content):
|
|
|
|
|
247 |
return 'Tyr', mods
|
248 |
|
249 |
-
# Serine (
|
250 |
if '[C@H](CO)' in content or '[C@@H](CO)' in content:
|
251 |
if not ('C(C)O' in content or 'COC' in content):
|
|
|
|
|
252 |
return 'Ser', mods
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
259 |
if '[C@H](CS)' in content or '[C@@H](CS)' in content:
|
|
|
|
|
260 |
return 'Cys', mods
|
261 |
|
262 |
-
# Methionine (
|
263 |
-
if ('CCSC' in content):
|
|
|
|
|
|
|
|
|
264 |
return 'Met', mods
|
265 |
-
|
266 |
-
# Glutamine (
|
267 |
if (content == '[C@@H](CC' or content == '[C@H](CC' and segment.get('bond_before')=='C(=O)N' and segment.get('bond_after')=='C(=O)N') or ('CCC(=O)N' in content) or ('CCC(N)=O' in content):
|
|
|
|
|
268 |
return 'Gln', mods
|
269 |
-
|
|
|
270 |
if (content == '[C@@H](C' or content == '[C@H](C' and segment.get('bond_before')=='C(=O)N' and segment.get('bond_after')=='C(=O)N') or ('CC(=O)N' in content) or ('CCN(=O)' in content) or ('CC(N)=O' in content):
|
|
|
|
|
271 |
return 'Asn', mods
|
272 |
|
273 |
-
# Glutamic acid (
|
274 |
if ('CCC(=O)O' in content):
|
275 |
-
|
276 |
-
|
|
|
|
|
|
|
277 |
if ('CC(=O)O' in content):
|
|
|
|
|
278 |
return 'Asp', mods
|
279 |
|
280 |
-
# Arginine (Arg) - 3-guanidinopropyl side chain
|
281 |
-
if ('CCCNC(=N)N' in content):
|
282 |
-
return 'Arg', mods
|
283 |
-
|
284 |
-
# Histidine (His) - Imidazole side chain
|
285 |
if re.search(r'Cc\d+c\[nH\]cn\d+', content) or re.search(r'Cc\d+cnc\[nH\]\d+', content):
|
|
|
|
|
286 |
return 'His', mods
|
|
|
|
|
287 |
|
288 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
289 |
|
290 |
-
|
291 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
292 |
|
293 |
-
if re.search(r'c\d+ccccc\d+', content):
|
294 |
-
if '[C@@H](c1ccccc1)' in content or '[C@H](c1ccccc1)' in content:
|
295 |
-
return '4', mods # Base phenylglycine
|
296 |
-
if ('C[C@H](CCCC)' in content or 'C[C@@H](CCCC)' in content) and 'CC(C)' not in content:
|
297 |
-
return 'Nle', mods
|
298 |
-
|
299 |
-
# Ornithine (Orn) - 3-carbon chain with NH2
|
300 |
-
if ('C[C@H](CCCN)' in content or 'C[C@@H](CCCN)' in content) and 'CC(C)' not in content:
|
301 |
-
return 'Orn', mods
|
302 |
-
|
303 |
-
# 2-Naphthylalanine (2Nal)
|
304 |
-
if ('Cc3cc2ccccc2c3' in content):
|
305 |
-
return '2Nal', mods
|
306 |
-
|
307 |
-
# Cyclohexylalanine (Cha)
|
308 |
-
if 'N2CCCCC2' in content or 'CCCCC2' in content:
|
309 |
-
return 'Cha', mods
|
310 |
-
|
311 |
-
# Aminobutyric acid (Abu) - 2-carbon chain
|
312 |
-
if ('C[C@H](CC)' in content or 'C[C@@H](CC)' in content) and not any(p in content for p in ['CC(C)', 'CCCC', 'CCC(C)']):
|
313 |
-
return 'Abu', mods
|
314 |
-
|
315 |
-
# Pipecolic acid (Pip)
|
316 |
-
if ('N3CCCCC3' in content or 'CCCCC3' in content):
|
317 |
-
return 'Pip', mods
|
318 |
-
|
319 |
-
# Cyclohexylglycine (Chg) - direct cyclohexyl without CH2
|
320 |
-
if ('C[C@H](C1CCCCC1)' in content or 'C[C@@H](C1CCCCC1)' in content):
|
321 |
-
return 'Chg', mods
|
322 |
-
|
323 |
-
# 4-Fluorophenylalanine (4F-Phe)
|
324 |
-
if ('Cc2ccc(F)cc2' in content):
|
325 |
-
return '4F-Phe', mods
|
326 |
-
|
327 |
-
# 4-substituted phenylalanines
|
328 |
-
if 'Cc1ccc' in content:
|
329 |
-
if 'OMe' in content or 'OCc1ccc' in content:
|
330 |
-
return '0A1', mods # 4-methoxy-Phenylalanine
|
331 |
-
elif 'Clc1ccc' in content:
|
332 |
-
return '200', mods # 4-chloro-Phenylalanine
|
333 |
-
elif 'Brc1ccc' in content:
|
334 |
-
return '4BF', mods # 4-Bromo-phenylalanine
|
335 |
-
elif 'C#Nc1ccc' in content:
|
336 |
-
return '4CF', mods # 4-cyano-phenylalanine
|
337 |
-
elif 'Ic1ccc' in content:
|
338 |
-
return 'PHI', mods # 4-Iodo-phenylalanine
|
339 |
-
elif 'Fc1ccc' in content:
|
340 |
-
return 'PFF', mods # 4-Fluoro-phenylalanine
|
341 |
-
|
342 |
-
# Modified tryptophans
|
343 |
-
if 'c[nH]c2' in content:
|
344 |
-
if 'Oc2cccc2' in content:
|
345 |
-
return '0AF', mods # 7-hydroxy-tryptophan
|
346 |
-
elif 'Fc2cccc2' in content:
|
347 |
-
return '4FW', mods # 4-fluoro-tryptophan
|
348 |
-
elif 'Clc2cccc2' in content:
|
349 |
-
return '6CW', mods # 6-chloro-tryptophan
|
350 |
-
elif 'Brc2cccc2' in content:
|
351 |
-
return 'BTR', mods # 6-bromo-tryptophan
|
352 |
-
elif 'COc2cccc2' in content:
|
353 |
-
return 'MOT5', mods # 5-Methoxy-tryptophan
|
354 |
-
elif 'Cc2cccc2' in content:
|
355 |
-
return 'MTR5', mods # 5-Methyl-tryptophan
|
356 |
-
|
357 |
-
# Special amino acids
|
358 |
-
if 'CC(C)(C)[C@@H]' in content or 'CC(C)(C)[C@H]' in content:
|
359 |
-
return 'BUG', mods # Tertleucine
|
360 |
-
|
361 |
-
if 'CCCNC(=N)N' in content:
|
362 |
-
return 'CIR', mods # Citrulline
|
363 |
-
|
364 |
-
if '[SeH]' in content:
|
365 |
-
return 'CSE', mods # Selenocysteine
|
366 |
-
|
367 |
-
if '[NH3]CC[C@@H]' in content or '[NH3]CC[C@H]' in content:
|
368 |
-
return 'DAB', mods # Diaminobutyric acid
|
369 |
-
|
370 |
-
if 'C1CCCCC1' in content:
|
371 |
-
if 'C1CCCCC1[C@@H]' in content or 'C1CCCCC1[C@H]' in content:
|
372 |
-
return 'CHG', mods # Cyclohexylglycine
|
373 |
-
elif 'C1CCCCC1C[C@@H]' in content or 'C1CCCCC1C[C@H]' in content:
|
374 |
-
return 'ALC', mods # 3-cyclohexyl-alanine
|
375 |
-
|
376 |
-
# Naphthalene derivatives
|
377 |
-
if 'c1cccc2c1cccc2' in content:
|
378 |
-
if 'c1cccc2c1cccc2[C@@H]' in content or 'c1cccc2c1cccc2[C@H]' in content:
|
379 |
-
return 'NAL', mods # 2-Naphthyl-alanine
|
380 |
-
|
381 |
-
# Heteroaromatic derivatives
|
382 |
-
if 'c1cncc' in content:
|
383 |
-
return 'PYR4', mods # 3-(4-Pyridyl)-alanine
|
384 |
-
if 'c1cscc' in content:
|
385 |
-
return 'THA3', mods # 3-(3-thienyl)-alanine
|
386 |
-
if 'c1nnc' in content:
|
387 |
-
return 'TRZ4', mods # 3-(1,2,4-Triazol-1-yl)-alanine
|
388 |
-
|
389 |
-
# Modified serines and threonines
|
390 |
-
if 'OP(O)(O)O' in content:
|
391 |
-
if '[C@@H](COP' in content or '[C@H](COP' in content:
|
392 |
-
return 'SEP', mods # phosphoserine
|
393 |
-
elif '[C@@H](OP' in content or '[C@H](OP' in content:
|
394 |
-
return 'TPO', mods # phosphothreonine
|
395 |
-
|
396 |
-
# Specialized ring systems
|
397 |
-
if 'c1c2ccccc2cc2c1cccc2' in content:
|
398 |
-
return 'ANTH', mods # 3-(9-anthryl)-alanine
|
399 |
-
if 'c1csc2c1cccc2' in content:
|
400 |
-
return 'BTH3', mods # 3-(3-benzothienyl)-alanine
|
401 |
-
if '[C@]12C[C@H]3C[C@@H](C2)C[C@@H](C1)C3' in content:
|
402 |
-
return 'ADAM', mods # Adamanthane
|
403 |
-
|
404 |
-
# Fluorinated derivatives
|
405 |
-
if 'FC(F)(F)' in content:
|
406 |
-
if 'CC(F)(F)F' in content:
|
407 |
-
return 'FLA', mods # Trifluoro-alanine
|
408 |
-
if 'C(F)(F)F)c1' in content:
|
409 |
-
if 'c1ccccc1C(F)(F)F' in content:
|
410 |
-
return 'TFG2', mods # 2-(Trifluoromethyl)-phenylglycine
|
411 |
-
if 'c1cccc(c1)C(F)(F)F' in content:
|
412 |
-
return 'TFG3', mods # 3-(Trifluoromethyl)-phenylglycine
|
413 |
-
if 'c1ccc(cc1)C(F)(F)F' in content:
|
414 |
-
return 'TFG4', mods # 4-(Trifluoromethyl)-phenylglycine
|
415 |
-
|
416 |
-
# Multiple halogen patterns
|
417 |
-
if 'F' in content and 'c1' in content:
|
418 |
-
if 'c1ccc(c(c1)F)F' in content:
|
419 |
-
return 'F2F', mods # 3,4-Difluoro-phenylalanine
|
420 |
-
if 'cc(F)cc(c1)F' in content:
|
421 |
-
return 'WFP', mods # 3,5-Difluoro-phenylalanine
|
422 |
-
if 'Cl' in content and 'c1' in content:
|
423 |
-
if 'c1ccc(cc1Cl)Cl' in content:
|
424 |
-
return 'CP24', mods # 2,4-dichloro-phenylalanine
|
425 |
-
if 'c1ccc(c(c1)Cl)Cl' in content:
|
426 |
-
return 'CP34', mods # 3,4-dichloro-phenylalanine
|
427 |
-
|
428 |
-
# Hydroxy and amino derivatives
|
429 |
-
if 'O' in content and 'c1' in content:
|
430 |
-
if 'c1cc(O)cc(c1)O' in content:
|
431 |
-
return '3FG', mods # (2s)-amino(3,5-dihydroxyphenyl)-ethanoic acid
|
432 |
-
if 'c1ccc(c(c1)O)O' in content:
|
433 |
-
return 'DAH', mods # 3,4-Dihydroxy-phenylalanine
|
434 |
-
|
435 |
-
# Modified histidines
|
436 |
-
if 'c1cnc' in content:
|
437 |
-
if '[C@@H]1CN[C@@H](N1)F' in content:
|
438 |
-
return '2HF', mods # 2-fluoro-l-histidine
|
439 |
-
if 'c1cnc([nH]1)F' in content:
|
440 |
-
return '2HF1', mods # 2-fluoro-l-histidine variant
|
441 |
-
if 'c1c[nH]c(n1)F' in content:
|
442 |
-
return '2HF2', mods # 2-fluoro-l-histidine variant
|
443 |
-
|
444 |
-
if '[SeH]' in content:
|
445 |
-
return 'CSE', mods # Selenocysteine
|
446 |
-
if 'S' in content:
|
447 |
-
if 'CSCc1ccccc1' in content:
|
448 |
-
return 'BCS', mods # benzylcysteine
|
449 |
-
if 'CCSC' in content:
|
450 |
-
return 'ESC', mods # Ethionine
|
451 |
-
if 'CCS' in content:
|
452 |
-
return 'HCS', mods # homocysteine
|
453 |
-
|
454 |
-
if 'CN=[N]=N' in content:
|
455 |
-
return 'AZDA', mods # azido-alanine
|
456 |
-
if '[NH]=[C](=[NH2])=[NH2]' in content:
|
457 |
-
if 'CCC[NH]=' in content:
|
458 |
-
return 'AGM', mods # 5-methyl-arginine
|
459 |
-
if 'CC[NH]=' in content:
|
460 |
-
return 'GDPR', mods # 2-Amino-3-guanidinopropionic acid
|
461 |
-
|
462 |
-
# Others
|
463 |
-
if 'C1CCCC1' in content:
|
464 |
-
return 'CPA3', mods # 3-Cyclopentyl-alanine
|
465 |
-
if 'C1CCCCC1' in content:
|
466 |
-
if 'CC1CCCCC1' in content:
|
467 |
-
return 'ALC', mods # 3-cyclohexyl-alanine
|
468 |
-
else:
|
469 |
-
return 'CHG', mods # Cyclohexylglycine
|
470 |
-
|
471 |
-
if 'CCC[C@@H]' in content or 'CCC[C@H]' in content:
|
472 |
-
return 'NLE', mods # Norleucine
|
473 |
-
if 'CC[C@@H]' in content or 'CC[C@H]' in content:
|
474 |
-
if not any(x in content for x in ['CC(C)', 'COC', 'CN(']):
|
475 |
-
return 'ABA', mods # 2-Aminobutyric acid
|
476 |
-
if 'CCON' in content:
|
477 |
-
return 'CAN', mods # canaline
|
478 |
-
if '[C@@H]1C=C[C@@H](C=C1)' in content:
|
479 |
-
return 'ACZ', mods # cis-amiclenomycin
|
480 |
-
if 'CCC(=O)[NH3]' in content:
|
481 |
-
return 'ONL', mods # 5-oxo-l-norleucine
|
482 |
-
if 'c1ccncc1' in content:
|
483 |
-
return 'PYR4', mods # 3-(4-Pyridyl)-alanine
|
484 |
-
if 'c1ccco1' in content:
|
485 |
-
return 'FUA2', mods # (2-furyl)-alanine
|
486 |
-
|
487 |
-
if 'c1ccc' in content:
|
488 |
-
if 'c1ccc(cc1)c1ccccc1' in content:
|
489 |
-
return 'BIF', mods # 4,4-biphenylalanine
|
490 |
-
if 'c1ccc(cc1)C(=O)c1ccccc1' in content:
|
491 |
-
return 'PBF', mods # 4-benzoyl-phenylalanine
|
492 |
-
if 'c1ccc(cc1)C(C)(C)C' in content:
|
493 |
-
return 'TBP4', mods # 4-tert-butyl-phenylalanine
|
494 |
-
if 'c1ccc(cc1)[C](=[NH2])=[NH2]' in content:
|
495 |
-
return '0BN', mods # 4-carbamimidoyl-l-phenylalanine
|
496 |
-
if 'c1cccc(c1)[C](=[NH2])=[NH2]' in content:
|
497 |
-
return 'APM', mods # m-amidinophenyl-3-alanine
|
498 |
-
|
499 |
-
if 'O' in content:
|
500 |
-
if '[C@H]([C@H](C)O)O' in content:
|
501 |
-
return 'ILX', mods # 4,5-dihydroxy-isoleucine
|
502 |
-
if '[C@H]([C@@H](C)O)O' in content:
|
503 |
-
return 'ALO', mods # Allo-threonine
|
504 |
-
if '[C@H](COP(O)(O)O)' in content:
|
505 |
-
return 'SEP', mods # phosphoserine
|
506 |
-
if '[C@H]([C@@H](C)OP(O)(O)O)' in content:
|
507 |
-
return 'TPO', mods # phosphothreonine
|
508 |
-
if '[C@H](c1ccc(O)cc1)O' in content:
|
509 |
-
return 'OMX', mods # (betar)-beta-hydroxy-l-tyrosine
|
510 |
-
if '[C@H](c1ccc(c(Cl)c1)O)O' in content:
|
511 |
-
return 'OMY', mods # (betar)-3-chloro-beta-hydroxy-l-tyrosine
|
512 |
-
|
513 |
-
if 'n1' in content:
|
514 |
-
if 'n1cccn1' in content:
|
515 |
-
return 'PYZ1', mods # 3-(1-Pyrazolyl)-alanine
|
516 |
-
if 'n1nncn1' in content:
|
517 |
-
return 'TEZA', mods # 3-(2-Tetrazolyl)-alanine
|
518 |
-
if 'c2c(n1)cccc2' in content:
|
519 |
-
return 'QU32', mods # 3-(2-Quinolyl)-alanine
|
520 |
-
if 'c1cnc2c(c1)cccc2' in content:
|
521 |
-
return 'QU33', mods # 3-(3-quinolyl)-alanine
|
522 |
-
if 'c1ccnc2c1cccc2' in content:
|
523 |
-
return 'QU34', mods # 3-(4-quinolyl)-alanine
|
524 |
-
if 'c1ccc2c(c1)nccc2' in content:
|
525 |
-
return 'QU35', mods # 3-(5-Quinolyl)-alanine
|
526 |
-
if 'c1ccc2c(c1)cncc2' in content:
|
527 |
-
return 'QU36', mods # 3-(6-Quinolyl)-alanine
|
528 |
-
if 'c1cnc2c(n1)cccc2' in content:
|
529 |
-
return 'QX32', mods # 3-(2-quinoxalyl)-alanine
|
530 |
-
|
531 |
-
if 'N' in content:
|
532 |
-
if '[NH3]CC[C@@H]' in content:
|
533 |
-
return 'DAB', mods # Diaminobutyric acid
|
534 |
-
if '[NH3]C[C@@H]' in content:
|
535 |
-
return 'DPP', mods # 2,3-Diaminopropanoic acid
|
536 |
-
if '[NH3]CCCCCC[C@@H]' in content:
|
537 |
-
return 'HHK', mods # (2s)-2,8-diaminooctanoic acid
|
538 |
-
if 'CCC[NH]=[C](=[NH2])=[NH2]' in content:
|
539 |
-
return 'GBUT', mods # 2-Amino-4-guanidinobutryric acid
|
540 |
-
if '[NH]=[C](=S)=[NH2]' in content:
|
541 |
-
return 'THIC', mods # Thio-citrulline
|
542 |
-
|
543 |
-
if 'CC' in content:
|
544 |
-
if 'CCCC[C@@H]' in content:
|
545 |
-
return 'AHP', mods # 2-Aminoheptanoic acid
|
546 |
-
if 'CCC([C@@H])(C)C' in content:
|
547 |
-
return 'I2M', mods # 3-methyl-l-alloisoleucine
|
548 |
-
if 'CC[C@H]([C@@H])C' in content:
|
549 |
-
return 'IIL', mods # Allo-Isoleucine
|
550 |
-
if '[C@H](CCC(C)C)' in content:
|
551 |
-
return 'HLEU', mods # Homoleucine
|
552 |
-
if '[C@@H]([C@@H](C)O)C' in content:
|
553 |
-
return 'HLU', mods # beta-hydroxyleucine
|
554 |
-
|
555 |
-
if '[C@@H]' in content:
|
556 |
-
if '[C@@H](C[C@@H](F))' in content:
|
557 |
-
return 'FGA4', mods # 4-Fluoro-glutamic acid
|
558 |
-
if '[C@@H](C[C@@H](O))' in content:
|
559 |
-
return '3GL', mods # 4-hydroxy-glutamic-acid
|
560 |
-
if '[C@@H](C[C@H](C))' in content:
|
561 |
-
return 'LME', mods # (3r)-3-methyl-l-glutamic acid
|
562 |
-
if '[C@@H](CC[C@H](C))' in content:
|
563 |
-
return 'MEG', mods # (3s)-3-methyl-l-glutamic acid
|
564 |
-
|
565 |
-
if 'S' in content:
|
566 |
-
if 'SCC[C@@H]' in content:
|
567 |
-
return 'HSER', mods # homoserine
|
568 |
-
if 'SCCN' in content:
|
569 |
-
return 'SLZ', mods # thialysine
|
570 |
-
if 'SC(=O)' in content:
|
571 |
-
return 'CSA', mods # s-acetonylcysteine
|
572 |
-
if '[S@@](=O)' in content:
|
573 |
-
return 'SME', mods # Methionine sulfoxide
|
574 |
-
if 'S(=O)(=O)' in content:
|
575 |
-
return 'OMT', mods # Methionine sulfone
|
576 |
-
|
577 |
-
if 'C=' in content:
|
578 |
-
if 'C=C[C@@H]' in content:
|
579 |
-
return '2AG', mods # 2-Allyl-glycine
|
580 |
-
if 'C=C[C@@H]' in content:
|
581 |
-
return 'LVG', mods # vinylglycine
|
582 |
-
if 'C=Cc1ccccc1' in content:
|
583 |
-
return 'STYA', mods # Styrylalanine
|
584 |
-
|
585 |
-
if '[C@@H]1Cc2c(C1)cccc2' in content:
|
586 |
-
return 'IGL', mods # alpha-amino-2-indanacetic acid
|
587 |
-
if '[C](=[C](=O)=O)=O' in content:
|
588 |
-
return '26P', mods # 2-amino-6-oxopimelic acid
|
589 |
-
if '[C](=[C](=O)=O)=C' in content:
|
590 |
-
return '2NP', mods # l-2-amino-6-methylene-pimelic acid
|
591 |
-
if 'c1cccc2c1cc(O)cc2' in content:
|
592 |
-
return 'NAO1', mods # 5-hydroxy-1-naphthalene
|
593 |
-
if 'c1ccc2c(c1)cc(O)cc2' in content:
|
594 |
-
return 'NAO2', mods # 6-hydroxy-2-naphthalene
|
595 |
return None, mods
|
596 |
|
597 |
def get_modifications(self, segment):
|
598 |
-
"""Get modifications based on bond types"""
|
599 |
mods = []
|
600 |
-
|
601 |
-
|
602 |
-
|
603 |
-
|
604 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
605 |
return mods
|
606 |
|
607 |
def analyze_structure(self, smiles):
|
608 |
-
"""Main analysis function with
|
609 |
print("\nAnalyzing structure:", smiles)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
610 |
|
611 |
-
#
|
612 |
-
|
|
|
|
|
613 |
|
614 |
print("\nSegment Analysis:")
|
615 |
sequence = []
|
616 |
for i, segment in enumerate(segments):
|
617 |
print(f"\nSegment {i}:")
|
618 |
-
print(f"Content: {segment
|
619 |
print(f"Bond before: {segment.get('bond_before', 'None')}")
|
620 |
print(f"Bond after: {segment.get('bond_after', 'None')}")
|
621 |
|
@@ -628,11 +548,10 @@ class PeptideAnalyzer:
|
|
628 |
print(f"Identified as: {residue}")
|
629 |
print(f"Modifications: {mods}")
|
630 |
else:
|
631 |
-
print(f"Warning: Could not identify residue in segment: {segment
|
632 |
|
633 |
-
# Check if cyclic
|
634 |
-
is_cyclic, peptide_cycles, aromatic_cycles = self.is_cyclic(smiles)
|
635 |
three_letter = '-'.join(sequence)
|
|
|
636 |
one_letter = ''.join(self.three_to_one.get(aa.split('(')[0], 'X') for aa in sequence)
|
637 |
|
638 |
if is_cyclic:
|
@@ -642,13 +561,14 @@ class PeptideAnalyzer:
|
|
642 |
print(f"\nFinal sequence: {three_letter}")
|
643 |
print(f"One-letter code: {one_letter}")
|
644 |
print(f"Is cyclic: {is_cyclic}")
|
645 |
-
|
646 |
-
|
647 |
|
648 |
return {
|
649 |
'three_letter': three_letter,
|
650 |
'one_letter': one_letter,
|
651 |
-
'is_cyclic': is_cyclic
|
|
|
652 |
}
|
653 |
|
654 |
def annotate_cyclic_structure(mol, sequence):
|
|
|
17 |
class PeptideAnalyzer:
|
18 |
def __init__(self):
|
19 |
self.bond_patterns = [
|
20 |
+
#(r'OC\(=O\)', 'ester'), # Ester bond
|
21 |
(r'N\(C\)C\(=O\)', 'n_methyl'), # N-methylated peptide bond
|
22 |
(r'N[0-9]C\(=O\)', 'proline'), # Proline peptide bond
|
23 |
(r'NC\(=O\)', 'peptide'), # Standard peptide bond
|
24 |
(r'C\(=O\)N\(C\)', 'n_methyl_reverse'), # Reverse N-methylated
|
25 |
(r'C\(=O\)N[12]?', 'peptide_reverse') # Reverse peptide bond
|
26 |
]
|
27 |
+
self.complex_residue_patterns = [
|
28 |
+
# Kpg - Lys(palmitoyl-Glu-OtBu)
|
29 |
+
(r'\[C[@]H\]\(CCCNC\(=O\)CCC\[C@@H\]\(NC\(=O\)CCCCCCCCCCCCCCCC\)C\(=O\)OC\(C\)\(C\)C\)', 'Kpg'),
|
30 |
+
(r'CCCCCCCCCCCCCCCCC\(=O\)N\[C@H\]\(CCCC\(=O\)NCCC\[C@@H\]', 'Kpg'),
|
31 |
+
(r'\[C[@]?H\]\(CSC\(c\d+ccccc\d+\)\(c\d+ccccc\d+\)c\d+ccc\(OC\)cc\d+\)', 'Cmt'),
|
32 |
+
(r'CSC\(c.*?c.*?OC\)', 'Cmt'), # Core structure of Cys-Mmt group
|
33 |
+
(r'COc.*?ccc\(C\(SC', 'Cmt'), # Start of Cmt in cyclic peptides
|
34 |
+
(r'c2ccccc2\)c2ccccc2\)cc', 'Cmt'), # End of Cmt in cyclic peptides
|
35 |
+
# Glu(OAll)
|
36 |
+
(r'C=CCOC\(=O\)CC\[C@@H\]', 'Eal'),
|
37 |
+
#(r'COc\d+ccc\(C\(SC\[C@@H\]\d+.*?\)\(c\d+ccccc\d+\)c\d+ccccc\d+\)cc\d+', 'Cmt-cyclic'),
|
38 |
+
|
39 |
+
# Dtg - Asp(OtBu)-(Dmb)Gly
|
40 |
+
(r'CN\(Cc\d+ccc\(OC\)cc\d+OC\)C\(=O\)\[C@H\]\(CC\(=O\)OC\(C\)\(C\)C\)', 'Dtg'),
|
41 |
+
(r'C\(=O\)N\(CC\d+=C\(C=C\(C=C\d+\)OC\)OC\)CC\(=O\)', 'Dtg'),
|
42 |
+
(r'N\[C@@H\]\(CC\(=O\)OC\(C\)\(C\)C\)C\(=O\)N\(CC\d+=C\(C=C\(C=C\d+\)OC\)OC\)CC\(=O\)', 'Dtg'),
|
43 |
+
]
|
44 |
# Three to one letter code mapping
|
45 |
self.three_to_one = {
|
46 |
'Ala': 'A', 'Cys': 'C', 'Asp': 'D', 'Glu': 'E',
|
47 |
'Phe': 'F', 'Gly': 'G', 'His': 'H', 'Ile': 'I',
|
48 |
'Lys': 'K', 'Leu': 'L', 'Met': 'M', 'Asn': 'N',
|
49 |
'Pro': 'P', 'Gln': 'Q', 'Arg': 'R', 'Ser': 'S',
|
50 |
+
'Thr': 'T', 'Val': 'V', 'Trp': 'W', 'Tyr': 'Y',
|
51 |
+
'ala': 'a', 'cys': 'c', 'asp': 'd', 'glu': 'e',
|
52 |
+
'phe': 'f', 'gly': 'g', 'his': 'h', 'ile': 'i',
|
53 |
+
'lys': 'k', 'leu': 'l', 'met': 'm', 'asn': 'n',
|
54 |
+
'pro': 'p', 'gln': 'q', 'arg': 'r', 'ser': 's',
|
55 |
+
'thr': 't', 'val': 'v', 'trp': 'w', 'tyr': 'y', 'Cmt-cyclic': 'Ĉ',
|
56 |
+
'Aib': 'Ŷ', 'Dtg': 'Ĝ', 'Cmt': 'Ĉ', 'Eal': 'Ė', 'Nml': "Ŀ", 'Nma': 'Ṃ',
|
57 |
+
'Kpg': 'Ƙ', 'Tpb': 'Ṯ', 'Cyl': 'Ċ', 'Nle': 'Ł', 'Hph': 'Ĥ', 'Cys-Cys': 'CC', 'cys-cys': 'cc',
|
58 |
}
|
59 |
+
|
60 |
+
def preprocess_complex_residues(self, smiles):
|
61 |
+
complex_positions = []
|
62 |
+
|
63 |
+
for pattern, residue_type in self.complex_residue_patterns:
|
64 |
+
for match in re.finditer(pattern, smiles):
|
65 |
+
# Only add if this position doesn't overlap with existing matches
|
66 |
+
if not any(pos['start'] <= match.start() < pos['end'] or
|
67 |
+
pos['start'] < match.end() <= pos['end'] for pos in complex_positions):
|
68 |
+
complex_positions.append({
|
69 |
+
'start': match.start(),
|
70 |
+
'end': match.end(),
|
71 |
+
'type': residue_type,
|
72 |
+
'pattern': match.group()
|
73 |
+
})
|
74 |
+
|
75 |
+
# Sort by position (to handle potential overlapping matches)
|
76 |
+
complex_positions.sort(key=lambda x: x['start'])
|
77 |
+
|
78 |
+
if not complex_positions:
|
79 |
+
return smiles, []
|
80 |
+
|
81 |
+
# Build a new SMILES string, protecting complex residues
|
82 |
+
preprocessed_smiles = smiles
|
83 |
+
offset = 0 # Track offset from replacements
|
84 |
+
|
85 |
+
protected_residues = []
|
86 |
+
|
87 |
+
for pos in complex_positions:
|
88 |
+
start = pos['start'] + offset
|
89 |
+
end = pos['end'] + offset
|
90 |
+
|
91 |
+
complex_part = preprocessed_smiles[start:end]
|
92 |
+
|
93 |
+
if not ('[C@H]' in complex_part or '[C@@H]' in complex_part):
|
94 |
+
continue
|
95 |
+
|
96 |
+
placeholder = f"COMPLEX_RESIDUE_{len(protected_residues)}"
|
97 |
+
|
98 |
+
preprocessed_smiles = preprocessed_smiles[:start] + placeholder + preprocessed_smiles[end:]
|
99 |
+
|
100 |
+
offset += len(placeholder) - (end - start)
|
101 |
+
|
102 |
+
protected_residues.append({
|
103 |
+
'placeholder': placeholder,
|
104 |
+
'type': pos['type'],
|
105 |
+
'content': complex_part
|
106 |
+
})
|
107 |
|
108 |
+
#print(f"Protected {pos['type']}: {complex_part[:20]}... as {placeholder}")
|
109 |
+
|
110 |
+
return preprocessed_smiles, protected_residues
|
111 |
+
|
112 |
def is_peptide(self, smiles):
|
113 |
"""Check if the SMILES represents a peptide structure"""
|
114 |
mol = Chem.MolFromSmiles(smiles)
|
|
|
149 |
is_cyclic = len(peptide_cycles) > 0 and not smiles.endswith('C(=O)O')
|
150 |
return is_cyclic, peptide_cycles, aromatic_cycles
|
151 |
|
152 |
+
def split_on_bonds(self, smiles, protected_residues=None):
|
153 |
+
"""Split SMILES into segments based on peptide bonds, with improved handling of protected residues"""
|
154 |
positions = []
|
155 |
used = set()
|
156 |
+
|
157 |
+
# First, handle protected complex residues if any
|
158 |
+
if protected_residues:
|
159 |
+
for residue in protected_residues:
|
160 |
+
match = re.search(residue['placeholder'], smiles)
|
161 |
+
if match:
|
162 |
+
positions.append({
|
163 |
+
'start': match.start(),
|
164 |
+
'end': match.end(),
|
165 |
+
'type': 'complex',
|
166 |
+
'pattern': residue['placeholder'],
|
167 |
+
'residue_type': residue['type'],
|
168 |
+
'content': residue['content']
|
169 |
+
})
|
170 |
+
used.update(range(match.start(), match.end()))
|
171 |
+
|
172 |
+
# Find all peptide bonds
|
173 |
+
bond_positions = []
|
174 |
+
|
175 |
# Find Gly pattern first
|
176 |
gly_pattern = r'NCC\(=O\)'
|
177 |
for match in re.finditer(gly_pattern, smiles):
|
178 |
if not any(p in range(match.start(), match.end()) for p in used):
|
179 |
+
bond_positions.append({
|
180 |
'start': match.start(),
|
181 |
'end': match.end(),
|
182 |
'type': 'gly',
|
|
|
187 |
for pattern, bond_type in self.bond_patterns:
|
188 |
for match in re.finditer(pattern, smiles):
|
189 |
if not any(p in range(match.start(), match.end()) for p in used):
|
190 |
+
bond_positions.append({
|
191 |
'start': match.start(),
|
192 |
'end': match.end(),
|
193 |
'type': bond_type,
|
194 |
'pattern': match.group()
|
195 |
})
|
196 |
used.update(range(match.start(), match.end()))
|
197 |
+
|
198 |
+
bond_positions.sort(key=lambda x: x['start'])
|
199 |
+
|
200 |
+
# Combine complex residue positions and bond positions
|
201 |
+
all_positions = positions + bond_positions
|
202 |
+
all_positions.sort(key=lambda x: x['start'])
|
203 |
|
204 |
# Create segments
|
205 |
segments = []
|
206 |
+
|
207 |
+
if all_positions and all_positions[0]['start'] > 0:
|
208 |
+
segments.append({
|
209 |
+
'content': smiles[0:all_positions[0]['start']],
|
210 |
+
'bond_after': all_positions[0]['pattern'] if all_positions[0]['type'] != 'complex' else None,
|
211 |
+
'complex_after': all_positions[0]['pattern'] if all_positions[0]['type'] == 'complex' else None
|
212 |
+
})
|
213 |
+
|
214 |
+
for i in range(len(all_positions)-1):
|
215 |
+
current = all_positions[i]
|
216 |
+
next_pos = all_positions[i+1]
|
217 |
+
|
218 |
+
if current['type'] == 'complex':
|
219 |
segments.append({
|
220 |
+
'content': current['content'],
|
221 |
+
'bond_before': all_positions[i-1]['pattern'] if i > 0 and all_positions[i-1]['type'] != 'complex' else None,
|
222 |
+
'bond_after': next_pos['pattern'] if next_pos['type'] != 'complex' else None,
|
223 |
+
'complex_type': current['residue_type']
|
224 |
})
|
225 |
+
elif current['type'] == 'gly':
|
226 |
+
segments.append({
|
227 |
+
'content': 'NCC(=O)',
|
228 |
+
'bond_before': all_positions[i-1]['pattern'] if i > 0 and all_positions[i-1]['type'] != 'complex' else None,
|
229 |
+
'bond_after': next_pos['pattern'] if next_pos['type'] != 'complex' else None
|
230 |
+
})
|
231 |
+
else:
|
232 |
+
# Only create segment if there's content between this bond and next position
|
233 |
+
content = smiles[current['end']:next_pos['start']]
|
234 |
+
if content and next_pos['type'] != 'complex':
|
235 |
segments.append({
|
236 |
+
'content': content,
|
237 |
+
'bond_before': current['pattern'],
|
238 |
+
'bond_after': next_pos['pattern'] if next_pos['type'] != 'complex' else None
|
239 |
})
|
240 |
+
|
241 |
+
if all_positions and all_positions[-1]['end'] < len(smiles):
|
242 |
+
if all_positions[-1]['type'] == 'complex':
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
243 |
segments.append({
|
244 |
+
'content': all_positions[-1]['content'],
|
245 |
+
'bond_before': all_positions[-2]['pattern'] if len(all_positions) > 1 and all_positions[-2]['type'] != 'complex' else None,
|
246 |
+
'complex_type': all_positions[-1]['residue_type']
|
247 |
})
|
248 |
+
else:
|
249 |
+
segments.append({
|
250 |
+
'content': smiles[all_positions[-1]['end']:],
|
251 |
+
'bond_before': all_positions[-1]['pattern']
|
252 |
+
})
|
253 |
+
|
254 |
return segments
|
255 |
|
256 |
def clean_terminal_carboxyl(self, segment):
|
|
|
274 |
def identify_residue(self, segment):
|
275 |
"""Identify residue with Pro reconstruction"""
|
276 |
# Only clean terminal carboxyl if this is the last segment
|
277 |
+
if 'complex_type' in segment:
|
278 |
+
return segment['complex_type'], []
|
279 |
content = self.clean_terminal_carboxyl(segment)
|
280 |
mods = self.get_modifications(segment)
|
281 |
|
282 |
+
if content.startswith('COc1ccc(C(SC[C@@H]'):
|
283 |
+
print("DIRECT MATCH: Found Cmt at beginning")
|
284 |
+
return 'Cmt', mods
|
285 |
+
|
286 |
+
if '[C@@H]3CCCN3C2=O)(c2ccccc2)c2ccccc2)cc' in content:
|
287 |
+
print("DIRECT MATCH: Found Pro at end")
|
288 |
+
return 'Pro', mods
|
289 |
+
|
290 |
+
# Eal - Glu(OAll)
|
291 |
+
if 'CCC(=O)OCC=C' in content or 'CC(=O)OCC=C' in content or 'C=CCOC(=O)CC' in content:
|
292 |
+
return 'Eal', mods
|
293 |
+
|
294 |
# Proline (P) - flexible ring numbers
|
295 |
if any([
|
296 |
# Check for any ring number in bond patterns
|
|
|
298 |
any(f'[C@@H]{n}' in content or f'[C@H]{n}' in content for n in '123456789'))
|
299 |
for n in '123456789'
|
300 |
]) or any([(segment.get('bond_before', '').startswith(f'C(=O)N{n}') and 'CCC' in content and
|
301 |
+
any(f'CCC{n}' for n in '123456789'))
|
302 |
+
for n in '123456789'
|
303 |
]) or any([
|
304 |
# Check ending patterns with any ring number
|
305 |
(f'CCCN{n}' in content and content.endswith('=O') and
|
|
|
316 |
]):
|
317 |
return 'Pro', mods
|
318 |
|
319 |
+
# D-Proline (p)
|
320 |
+
if ('N1[C@H](CCC1)' in content):
|
321 |
+
return 'pro', mods
|
322 |
+
|
323 |
+
# Tryptophan (W)
|
324 |
if re.search(r'c[0-9]c\[nH\]c[0-9]ccccc[0-9][0-9]', content) and \
|
325 |
'c[nH]c' in content.replace(' ', ''):
|
326 |
+
if '[C@H](CC' in content: # D-form
|
327 |
+
return 'trp', mods
|
328 |
return 'Trp', mods
|
329 |
|
330 |
+
# Lysine (K) - both patterns
|
331 |
if '[C@@H](CCCCN)' in content or '[C@H](CCCCN)' in content:
|
332 |
+
if '[C@H](CCCCN)' in content: # D-form
|
333 |
+
return 'lys', mods
|
334 |
return 'Lys', mods
|
335 |
+
|
336 |
+
# Arginine (R) - both patterns
|
337 |
if '[C@@H](CCCNC(=N)N)' in content or '[C@H](CCCNC(=N)N)' in content:
|
338 |
+
if '[C@H](CCCNC(=N)N)' in content: # D-form
|
339 |
+
return 'arg', mods
|
340 |
return 'Arg', mods
|
341 |
|
342 |
+
if content == 'C' and segment.get('bond_before') and segment.get('bond_after'):
|
343 |
+
# If it's surrounded by peptide bonds, it's almost certainly Gly
|
344 |
+
if ('C(=O)N' in segment['bond_before'] or 'NC(=O)' in segment['bond_before'] or 'N(C)C(=O)' in segment['bond_before']) and \
|
345 |
+
('NC(=O)' in segment['bond_after'] or 'C(=O)N' in segment['bond_after'] or 'N(C)C(=O)' in segment['bond_after']):
|
|
|
346 |
return 'Gly', mods
|
347 |
+
|
348 |
+
# Leucine patterns (L/l)
|
349 |
if 'CC(C)C[C@H]' in content or 'CC(C)C[C@@H]' in content or '[C@@H](CC(C)C)' in content or '[C@H](CC(C)C)' in content or (('N[C@H](CCC(C)C)' in content or 'N[C@@H](CCC(C)C)' in content) and segment.get('bond_before') is None):
|
350 |
+
if '[C@H](CC(C)C)' in content or 'CC(C)C[C@H]' in content: # D-form
|
351 |
+
return 'leu', mods
|
352 |
return 'Leu', mods
|
353 |
|
354 |
+
# Threonine patterns (T/t)
|
355 |
+
if '[C@@H]([C@@H](C)O)' in content or '[C@H]([C@H](C)O)' in content or '[C@@H]([C@H](C)O)' in content or '[C@H]([C@@H](C)O)' in content:
|
356 |
+
# Check both stereochemistry patterns
|
357 |
+
if '[C@H]([C@@H](C)O)' in content: # D-form
|
358 |
+
return 'thr', mods
|
359 |
return 'Thr', mods
|
360 |
+
|
361 |
+
if re.search(r'\[C@H\]\(CCc\d+ccccc\d+\)', content) or re.search(r'\[C@@H\]\(CCc\d+ccccc\d+\)', content):
|
362 |
+
return 'Hph', mods
|
363 |
+
|
364 |
+
# Phenylalanine patterns (F/f)
|
365 |
if re.search(r'\[C@H\]\(Cc\d+ccccc\d+\)', content) or re.search(r'\[C@@H\]\(Cc\d+ccccc\d+\)', content):
|
366 |
+
if re.search(r'\[C@H\]\(Cc\d+ccccc\d+\)', content): # D-form
|
367 |
+
return 'phe', mods
|
368 |
return 'Phe', mods
|
369 |
|
370 |
+
if ('CC(C)[C@@H]' in content or 'CC(C)[C@H]' in content or
|
371 |
+
'[C@H](C(C)C)' in content or '[C@@H](C(C)C)' in content or
|
372 |
+
'C(C)C[C@H]' in content or 'C(C)C[C@@H]' in content):
|
|
|
|
|
|
|
|
|
373 |
|
374 |
+
# Make sure it's not leucine
|
375 |
+
if not any(p in content for p in ['CC(C)C[C@H]', 'CC(C)C[C@@H]', 'CCC(=O)']):
|
376 |
+
if '[C@H]' in content and not '[C@@H]' in content: # D-form
|
377 |
+
return 'val', mods
|
378 |
+
return 'Val', mods
|
379 |
+
|
380 |
+
# Isoleucine patterns (I/i)
|
381 |
if any([
|
382 |
+
'CC[C@@H](C)' in content, '[C@@H](C)CC' in content,
|
|
|
383 |
'[C@@H](CC)C' in content,
|
|
|
|
|
384 |
'C(C)C[C@@H]' in content and 'CC(C)C' not in content
|
385 |
]):
|
386 |
+
if '[C@H]([C@@H](CC)C)' in content or '[C@H](CC)C' in content: # D-form
|
387 |
+
return 'ile', mods
|
388 |
+
elif '[C@H](C)CC' in content or '[C@H](CC)C' in content or 'CC[C@H](C)' in content:
|
389 |
+
return 'ile', mods
|
390 |
+
elif 'C(C)C[C@H]' in content and 'CC(C)C' not in content:
|
391 |
+
return 'ile', mods
|
392 |
return 'Ile', mods
|
393 |
|
394 |
+
# Alanine patterns (A/a)
|
395 |
if ('[C@H](C)' in content or '[C@@H](C)' in content):
|
396 |
if not any(p in content for p in ['C(C)C', 'COC', 'CN(', 'C(C)O', 'CC[C@H]', 'CC[C@@H]']):
|
397 |
+
if '[C@H](C)' in content: # D-form
|
398 |
+
return 'ala', mods
|
399 |
return 'Ala', mods
|
400 |
|
401 |
+
# Tyrosine patterns (Y/y)
|
402 |
if re.search(r'Cc[0-9]ccc\(O\)cc[0-9]', content):
|
403 |
+
if '[C@H](Cc1ccc(O)cc1)' in content: # D-form
|
404 |
+
return 'tyr', mods
|
405 |
return 'Tyr', mods
|
406 |
|
407 |
+
# Serine patterns (S/s)
|
408 |
if '[C@H](CO)' in content or '[C@@H](CO)' in content:
|
409 |
if not ('C(C)O' in content or 'COC' in content):
|
410 |
+
if '[C@H](CO)' in content: # D-form
|
411 |
+
return 'ser', mods
|
412 |
return 'Ser', mods
|
413 |
+
|
414 |
+
if 'CSSC' in content:
|
415 |
+
if re.search(r'\[C@@H\].*CSSC.*\[C@@H\]', content) or re.search(r'\[C@H\].*CSSC.*\[C@H\]', content):
|
416 |
+
if '[C@H]' in content and not '[C@@H]' in content: # D-form
|
417 |
+
return 'cys-cys', mods
|
418 |
+
return 'Cys-Cys', mods
|
419 |
+
|
420 |
+
if '[C@@H](N)CSSC' in content or '[C@H](N)CSSC' in content:
|
421 |
+
if '[C@H](N)CSSC' in content: # D-form
|
422 |
+
return 'cys-cys', mods
|
423 |
+
return 'Cys-Cys', mods
|
424 |
+
|
425 |
+
if 'CSSC[C@@H](C(=O)O)' in content or 'CSSC[C@H](C(=O)O)' in content:
|
426 |
+
if 'CSSC[C@H](C(=O)O)' in content: # D-form
|
427 |
+
return 'cys-cys', mods
|
428 |
+
return 'Cys-Cys', mods
|
429 |
+
|
430 |
+
# Cysteine patterns (C/c)
|
431 |
if '[C@H](CS)' in content or '[C@@H](CS)' in content:
|
432 |
+
if '[C@H](CS)' in content: # D-form
|
433 |
+
return 'cys', mods
|
434 |
return 'Cys', mods
|
435 |
|
436 |
+
# Methionine patterns (M/m)
|
437 |
+
if ('CCSC' in content) or ("CSCC" in content):
|
438 |
+
if '[C@H](CCSC)' in content: # D-form
|
439 |
+
return 'met', mods
|
440 |
+
elif '[C@H]' in content:
|
441 |
+
return 'met', mods
|
442 |
return 'Met', mods
|
443 |
+
|
444 |
+
# Glutamine patterns (Q/q)
|
445 |
if (content == '[C@@H](CC' or content == '[C@H](CC' and segment.get('bond_before')=='C(=O)N' and segment.get('bond_after')=='C(=O)N') or ('CCC(=O)N' in content) or ('CCC(N)=O' in content):
|
446 |
+
if '[C@H](CCC(=O)N)' in content: # D-form
|
447 |
+
return 'gln', mods
|
448 |
return 'Gln', mods
|
449 |
+
|
450 |
+
# Asparagine patterns (N/n)
|
451 |
if (content == '[C@@H](C' or content == '[C@H](C' and segment.get('bond_before')=='C(=O)N' and segment.get('bond_after')=='C(=O)N') or ('CC(=O)N' in content) or ('CCN(=O)' in content) or ('CC(N)=O' in content):
|
452 |
+
if '[C@H](CC(=O)N)' in content: # D-form
|
453 |
+
return 'asn', mods
|
454 |
return 'Asn', mods
|
455 |
|
456 |
+
# Glutamic acid patterns (E/e)
|
457 |
if ('CCC(=O)O' in content):
|
458 |
+
if '[C@H](CCC(=O)O)' in content: # D-form
|
459 |
+
return 'glu', mods
|
460 |
+
return 'Glu', mods
|
461 |
+
|
462 |
+
# Aspartic acid patterns (D/d)
|
463 |
if ('CC(=O)O' in content):
|
464 |
+
if '[C@H](CC(=O)O)' in content: # D-form
|
465 |
+
return 'asp', mods
|
466 |
return 'Asp', mods
|
467 |
|
|
|
|
|
|
|
|
|
|
|
468 |
if re.search(r'Cc\d+c\[nH\]cn\d+', content) or re.search(r'Cc\d+cnc\[nH\]\d+', content):
|
469 |
+
if '[C@H]' in content: # D-form
|
470 |
+
return 'his', mods
|
471 |
return 'His', mods
|
472 |
+
if 'C2(CCCC2)' in content or 'C1(CCCC1)' in content or re.search(r'C\d+\(CCCC\d+\)', content):
|
473 |
+
return 'Cyl', mods
|
474 |
|
475 |
+
if ('N[C@@H](CCCC)' in content or '[C@@H](CCCC)' in content or 'CCCC[C@@H]' in content or
|
476 |
+
'N[C@H](CCCC)' in content or '[C@H](CCCC)' in content) and 'CC(C)' not in content:
|
477 |
+
return 'Nle', mods
|
478 |
+
# Aib - alpha-aminoisobutyric acid (2-aminoisobutyric acid)
|
479 |
+
if 'C(C)(C)(N)' in content or 'C(C)(C)' in content or 'C(C)(C)' in content and ('C(=O)N' in segment['bond_before'] or 'NC(=O)' in segment['bond_before'] or 'N(C)C(=O)' in segment['bond_before']) and \
|
480 |
+
('NC(=O)' in segment['bond_after'] or 'C(=O)N' in segment['bond_after'] or 'N(C)C(=O)' in segment['bond_after']):
|
481 |
+
return 'Aib', mods
|
482 |
|
483 |
+
# Dtg - Asp(OtBu)-(Dmb)Gly
|
484 |
+
if 'CC(=O)OC(C)(C)C' in content and 'CC1=C(C=C(C=C1)OC)OC' in content:
|
485 |
+
return 'Dtg', mods
|
486 |
+
|
487 |
+
|
488 |
+
# Kpg - Lys(palmitoyl-Glu-OtBu)
|
489 |
+
if 'CCCNC(=O)' in content and 'CCCCCCCCCCCC' in content:
|
490 |
+
return 'Kpg', mods
|
491 |
+
|
492 |
+
# Tpb - Thr(PO(OBzl)OH)
|
493 |
+
if re.search(r'\[C[@]?H\]\(C\)OP\(=O\)\(O\)', content) or 'OP(=O)(O)OCC' in content:
|
494 |
+
return 'Tpb', mods
|
495 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
496 |
return None, mods
|
497 |
|
498 |
def get_modifications(self, segment):
|
499 |
+
"""Get modifications based on bond types and segment content - fixed to avoid duplicates"""
|
500 |
mods = []
|
501 |
+
|
502 |
+
# Check for N-methylation in any form, but only add it once
|
503 |
+
# Check both bonds and segment content for N-methylation patterns
|
504 |
+
if ((segment.get('bond_after') and
|
505 |
+
('N(C)' in segment['bond_after'] or segment['bond_after'].startswith('C(=O)N(C)'))) or
|
506 |
+
('N(C)C(=O)' in segment['content'] or 'N(C)C1=O' in segment['content']) or
|
507 |
+
(segment['content'].endswith('N(C)C(=O)') or segment['content'].endswith('N(C)C1=O'))):
|
508 |
+
mods.append('N-Me')
|
509 |
+
|
510 |
+
# Check for O-linked modifications
|
511 |
+
#if segment.get('bond_after') and 'OC(=O)' in segment['bond_after']:
|
512 |
+
#mods.append('O-linked')
|
513 |
+
|
514 |
return mods
|
515 |
|
516 |
def analyze_structure(self, smiles):
|
517 |
+
"""Main analysis function with preprocessing for complex residues"""
|
518 |
print("\nAnalyzing structure:", smiles)
|
519 |
+
|
520 |
+
# Pre-process to identify complex residues first
|
521 |
+
preprocessed_smiles, protected_residues = self.preprocess_complex_residues(smiles)
|
522 |
+
|
523 |
+
if protected_residues:
|
524 |
+
print(f"Identified {len(protected_residues)} complex residues during pre-processing")
|
525 |
+
for i, residue in enumerate(protected_residues):
|
526 |
+
print(f"Complex residue {i+1}: {residue['type']}")
|
527 |
+
|
528 |
|
529 |
+
# Check if it's cyclic
|
530 |
+
is_cyclic, peptide_cycles, aromatic_cycles = self.is_cyclic(smiles)
|
531 |
+
|
532 |
+
segments = self.split_on_bonds(preprocessed_smiles, protected_residues)
|
533 |
|
534 |
print("\nSegment Analysis:")
|
535 |
sequence = []
|
536 |
for i, segment in enumerate(segments):
|
537 |
print(f"\nSegment {i}:")
|
538 |
+
print(f"Content: {segment.get('content', 'None')}")
|
539 |
print(f"Bond before: {segment.get('bond_before', 'None')}")
|
540 |
print(f"Bond after: {segment.get('bond_after', 'None')}")
|
541 |
|
|
|
548 |
print(f"Identified as: {residue}")
|
549 |
print(f"Modifications: {mods}")
|
550 |
else:
|
551 |
+
print(f"Warning: Could not identify residue in segment: {segment.get('content', 'None')}")
|
552 |
|
|
|
|
|
553 |
three_letter = '-'.join(sequence)
|
554 |
+
|
555 |
one_letter = ''.join(self.three_to_one.get(aa.split('(')[0], 'X') for aa in sequence)
|
556 |
|
557 |
if is_cyclic:
|
|
|
561 |
print(f"\nFinal sequence: {three_letter}")
|
562 |
print(f"One-letter code: {one_letter}")
|
563 |
print(f"Is cyclic: {is_cyclic}")
|
564 |
+
print(f"Peptide cycles: {peptide_cycles}")
|
565 |
+
print(f"Aromatic cycles: {aromatic_cycles}")
|
566 |
|
567 |
return {
|
568 |
'three_letter': three_letter,
|
569 |
'one_letter': one_letter,
|
570 |
+
'is_cyclic': is_cyclic,
|
571 |
+
'residues': sequence
|
572 |
}
|
573 |
|
574 |
def annotate_cyclic_structure(mol, sequence):
|