yinuozhang commited on
Commit
0095ae9
·
1 Parent(s): b3dd269

synthesizable

Browse files
Files changed (1) hide show
  1. app.py +325 -405
app.py CHANGED
@@ -17,22 +17,98 @@ from rdkit import Chem
17
  class PeptideAnalyzer:
18
  def __init__(self):
19
  self.bond_patterns = [
20
- (r'OC\(=O\)', 'ester'), # Ester bond
21
  (r'N\(C\)C\(=O\)', 'n_methyl'), # N-methylated peptide bond
22
  (r'N[0-9]C\(=O\)', 'proline'), # Proline peptide bond
23
  (r'NC\(=O\)', 'peptide'), # Standard peptide bond
24
  (r'C\(=O\)N\(C\)', 'n_methyl_reverse'), # Reverse N-methylated
25
  (r'C\(=O\)N[12]?', 'peptide_reverse') # Reverse peptide bond
26
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  # Three to one letter code mapping
28
  self.three_to_one = {
29
  'Ala': 'A', 'Cys': 'C', 'Asp': 'D', 'Glu': 'E',
30
  'Phe': 'F', 'Gly': 'G', 'His': 'H', 'Ile': 'I',
31
  'Lys': 'K', 'Leu': 'L', 'Met': 'M', 'Asn': 'N',
32
  'Pro': 'P', 'Gln': 'Q', 'Arg': 'R', 'Ser': 'S',
33
- 'Thr': 'T', 'Val': 'V', 'Trp': 'W', 'Tyr': 'Y'
 
 
 
 
 
 
 
34
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
 
 
 
 
36
  def is_peptide(self, smiles):
37
  """Check if the SMILES represents a peptide structure"""
38
  mol = Chem.MolFromSmiles(smiles)
@@ -73,14 +149,34 @@ class PeptideAnalyzer:
73
  is_cyclic = len(peptide_cycles) > 0 and not smiles.endswith('C(=O)O')
74
  return is_cyclic, peptide_cycles, aromatic_cycles
75
 
76
- def split_on_bonds(self, smiles):
 
77
  positions = []
78
  used = set()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  # Find Gly pattern first
80
  gly_pattern = r'NCC\(=O\)'
81
  for match in re.finditer(gly_pattern, smiles):
82
  if not any(p in range(match.start(), match.end()) for p in used):
83
- positions.append({
84
  'start': match.start(),
85
  'end': match.end(),
86
  'type': 'gly',
@@ -91,56 +187,70 @@ class PeptideAnalyzer:
91
  for pattern, bond_type in self.bond_patterns:
92
  for match in re.finditer(pattern, smiles):
93
  if not any(p in range(match.start(), match.end()) for p in used):
94
- positions.append({
95
  'start': match.start(),
96
  'end': match.end(),
97
  'type': bond_type,
98
  'pattern': match.group()
99
  })
100
  used.update(range(match.start(), match.end()))
101
-
102
- # Sort by position
103
- positions.sort(key=lambda x: x['start'])
 
 
 
104
 
105
  # Create segments
106
  segments = []
107
- if positions:
108
- # First segment
109
- if positions[0]['start'] > 0:
 
 
 
 
 
 
 
 
 
 
110
  segments.append({
111
- 'content': smiles[0:positions[0]['start']],
112
- 'bond_after': positions[0]['pattern']
 
 
113
  })
114
- # Process segments
115
- for i in range(len(positions)-1):
116
- current = positions[i]
117
- next_pos = positions[i+1]
118
- if current['type'] == 'gly':
119
- segments.append({
120
- 'content': 'NCC(=O)',
121
- 'bond_before': positions[i-1]['pattern'] if i > 0 else None,
122
- 'bond_after': next_pos['pattern']
123
- })
124
  segments.append({
125
- 'content': smiles[current['start']+7:next_pos['start']],
126
- 'bond_before': 'gly_bond',
127
- 'bond_after': next_pos['pattern']
128
  })
129
- else:
130
- content = smiles[current['end']:next_pos['start']]
131
- if content:
132
- segments.append({
133
- 'content': content,
134
- 'bond_before': current['pattern'],
135
- 'bond_after': next_pos['pattern']
136
- })
137
-
138
- # Last segment
139
- if positions[-1]['end'] < len(smiles):
140
  segments.append({
141
- 'content': smiles[positions[-1]['end']:],
142
- 'bond_before': positions[-1]['pattern']
 
143
  })
 
 
 
 
 
 
144
  return segments
145
 
146
  def clean_terminal_carboxyl(self, segment):
@@ -164,9 +274,23 @@ class PeptideAnalyzer:
164
  def identify_residue(self, segment):
165
  """Identify residue with Pro reconstruction"""
166
  # Only clean terminal carboxyl if this is the last segment
 
 
167
  content = self.clean_terminal_carboxyl(segment)
168
  mods = self.get_modifications(segment)
169
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  # Proline (P) - flexible ring numbers
171
  if any([
172
  # Check for any ring number in bond patterns
@@ -174,8 +298,8 @@ class PeptideAnalyzer:
174
  any(f'[C@@H]{n}' in content or f'[C@H]{n}' in content for n in '123456789'))
175
  for n in '123456789'
176
  ]) or any([(segment.get('bond_before', '').startswith(f'C(=O)N{n}') and 'CCC' in content and
177
- any(f'CCC{n}' for n in '123456789'))
178
- for n in '123456789'
179
  ]) or any([
180
  # Check ending patterns with any ring number
181
  (f'CCCN{n}' in content and content.endswith('=O') and
@@ -192,430 +316,226 @@ class PeptideAnalyzer:
192
  ]):
193
  return 'Pro', mods
194
 
195
- # Tryptophan (W) - more specific indole pattern
 
 
 
 
196
  if re.search(r'c[0-9]c\[nH\]c[0-9]ccccc[0-9][0-9]', content) and \
197
  'c[nH]c' in content.replace(' ', ''):
 
 
198
  return 'Trp', mods
199
 
200
- # Lysine (K)
201
  if '[C@@H](CCCCN)' in content or '[C@H](CCCCN)' in content:
 
 
202
  return 'Lys', mods
203
-
204
- # Arginine (R)
205
  if '[C@@H](CCCNC(=N)N)' in content or '[C@H](CCCNC(=N)N)' in content:
 
 
206
  return 'Arg', mods
207
 
208
- if ('NCC(=O)' in content) or (content == 'C'):
209
- if segment.get('bond_before') and segment.get('bond_after'):
210
- if ('C(=O)N' in segment['bond_before'] or 'C(=O)N(C)' in segment['bond_before']):
211
- return 'Gly', mods
212
- elif segment.get('bond_before') and segment.get('bond_before').startswith('C(=O)N'):
213
  return 'Gly', mods
214
-
 
215
  if 'CC(C)C[C@H]' in content or 'CC(C)C[C@@H]' in content or '[C@@H](CC(C)C)' in content or '[C@H](CC(C)C)' in content or (('N[C@H](CCC(C)C)' in content or 'N[C@@H](CCC(C)C)' in content) and segment.get('bond_before') is None):
 
 
216
  return 'Leu', mods
217
 
218
- if '[C@@H]([C@@H](C)O)' in content or '[C@H]([C@H](C)O)' in content:
 
 
 
 
219
  return 'Thr', mods
 
 
 
 
 
220
  if re.search(r'\[C@H\]\(Cc\d+ccccc\d+\)', content) or re.search(r'\[C@@H\]\(Cc\d+ccccc\d+\)', content):
 
 
221
  return 'Phe', mods
222
 
223
- if ('[C@H](C(C)C)' in content or
224
- '[C@@H](C(C)C)' in content or
225
- '[C@H]C(C)C' in content or
226
- '[C@@H]C(C)C' in content
227
- ):
228
- if not any(p in content for p in ['CC(C)C[C@H]', 'CC(C)C[C@@H]']): # Still check not Leu
229
- return 'Val', mods
230
 
 
 
 
 
 
 
 
231
  if any([
232
- 'CC[C@H](C)' in content,
233
- 'CC[C@@H](C)' in content,
234
  '[C@@H](CC)C' in content,
235
- '[C@H](CC)C' in content,
236
- 'C(C)C[C@H]' in content and 'CC(C)C' not in content,
237
  'C(C)C[C@@H]' in content and 'CC(C)C' not in content
238
  ]):
 
 
 
 
 
 
239
  return 'Ile', mods
240
 
 
241
  if ('[C@H](C)' in content or '[C@@H](C)' in content):
242
  if not any(p in content for p in ['C(C)C', 'COC', 'CN(', 'C(C)O', 'CC[C@H]', 'CC[C@@H]']):
 
 
243
  return 'Ala', mods
244
 
245
- # Tyrosine (Tyr) - 4-hydroxybenzyl side chain
246
  if re.search(r'Cc[0-9]ccc\(O\)cc[0-9]', content):
 
 
247
  return 'Tyr', mods
248
 
249
- # Serine (Ser) - Hydroxymethyl side chain
250
  if '[C@H](CO)' in content or '[C@@H](CO)' in content:
251
  if not ('C(C)O' in content or 'COC' in content):
 
 
252
  return 'Ser', mods
253
-
254
- # Threonine (Thr) - 1-hydroxyethyl side chain
255
- if '[C@@H]([C@@H](C)O)' in content or '[C@H]([C@H](C)O)' in content or '[C@@H](C)O' in content or '[C@H](C)O' in content:
256
- return 'Thr', mods
257
-
258
- # Cysteine (Cys) - Thiol side chain
 
 
 
 
 
 
 
 
 
 
 
 
259
  if '[C@H](CS)' in content or '[C@@H](CS)' in content:
 
 
260
  return 'Cys', mods
261
 
262
- # Methionine (Met) - Methylthioethyl side chain
263
- if ('CCSC' in content):
 
 
 
 
264
  return 'Met', mods
265
-
266
- # Glutamine (Gln) - Carbamoylethyl side chain
267
  if (content == '[C@@H](CC' or content == '[C@H](CC' and segment.get('bond_before')=='C(=O)N' and segment.get('bond_after')=='C(=O)N') or ('CCC(=O)N' in content) or ('CCC(N)=O' in content):
 
 
268
  return 'Gln', mods
269
- # Asparagine (Asn) - Carbamoylmethyl side chain
 
270
  if (content == '[C@@H](C' or content == '[C@H](C' and segment.get('bond_before')=='C(=O)N' and segment.get('bond_after')=='C(=O)N') or ('CC(=O)N' in content) or ('CCN(=O)' in content) or ('CC(N)=O' in content):
 
 
271
  return 'Asn', mods
272
 
273
- # Glutamic acid (Glu) - Carboxyethyl side chain
274
  if ('CCC(=O)O' in content):
275
- return 'Glu', mods
276
- # Aspartic acid (Asp) - Carboxymethyl side chain
 
 
 
277
  if ('CC(=O)O' in content):
 
 
278
  return 'Asp', mods
279
 
280
- # Arginine (Arg) - 3-guanidinopropyl side chain
281
- if ('CCCNC(=N)N' in content):
282
- return 'Arg', mods
283
-
284
- # Histidine (His) - Imidazole side chain
285
  if re.search(r'Cc\d+c\[nH\]cn\d+', content) or re.search(r'Cc\d+cnc\[nH\]\d+', content):
 
 
286
  return 'His', mods
 
 
287
 
288
- ############UAA
 
 
 
 
 
 
289
 
290
- if '[C@H](COC(C)(C)C)' in content or '[C@@H](COC(C)(C)C)' in content:
291
- return 'O-tBu', mods
 
 
 
 
 
 
 
 
 
 
292
 
293
- if re.search(r'c\d+ccccc\d+', content):
294
- if '[C@@H](c1ccccc1)' in content or '[C@H](c1ccccc1)' in content:
295
- return '4', mods # Base phenylglycine
296
- if ('C[C@H](CCCC)' in content or 'C[C@@H](CCCC)' in content) and 'CC(C)' not in content:
297
- return 'Nle', mods
298
-
299
- # Ornithine (Orn) - 3-carbon chain with NH2
300
- if ('C[C@H](CCCN)' in content or 'C[C@@H](CCCN)' in content) and 'CC(C)' not in content:
301
- return 'Orn', mods
302
-
303
- # 2-Naphthylalanine (2Nal)
304
- if ('Cc3cc2ccccc2c3' in content):
305
- return '2Nal', mods
306
-
307
- # Cyclohexylalanine (Cha)
308
- if 'N2CCCCC2' in content or 'CCCCC2' in content:
309
- return 'Cha', mods
310
-
311
- # Aminobutyric acid (Abu) - 2-carbon chain
312
- if ('C[C@H](CC)' in content or 'C[C@@H](CC)' in content) and not any(p in content for p in ['CC(C)', 'CCCC', 'CCC(C)']):
313
- return 'Abu', mods
314
-
315
- # Pipecolic acid (Pip)
316
- if ('N3CCCCC3' in content or 'CCCCC3' in content):
317
- return 'Pip', mods
318
-
319
- # Cyclohexylglycine (Chg) - direct cyclohexyl without CH2
320
- if ('C[C@H](C1CCCCC1)' in content or 'C[C@@H](C1CCCCC1)' in content):
321
- return 'Chg', mods
322
-
323
- # 4-Fluorophenylalanine (4F-Phe)
324
- if ('Cc2ccc(F)cc2' in content):
325
- return '4F-Phe', mods
326
-
327
- # 4-substituted phenylalanines
328
- if 'Cc1ccc' in content:
329
- if 'OMe' in content or 'OCc1ccc' in content:
330
- return '0A1', mods # 4-methoxy-Phenylalanine
331
- elif 'Clc1ccc' in content:
332
- return '200', mods # 4-chloro-Phenylalanine
333
- elif 'Brc1ccc' in content:
334
- return '4BF', mods # 4-Bromo-phenylalanine
335
- elif 'C#Nc1ccc' in content:
336
- return '4CF', mods # 4-cyano-phenylalanine
337
- elif 'Ic1ccc' in content:
338
- return 'PHI', mods # 4-Iodo-phenylalanine
339
- elif 'Fc1ccc' in content:
340
- return 'PFF', mods # 4-Fluoro-phenylalanine
341
-
342
- # Modified tryptophans
343
- if 'c[nH]c2' in content:
344
- if 'Oc2cccc2' in content:
345
- return '0AF', mods # 7-hydroxy-tryptophan
346
- elif 'Fc2cccc2' in content:
347
- return '4FW', mods # 4-fluoro-tryptophan
348
- elif 'Clc2cccc2' in content:
349
- return '6CW', mods # 6-chloro-tryptophan
350
- elif 'Brc2cccc2' in content:
351
- return 'BTR', mods # 6-bromo-tryptophan
352
- elif 'COc2cccc2' in content:
353
- return 'MOT5', mods # 5-Methoxy-tryptophan
354
- elif 'Cc2cccc2' in content:
355
- return 'MTR5', mods # 5-Methyl-tryptophan
356
-
357
- # Special amino acids
358
- if 'CC(C)(C)[C@@H]' in content or 'CC(C)(C)[C@H]' in content:
359
- return 'BUG', mods # Tertleucine
360
-
361
- if 'CCCNC(=N)N' in content:
362
- return 'CIR', mods # Citrulline
363
-
364
- if '[SeH]' in content:
365
- return 'CSE', mods # Selenocysteine
366
-
367
- if '[NH3]CC[C@@H]' in content or '[NH3]CC[C@H]' in content:
368
- return 'DAB', mods # Diaminobutyric acid
369
-
370
- if 'C1CCCCC1' in content:
371
- if 'C1CCCCC1[C@@H]' in content or 'C1CCCCC1[C@H]' in content:
372
- return 'CHG', mods # Cyclohexylglycine
373
- elif 'C1CCCCC1C[C@@H]' in content or 'C1CCCCC1C[C@H]' in content:
374
- return 'ALC', mods # 3-cyclohexyl-alanine
375
-
376
- # Naphthalene derivatives
377
- if 'c1cccc2c1cccc2' in content:
378
- if 'c1cccc2c1cccc2[C@@H]' in content or 'c1cccc2c1cccc2[C@H]' in content:
379
- return 'NAL', mods # 2-Naphthyl-alanine
380
-
381
- # Heteroaromatic derivatives
382
- if 'c1cncc' in content:
383
- return 'PYR4', mods # 3-(4-Pyridyl)-alanine
384
- if 'c1cscc' in content:
385
- return 'THA3', mods # 3-(3-thienyl)-alanine
386
- if 'c1nnc' in content:
387
- return 'TRZ4', mods # 3-(1,2,4-Triazol-1-yl)-alanine
388
-
389
- # Modified serines and threonines
390
- if 'OP(O)(O)O' in content:
391
- if '[C@@H](COP' in content or '[C@H](COP' in content:
392
- return 'SEP', mods # phosphoserine
393
- elif '[C@@H](OP' in content or '[C@H](OP' in content:
394
- return 'TPO', mods # phosphothreonine
395
-
396
- # Specialized ring systems
397
- if 'c1c2ccccc2cc2c1cccc2' in content:
398
- return 'ANTH', mods # 3-(9-anthryl)-alanine
399
- if 'c1csc2c1cccc2' in content:
400
- return 'BTH3', mods # 3-(3-benzothienyl)-alanine
401
- if '[C@]12C[C@H]3C[C@@H](C2)C[C@@H](C1)C3' in content:
402
- return 'ADAM', mods # Adamanthane
403
-
404
- # Fluorinated derivatives
405
- if 'FC(F)(F)' in content:
406
- if 'CC(F)(F)F' in content:
407
- return 'FLA', mods # Trifluoro-alanine
408
- if 'C(F)(F)F)c1' in content:
409
- if 'c1ccccc1C(F)(F)F' in content:
410
- return 'TFG2', mods # 2-(Trifluoromethyl)-phenylglycine
411
- if 'c1cccc(c1)C(F)(F)F' in content:
412
- return 'TFG3', mods # 3-(Trifluoromethyl)-phenylglycine
413
- if 'c1ccc(cc1)C(F)(F)F' in content:
414
- return 'TFG4', mods # 4-(Trifluoromethyl)-phenylglycine
415
-
416
- # Multiple halogen patterns
417
- if 'F' in content and 'c1' in content:
418
- if 'c1ccc(c(c1)F)F' in content:
419
- return 'F2F', mods # 3,4-Difluoro-phenylalanine
420
- if 'cc(F)cc(c1)F' in content:
421
- return 'WFP', mods # 3,5-Difluoro-phenylalanine
422
- if 'Cl' in content and 'c1' in content:
423
- if 'c1ccc(cc1Cl)Cl' in content:
424
- return 'CP24', mods # 2,4-dichloro-phenylalanine
425
- if 'c1ccc(c(c1)Cl)Cl' in content:
426
- return 'CP34', mods # 3,4-dichloro-phenylalanine
427
-
428
- # Hydroxy and amino derivatives
429
- if 'O' in content and 'c1' in content:
430
- if 'c1cc(O)cc(c1)O' in content:
431
- return '3FG', mods # (2s)-amino(3,5-dihydroxyphenyl)-ethanoic acid
432
- if 'c1ccc(c(c1)O)O' in content:
433
- return 'DAH', mods # 3,4-Dihydroxy-phenylalanine
434
-
435
- # Modified histidines
436
- if 'c1cnc' in content:
437
- if '[C@@H]1CN[C@@H](N1)F' in content:
438
- return '2HF', mods # 2-fluoro-l-histidine
439
- if 'c1cnc([nH]1)F' in content:
440
- return '2HF1', mods # 2-fluoro-l-histidine variant
441
- if 'c1c[nH]c(n1)F' in content:
442
- return '2HF2', mods # 2-fluoro-l-histidine variant
443
-
444
- if '[SeH]' in content:
445
- return 'CSE', mods # Selenocysteine
446
- if 'S' in content:
447
- if 'CSCc1ccccc1' in content:
448
- return 'BCS', mods # benzylcysteine
449
- if 'CCSC' in content:
450
- return 'ESC', mods # Ethionine
451
- if 'CCS' in content:
452
- return 'HCS', mods # homocysteine
453
-
454
- if 'CN=[N]=N' in content:
455
- return 'AZDA', mods # azido-alanine
456
- if '[NH]=[C](=[NH2])=[NH2]' in content:
457
- if 'CCC[NH]=' in content:
458
- return 'AGM', mods # 5-methyl-arginine
459
- if 'CC[NH]=' in content:
460
- return 'GDPR', mods # 2-Amino-3-guanidinopropionic acid
461
-
462
- # Others
463
- if 'C1CCCC1' in content:
464
- return 'CPA3', mods # 3-Cyclopentyl-alanine
465
- if 'C1CCCCC1' in content:
466
- if 'CC1CCCCC1' in content:
467
- return 'ALC', mods # 3-cyclohexyl-alanine
468
- else:
469
- return 'CHG', mods # Cyclohexylglycine
470
-
471
- if 'CCC[C@@H]' in content or 'CCC[C@H]' in content:
472
- return 'NLE', mods # Norleucine
473
- if 'CC[C@@H]' in content or 'CC[C@H]' in content:
474
- if not any(x in content for x in ['CC(C)', 'COC', 'CN(']):
475
- return 'ABA', mods # 2-Aminobutyric acid
476
- if 'CCON' in content:
477
- return 'CAN', mods # canaline
478
- if '[C@@H]1C=C[C@@H](C=C1)' in content:
479
- return 'ACZ', mods # cis-amiclenomycin
480
- if 'CCC(=O)[NH3]' in content:
481
- return 'ONL', mods # 5-oxo-l-norleucine
482
- if 'c1ccncc1' in content:
483
- return 'PYR4', mods # 3-(4-Pyridyl)-alanine
484
- if 'c1ccco1' in content:
485
- return 'FUA2', mods # (2-furyl)-alanine
486
-
487
- if 'c1ccc' in content:
488
- if 'c1ccc(cc1)c1ccccc1' in content:
489
- return 'BIF', mods # 4,4-biphenylalanine
490
- if 'c1ccc(cc1)C(=O)c1ccccc1' in content:
491
- return 'PBF', mods # 4-benzoyl-phenylalanine
492
- if 'c1ccc(cc1)C(C)(C)C' in content:
493
- return 'TBP4', mods # 4-tert-butyl-phenylalanine
494
- if 'c1ccc(cc1)[C](=[NH2])=[NH2]' in content:
495
- return '0BN', mods # 4-carbamimidoyl-l-phenylalanine
496
- if 'c1cccc(c1)[C](=[NH2])=[NH2]' in content:
497
- return 'APM', mods # m-amidinophenyl-3-alanine
498
-
499
- if 'O' in content:
500
- if '[C@H]([C@H](C)O)O' in content:
501
- return 'ILX', mods # 4,5-dihydroxy-isoleucine
502
- if '[C@H]([C@@H](C)O)O' in content:
503
- return 'ALO', mods # Allo-threonine
504
- if '[C@H](COP(O)(O)O)' in content:
505
- return 'SEP', mods # phosphoserine
506
- if '[C@H]([C@@H](C)OP(O)(O)O)' in content:
507
- return 'TPO', mods # phosphothreonine
508
- if '[C@H](c1ccc(O)cc1)O' in content:
509
- return 'OMX', mods # (betar)-beta-hydroxy-l-tyrosine
510
- if '[C@H](c1ccc(c(Cl)c1)O)O' in content:
511
- return 'OMY', mods # (betar)-3-chloro-beta-hydroxy-l-tyrosine
512
-
513
- if 'n1' in content:
514
- if 'n1cccn1' in content:
515
- return 'PYZ1', mods # 3-(1-Pyrazolyl)-alanine
516
- if 'n1nncn1' in content:
517
- return 'TEZA', mods # 3-(2-Tetrazolyl)-alanine
518
- if 'c2c(n1)cccc2' in content:
519
- return 'QU32', mods # 3-(2-Quinolyl)-alanine
520
- if 'c1cnc2c(c1)cccc2' in content:
521
- return 'QU33', mods # 3-(3-quinolyl)-alanine
522
- if 'c1ccnc2c1cccc2' in content:
523
- return 'QU34', mods # 3-(4-quinolyl)-alanine
524
- if 'c1ccc2c(c1)nccc2' in content:
525
- return 'QU35', mods # 3-(5-Quinolyl)-alanine
526
- if 'c1ccc2c(c1)cncc2' in content:
527
- return 'QU36', mods # 3-(6-Quinolyl)-alanine
528
- if 'c1cnc2c(n1)cccc2' in content:
529
- return 'QX32', mods # 3-(2-quinoxalyl)-alanine
530
-
531
- if 'N' in content:
532
- if '[NH3]CC[C@@H]' in content:
533
- return 'DAB', mods # Diaminobutyric acid
534
- if '[NH3]C[C@@H]' in content:
535
- return 'DPP', mods # 2,3-Diaminopropanoic acid
536
- if '[NH3]CCCCCC[C@@H]' in content:
537
- return 'HHK', mods # (2s)-2,8-diaminooctanoic acid
538
- if 'CCC[NH]=[C](=[NH2])=[NH2]' in content:
539
- return 'GBUT', mods # 2-Amino-4-guanidinobutryric acid
540
- if '[NH]=[C](=S)=[NH2]' in content:
541
- return 'THIC', mods # Thio-citrulline
542
-
543
- if 'CC' in content:
544
- if 'CCCC[C@@H]' in content:
545
- return 'AHP', mods # 2-Aminoheptanoic acid
546
- if 'CCC([C@@H])(C)C' in content:
547
- return 'I2M', mods # 3-methyl-l-alloisoleucine
548
- if 'CC[C@H]([C@@H])C' in content:
549
- return 'IIL', mods # Allo-Isoleucine
550
- if '[C@H](CCC(C)C)' in content:
551
- return 'HLEU', mods # Homoleucine
552
- if '[C@@H]([C@@H](C)O)C' in content:
553
- return 'HLU', mods # beta-hydroxyleucine
554
-
555
- if '[C@@H]' in content:
556
- if '[C@@H](C[C@@H](F))' in content:
557
- return 'FGA4', mods # 4-Fluoro-glutamic acid
558
- if '[C@@H](C[C@@H](O))' in content:
559
- return '3GL', mods # 4-hydroxy-glutamic-acid
560
- if '[C@@H](C[C@H](C))' in content:
561
- return 'LME', mods # (3r)-3-methyl-l-glutamic acid
562
- if '[C@@H](CC[C@H](C))' in content:
563
- return 'MEG', mods # (3s)-3-methyl-l-glutamic acid
564
-
565
- if 'S' in content:
566
- if 'SCC[C@@H]' in content:
567
- return 'HSER', mods # homoserine
568
- if 'SCCN' in content:
569
- return 'SLZ', mods # thialysine
570
- if 'SC(=O)' in content:
571
- return 'CSA', mods # s-acetonylcysteine
572
- if '[S@@](=O)' in content:
573
- return 'SME', mods # Methionine sulfoxide
574
- if 'S(=O)(=O)' in content:
575
- return 'OMT', mods # Methionine sulfone
576
-
577
- if 'C=' in content:
578
- if 'C=C[C@@H]' in content:
579
- return '2AG', mods # 2-Allyl-glycine
580
- if 'C=C[C@@H]' in content:
581
- return 'LVG', mods # vinylglycine
582
- if 'C=Cc1ccccc1' in content:
583
- return 'STYA', mods # Styrylalanine
584
-
585
- if '[C@@H]1Cc2c(C1)cccc2' in content:
586
- return 'IGL', mods # alpha-amino-2-indanacetic acid
587
- if '[C](=[C](=O)=O)=O' in content:
588
- return '26P', mods # 2-amino-6-oxopimelic acid
589
- if '[C](=[C](=O)=O)=C' in content:
590
- return '2NP', mods # l-2-amino-6-methylene-pimelic acid
591
- if 'c1cccc2c1cc(O)cc2' in content:
592
- return 'NAO1', mods # 5-hydroxy-1-naphthalene
593
- if 'c1ccc2c(c1)cc(O)cc2' in content:
594
- return 'NAO2', mods # 6-hydroxy-2-naphthalene
595
  return None, mods
596
 
597
  def get_modifications(self, segment):
598
- """Get modifications based on bond types"""
599
  mods = []
600
- if segment.get('bond_after'):
601
- if 'N(C)' in segment['bond_after'] or segment['bond_after'].startswith('C(=O)N(C)'):
602
- mods.append('N-Me')
603
- if 'OC(=O)' in segment['bond_after']:
604
- mods.append('O-linked')
 
 
 
 
 
 
 
 
605
  return mods
606
 
607
  def analyze_structure(self, smiles):
608
- """Main analysis function with debug output"""
609
  print("\nAnalyzing structure:", smiles)
 
 
 
 
 
 
 
 
 
610
 
611
- # Split into segments
612
- segments = self.split_on_bonds(smiles)
 
 
613
 
614
  print("\nSegment Analysis:")
615
  sequence = []
616
  for i, segment in enumerate(segments):
617
  print(f"\nSegment {i}:")
618
- print(f"Content: {segment['content']}")
619
  print(f"Bond before: {segment.get('bond_before', 'None')}")
620
  print(f"Bond after: {segment.get('bond_after', 'None')}")
621
 
@@ -628,11 +548,10 @@ class PeptideAnalyzer:
628
  print(f"Identified as: {residue}")
629
  print(f"Modifications: {mods}")
630
  else:
631
- print(f"Warning: Could not identify residue in segment: {segment['content']}")
632
 
633
- # Check if cyclic
634
- is_cyclic, peptide_cycles, aromatic_cycles = self.is_cyclic(smiles)
635
  three_letter = '-'.join(sequence)
 
636
  one_letter = ''.join(self.three_to_one.get(aa.split('(')[0], 'X') for aa in sequence)
637
 
638
  if is_cyclic:
@@ -642,13 +561,14 @@ class PeptideAnalyzer:
642
  print(f"\nFinal sequence: {three_letter}")
643
  print(f"One-letter code: {one_letter}")
644
  print(f"Is cyclic: {is_cyclic}")
645
- #print(f"Peptide cycles: {peptide_cycles}")
646
- #print(f"Aromatic cycles: {aromatic_cycles}")
647
 
648
  return {
649
  'three_letter': three_letter,
650
  'one_letter': one_letter,
651
- 'is_cyclic': is_cyclic
 
652
  }
653
 
654
  def annotate_cyclic_structure(mol, sequence):
 
17
  class PeptideAnalyzer:
18
  def __init__(self):
19
  self.bond_patterns = [
20
+ #(r'OC\(=O\)', 'ester'), # Ester bond
21
  (r'N\(C\)C\(=O\)', 'n_methyl'), # N-methylated peptide bond
22
  (r'N[0-9]C\(=O\)', 'proline'), # Proline peptide bond
23
  (r'NC\(=O\)', 'peptide'), # Standard peptide bond
24
  (r'C\(=O\)N\(C\)', 'n_methyl_reverse'), # Reverse N-methylated
25
  (r'C\(=O\)N[12]?', 'peptide_reverse') # Reverse peptide bond
26
  ]
27
+ self.complex_residue_patterns = [
28
+ # Kpg - Lys(palmitoyl-Glu-OtBu)
29
+ (r'\[C[@]H\]\(CCCNC\(=O\)CCC\[C@@H\]\(NC\(=O\)CCCCCCCCCCCCCCCC\)C\(=O\)OC\(C\)\(C\)C\)', 'Kpg'),
30
+ (r'CCCCCCCCCCCCCCCCC\(=O\)N\[C@H\]\(CCCC\(=O\)NCCC\[C@@H\]', 'Kpg'),
31
+ (r'\[C[@]?H\]\(CSC\(c\d+ccccc\d+\)\(c\d+ccccc\d+\)c\d+ccc\(OC\)cc\d+\)', 'Cmt'),
32
+ (r'CSC\(c.*?c.*?OC\)', 'Cmt'), # Core structure of Cys-Mmt group
33
+ (r'COc.*?ccc\(C\(SC', 'Cmt'), # Start of Cmt in cyclic peptides
34
+ (r'c2ccccc2\)c2ccccc2\)cc', 'Cmt'), # End of Cmt in cyclic peptides
35
+ # Glu(OAll)
36
+ (r'C=CCOC\(=O\)CC\[C@@H\]', 'Eal'),
37
+ #(r'COc\d+ccc\(C\(SC\[C@@H\]\d+.*?\)\(c\d+ccccc\d+\)c\d+ccccc\d+\)cc\d+', 'Cmt-cyclic'),
38
+
39
+ # Dtg - Asp(OtBu)-(Dmb)Gly
40
+ (r'CN\(Cc\d+ccc\(OC\)cc\d+OC\)C\(=O\)\[C@H\]\(CC\(=O\)OC\(C\)\(C\)C\)', 'Dtg'),
41
+ (r'C\(=O\)N\(CC\d+=C\(C=C\(C=C\d+\)OC\)OC\)CC\(=O\)', 'Dtg'),
42
+ (r'N\[C@@H\]\(CC\(=O\)OC\(C\)\(C\)C\)C\(=O\)N\(CC\d+=C\(C=C\(C=C\d+\)OC\)OC\)CC\(=O\)', 'Dtg'),
43
+ ]
44
  # Three to one letter code mapping
45
  self.three_to_one = {
46
  'Ala': 'A', 'Cys': 'C', 'Asp': 'D', 'Glu': 'E',
47
  'Phe': 'F', 'Gly': 'G', 'His': 'H', 'Ile': 'I',
48
  'Lys': 'K', 'Leu': 'L', 'Met': 'M', 'Asn': 'N',
49
  'Pro': 'P', 'Gln': 'Q', 'Arg': 'R', 'Ser': 'S',
50
+ 'Thr': 'T', 'Val': 'V', 'Trp': 'W', 'Tyr': 'Y',
51
+ 'ala': 'a', 'cys': 'c', 'asp': 'd', 'glu': 'e',
52
+ 'phe': 'f', 'gly': 'g', 'his': 'h', 'ile': 'i',
53
+ 'lys': 'k', 'leu': 'l', 'met': 'm', 'asn': 'n',
54
+ 'pro': 'p', 'gln': 'q', 'arg': 'r', 'ser': 's',
55
+ 'thr': 't', 'val': 'v', 'trp': 'w', 'tyr': 'y', 'Cmt-cyclic': 'Ĉ',
56
+ 'Aib': 'Ŷ', 'Dtg': 'Ĝ', 'Cmt': 'Ĉ', 'Eal': 'Ė', 'Nml': "Ŀ", 'Nma': 'Ṃ',
57
+ 'Kpg': 'Ƙ', 'Tpb': 'Ṯ', 'Cyl': 'Ċ', 'Nle': 'Ł', 'Hph': 'Ĥ', 'Cys-Cys': 'CC', 'cys-cys': 'cc',
58
  }
59
+
60
+ def preprocess_complex_residues(self, smiles):
61
+ complex_positions = []
62
+
63
+ for pattern, residue_type in self.complex_residue_patterns:
64
+ for match in re.finditer(pattern, smiles):
65
+ # Only add if this position doesn't overlap with existing matches
66
+ if not any(pos['start'] <= match.start() < pos['end'] or
67
+ pos['start'] < match.end() <= pos['end'] for pos in complex_positions):
68
+ complex_positions.append({
69
+ 'start': match.start(),
70
+ 'end': match.end(),
71
+ 'type': residue_type,
72
+ 'pattern': match.group()
73
+ })
74
+
75
+ # Sort by position (to handle potential overlapping matches)
76
+ complex_positions.sort(key=lambda x: x['start'])
77
+
78
+ if not complex_positions:
79
+ return smiles, []
80
+
81
+ # Build a new SMILES string, protecting complex residues
82
+ preprocessed_smiles = smiles
83
+ offset = 0 # Track offset from replacements
84
+
85
+ protected_residues = []
86
+
87
+ for pos in complex_positions:
88
+ start = pos['start'] + offset
89
+ end = pos['end'] + offset
90
+
91
+ complex_part = preprocessed_smiles[start:end]
92
+
93
+ if not ('[C@H]' in complex_part or '[C@@H]' in complex_part):
94
+ continue
95
+
96
+ placeholder = f"COMPLEX_RESIDUE_{len(protected_residues)}"
97
+
98
+ preprocessed_smiles = preprocessed_smiles[:start] + placeholder + preprocessed_smiles[end:]
99
+
100
+ offset += len(placeholder) - (end - start)
101
+
102
+ protected_residues.append({
103
+ 'placeholder': placeholder,
104
+ 'type': pos['type'],
105
+ 'content': complex_part
106
+ })
107
 
108
+ #print(f"Protected {pos['type']}: {complex_part[:20]}... as {placeholder}")
109
+
110
+ return preprocessed_smiles, protected_residues
111
+
112
  def is_peptide(self, smiles):
113
  """Check if the SMILES represents a peptide structure"""
114
  mol = Chem.MolFromSmiles(smiles)
 
149
  is_cyclic = len(peptide_cycles) > 0 and not smiles.endswith('C(=O)O')
150
  return is_cyclic, peptide_cycles, aromatic_cycles
151
 
152
+ def split_on_bonds(self, smiles, protected_residues=None):
153
+ """Split SMILES into segments based on peptide bonds, with improved handling of protected residues"""
154
  positions = []
155
  used = set()
156
+
157
+ # First, handle protected complex residues if any
158
+ if protected_residues:
159
+ for residue in protected_residues:
160
+ match = re.search(residue['placeholder'], smiles)
161
+ if match:
162
+ positions.append({
163
+ 'start': match.start(),
164
+ 'end': match.end(),
165
+ 'type': 'complex',
166
+ 'pattern': residue['placeholder'],
167
+ 'residue_type': residue['type'],
168
+ 'content': residue['content']
169
+ })
170
+ used.update(range(match.start(), match.end()))
171
+
172
+ # Find all peptide bonds
173
+ bond_positions = []
174
+
175
  # Find Gly pattern first
176
  gly_pattern = r'NCC\(=O\)'
177
  for match in re.finditer(gly_pattern, smiles):
178
  if not any(p in range(match.start(), match.end()) for p in used):
179
+ bond_positions.append({
180
  'start': match.start(),
181
  'end': match.end(),
182
  'type': 'gly',
 
187
  for pattern, bond_type in self.bond_patterns:
188
  for match in re.finditer(pattern, smiles):
189
  if not any(p in range(match.start(), match.end()) for p in used):
190
+ bond_positions.append({
191
  'start': match.start(),
192
  'end': match.end(),
193
  'type': bond_type,
194
  'pattern': match.group()
195
  })
196
  used.update(range(match.start(), match.end()))
197
+
198
+ bond_positions.sort(key=lambda x: x['start'])
199
+
200
+ # Combine complex residue positions and bond positions
201
+ all_positions = positions + bond_positions
202
+ all_positions.sort(key=lambda x: x['start'])
203
 
204
  # Create segments
205
  segments = []
206
+
207
+ if all_positions and all_positions[0]['start'] > 0:
208
+ segments.append({
209
+ 'content': smiles[0:all_positions[0]['start']],
210
+ 'bond_after': all_positions[0]['pattern'] if all_positions[0]['type'] != 'complex' else None,
211
+ 'complex_after': all_positions[0]['pattern'] if all_positions[0]['type'] == 'complex' else None
212
+ })
213
+
214
+ for i in range(len(all_positions)-1):
215
+ current = all_positions[i]
216
+ next_pos = all_positions[i+1]
217
+
218
+ if current['type'] == 'complex':
219
  segments.append({
220
+ 'content': current['content'],
221
+ 'bond_before': all_positions[i-1]['pattern'] if i > 0 and all_positions[i-1]['type'] != 'complex' else None,
222
+ 'bond_after': next_pos['pattern'] if next_pos['type'] != 'complex' else None,
223
+ 'complex_type': current['residue_type']
224
  })
225
+ elif current['type'] == 'gly':
226
+ segments.append({
227
+ 'content': 'NCC(=O)',
228
+ 'bond_before': all_positions[i-1]['pattern'] if i > 0 and all_positions[i-1]['type'] != 'complex' else None,
229
+ 'bond_after': next_pos['pattern'] if next_pos['type'] != 'complex' else None
230
+ })
231
+ else:
232
+ # Only create segment if there's content between this bond and next position
233
+ content = smiles[current['end']:next_pos['start']]
234
+ if content and next_pos['type'] != 'complex':
235
  segments.append({
236
+ 'content': content,
237
+ 'bond_before': current['pattern'],
238
+ 'bond_after': next_pos['pattern'] if next_pos['type'] != 'complex' else None
239
  })
240
+
241
+ if all_positions and all_positions[-1]['end'] < len(smiles):
242
+ if all_positions[-1]['type'] == 'complex':
 
 
 
 
 
 
 
 
243
  segments.append({
244
+ 'content': all_positions[-1]['content'],
245
+ 'bond_before': all_positions[-2]['pattern'] if len(all_positions) > 1 and all_positions[-2]['type'] != 'complex' else None,
246
+ 'complex_type': all_positions[-1]['residue_type']
247
  })
248
+ else:
249
+ segments.append({
250
+ 'content': smiles[all_positions[-1]['end']:],
251
+ 'bond_before': all_positions[-1]['pattern']
252
+ })
253
+
254
  return segments
255
 
256
  def clean_terminal_carboxyl(self, segment):
 
274
  def identify_residue(self, segment):
275
  """Identify residue with Pro reconstruction"""
276
  # Only clean terminal carboxyl if this is the last segment
277
+ if 'complex_type' in segment:
278
+ return segment['complex_type'], []
279
  content = self.clean_terminal_carboxyl(segment)
280
  mods = self.get_modifications(segment)
281
 
282
+ if content.startswith('COc1ccc(C(SC[C@@H]'):
283
+ print("DIRECT MATCH: Found Cmt at beginning")
284
+ return 'Cmt', mods
285
+
286
+ if '[C@@H]3CCCN3C2=O)(c2ccccc2)c2ccccc2)cc' in content:
287
+ print("DIRECT MATCH: Found Pro at end")
288
+ return 'Pro', mods
289
+
290
+ # Eal - Glu(OAll)
291
+ if 'CCC(=O)OCC=C' in content or 'CC(=O)OCC=C' in content or 'C=CCOC(=O)CC' in content:
292
+ return 'Eal', mods
293
+
294
  # Proline (P) - flexible ring numbers
295
  if any([
296
  # Check for any ring number in bond patterns
 
298
  any(f'[C@@H]{n}' in content or f'[C@H]{n}' in content for n in '123456789'))
299
  for n in '123456789'
300
  ]) or any([(segment.get('bond_before', '').startswith(f'C(=O)N{n}') and 'CCC' in content and
301
+ any(f'CCC{n}' for n in '123456789'))
302
+ for n in '123456789'
303
  ]) or any([
304
  # Check ending patterns with any ring number
305
  (f'CCCN{n}' in content and content.endswith('=O') and
 
316
  ]):
317
  return 'Pro', mods
318
 
319
+ # D-Proline (p)
320
+ if ('N1[C@H](CCC1)' in content):
321
+ return 'pro', mods
322
+
323
+ # Tryptophan (W)
324
  if re.search(r'c[0-9]c\[nH\]c[0-9]ccccc[0-9][0-9]', content) and \
325
  'c[nH]c' in content.replace(' ', ''):
326
+ if '[C@H](CC' in content: # D-form
327
+ return 'trp', mods
328
  return 'Trp', mods
329
 
330
+ # Lysine (K) - both patterns
331
  if '[C@@H](CCCCN)' in content or '[C@H](CCCCN)' in content:
332
+ if '[C@H](CCCCN)' in content: # D-form
333
+ return 'lys', mods
334
  return 'Lys', mods
335
+
336
+ # Arginine (R) - both patterns
337
  if '[C@@H](CCCNC(=N)N)' in content or '[C@H](CCCNC(=N)N)' in content:
338
+ if '[C@H](CCCNC(=N)N)' in content: # D-form
339
+ return 'arg', mods
340
  return 'Arg', mods
341
 
342
+ if content == 'C' and segment.get('bond_before') and segment.get('bond_after'):
343
+ # If it's surrounded by peptide bonds, it's almost certainly Gly
344
+ if ('C(=O)N' in segment['bond_before'] or 'NC(=O)' in segment['bond_before'] or 'N(C)C(=O)' in segment['bond_before']) and \
345
+ ('NC(=O)' in segment['bond_after'] or 'C(=O)N' in segment['bond_after'] or 'N(C)C(=O)' in segment['bond_after']):
 
346
  return 'Gly', mods
347
+
348
+ # Leucine patterns (L/l)
349
  if 'CC(C)C[C@H]' in content or 'CC(C)C[C@@H]' in content or '[C@@H](CC(C)C)' in content or '[C@H](CC(C)C)' in content or (('N[C@H](CCC(C)C)' in content or 'N[C@@H](CCC(C)C)' in content) and segment.get('bond_before') is None):
350
+ if '[C@H](CC(C)C)' in content or 'CC(C)C[C@H]' in content: # D-form
351
+ return 'leu', mods
352
  return 'Leu', mods
353
 
354
+ # Threonine patterns (T/t)
355
+ if '[C@@H]([C@@H](C)O)' in content or '[C@H]([C@H](C)O)' in content or '[C@@H]([C@H](C)O)' in content or '[C@H]([C@@H](C)O)' in content:
356
+ # Check both stereochemistry patterns
357
+ if '[C@H]([C@@H](C)O)' in content: # D-form
358
+ return 'thr', mods
359
  return 'Thr', mods
360
+
361
+ if re.search(r'\[C@H\]\(CCc\d+ccccc\d+\)', content) or re.search(r'\[C@@H\]\(CCc\d+ccccc\d+\)', content):
362
+ return 'Hph', mods
363
+
364
+ # Phenylalanine patterns (F/f)
365
  if re.search(r'\[C@H\]\(Cc\d+ccccc\d+\)', content) or re.search(r'\[C@@H\]\(Cc\d+ccccc\d+\)', content):
366
+ if re.search(r'\[C@H\]\(Cc\d+ccccc\d+\)', content): # D-form
367
+ return 'phe', mods
368
  return 'Phe', mods
369
 
370
+ if ('CC(C)[C@@H]' in content or 'CC(C)[C@H]' in content or
371
+ '[C@H](C(C)C)' in content or '[C@@H](C(C)C)' in content or
372
+ 'C(C)C[C@H]' in content or 'C(C)C[C@@H]' in content):
 
 
 
 
373
 
374
+ # Make sure it's not leucine
375
+ if not any(p in content for p in ['CC(C)C[C@H]', 'CC(C)C[C@@H]', 'CCC(=O)']):
376
+ if '[C@H]' in content and not '[C@@H]' in content: # D-form
377
+ return 'val', mods
378
+ return 'Val', mods
379
+
380
+ # Isoleucine patterns (I/i)
381
  if any([
382
+ 'CC[C@@H](C)' in content, '[C@@H](C)CC' in content,
 
383
  '[C@@H](CC)C' in content,
 
 
384
  'C(C)C[C@@H]' in content and 'CC(C)C' not in content
385
  ]):
386
+ if '[C@H]([C@@H](CC)C)' in content or '[C@H](CC)C' in content: # D-form
387
+ return 'ile', mods
388
+ elif '[C@H](C)CC' in content or '[C@H](CC)C' in content or 'CC[C@H](C)' in content:
389
+ return 'ile', mods
390
+ elif 'C(C)C[C@H]' in content and 'CC(C)C' not in content:
391
+ return 'ile', mods
392
  return 'Ile', mods
393
 
394
+ # Alanine patterns (A/a)
395
  if ('[C@H](C)' in content or '[C@@H](C)' in content):
396
  if not any(p in content for p in ['C(C)C', 'COC', 'CN(', 'C(C)O', 'CC[C@H]', 'CC[C@@H]']):
397
+ if '[C@H](C)' in content: # D-form
398
+ return 'ala', mods
399
  return 'Ala', mods
400
 
401
+ # Tyrosine patterns (Y/y)
402
  if re.search(r'Cc[0-9]ccc\(O\)cc[0-9]', content):
403
+ if '[C@H](Cc1ccc(O)cc1)' in content: # D-form
404
+ return 'tyr', mods
405
  return 'Tyr', mods
406
 
407
+ # Serine patterns (S/s)
408
  if '[C@H](CO)' in content or '[C@@H](CO)' in content:
409
  if not ('C(C)O' in content or 'COC' in content):
410
+ if '[C@H](CO)' in content: # D-form
411
+ return 'ser', mods
412
  return 'Ser', mods
413
+
414
+ if 'CSSC' in content:
415
+ if re.search(r'\[C@@H\].*CSSC.*\[C@@H\]', content) or re.search(r'\[C@H\].*CSSC.*\[C@H\]', content):
416
+ if '[C@H]' in content and not '[C@@H]' in content: # D-form
417
+ return 'cys-cys', mods
418
+ return 'Cys-Cys', mods
419
+
420
+ if '[C@@H](N)CSSC' in content or '[C@H](N)CSSC' in content:
421
+ if '[C@H](N)CSSC' in content: # D-form
422
+ return 'cys-cys', mods
423
+ return 'Cys-Cys', mods
424
+
425
+ if 'CSSC[C@@H](C(=O)O)' in content or 'CSSC[C@H](C(=O)O)' in content:
426
+ if 'CSSC[C@H](C(=O)O)' in content: # D-form
427
+ return 'cys-cys', mods
428
+ return 'Cys-Cys', mods
429
+
430
+ # Cysteine patterns (C/c)
431
  if '[C@H](CS)' in content or '[C@@H](CS)' in content:
432
+ if '[C@H](CS)' in content: # D-form
433
+ return 'cys', mods
434
  return 'Cys', mods
435
 
436
+ # Methionine patterns (M/m)
437
+ if ('CCSC' in content) or ("CSCC" in content):
438
+ if '[C@H](CCSC)' in content: # D-form
439
+ return 'met', mods
440
+ elif '[C@H]' in content:
441
+ return 'met', mods
442
  return 'Met', mods
443
+
444
+ # Glutamine patterns (Q/q)
445
  if (content == '[C@@H](CC' or content == '[C@H](CC' and segment.get('bond_before')=='C(=O)N' and segment.get('bond_after')=='C(=O)N') or ('CCC(=O)N' in content) or ('CCC(N)=O' in content):
446
+ if '[C@H](CCC(=O)N)' in content: # D-form
447
+ return 'gln', mods
448
  return 'Gln', mods
449
+
450
+ # Asparagine patterns (N/n)
451
  if (content == '[C@@H](C' or content == '[C@H](C' and segment.get('bond_before')=='C(=O)N' and segment.get('bond_after')=='C(=O)N') or ('CC(=O)N' in content) or ('CCN(=O)' in content) or ('CC(N)=O' in content):
452
+ if '[C@H](CC(=O)N)' in content: # D-form
453
+ return 'asn', mods
454
  return 'Asn', mods
455
 
456
+ # Glutamic acid patterns (E/e)
457
  if ('CCC(=O)O' in content):
458
+ if '[C@H](CCC(=O)O)' in content: # D-form
459
+ return 'glu', mods
460
+ return 'Glu', mods
461
+
462
+ # Aspartic acid patterns (D/d)
463
  if ('CC(=O)O' in content):
464
+ if '[C@H](CC(=O)O)' in content: # D-form
465
+ return 'asp', mods
466
  return 'Asp', mods
467
 
 
 
 
 
 
468
  if re.search(r'Cc\d+c\[nH\]cn\d+', content) or re.search(r'Cc\d+cnc\[nH\]\d+', content):
469
+ if '[C@H]' in content: # D-form
470
+ return 'his', mods
471
  return 'His', mods
472
+ if 'C2(CCCC2)' in content or 'C1(CCCC1)' in content or re.search(r'C\d+\(CCCC\d+\)', content):
473
+ return 'Cyl', mods
474
 
475
+ if ('N[C@@H](CCCC)' in content or '[C@@H](CCCC)' in content or 'CCCC[C@@H]' in content or
476
+ 'N[C@H](CCCC)' in content or '[C@H](CCCC)' in content) and 'CC(C)' not in content:
477
+ return 'Nle', mods
478
+ # Aib - alpha-aminoisobutyric acid (2-aminoisobutyric acid)
479
+ if 'C(C)(C)(N)' in content or 'C(C)(C)' in content or 'C(C)(C)' in content and ('C(=O)N' in segment['bond_before'] or 'NC(=O)' in segment['bond_before'] or 'N(C)C(=O)' in segment['bond_before']) and \
480
+ ('NC(=O)' in segment['bond_after'] or 'C(=O)N' in segment['bond_after'] or 'N(C)C(=O)' in segment['bond_after']):
481
+ return 'Aib', mods
482
 
483
+ # Dtg - Asp(OtBu)-(Dmb)Gly
484
+ if 'CC(=O)OC(C)(C)C' in content and 'CC1=C(C=C(C=C1)OC)OC' in content:
485
+ return 'Dtg', mods
486
+
487
+
488
+ # Kpg - Lys(palmitoyl-Glu-OtBu)
489
+ if 'CCCNC(=O)' in content and 'CCCCCCCCCCCC' in content:
490
+ return 'Kpg', mods
491
+
492
+ # Tpb - Thr(PO(OBzl)OH)
493
+ if re.search(r'\[C[@]?H\]\(C\)OP\(=O\)\(O\)', content) or 'OP(=O)(O)OCC' in content:
494
+ return 'Tpb', mods
495
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
496
  return None, mods
497
 
498
  def get_modifications(self, segment):
499
+ """Get modifications based on bond types and segment content - fixed to avoid duplicates"""
500
  mods = []
501
+
502
+ # Check for N-methylation in any form, but only add it once
503
+ # Check both bonds and segment content for N-methylation patterns
504
+ if ((segment.get('bond_after') and
505
+ ('N(C)' in segment['bond_after'] or segment['bond_after'].startswith('C(=O)N(C)'))) or
506
+ ('N(C)C(=O)' in segment['content'] or 'N(C)C1=O' in segment['content']) or
507
+ (segment['content'].endswith('N(C)C(=O)') or segment['content'].endswith('N(C)C1=O'))):
508
+ mods.append('N-Me')
509
+
510
+ # Check for O-linked modifications
511
+ #if segment.get('bond_after') and 'OC(=O)' in segment['bond_after']:
512
+ #mods.append('O-linked')
513
+
514
  return mods
515
 
516
  def analyze_structure(self, smiles):
517
+ """Main analysis function with preprocessing for complex residues"""
518
  print("\nAnalyzing structure:", smiles)
519
+
520
+ # Pre-process to identify complex residues first
521
+ preprocessed_smiles, protected_residues = self.preprocess_complex_residues(smiles)
522
+
523
+ if protected_residues:
524
+ print(f"Identified {len(protected_residues)} complex residues during pre-processing")
525
+ for i, residue in enumerate(protected_residues):
526
+ print(f"Complex residue {i+1}: {residue['type']}")
527
+
528
 
529
+ # Check if it's cyclic
530
+ is_cyclic, peptide_cycles, aromatic_cycles = self.is_cyclic(smiles)
531
+
532
+ segments = self.split_on_bonds(preprocessed_smiles, protected_residues)
533
 
534
  print("\nSegment Analysis:")
535
  sequence = []
536
  for i, segment in enumerate(segments):
537
  print(f"\nSegment {i}:")
538
+ print(f"Content: {segment.get('content', 'None')}")
539
  print(f"Bond before: {segment.get('bond_before', 'None')}")
540
  print(f"Bond after: {segment.get('bond_after', 'None')}")
541
 
 
548
  print(f"Identified as: {residue}")
549
  print(f"Modifications: {mods}")
550
  else:
551
+ print(f"Warning: Could not identify residue in segment: {segment.get('content', 'None')}")
552
 
 
 
553
  three_letter = '-'.join(sequence)
554
+
555
  one_letter = ''.join(self.three_to_one.get(aa.split('(')[0], 'X') for aa in sequence)
556
 
557
  if is_cyclic:
 
561
  print(f"\nFinal sequence: {three_letter}")
562
  print(f"One-letter code: {one_letter}")
563
  print(f"Is cyclic: {is_cyclic}")
564
+ print(f"Peptide cycles: {peptide_cycles}")
565
+ print(f"Aromatic cycles: {aromatic_cycles}")
566
 
567
  return {
568
  'three_letter': three_letter,
569
  'one_letter': one_letter,
570
+ 'is_cyclic': is_cyclic,
571
+ 'residues': sequence
572
  }
573
 
574
  def annotate_cyclic_structure(mol, sequence):