Spaces:
Running
Running
Commit
·
71e2885
1
Parent(s):
2b18b2c
adjust several LD aa and GLY bond
Browse files
app.py
CHANGED
@@ -72,12 +72,10 @@ class PeptideAnalyzer:
|
|
72 |
|
73 |
is_cyclic = len(peptide_cycles) > 0 and not smiles.endswith('C(=O)O')
|
74 |
return is_cyclic, peptide_cycles, aromatic_cycles
|
75 |
-
|
76 |
def split_on_bonds(self, smiles):
|
77 |
-
"""Split SMILES into segments with simplified Pro handling"""
|
78 |
positions = []
|
79 |
used = set()
|
80 |
-
|
81 |
# Find Gly pattern first
|
82 |
gly_pattern = r'NCC\(=O\)'
|
83 |
for match in re.finditer(gly_pattern, smiles):
|
@@ -106,7 +104,6 @@ class PeptideAnalyzer:
|
|
106 |
|
107 |
# Create segments
|
108 |
segments = []
|
109 |
-
|
110 |
if positions:
|
111 |
# First segment
|
112 |
if positions[0]['start'] > 0:
|
@@ -114,18 +111,21 @@ class PeptideAnalyzer:
|
|
114 |
'content': smiles[0:positions[0]['start']],
|
115 |
'bond_after': positions[0]['pattern']
|
116 |
})
|
117 |
-
|
118 |
# Process segments
|
119 |
for i in range(len(positions)-1):
|
120 |
current = positions[i]
|
121 |
next_pos = positions[i+1]
|
122 |
-
|
123 |
if current['type'] == 'gly':
|
124 |
segments.append({
|
125 |
'content': 'NCC(=O)',
|
126 |
'bond_before': positions[i-1]['pattern'] if i > 0 else None,
|
127 |
'bond_after': next_pos['pattern']
|
128 |
})
|
|
|
|
|
|
|
|
|
|
|
129 |
else:
|
130 |
content = smiles[current['end']:next_pos['start']]
|
131 |
if content:
|
@@ -141,7 +141,6 @@ class PeptideAnalyzer:
|
|
141 |
'content': smiles[positions[-1]['end']:],
|
142 |
'bond_before': positions[-1]['pattern']
|
143 |
})
|
144 |
-
|
145 |
return segments
|
146 |
|
147 |
def clean_terminal_carboxyl(self, segment):
|
@@ -168,11 +167,162 @@ class PeptideAnalyzer:
|
|
168 |
content = self.clean_terminal_carboxyl(segment)
|
169 |
mods = self.get_modifications(segment)
|
170 |
|
171 |
-
#
|
172 |
-
|
173 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
174 |
if '[C@@H](c1ccccc1)' in content or '[C@H](c1ccccc1)' in content:
|
175 |
return '4', mods # Base phenylglycine
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
176 |
|
177 |
# 4-substituted phenylalanines
|
178 |
if 'Cc1ccc' in content:
|
@@ -282,22 +432,6 @@ class PeptideAnalyzer:
|
|
282 |
if 'c1ccc(c(c1)O)O' in content:
|
283 |
return 'DAH', mods # 3,4-Dihydroxy-phenylalanine
|
284 |
|
285 |
-
# Cyclic amino acids
|
286 |
-
if 'C1CCCC1' in content:
|
287 |
-
return 'CPA3', mods # 3-Cyclopentyl-alanine
|
288 |
-
if 'C1CCCCC1' in content:
|
289 |
-
if 'CC1CCCCC1' in content:
|
290 |
-
return 'ALC', mods # 3-cyclohexyl-alanine
|
291 |
-
else:
|
292 |
-
return 'CHG', mods # Cyclohexylglycine
|
293 |
-
|
294 |
-
# Chain-length variants
|
295 |
-
if 'CCC[C@@H]' in content or 'CCC[C@H]' in content:
|
296 |
-
return 'NLE', mods # Norleucine
|
297 |
-
if 'CC[C@@H]' in content or 'CC[C@H]' in content:
|
298 |
-
if not any(x in content for x in ['CC(C)', 'COC', 'CN(']):
|
299 |
-
return 'ABA', mods # 2-Aminobutyric acid
|
300 |
-
|
301 |
# Modified histidines
|
302 |
if 'c1cnc' in content:
|
303 |
if '[C@@H]1CN[C@@H](N1)F' in content:
|
@@ -307,7 +441,6 @@ class PeptideAnalyzer:
|
|
307 |
if 'c1c[nH]c(n1)F' in content:
|
308 |
return '2HF2', mods # 2-fluoro-l-histidine variant
|
309 |
|
310 |
-
# Sulfur and selenium containing
|
311 |
if '[SeH]' in content:
|
312 |
return 'CSE', mods # Selenocysteine
|
313 |
if 'S' in content:
|
@@ -318,7 +451,6 @@ class PeptideAnalyzer:
|
|
318 |
if 'CCS' in content:
|
319 |
return 'HCS', mods # homocysteine
|
320 |
|
321 |
-
# Additional modifications
|
322 |
if 'CN=[N]=N' in content:
|
323 |
return 'AZDA', mods # azido-alanine
|
324 |
if '[NH]=[C](=[NH2])=[NH2]' in content:
|
@@ -326,7 +458,21 @@ class PeptideAnalyzer:
|
|
326 |
return 'AGM', mods # 5-methyl-arginine
|
327 |
if 'CC[NH]=' in content:
|
328 |
return 'GDPR', mods # 2-Amino-3-guanidinopropionic acid
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
329 |
|
|
|
|
|
|
|
|
|
|
|
330 |
if 'CCON' in content:
|
331 |
return 'CAN', mods # canaline
|
332 |
if '[C@@H]1C=C[C@@H](C=C1)' in content:
|
@@ -350,7 +496,6 @@ class PeptideAnalyzer:
|
|
350 |
if 'c1cccc(c1)[C](=[NH2])=[NH2]' in content:
|
351 |
return 'APM', mods # m-amidinophenyl-3-alanine
|
352 |
|
353 |
-
# Multiple hydroxy patterns
|
354 |
if 'O' in content:
|
355 |
if '[C@H]([C@H](C)O)O' in content:
|
356 |
return 'ILX', mods # 4,5-dihydroxy-isoleucine
|
@@ -365,7 +510,6 @@ class PeptideAnalyzer:
|
|
365 |
if '[C@H](c1ccc(c(Cl)c1)O)O' in content:
|
366 |
return 'OMY', mods # (betar)-3-chloro-beta-hydroxy-l-tyrosine
|
367 |
|
368 |
-
# Heterocyclic patterns
|
369 |
if 'n1' in content:
|
370 |
if 'n1cccn1' in content:
|
371 |
return 'PYZ1', mods # 3-(1-Pyrazolyl)-alanine
|
@@ -384,7 +528,6 @@ class PeptideAnalyzer:
|
|
384 |
if 'c1cnc2c(n1)cccc2' in content:
|
385 |
return 'QX32', mods # 3-(2-quinoxalyl)-alanine
|
386 |
|
387 |
-
# Multiple nitrogen patterns
|
388 |
if 'N' in content:
|
389 |
if '[NH3]CC[C@@H]' in content:
|
390 |
return 'DAB', mods # Diaminobutyric acid
|
@@ -397,7 +540,6 @@ class PeptideAnalyzer:
|
|
397 |
if '[NH]=[C](=S)=[NH2]' in content:
|
398 |
return 'THIC', mods # Thio-citrulline
|
399 |
|
400 |
-
# Chain modified amino acids
|
401 |
if 'CC' in content:
|
402 |
if 'CCCC[C@@H]' in content:
|
403 |
return 'AHP', mods # 2-Aminoheptanoic acid
|
@@ -410,7 +552,6 @@ class PeptideAnalyzer:
|
|
410 |
if '[C@@H]([C@@H](C)O)C' in content:
|
411 |
return 'HLU', mods # beta-hydroxyleucine
|
412 |
|
413 |
-
# Modified glutamate/aspartate patterns
|
414 |
if '[C@@H]' in content:
|
415 |
if '[C@@H](C[C@@H](F))' in content:
|
416 |
return 'FGA4', mods # 4-Fluoro-glutamic acid
|
@@ -421,7 +562,6 @@ class PeptideAnalyzer:
|
|
421 |
if '[C@@H](CC[C@H](C))' in content:
|
422 |
return 'MEG', mods # (3s)-3-methyl-l-glutamic acid
|
423 |
|
424 |
-
# Sulfur and selenium modifications
|
425 |
if 'S' in content:
|
426 |
if 'SCC[C@@H]' in content:
|
427 |
return 'HSER', mods # homoserine
|
@@ -434,7 +574,6 @@ class PeptideAnalyzer:
|
|
434 |
if 'S(=O)(=O)' in content:
|
435 |
return 'OMT', mods # Methionine sulfone
|
436 |
|
437 |
-
# Double bond containing
|
438 |
if 'C=' in content:
|
439 |
if 'C=C[C@@H]' in content:
|
440 |
return '2AG', mods # 2-Allyl-glycine
|
@@ -443,175 +582,16 @@ class PeptideAnalyzer:
|
|
443 |
if 'C=Cc1ccccc1' in content:
|
444 |
return 'STYA', mods # Styrylalanine
|
445 |
|
446 |
-
# Special cases
|
447 |
if '[C@@H]1Cc2c(C1)cccc2' in content:
|
448 |
return 'IGL', mods # alpha-amino-2-indanacetic acid
|
449 |
if '[C](=[C](=O)=O)=O' in content:
|
450 |
return '26P', mods # 2-amino-6-oxopimelic acid
|
451 |
if '[C](=[C](=O)=O)=C' in content:
|
452 |
return '2NP', mods # l-2-amino-6-methylene-pimelic acid
|
453 |
-
if 'c2cnc[nH]2' in content:
|
454 |
-
return 'HIS', mods # histidine core
|
455 |
if 'c1cccc2c1cc(O)cc2' in content:
|
456 |
return 'NAO1', mods # 5-hydroxy-1-naphthalene
|
457 |
if 'c1ccc2c(c1)cc(O)cc2' in content:
|
458 |
-
return 'NAO2', mods # 6-hydroxy-2-naphthalene
|
459 |
-
|
460 |
-
# Proline (P) - flexible ring numbers
|
461 |
-
if any([
|
462 |
-
# Check for any ring number in bond patterns
|
463 |
-
(segment.get('bond_after', '').startswith(f'N{n}C(=O)') and 'CCC' in content and
|
464 |
-
any(f'[C@@H]{n}' in content or f'[C@H]{n}' in content for n in '123456789'))
|
465 |
-
for n in '123456789'
|
466 |
-
]) or any([
|
467 |
-
# Check ending patterns with any ring number
|
468 |
-
(f'CCCN{n}' in content and content.endswith('=O') and
|
469 |
-
any(f'[C@@H]{n}' in content or f'[C@H]{n}' in content for n in '123456789'))
|
470 |
-
for n in '123456789'
|
471 |
-
]) or any([
|
472 |
-
# Handle CCC[C@H]n patterns
|
473 |
-
(content == f'CCC[C@H]{n}' and segment.get('bond_before', '').startswith(f'C(=O)N{n}')) or
|
474 |
-
(content == f'CCC[C@@H]{n}' and segment.get('bond_before', '').startswith(f'C(=O)N{n}')) or
|
475 |
-
# N-terminal Pro with any ring number
|
476 |
-
(f'N{n}CCC[C@H]{n}' in content) or
|
477 |
-
(f'N{n}CCC[C@@H]{n}' in content)
|
478 |
-
for n in '123456789'
|
479 |
-
]):
|
480 |
-
return 'Pro', mods
|
481 |
-
|
482 |
-
# Tryptophan (W) - more specific indole pattern
|
483 |
-
if re.search(r'c[0-9]c\[nH\]c[0-9]ccccc[0-9][0-9]', content) and \
|
484 |
-
'c[nH]c' in content.replace(' ', ''):
|
485 |
-
return 'Trp', mods
|
486 |
-
|
487 |
-
# Lysine (K) - both patterns
|
488 |
-
if '[C@@H](CCCCN)' in content or '[C@H](CCCCN)' in content:
|
489 |
-
return 'Lys', mods
|
490 |
-
|
491 |
-
# Arginine (R) - both patterns
|
492 |
-
if '[C@@H](CCCNC(=N)N)' in content or '[C@H](CCCNC(=N)N)' in content:
|
493 |
-
return 'Arg', mods
|
494 |
-
|
495 |
-
if ('C[C@H](CCCC)' in content or 'C[C@@H](CCCC)' in content) and 'CC(C)' not in content:
|
496 |
-
return 'Nle', mods
|
497 |
-
|
498 |
-
# Ornithine (Orn) - 3-carbon chain with NH2
|
499 |
-
if ('C[C@H](CCCN)' in content or 'C[C@@H](CCCN)' in content) and 'CC(C)' not in content:
|
500 |
-
return 'Orn', mods
|
501 |
-
|
502 |
-
# 2-Naphthylalanine (2Nal) - distinct from Phe pattern
|
503 |
-
if ('Cc3cc2ccccc2c3' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
|
504 |
-
return '2Nal', mods
|
505 |
-
|
506 |
-
# Cyclohexylalanine (Cha) - already in your code but moved here for clarity
|
507 |
-
if 'N2CCCCC2' in content or 'CCCCC2' in content:
|
508 |
-
return 'Cha', mods
|
509 |
-
|
510 |
-
# Aminobutyric acid (Abu) - 2-carbon chain
|
511 |
-
if ('C[C@H](CC)' in content or 'C[C@@H](CC)' in content) and not any(p in content for p in ['CC(C)', 'CCCC', 'CCC(C)']):
|
512 |
-
return 'Abu', mods
|
513 |
-
|
514 |
-
# Pipecolic acid (Pip) - 6-membered ring like Pro
|
515 |
-
if ('N3CCCCC3' in content or 'CCCCC3' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
|
516 |
-
return 'Pip', mods
|
517 |
-
|
518 |
-
# Cyclohexylglycine (Chg) - direct cyclohexyl without CH2
|
519 |
-
if ('C[C@H](C1CCCCC1)' in content or 'C[C@@H](C1CCCCC1)' in content):
|
520 |
-
return 'Chg', mods
|
521 |
-
|
522 |
-
# 4-Fluorophenylalanine (4F-Phe)
|
523 |
-
if ('Cc2ccc(F)cc2' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
|
524 |
-
return '4F-Phe', mods
|
525 |
-
|
526 |
-
# Regular residue identification
|
527 |
-
if ('NCC(=O)' in content) or (content == 'C'):
|
528 |
-
# Middle case - between bonds
|
529 |
-
if segment.get('bond_before') and segment.get('bond_after'):
|
530 |
-
if ('C(=O)N' in segment['bond_before'] or 'C(=O)N(C)' in segment['bond_before']):
|
531 |
-
return 'Gly', mods
|
532 |
-
# Terminal case - at the end
|
533 |
-
elif segment.get('bond_before') and segment.get('bond_before').startswith('C(=O)N'):
|
534 |
-
return 'Gly', mods
|
535 |
-
|
536 |
-
if 'CC(C)C[C@H]' in content or 'CC(C)C[C@@H]' in content:
|
537 |
-
return 'Leu', mods
|
538 |
-
if '[C@@H](CC(C)C)' in content or '[C@H](CC(C)C)' in content:
|
539 |
-
return 'Leu', mods
|
540 |
-
|
541 |
-
if '[C@@H]([C@@H](C)O)' in content or '[C@H]([C@H](C)O)' in content:
|
542 |
-
return 'Thr', mods
|
543 |
-
|
544 |
-
if '[C@H](Cc2ccccc2)' in content or '[C@@H](Cc2ccccc2)' in content:
|
545 |
-
return 'Phe', mods
|
546 |
-
|
547 |
-
if ('[C@H](C(C)C)' in content or # With outer parentheses
|
548 |
-
'[C@@H](C(C)C)' in content or # With outer parentheses
|
549 |
-
'[C@H]C(C)C' in content or # Without outer parentheses
|
550 |
-
'[C@@H]C(C)C' in content): # Without outer parentheses
|
551 |
-
if not any(p in content for p in ['CC(C)C[C@H]', 'CC(C)C[C@@H]']): # Still check not Leu
|
552 |
-
return 'Val', mods
|
553 |
-
|
554 |
-
if '[C@H](COC(C)(C)C)' in content or '[C@@H](COC(C)(C)C)' in content:
|
555 |
-
return 'O-tBu', mods
|
556 |
-
|
557 |
-
if any([
|
558 |
-
'CC[C@H](C)' in content,
|
559 |
-
'CC[C@@H](C)' in content,
|
560 |
-
'C(C)C[C@H]' in content and 'CC(C)C' not in content,
|
561 |
-
'C(C)C[C@@H]' in content and 'CC(C)C' not in content
|
562 |
-
]):
|
563 |
-
return 'Ile', mods
|
564 |
-
|
565 |
-
if ('[C@H](C)' in content or '[C@@H](C)' in content):
|
566 |
-
if not any(p in content for p in ['C(C)C', 'COC', 'CN(', 'C(C)O', 'CC[C@H]', 'CC[C@@H]']):
|
567 |
-
return 'Ala', mods
|
568 |
-
|
569 |
-
# Tyrosine (Tyr) - 4-hydroxybenzyl side chain
|
570 |
-
if re.search(r'Cc[0-9]ccc\(O\)cc[0-9]', content):
|
571 |
-
return 'Tyr', mods
|
572 |
-
|
573 |
-
|
574 |
-
# Serine (Ser) - Hydroxymethyl side chain
|
575 |
-
if '[C@H](CO)' in content or '[C@@H](CO)' in content:
|
576 |
-
if not ('C(C)O' in content or 'COC' in content):
|
577 |
-
return 'Ser', mods
|
578 |
-
|
579 |
-
# Threonine (Thr) - 1-hydroxyethyl side chain
|
580 |
-
if '[C@@H]([C@@H](C)O)' in content or '[C@H]([C@H](C)O)' in content or '[C@@H](C)O' in content or '[C@H](C)O' in content:
|
581 |
-
return 'Thr', mods
|
582 |
-
|
583 |
-
# Cysteine (Cys) - Thiol side chain
|
584 |
-
if '[C@H](CS)' in content or '[C@@H](CS)' in content:
|
585 |
-
return 'Cys', mods
|
586 |
-
|
587 |
-
# Methionine (Met) - Methylthioethyl side chain
|
588 |
-
if ('C[C@H](CCSC)' in content or 'C[C@@H](CCSC)' in content):
|
589 |
-
return 'Met', mods
|
590 |
-
|
591 |
-
# Asparagine (Asn) - Carbamoylmethyl side chain
|
592 |
-
if ('CC(=O)N' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
|
593 |
-
return 'Asn', mods
|
594 |
-
|
595 |
-
# Glutamine (Gln) - Carbamoylethyl side chain
|
596 |
-
if ('CCC(=O)N' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
|
597 |
-
return 'Gln', mods
|
598 |
-
|
599 |
-
# Aspartic acid (Asp) - Carboxymethyl side chain
|
600 |
-
if ('CC(=O)O' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
|
601 |
-
return 'Asp', mods
|
602 |
-
|
603 |
-
# Glutamic acid (Glu) - Carboxyethyl side chain
|
604 |
-
if ('CCC(=O)O' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
|
605 |
-
return 'Glu', mods
|
606 |
-
|
607 |
-
# Arginine (Arg) - 3-guanidinopropyl side chain
|
608 |
-
if ('CCCNC(=N)N' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
|
609 |
-
return 'Arg', mods
|
610 |
-
|
611 |
-
# Histidine (His) - Imidazole side chain
|
612 |
-
if ('Cc2cnc[nH]2' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
|
613 |
-
return 'His', mods
|
614 |
-
|
615 |
return None, mods
|
616 |
|
617 |
def get_modifications(self, segment):
|
@@ -670,109 +650,11 @@ class PeptideAnalyzer:
|
|
670 |
'one_letter': one_letter,
|
671 |
'is_cyclic': is_cyclic
|
672 |
}
|
673 |
-
|
674 |
-
"""
|
675 |
-
def annotate_cyclic_structure(mol, sequence):
|
676 |
-
'''Create annotated 2D structure with clear, non-overlapping residue labels'''
|
677 |
-
# Generate 2D coordinates
|
678 |
-
# Generate 2D coordinates
|
679 |
-
AllChem.Compute2DCoords(mol)
|
680 |
-
|
681 |
-
# Create drawer with larger size for annotations
|
682 |
-
drawer = Draw.rdMolDraw2D.MolDraw2DCairo(2000, 2000) # Even larger size
|
683 |
-
|
684 |
-
# Get residue list and reverse it to match structural representation
|
685 |
-
if sequence.startswith('cyclo('):
|
686 |
-
residues = sequence[6:-1].split('-')
|
687 |
-
else:
|
688 |
-
residues = sequence.split('-')
|
689 |
-
residues = list(reversed(residues)) # Reverse the sequence
|
690 |
-
|
691 |
-
# Draw molecule first to get its bounds
|
692 |
-
drawer.drawOptions().addAtomIndices = False
|
693 |
-
drawer.DrawMolecule(mol)
|
694 |
-
drawer.FinishDrawing()
|
695 |
-
|
696 |
-
# Convert to PIL Image
|
697 |
-
img = Image.open(BytesIO(drawer.GetDrawingText()))
|
698 |
-
draw = ImageDraw.Draw(img)
|
699 |
-
|
700 |
-
try:
|
701 |
-
# Try to use DejaVuSans as it's commonly available on Linux systems
|
702 |
-
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 60)
|
703 |
-
small_font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 60)
|
704 |
-
except OSError:
|
705 |
-
try:
|
706 |
-
# Fallback to Arial if available (common on Windows)
|
707 |
-
font = ImageFont.truetype("arial.ttf", 60)
|
708 |
-
small_font = ImageFont.truetype("arial.ttf", 60)
|
709 |
-
except OSError:
|
710 |
-
# If no TrueType fonts are available, fall back to default
|
711 |
-
print("Warning: TrueType fonts not available, using default font")
|
712 |
-
font = ImageFont.load_default()
|
713 |
-
small_font = ImageFont.load_default()
|
714 |
-
# Get molecule bounds
|
715 |
-
conf = mol.GetConformer()
|
716 |
-
positions = []
|
717 |
-
for i in range(mol.GetNumAtoms()):
|
718 |
-
pos = conf.GetAtomPosition(i)
|
719 |
-
positions.append((pos.x, pos.y))
|
720 |
-
|
721 |
-
x_coords = [p[0] for p in positions]
|
722 |
-
y_coords = [p[1] for p in positions]
|
723 |
-
min_x, max_x = min(x_coords), max(x_coords)
|
724 |
-
min_y, max_y = min(y_coords), max(y_coords)
|
725 |
-
|
726 |
-
# Calculate scaling factors
|
727 |
-
scale = 150 # Increased scale factor
|
728 |
-
center_x = 1000 # Image center
|
729 |
-
center_y = 1000
|
730 |
-
|
731 |
-
# Add residue labels in a circular arrangement around the structure
|
732 |
-
n_residues = len(residues)
|
733 |
-
radius = 700 # Distance of labels from center
|
734 |
-
|
735 |
-
# Start from the rightmost point (3 o'clock position) and go counterclockwise
|
736 |
-
# Offset by -3 positions to align with structure
|
737 |
-
offset = 0 # Adjust this value to match the structure alignment
|
738 |
-
for i, residue in enumerate(residues):
|
739 |
-
# Calculate position in a circle around the structure
|
740 |
-
# Start from 0 (3 o'clock) and go counterclockwise
|
741 |
-
angle = -(2 * np.pi * ((i + offset) % n_residues) / n_residues)
|
742 |
|
743 |
-
# Calculate label position
|
744 |
-
label_x = center_x + radius * np.cos(angle)
|
745 |
-
label_y = center_y + radius * np.sin(angle)
|
746 |
-
|
747 |
-
# Draw residue label
|
748 |
-
text = f"{i+1}. {residue}"
|
749 |
-
bbox = draw.textbbox((label_x, label_y), text, font=font)
|
750 |
-
padding = 10
|
751 |
-
draw.rectangle([bbox[0]-padding, bbox[1]-padding,
|
752 |
-
bbox[2]+padding, bbox[3]+padding],
|
753 |
-
fill='white', outline='white')
|
754 |
-
draw.text((label_x, label_y), text,
|
755 |
-
font=font, fill='black', anchor="mm")
|
756 |
-
|
757 |
-
# Add sequence at the top with white background
|
758 |
-
seq_text = f"Sequence: {sequence}"
|
759 |
-
bbox = draw.textbbox((center_x, 100), seq_text, font=small_font)
|
760 |
-
padding = 10
|
761 |
-
draw.rectangle([bbox[0]-padding, bbox[1]-padding,
|
762 |
-
bbox[2]+padding, bbox[3]+padding],
|
763 |
-
fill='white', outline='white')
|
764 |
-
draw.text((center_x, 100), seq_text,
|
765 |
-
font=small_font, fill='black', anchor="mm")
|
766 |
-
|
767 |
-
return img
|
768 |
-
|
769 |
-
"""
|
770 |
def annotate_cyclic_structure(mol, sequence):
|
771 |
-
"""Create structure visualization
|
772 |
-
# Generate 2D coordinates
|
773 |
AllChem.Compute2DCoords(mol)
|
774 |
|
775 |
-
# Create drawer with larger size for annotations
|
776 |
drawer = Draw.rdMolDraw2D.MolDraw2DCairo(2000, 2000)
|
777 |
|
778 |
# Draw molecule first
|
@@ -792,7 +674,7 @@ def annotate_cyclic_structure(mol, sequence):
|
|
792 |
print("Warning: TrueType fonts not available, using default font")
|
793 |
small_font = ImageFont.load_default()
|
794 |
|
795 |
-
#
|
796 |
seq_text = f"Sequence: {sequence}"
|
797 |
bbox = draw.textbbox((1000, 100), seq_text, font=small_font)
|
798 |
padding = 10
|
@@ -805,61 +687,50 @@ def annotate_cyclic_structure(mol, sequence):
|
|
805 |
return img
|
806 |
|
807 |
def create_enhanced_linear_viz(sequence, smiles):
|
808 |
-
"""
|
809 |
-
analyzer = PeptideAnalyzer()
|
810 |
|
811 |
-
# Create figure with two subplots
|
812 |
fig = plt.figure(figsize=(15, 10))
|
813 |
gs = fig.add_gridspec(2, 1, height_ratios=[1, 2])
|
814 |
ax_struct = fig.add_subplot(gs[0])
|
815 |
ax_detail = fig.add_subplot(gs[1])
|
816 |
|
817 |
-
# Parse sequence and get residues
|
818 |
if sequence.startswith('cyclo('):
|
819 |
residues = sequence[6:-1].split('-')
|
820 |
else:
|
821 |
residues = sequence.split('-')
|
822 |
|
823 |
-
# Get segments using analyzer
|
824 |
segments = analyzer.split_on_bonds(smiles)
|
825 |
|
826 |
-
# Debug print
|
827 |
print(f"Number of residues: {len(residues)}")
|
828 |
print(f"Number of segments: {len(segments)}")
|
829 |
|
830 |
-
# Top subplot - Basic structure
|
831 |
ax_struct.set_xlim(0, 10)
|
832 |
ax_struct.set_ylim(0, 2)
|
833 |
|
834 |
num_residues = len(residues)
|
835 |
spacing = 9.0 / (num_residues - 1) if num_residues > 1 else 9.0
|
836 |
|
837 |
-
# Draw basic structure
|
838 |
y_pos = 1.5
|
839 |
for i in range(num_residues):
|
840 |
x_pos = 0.5 + i * spacing
|
841 |
|
842 |
-
# Draw amino acid box
|
843 |
rect = patches.Rectangle((x_pos-0.3, y_pos-0.2), 0.6, 0.4,
|
844 |
facecolor='lightblue', edgecolor='black')
|
845 |
ax_struct.add_patch(rect)
|
846 |
|
847 |
-
# Draw connecting bonds if not the last residue
|
848 |
if i < num_residues - 1:
|
849 |
segment = segments[i] if i < len(segments) else None
|
850 |
if segment:
|
851 |
-
# Determine bond type from segment info
|
852 |
bond_type = 'ester' if 'O-linked' in segment.get('bond_after', '') else 'peptide'
|
853 |
is_n_methylated = 'N-Me' in segment.get('bond_after', '')
|
854 |
|
855 |
bond_color = 'red' if bond_type == 'ester' else 'black'
|
856 |
linestyle = '--' if bond_type == 'ester' else '-'
|
857 |
|
858 |
-
# Draw bond line
|
859 |
ax_struct.plot([x_pos+0.3, x_pos+spacing-0.3], [y_pos, y_pos],
|
860 |
color=bond_color, linestyle=linestyle, linewidth=2)
|
861 |
|
862 |
-
# Add bond type label
|
863 |
mid_x = x_pos + spacing/2
|
864 |
bond_label = f"{bond_type}"
|
865 |
if is_n_methylated:
|
@@ -868,16 +739,13 @@ def create_enhanced_linear_viz(sequence, smiles):
|
|
868 |
ha='center', va='bottom', fontsize=10,
|
869 |
color=bond_color)
|
870 |
|
871 |
-
# Add residue label
|
872 |
ax_struct.text(x_pos, y_pos-0.5, residues[i],
|
873 |
ha='center', va='top', fontsize=14)
|
874 |
|
875 |
-
# Bottom subplot - Detailed breakdown
|
876 |
ax_detail.set_ylim(0, len(segments)+1)
|
877 |
ax_detail.set_xlim(0, 1)
|
878 |
|
879 |
-
|
880 |
-
segment_y = len(segments) # Start from top
|
881 |
for i, segment in enumerate(segments):
|
882 |
y = segment_y - i
|
883 |
|
@@ -899,7 +767,6 @@ def create_enhanced_linear_viz(sequence, smiles):
|
|
899 |
text += "peptide"
|
900 |
color = 'red'
|
901 |
|
902 |
-
# Add segment analysis
|
903 |
ax_detail.text(0.05, y, text, fontsize=12, color=color)
|
904 |
ax_detail.text(0.5, y, f"SMILES: {segment.get('content', '')}", fontsize=10, color='gray')
|
905 |
|
@@ -910,11 +777,9 @@ def create_enhanced_linear_viz(sequence, smiles):
|
|
910 |
ax_struct.text(5, y_pos+0.3, 'Cyclic Connection',
|
911 |
ha='center', color='red', fontsize=14)
|
912 |
|
913 |
-
# Add titles and adjust layout
|
914 |
ax_struct.set_title("Peptide Structure Overview", pad=20)
|
915 |
ax_detail.set_title("Segment Analysis Breakdown", pad=20)
|
916 |
|
917 |
-
# Remove axes
|
918 |
for ax in [ax_struct, ax_detail]:
|
919 |
ax.set_xticks([])
|
920 |
ax.set_yticks([])
|
@@ -924,7 +789,7 @@ def create_enhanced_linear_viz(sequence, smiles):
|
|
924 |
return fig
|
925 |
|
926 |
class PeptideStructureGenerator:
|
927 |
-
"""
|
928 |
|
929 |
@staticmethod
|
930 |
def prepare_molecule(smiles):
|
@@ -933,7 +798,6 @@ class PeptideStructureGenerator:
|
|
933 |
if mol is None:
|
934 |
raise ValueError("Failed to create molecule from SMILES")
|
935 |
|
936 |
-
# Calculate valence for each atom
|
937 |
for atom in mol.GetAtoms():
|
938 |
atom.UpdatePropertyCache(strict=False)
|
939 |
|
@@ -951,7 +815,7 @@ class PeptideStructureGenerator:
|
|
951 |
|
952 |
@staticmethod
|
953 |
def get_etkdg_params(attempt=0):
|
954 |
-
"""Get ETKDG parameters
|
955 |
params = AllChem.ETKDGv3()
|
956 |
params.randomSeed = -1
|
957 |
params.maxIterations = 200
|
@@ -1025,13 +889,11 @@ class PeptideStructureGenerator:
|
|
1025 |
@staticmethod
|
1026 |
def mol_to_sdf_bytes(mol):
|
1027 |
"""Convert RDKit molecule to SDF file bytes"""
|
1028 |
-
# First write to StringIO in text mode
|
1029 |
sio = StringIO()
|
1030 |
writer = Chem.SDWriter(sio)
|
1031 |
writer.write(mol)
|
1032 |
writer.close()
|
1033 |
|
1034 |
-
# Convert the string to bytes
|
1035 |
return sio.getvalue().encode('utf-8')
|
1036 |
|
1037 |
def process_input(smiles_input=None, file_obj=None, show_linear=False,
|
@@ -1045,17 +907,14 @@ def process_input(smiles_input=None, file_obj=None, show_linear=False,
|
|
1045 |
if smiles_input:
|
1046 |
smiles = smiles_input.strip()
|
1047 |
|
1048 |
-
# First check if it's a peptide using analyzer's method
|
1049 |
if not analyzer.is_peptide(smiles):
|
1050 |
return "Error: Input SMILES does not appear to be a peptide structure.", None, None
|
1051 |
|
1052 |
try:
|
1053 |
-
# Create molecule
|
1054 |
mol = Chem.MolFromSmiles(smiles)
|
1055 |
if mol is None:
|
1056 |
return "Error: Invalid SMILES notation.", None, None
|
1057 |
|
1058 |
-
# Generate 3D structures if requested
|
1059 |
if generate_3d:
|
1060 |
generator = PeptideStructureGenerator()
|
1061 |
|
@@ -1080,10 +939,8 @@ def process_input(smiles_input=None, file_obj=None, show_linear=False,
|
|
1080 |
except Exception as e:
|
1081 |
return f"Error generating 3D structures: {str(e)}", None, None, None
|
1082 |
|
1083 |
-
# Use analyzer to get sequence
|
1084 |
segments = analyzer.split_on_bonds(smiles)
|
1085 |
|
1086 |
-
# Process segments and build sequence
|
1087 |
sequence_parts = []
|
1088 |
output_text = ""
|
1089 |
|
@@ -1108,7 +965,6 @@ def process_input(smiles_input=None, file_obj=None, show_linear=False,
|
|
1108 |
output_text += f"Warning: Could not identify residue in segment: {segment['content']}\n"
|
1109 |
output_text += "\n"
|
1110 |
else:
|
1111 |
-
# Just build sequence without detailed analysis in output
|
1112 |
for segment in segments:
|
1113 |
residue, mods = analyzer.identify_residue(segment)
|
1114 |
if residue:
|
@@ -1117,7 +973,6 @@ def process_input(smiles_input=None, file_obj=None, show_linear=False,
|
|
1117 |
else:
|
1118 |
sequence_parts.append(residue)
|
1119 |
|
1120 |
-
# Check if cyclic using analyzer's method
|
1121 |
is_cyclic, peptide_cycles, aromatic_cycles = analyzer.is_cyclic(smiles)
|
1122 |
three_letter = '-'.join(sequence_parts)
|
1123 |
one_letter = ''.join(analyzer.three_to_one.get(aa.split('(')[0], 'X') for aa in sequence_parts)
|
@@ -1126,7 +981,6 @@ def process_input(smiles_input=None, file_obj=None, show_linear=False,
|
|
1126 |
three_letter = f"cyclo({three_letter})"
|
1127 |
one_letter = f"cyclo({one_letter})"
|
1128 |
|
1129 |
-
# Create cyclic structure visualization
|
1130 |
img_cyclic = annotate_cyclic_structure(mol, three_letter)
|
1131 |
|
1132 |
# Create linear representation if requested
|
@@ -1139,7 +993,6 @@ def process_input(smiles_input=None, file_obj=None, show_linear=False,
|
|
1139 |
img_linear = Image.open(buf)
|
1140 |
plt.close(fig_linear)
|
1141 |
|
1142 |
-
# Add summary to output
|
1143 |
summary = "Summary:\n"
|
1144 |
summary += f"Sequence: {three_letter}\n"
|
1145 |
summary += f"One-letter code: {one_letter}\n"
|
@@ -1161,7 +1014,6 @@ def process_input(smiles_input=None, file_obj=None, show_linear=False,
|
|
1161 |
# Handle file input
|
1162 |
if file_obj is not None:
|
1163 |
try:
|
1164 |
-
# Handle file content
|
1165 |
if hasattr(file_obj, 'name'):
|
1166 |
with open(file_obj.name, 'r') as f:
|
1167 |
content = f.read()
|
@@ -1172,16 +1024,13 @@ def process_input(smiles_input=None, file_obj=None, show_linear=False,
|
|
1172 |
for line in content.splitlines():
|
1173 |
smiles = line.strip()
|
1174 |
if smiles:
|
1175 |
-
# Check if it's a peptide
|
1176 |
if not analyzer.is_peptide(smiles):
|
1177 |
output_text += f"Skipping non-peptide SMILES: {smiles}\n"
|
1178 |
continue
|
1179 |
|
1180 |
-
# Process this SMILES
|
1181 |
segments = analyzer.split_on_bonds(smiles)
|
1182 |
sequence_parts = []
|
1183 |
|
1184 |
-
# Add segment details if requested
|
1185 |
if show_segment_details:
|
1186 |
output_text += f"\nSegment Analysis for SMILES: {smiles}\n"
|
1187 |
for i, segment in enumerate(segments):
|
@@ -1206,7 +1055,6 @@ def process_input(smiles_input=None, file_obj=None, show_linear=False,
|
|
1206 |
else:
|
1207 |
sequence_parts.append(residue)
|
1208 |
|
1209 |
-
# Get cyclicity and create sequence
|
1210 |
is_cyclic, peptide_cycles, aromatic_cycles = analyzer.is_cyclic(smiles)
|
1211 |
sequence = f"cyclo({'-'.join(sequence_parts)})" if is_cyclic else '-'.join(sequence_parts)
|
1212 |
|
@@ -1215,7 +1063,6 @@ def process_input(smiles_input=None, file_obj=None, show_linear=False,
|
|
1215 |
output_text += f"Is Cyclic: {'Yes' if is_cyclic else 'No'}\n"
|
1216 |
if is_cyclic:
|
1217 |
output_text += f"Peptide Cycles: {', '.join(peptide_cycles)}\n"
|
1218 |
-
#output_text += f"Aromatic Cycles: {', '.join(aromatic_cycles)}\n"
|
1219 |
output_text += "-" * 50 + "\n"
|
1220 |
|
1221 |
return output_text, None, None
|
@@ -1298,6 +1145,5 @@ iface = gr.Interface(
|
|
1298 |
flagging_mode="never"
|
1299 |
)
|
1300 |
|
1301 |
-
# Launch the app
|
1302 |
if __name__ == "__main__":
|
1303 |
iface.launch(share=True)
|
|
|
72 |
|
73 |
is_cyclic = len(peptide_cycles) > 0 and not smiles.endswith('C(=O)O')
|
74 |
return is_cyclic, peptide_cycles, aromatic_cycles
|
75 |
+
|
76 |
def split_on_bonds(self, smiles):
|
|
|
77 |
positions = []
|
78 |
used = set()
|
|
|
79 |
# Find Gly pattern first
|
80 |
gly_pattern = r'NCC\(=O\)'
|
81 |
for match in re.finditer(gly_pattern, smiles):
|
|
|
104 |
|
105 |
# Create segments
|
106 |
segments = []
|
|
|
107 |
if positions:
|
108 |
# First segment
|
109 |
if positions[0]['start'] > 0:
|
|
|
111 |
'content': smiles[0:positions[0]['start']],
|
112 |
'bond_after': positions[0]['pattern']
|
113 |
})
|
|
|
114 |
# Process segments
|
115 |
for i in range(len(positions)-1):
|
116 |
current = positions[i]
|
117 |
next_pos = positions[i+1]
|
|
|
118 |
if current['type'] == 'gly':
|
119 |
segments.append({
|
120 |
'content': 'NCC(=O)',
|
121 |
'bond_before': positions[i-1]['pattern'] if i > 0 else None,
|
122 |
'bond_after': next_pos['pattern']
|
123 |
})
|
124 |
+
segments.append({
|
125 |
+
'content': smiles[current['start']+7:next_pos['start']],
|
126 |
+
'bond_before': 'gly_bond',
|
127 |
+
'bond_after': next_pos['pattern']
|
128 |
+
})
|
129 |
else:
|
130 |
content = smiles[current['end']:next_pos['start']]
|
131 |
if content:
|
|
|
141 |
'content': smiles[positions[-1]['end']:],
|
142 |
'bond_before': positions[-1]['pattern']
|
143 |
})
|
|
|
144 |
return segments
|
145 |
|
146 |
def clean_terminal_carboxyl(self, segment):
|
|
|
167 |
content = self.clean_terminal_carboxyl(segment)
|
168 |
mods = self.get_modifications(segment)
|
169 |
|
170 |
+
# Proline (P) - flexible ring numbers
|
171 |
+
if any([
|
172 |
+
# Check for any ring number in bond patterns
|
173 |
+
(segment.get('bond_after', '').startswith(f'N{n}C(=O)') and 'CCC' in content and
|
174 |
+
any(f'[C@@H]{n}' in content or f'[C@H]{n}' in content for n in '123456789'))
|
175 |
+
for n in '123456789'
|
176 |
+
]) or any([(segment.get('bond_before', '').startswith(f'C(=O)N{n}') and 'CCC' in content and
|
177 |
+
any(f'CCC{n}' for n in '123456789'))
|
178 |
+
for n in '123456789'
|
179 |
+
]) or any([
|
180 |
+
# Check ending patterns with any ring number
|
181 |
+
(f'CCCN{n}' in content and content.endswith('=O') and
|
182 |
+
any(f'[C@@H]{n}' in content or f'[C@H]{n}' in content for n in '123456789'))
|
183 |
+
for n in '123456789'
|
184 |
+
]) or any([
|
185 |
+
# Handle CCC[C@H]n patterns
|
186 |
+
(content == f'CCC[C@H]{n}' and segment.get('bond_before', '').startswith(f'C(=O)N{n}')) or
|
187 |
+
(content == f'CCC[C@@H]{n}' and segment.get('bond_before', '').startswith(f'C(=O)N{n}')) or
|
188 |
+
# N-terminal Pro with any ring number
|
189 |
+
(f'N{n}CCC[C@H]{n}' in content) or
|
190 |
+
(f'N{n}CCC[C@@H]{n}' in content)
|
191 |
+
for n in '123456789'
|
192 |
+
]):
|
193 |
+
return 'Pro', mods
|
194 |
+
|
195 |
+
# Tryptophan (W) - more specific indole pattern
|
196 |
+
if re.search(r'c[0-9]c\[nH\]c[0-9]ccccc[0-9][0-9]', content) and \
|
197 |
+
'c[nH]c' in content.replace(' ', ''):
|
198 |
+
return 'Trp', mods
|
199 |
+
|
200 |
+
# Lysine (K)
|
201 |
+
if '[C@@H](CCCCN)' in content or '[C@H](CCCCN)' in content:
|
202 |
+
return 'Lys', mods
|
203 |
+
|
204 |
+
# Arginine (R)
|
205 |
+
if '[C@@H](CCCNC(=N)N)' in content or '[C@H](CCCNC(=N)N)' in content:
|
206 |
+
return 'Arg', mods
|
207 |
+
|
208 |
+
if ('NCC(=O)' in content) or (content == 'C'):
|
209 |
+
if segment.get('bond_before') and segment.get('bond_after'):
|
210 |
+
if ('C(=O)N' in segment['bond_before'] or 'C(=O)N(C)' in segment['bond_before']):
|
211 |
+
return 'Gly', mods
|
212 |
+
elif segment.get('bond_before') and segment.get('bond_before').startswith('C(=O)N'):
|
213 |
+
return 'Gly', mods
|
214 |
+
|
215 |
+
if 'CC(C)C[C@H]' in content or 'CC(C)C[C@@H]' in content or '[C@@H](CC(C)C)' in content or '[C@H](CC(C)C)' in content or (('N[C@H](CCC(C)C)' in content or 'N[C@@H](CCC(C)C)' in content) and segment.get('bond_before') is None):
|
216 |
+
return 'Leu', mods
|
217 |
+
|
218 |
+
if '[C@@H]([C@@H](C)O)' in content or '[C@H]([C@H](C)O)' in content:
|
219 |
+
return 'Thr', mods
|
220 |
+
if re.search(r'\[C@H\]\(Cc\d+ccccc\d+\)', content) or re.search(r'\[C@@H\]\(Cc\d+ccccc\d+\)', content):
|
221 |
+
return 'Phe', mods
|
222 |
+
|
223 |
+
if ('[C@H](C(C)C)' in content or
|
224 |
+
'[C@@H](C(C)C)' in content or
|
225 |
+
'[C@H]C(C)C' in content or
|
226 |
+
'[C@@H]C(C)C' in content
|
227 |
+
):
|
228 |
+
if not any(p in content for p in ['CC(C)C[C@H]', 'CC(C)C[C@@H]']): # Still check not Leu
|
229 |
+
return 'Val', mods
|
230 |
+
|
231 |
+
if any([
|
232 |
+
'CC[C@H](C)' in content,
|
233 |
+
'CC[C@@H](C)' in content,
|
234 |
+
'[C@@H](CC)C' in content,
|
235 |
+
'[C@H](CC)C' in content,
|
236 |
+
'C(C)C[C@H]' in content and 'CC(C)C' not in content,
|
237 |
+
'C(C)C[C@@H]' in content and 'CC(C)C' not in content
|
238 |
+
]):
|
239 |
+
return 'Ile', mods
|
240 |
+
|
241 |
+
if ('[C@H](C)' in content or '[C@@H](C)' in content):
|
242 |
+
if not any(p in content for p in ['C(C)C', 'COC', 'CN(', 'C(C)O', 'CC[C@H]', 'CC[C@@H]']):
|
243 |
+
return 'Ala', mods
|
244 |
+
|
245 |
+
# Tyrosine (Tyr) - 4-hydroxybenzyl side chain
|
246 |
+
if re.search(r'Cc[0-9]ccc\(O\)cc[0-9]', content):
|
247 |
+
return 'Tyr', mods
|
248 |
+
|
249 |
+
# Serine (Ser) - Hydroxymethyl side chain
|
250 |
+
if '[C@H](CO)' in content or '[C@@H](CO)' in content:
|
251 |
+
if not ('C(C)O' in content or 'COC' in content):
|
252 |
+
return 'Ser', mods
|
253 |
+
|
254 |
+
# Threonine (Thr) - 1-hydroxyethyl side chain
|
255 |
+
if '[C@@H]([C@@H](C)O)' in content or '[C@H]([C@H](C)O)' in content or '[C@@H](C)O' in content or '[C@H](C)O' in content:
|
256 |
+
return 'Thr', mods
|
257 |
+
|
258 |
+
# Cysteine (Cys) - Thiol side chain
|
259 |
+
if '[C@H](CS)' in content or '[C@@H](CS)' in content:
|
260 |
+
return 'Cys', mods
|
261 |
+
|
262 |
+
# Methionine (Met) - Methylthioethyl side chain
|
263 |
+
if ('CCSC' in content):
|
264 |
+
return 'Met', mods
|
265 |
+
|
266 |
+
# Glutamine (Gln) - Carbamoylethyl side chain
|
267 |
+
if (content == '[C@@H](CC' or content == '[C@H](CC' and segment.get('bond_before')=='C(=O)N' and segment.get('bond_after')=='C(=O)N') or ('CCC(=O)N' in content) or ('CCC(N)=O' in content):
|
268 |
+
return 'Gln', mods
|
269 |
+
# Asparagine (Asn) - Carbamoylmethyl side chain
|
270 |
+
if (content == '[C@@H](C' or content == '[C@H](C' and segment.get('bond_before')=='C(=O)N' and segment.get('bond_after')=='C(=O)N') or ('CC(=O)N' in content) or ('CCN(=O)' in content) or ('CC(N)=O' in content):
|
271 |
+
return 'Asn', mods
|
272 |
+
|
273 |
+
# Glutamic acid (Glu) - Carboxyethyl side chain
|
274 |
+
if ('CCC(=O)O' in content):
|
275 |
+
return 'Glu', mods
|
276 |
+
# Aspartic acid (Asp) - Carboxymethyl side chain
|
277 |
+
if ('CC(=O)O' in content):
|
278 |
+
return 'Asp', mods
|
279 |
+
|
280 |
+
# Arginine (Arg) - 3-guanidinopropyl side chain
|
281 |
+
if ('CCCNC(=N)N' in content):
|
282 |
+
return 'Arg', mods
|
283 |
+
|
284 |
+
# Histidine (His) - Imidazole side chain
|
285 |
+
if re.search(r'Cc\d+c\[nH\]cn\d+', content) or re.search(r'Cc\d+cnc\[nH\]\d+', content):
|
286 |
+
return 'His', mods
|
287 |
+
|
288 |
+
############UAA
|
289 |
+
|
290 |
+
if '[C@H](COC(C)(C)C)' in content or '[C@@H](COC(C)(C)C)' in content:
|
291 |
+
return 'O-tBu', mods
|
292 |
+
|
293 |
+
if re.search(r'c\d+ccccc\d+', content):
|
294 |
if '[C@@H](c1ccccc1)' in content or '[C@H](c1ccccc1)' in content:
|
295 |
return '4', mods # Base phenylglycine
|
296 |
+
if ('C[C@H](CCCC)' in content or 'C[C@@H](CCCC)' in content) and 'CC(C)' not in content:
|
297 |
+
return 'Nle', mods
|
298 |
+
|
299 |
+
# Ornithine (Orn) - 3-carbon chain with NH2
|
300 |
+
if ('C[C@H](CCCN)' in content or 'C[C@@H](CCCN)' in content) and 'CC(C)' not in content:
|
301 |
+
return 'Orn', mods
|
302 |
+
|
303 |
+
# 2-Naphthylalanine (2Nal)
|
304 |
+
if ('Cc3cc2ccccc2c3' in content):
|
305 |
+
return '2Nal', mods
|
306 |
+
|
307 |
+
# Cyclohexylalanine (Cha)
|
308 |
+
if 'N2CCCCC2' in content or 'CCCCC2' in content:
|
309 |
+
return 'Cha', mods
|
310 |
+
|
311 |
+
# Aminobutyric acid (Abu) - 2-carbon chain
|
312 |
+
if ('C[C@H](CC)' in content or 'C[C@@H](CC)' in content) and not any(p in content for p in ['CC(C)', 'CCCC', 'CCC(C)']):
|
313 |
+
return 'Abu', mods
|
314 |
+
|
315 |
+
# Pipecolic acid (Pip)
|
316 |
+
if ('N3CCCCC3' in content or 'CCCCC3' in content):
|
317 |
+
return 'Pip', mods
|
318 |
+
|
319 |
+
# Cyclohexylglycine (Chg) - direct cyclohexyl without CH2
|
320 |
+
if ('C[C@H](C1CCCCC1)' in content or 'C[C@@H](C1CCCCC1)' in content):
|
321 |
+
return 'Chg', mods
|
322 |
+
|
323 |
+
# 4-Fluorophenylalanine (4F-Phe)
|
324 |
+
if ('Cc2ccc(F)cc2' in content):
|
325 |
+
return '4F-Phe', mods
|
326 |
|
327 |
# 4-substituted phenylalanines
|
328 |
if 'Cc1ccc' in content:
|
|
|
432 |
if 'c1ccc(c(c1)O)O' in content:
|
433 |
return 'DAH', mods # 3,4-Dihydroxy-phenylalanine
|
434 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
435 |
# Modified histidines
|
436 |
if 'c1cnc' in content:
|
437 |
if '[C@@H]1CN[C@@H](N1)F' in content:
|
|
|
441 |
if 'c1c[nH]c(n1)F' in content:
|
442 |
return '2HF2', mods # 2-fluoro-l-histidine variant
|
443 |
|
|
|
444 |
if '[SeH]' in content:
|
445 |
return 'CSE', mods # Selenocysteine
|
446 |
if 'S' in content:
|
|
|
451 |
if 'CCS' in content:
|
452 |
return 'HCS', mods # homocysteine
|
453 |
|
|
|
454 |
if 'CN=[N]=N' in content:
|
455 |
return 'AZDA', mods # azido-alanine
|
456 |
if '[NH]=[C](=[NH2])=[NH2]' in content:
|
|
|
458 |
return 'AGM', mods # 5-methyl-arginine
|
459 |
if 'CC[NH]=' in content:
|
460 |
return 'GDPR', mods # 2-Amino-3-guanidinopropionic acid
|
461 |
+
|
462 |
+
# Others
|
463 |
+
if 'C1CCCC1' in content:
|
464 |
+
return 'CPA3', mods # 3-Cyclopentyl-alanine
|
465 |
+
if 'C1CCCCC1' in content:
|
466 |
+
if 'CC1CCCCC1' in content:
|
467 |
+
return 'ALC', mods # 3-cyclohexyl-alanine
|
468 |
+
else:
|
469 |
+
return 'CHG', mods # Cyclohexylglycine
|
470 |
|
471 |
+
if 'CCC[C@@H]' in content or 'CCC[C@H]' in content:
|
472 |
+
return 'NLE', mods # Norleucine
|
473 |
+
if 'CC[C@@H]' in content or 'CC[C@H]' in content:
|
474 |
+
if not any(x in content for x in ['CC(C)', 'COC', 'CN(']):
|
475 |
+
return 'ABA', mods # 2-Aminobutyric acid
|
476 |
if 'CCON' in content:
|
477 |
return 'CAN', mods # canaline
|
478 |
if '[C@@H]1C=C[C@@H](C=C1)' in content:
|
|
|
496 |
if 'c1cccc(c1)[C](=[NH2])=[NH2]' in content:
|
497 |
return 'APM', mods # m-amidinophenyl-3-alanine
|
498 |
|
|
|
499 |
if 'O' in content:
|
500 |
if '[C@H]([C@H](C)O)O' in content:
|
501 |
return 'ILX', mods # 4,5-dihydroxy-isoleucine
|
|
|
510 |
if '[C@H](c1ccc(c(Cl)c1)O)O' in content:
|
511 |
return 'OMY', mods # (betar)-3-chloro-beta-hydroxy-l-tyrosine
|
512 |
|
|
|
513 |
if 'n1' in content:
|
514 |
if 'n1cccn1' in content:
|
515 |
return 'PYZ1', mods # 3-(1-Pyrazolyl)-alanine
|
|
|
528 |
if 'c1cnc2c(n1)cccc2' in content:
|
529 |
return 'QX32', mods # 3-(2-quinoxalyl)-alanine
|
530 |
|
|
|
531 |
if 'N' in content:
|
532 |
if '[NH3]CC[C@@H]' in content:
|
533 |
return 'DAB', mods # Diaminobutyric acid
|
|
|
540 |
if '[NH]=[C](=S)=[NH2]' in content:
|
541 |
return 'THIC', mods # Thio-citrulline
|
542 |
|
|
|
543 |
if 'CC' in content:
|
544 |
if 'CCCC[C@@H]' in content:
|
545 |
return 'AHP', mods # 2-Aminoheptanoic acid
|
|
|
552 |
if '[C@@H]([C@@H](C)O)C' in content:
|
553 |
return 'HLU', mods # beta-hydroxyleucine
|
554 |
|
|
|
555 |
if '[C@@H]' in content:
|
556 |
if '[C@@H](C[C@@H](F))' in content:
|
557 |
return 'FGA4', mods # 4-Fluoro-glutamic acid
|
|
|
562 |
if '[C@@H](CC[C@H](C))' in content:
|
563 |
return 'MEG', mods # (3s)-3-methyl-l-glutamic acid
|
564 |
|
|
|
565 |
if 'S' in content:
|
566 |
if 'SCC[C@@H]' in content:
|
567 |
return 'HSER', mods # homoserine
|
|
|
574 |
if 'S(=O)(=O)' in content:
|
575 |
return 'OMT', mods # Methionine sulfone
|
576 |
|
|
|
577 |
if 'C=' in content:
|
578 |
if 'C=C[C@@H]' in content:
|
579 |
return '2AG', mods # 2-Allyl-glycine
|
|
|
582 |
if 'C=Cc1ccccc1' in content:
|
583 |
return 'STYA', mods # Styrylalanine
|
584 |
|
|
|
585 |
if '[C@@H]1Cc2c(C1)cccc2' in content:
|
586 |
return 'IGL', mods # alpha-amino-2-indanacetic acid
|
587 |
if '[C](=[C](=O)=O)=O' in content:
|
588 |
return '26P', mods # 2-amino-6-oxopimelic acid
|
589 |
if '[C](=[C](=O)=O)=C' in content:
|
590 |
return '2NP', mods # l-2-amino-6-methylene-pimelic acid
|
|
|
|
|
591 |
if 'c1cccc2c1cc(O)cc2' in content:
|
592 |
return 'NAO1', mods # 5-hydroxy-1-naphthalene
|
593 |
if 'c1ccc2c(c1)cc(O)cc2' in content:
|
594 |
+
return 'NAO2', mods # 6-hydroxy-2-naphthalene
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
595 |
return None, mods
|
596 |
|
597 |
def get_modifications(self, segment):
|
|
|
650 |
'one_letter': one_letter,
|
651 |
'is_cyclic': is_cyclic
|
652 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
653 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
654 |
def annotate_cyclic_structure(mol, sequence):
|
655 |
+
"""Create structure visualization"""
|
|
|
656 |
AllChem.Compute2DCoords(mol)
|
657 |
|
|
|
658 |
drawer = Draw.rdMolDraw2D.MolDraw2DCairo(2000, 2000)
|
659 |
|
660 |
# Draw molecule first
|
|
|
674 |
print("Warning: TrueType fonts not available, using default font")
|
675 |
small_font = ImageFont.load_default()
|
676 |
|
677 |
+
# Header
|
678 |
seq_text = f"Sequence: {sequence}"
|
679 |
bbox = draw.textbbox((1000, 100), seq_text, font=small_font)
|
680 |
padding = 10
|
|
|
687 |
return img
|
688 |
|
689 |
def create_enhanced_linear_viz(sequence, smiles):
|
690 |
+
""""Linear visualization"""
|
691 |
+
analyzer = PeptideAnalyzer()
|
692 |
|
|
|
693 |
fig = plt.figure(figsize=(15, 10))
|
694 |
gs = fig.add_gridspec(2, 1, height_ratios=[1, 2])
|
695 |
ax_struct = fig.add_subplot(gs[0])
|
696 |
ax_detail = fig.add_subplot(gs[1])
|
697 |
|
|
|
698 |
if sequence.startswith('cyclo('):
|
699 |
residues = sequence[6:-1].split('-')
|
700 |
else:
|
701 |
residues = sequence.split('-')
|
702 |
|
|
|
703 |
segments = analyzer.split_on_bonds(smiles)
|
704 |
|
|
|
705 |
print(f"Number of residues: {len(residues)}")
|
706 |
print(f"Number of segments: {len(segments)}")
|
707 |
|
|
|
708 |
ax_struct.set_xlim(0, 10)
|
709 |
ax_struct.set_ylim(0, 2)
|
710 |
|
711 |
num_residues = len(residues)
|
712 |
spacing = 9.0 / (num_residues - 1) if num_residues > 1 else 9.0
|
713 |
|
|
|
714 |
y_pos = 1.5
|
715 |
for i in range(num_residues):
|
716 |
x_pos = 0.5 + i * spacing
|
717 |
|
|
|
718 |
rect = patches.Rectangle((x_pos-0.3, y_pos-0.2), 0.6, 0.4,
|
719 |
facecolor='lightblue', edgecolor='black')
|
720 |
ax_struct.add_patch(rect)
|
721 |
|
|
|
722 |
if i < num_residues - 1:
|
723 |
segment = segments[i] if i < len(segments) else None
|
724 |
if segment:
|
|
|
725 |
bond_type = 'ester' if 'O-linked' in segment.get('bond_after', '') else 'peptide'
|
726 |
is_n_methylated = 'N-Me' in segment.get('bond_after', '')
|
727 |
|
728 |
bond_color = 'red' if bond_type == 'ester' else 'black'
|
729 |
linestyle = '--' if bond_type == 'ester' else '-'
|
730 |
|
|
|
731 |
ax_struct.plot([x_pos+0.3, x_pos+spacing-0.3], [y_pos, y_pos],
|
732 |
color=bond_color, linestyle=linestyle, linewidth=2)
|
733 |
|
|
|
734 |
mid_x = x_pos + spacing/2
|
735 |
bond_label = f"{bond_type}"
|
736 |
if is_n_methylated:
|
|
|
739 |
ha='center', va='bottom', fontsize=10,
|
740 |
color=bond_color)
|
741 |
|
|
|
742 |
ax_struct.text(x_pos, y_pos-0.5, residues[i],
|
743 |
ha='center', va='top', fontsize=14)
|
744 |
|
|
|
745 |
ax_detail.set_ylim(0, len(segments)+1)
|
746 |
ax_detail.set_xlim(0, 1)
|
747 |
|
748 |
+
segment_y = len(segments)
|
|
|
749 |
for i, segment in enumerate(segments):
|
750 |
y = segment_y - i
|
751 |
|
|
|
767 |
text += "peptide"
|
768 |
color = 'red'
|
769 |
|
|
|
770 |
ax_detail.text(0.05, y, text, fontsize=12, color=color)
|
771 |
ax_detail.text(0.5, y, f"SMILES: {segment.get('content', '')}", fontsize=10, color='gray')
|
772 |
|
|
|
777 |
ax_struct.text(5, y_pos+0.3, 'Cyclic Connection',
|
778 |
ha='center', color='red', fontsize=14)
|
779 |
|
|
|
780 |
ax_struct.set_title("Peptide Structure Overview", pad=20)
|
781 |
ax_detail.set_title("Segment Analysis Breakdown", pad=20)
|
782 |
|
|
|
783 |
for ax in [ax_struct, ax_detail]:
|
784 |
ax.set_xticks([])
|
785 |
ax.set_yticks([])
|
|
|
789 |
return fig
|
790 |
|
791 |
class PeptideStructureGenerator:
|
792 |
+
"""Generate 3D structures of peptides using different embedding methods"""
|
793 |
|
794 |
@staticmethod
|
795 |
def prepare_molecule(smiles):
|
|
|
798 |
if mol is None:
|
799 |
raise ValueError("Failed to create molecule from SMILES")
|
800 |
|
|
|
801 |
for atom in mol.GetAtoms():
|
802 |
atom.UpdatePropertyCache(strict=False)
|
803 |
|
|
|
815 |
|
816 |
@staticmethod
|
817 |
def get_etkdg_params(attempt=0):
|
818 |
+
"""Get ETKDG parameters"""
|
819 |
params = AllChem.ETKDGv3()
|
820 |
params.randomSeed = -1
|
821 |
params.maxIterations = 200
|
|
|
889 |
@staticmethod
|
890 |
def mol_to_sdf_bytes(mol):
|
891 |
"""Convert RDKit molecule to SDF file bytes"""
|
|
|
892 |
sio = StringIO()
|
893 |
writer = Chem.SDWriter(sio)
|
894 |
writer.write(mol)
|
895 |
writer.close()
|
896 |
|
|
|
897 |
return sio.getvalue().encode('utf-8')
|
898 |
|
899 |
def process_input(smiles_input=None, file_obj=None, show_linear=False,
|
|
|
907 |
if smiles_input:
|
908 |
smiles = smiles_input.strip()
|
909 |
|
|
|
910 |
if not analyzer.is_peptide(smiles):
|
911 |
return "Error: Input SMILES does not appear to be a peptide structure.", None, None
|
912 |
|
913 |
try:
|
|
|
914 |
mol = Chem.MolFromSmiles(smiles)
|
915 |
if mol is None:
|
916 |
return "Error: Invalid SMILES notation.", None, None
|
917 |
|
|
|
918 |
if generate_3d:
|
919 |
generator = PeptideStructureGenerator()
|
920 |
|
|
|
939 |
except Exception as e:
|
940 |
return f"Error generating 3D structures: {str(e)}", None, None, None
|
941 |
|
|
|
942 |
segments = analyzer.split_on_bonds(smiles)
|
943 |
|
|
|
944 |
sequence_parts = []
|
945 |
output_text = ""
|
946 |
|
|
|
965 |
output_text += f"Warning: Could not identify residue in segment: {segment['content']}\n"
|
966 |
output_text += "\n"
|
967 |
else:
|
|
|
968 |
for segment in segments:
|
969 |
residue, mods = analyzer.identify_residue(segment)
|
970 |
if residue:
|
|
|
973 |
else:
|
974 |
sequence_parts.append(residue)
|
975 |
|
|
|
976 |
is_cyclic, peptide_cycles, aromatic_cycles = analyzer.is_cyclic(smiles)
|
977 |
three_letter = '-'.join(sequence_parts)
|
978 |
one_letter = ''.join(analyzer.three_to_one.get(aa.split('(')[0], 'X') for aa in sequence_parts)
|
|
|
981 |
three_letter = f"cyclo({three_letter})"
|
982 |
one_letter = f"cyclo({one_letter})"
|
983 |
|
|
|
984 |
img_cyclic = annotate_cyclic_structure(mol, three_letter)
|
985 |
|
986 |
# Create linear representation if requested
|
|
|
993 |
img_linear = Image.open(buf)
|
994 |
plt.close(fig_linear)
|
995 |
|
|
|
996 |
summary = "Summary:\n"
|
997 |
summary += f"Sequence: {three_letter}\n"
|
998 |
summary += f"One-letter code: {one_letter}\n"
|
|
|
1014 |
# Handle file input
|
1015 |
if file_obj is not None:
|
1016 |
try:
|
|
|
1017 |
if hasattr(file_obj, 'name'):
|
1018 |
with open(file_obj.name, 'r') as f:
|
1019 |
content = f.read()
|
|
|
1024 |
for line in content.splitlines():
|
1025 |
smiles = line.strip()
|
1026 |
if smiles:
|
|
|
1027 |
if not analyzer.is_peptide(smiles):
|
1028 |
output_text += f"Skipping non-peptide SMILES: {smiles}\n"
|
1029 |
continue
|
1030 |
|
|
|
1031 |
segments = analyzer.split_on_bonds(smiles)
|
1032 |
sequence_parts = []
|
1033 |
|
|
|
1034 |
if show_segment_details:
|
1035 |
output_text += f"\nSegment Analysis for SMILES: {smiles}\n"
|
1036 |
for i, segment in enumerate(segments):
|
|
|
1055 |
else:
|
1056 |
sequence_parts.append(residue)
|
1057 |
|
|
|
1058 |
is_cyclic, peptide_cycles, aromatic_cycles = analyzer.is_cyclic(smiles)
|
1059 |
sequence = f"cyclo({'-'.join(sequence_parts)})" if is_cyclic else '-'.join(sequence_parts)
|
1060 |
|
|
|
1063 |
output_text += f"Is Cyclic: {'Yes' if is_cyclic else 'No'}\n"
|
1064 |
if is_cyclic:
|
1065 |
output_text += f"Peptide Cycles: {', '.join(peptide_cycles)}\n"
|
|
|
1066 |
output_text += "-" * 50 + "\n"
|
1067 |
|
1068 |
return output_text, None, None
|
|
|
1145 |
flagging_mode="never"
|
1146 |
)
|
1147 |
|
|
|
1148 |
if __name__ == "__main__":
|
1149 |
iface.launch(share=True)
|