yinuozhang commited on
Commit
71e2885
·
1 Parent(s): 2b18b2c

adjust several LD aa and GLY bond

Browse files
Files changed (1) hide show
  1. app.py +182 -336
app.py CHANGED
@@ -72,12 +72,10 @@ class PeptideAnalyzer:
72
 
73
  is_cyclic = len(peptide_cycles) > 0 and not smiles.endswith('C(=O)O')
74
  return is_cyclic, peptide_cycles, aromatic_cycles
75
-
76
  def split_on_bonds(self, smiles):
77
- """Split SMILES into segments with simplified Pro handling"""
78
  positions = []
79
  used = set()
80
-
81
  # Find Gly pattern first
82
  gly_pattern = r'NCC\(=O\)'
83
  for match in re.finditer(gly_pattern, smiles):
@@ -106,7 +104,6 @@ class PeptideAnalyzer:
106
 
107
  # Create segments
108
  segments = []
109
-
110
  if positions:
111
  # First segment
112
  if positions[0]['start'] > 0:
@@ -114,18 +111,21 @@ class PeptideAnalyzer:
114
  'content': smiles[0:positions[0]['start']],
115
  'bond_after': positions[0]['pattern']
116
  })
117
-
118
  # Process segments
119
  for i in range(len(positions)-1):
120
  current = positions[i]
121
  next_pos = positions[i+1]
122
-
123
  if current['type'] == 'gly':
124
  segments.append({
125
  'content': 'NCC(=O)',
126
  'bond_before': positions[i-1]['pattern'] if i > 0 else None,
127
  'bond_after': next_pos['pattern']
128
  })
 
 
 
 
 
129
  else:
130
  content = smiles[current['end']:next_pos['start']]
131
  if content:
@@ -141,7 +141,6 @@ class PeptideAnalyzer:
141
  'content': smiles[positions[-1]['end']:],
142
  'bond_before': positions[-1]['pattern']
143
  })
144
-
145
  return segments
146
 
147
  def clean_terminal_carboxyl(self, segment):
@@ -168,11 +167,162 @@ class PeptideAnalyzer:
168
  content = self.clean_terminal_carboxyl(segment)
169
  mods = self.get_modifications(segment)
170
 
171
- # UAA pattern matching section - before regular residues
172
- # Phenylglycine and derivatives
173
- if 'c1ccccc1' in content:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  if '[C@@H](c1ccccc1)' in content or '[C@H](c1ccccc1)' in content:
175
  return '4', mods # Base phenylglycine
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
 
177
  # 4-substituted phenylalanines
178
  if 'Cc1ccc' in content:
@@ -282,22 +432,6 @@ class PeptideAnalyzer:
282
  if 'c1ccc(c(c1)O)O' in content:
283
  return 'DAH', mods # 3,4-Dihydroxy-phenylalanine
284
 
285
- # Cyclic amino acids
286
- if 'C1CCCC1' in content:
287
- return 'CPA3', mods # 3-Cyclopentyl-alanine
288
- if 'C1CCCCC1' in content:
289
- if 'CC1CCCCC1' in content:
290
- return 'ALC', mods # 3-cyclohexyl-alanine
291
- else:
292
- return 'CHG', mods # Cyclohexylglycine
293
-
294
- # Chain-length variants
295
- if 'CCC[C@@H]' in content or 'CCC[C@H]' in content:
296
- return 'NLE', mods # Norleucine
297
- if 'CC[C@@H]' in content or 'CC[C@H]' in content:
298
- if not any(x in content for x in ['CC(C)', 'COC', 'CN(']):
299
- return 'ABA', mods # 2-Aminobutyric acid
300
-
301
  # Modified histidines
302
  if 'c1cnc' in content:
303
  if '[C@@H]1CN[C@@H](N1)F' in content:
@@ -307,7 +441,6 @@ class PeptideAnalyzer:
307
  if 'c1c[nH]c(n1)F' in content:
308
  return '2HF2', mods # 2-fluoro-l-histidine variant
309
 
310
- # Sulfur and selenium containing
311
  if '[SeH]' in content:
312
  return 'CSE', mods # Selenocysteine
313
  if 'S' in content:
@@ -318,7 +451,6 @@ class PeptideAnalyzer:
318
  if 'CCS' in content:
319
  return 'HCS', mods # homocysteine
320
 
321
- # Additional modifications
322
  if 'CN=[N]=N' in content:
323
  return 'AZDA', mods # azido-alanine
324
  if '[NH]=[C](=[NH2])=[NH2]' in content:
@@ -326,7 +458,21 @@ class PeptideAnalyzer:
326
  return 'AGM', mods # 5-methyl-arginine
327
  if 'CC[NH]=' in content:
328
  return 'GDPR', mods # 2-Amino-3-guanidinopropionic acid
 
 
 
 
 
 
 
 
 
329
 
 
 
 
 
 
330
  if 'CCON' in content:
331
  return 'CAN', mods # canaline
332
  if '[C@@H]1C=C[C@@H](C=C1)' in content:
@@ -350,7 +496,6 @@ class PeptideAnalyzer:
350
  if 'c1cccc(c1)[C](=[NH2])=[NH2]' in content:
351
  return 'APM', mods # m-amidinophenyl-3-alanine
352
 
353
- # Multiple hydroxy patterns
354
  if 'O' in content:
355
  if '[C@H]([C@H](C)O)O' in content:
356
  return 'ILX', mods # 4,5-dihydroxy-isoleucine
@@ -365,7 +510,6 @@ class PeptideAnalyzer:
365
  if '[C@H](c1ccc(c(Cl)c1)O)O' in content:
366
  return 'OMY', mods # (betar)-3-chloro-beta-hydroxy-l-tyrosine
367
 
368
- # Heterocyclic patterns
369
  if 'n1' in content:
370
  if 'n1cccn1' in content:
371
  return 'PYZ1', mods # 3-(1-Pyrazolyl)-alanine
@@ -384,7 +528,6 @@ class PeptideAnalyzer:
384
  if 'c1cnc2c(n1)cccc2' in content:
385
  return 'QX32', mods # 3-(2-quinoxalyl)-alanine
386
 
387
- # Multiple nitrogen patterns
388
  if 'N' in content:
389
  if '[NH3]CC[C@@H]' in content:
390
  return 'DAB', mods # Diaminobutyric acid
@@ -397,7 +540,6 @@ class PeptideAnalyzer:
397
  if '[NH]=[C](=S)=[NH2]' in content:
398
  return 'THIC', mods # Thio-citrulline
399
 
400
- # Chain modified amino acids
401
  if 'CC' in content:
402
  if 'CCCC[C@@H]' in content:
403
  return 'AHP', mods # 2-Aminoheptanoic acid
@@ -410,7 +552,6 @@ class PeptideAnalyzer:
410
  if '[C@@H]([C@@H](C)O)C' in content:
411
  return 'HLU', mods # beta-hydroxyleucine
412
 
413
- # Modified glutamate/aspartate patterns
414
  if '[C@@H]' in content:
415
  if '[C@@H](C[C@@H](F))' in content:
416
  return 'FGA4', mods # 4-Fluoro-glutamic acid
@@ -421,7 +562,6 @@ class PeptideAnalyzer:
421
  if '[C@@H](CC[C@H](C))' in content:
422
  return 'MEG', mods # (3s)-3-methyl-l-glutamic acid
423
 
424
- # Sulfur and selenium modifications
425
  if 'S' in content:
426
  if 'SCC[C@@H]' in content:
427
  return 'HSER', mods # homoserine
@@ -434,7 +574,6 @@ class PeptideAnalyzer:
434
  if 'S(=O)(=O)' in content:
435
  return 'OMT', mods # Methionine sulfone
436
 
437
- # Double bond containing
438
  if 'C=' in content:
439
  if 'C=C[C@@H]' in content:
440
  return '2AG', mods # 2-Allyl-glycine
@@ -443,175 +582,16 @@ class PeptideAnalyzer:
443
  if 'C=Cc1ccccc1' in content:
444
  return 'STYA', mods # Styrylalanine
445
 
446
- # Special cases
447
  if '[C@@H]1Cc2c(C1)cccc2' in content:
448
  return 'IGL', mods # alpha-amino-2-indanacetic acid
449
  if '[C](=[C](=O)=O)=O' in content:
450
  return '26P', mods # 2-amino-6-oxopimelic acid
451
  if '[C](=[C](=O)=O)=C' in content:
452
  return '2NP', mods # l-2-amino-6-methylene-pimelic acid
453
- if 'c2cnc[nH]2' in content:
454
- return 'HIS', mods # histidine core
455
  if 'c1cccc2c1cc(O)cc2' in content:
456
  return 'NAO1', mods # 5-hydroxy-1-naphthalene
457
  if 'c1ccc2c(c1)cc(O)cc2' in content:
458
- return 'NAO2', mods # 6-hydroxy-2-naphthalene
459
-
460
- # Proline (P) - flexible ring numbers
461
- if any([
462
- # Check for any ring number in bond patterns
463
- (segment.get('bond_after', '').startswith(f'N{n}C(=O)') and 'CCC' in content and
464
- any(f'[C@@H]{n}' in content or f'[C@H]{n}' in content for n in '123456789'))
465
- for n in '123456789'
466
- ]) or any([
467
- # Check ending patterns with any ring number
468
- (f'CCCN{n}' in content and content.endswith('=O') and
469
- any(f'[C@@H]{n}' in content or f'[C@H]{n}' in content for n in '123456789'))
470
- for n in '123456789'
471
- ]) or any([
472
- # Handle CCC[C@H]n patterns
473
- (content == f'CCC[C@H]{n}' and segment.get('bond_before', '').startswith(f'C(=O)N{n}')) or
474
- (content == f'CCC[C@@H]{n}' and segment.get('bond_before', '').startswith(f'C(=O)N{n}')) or
475
- # N-terminal Pro with any ring number
476
- (f'N{n}CCC[C@H]{n}' in content) or
477
- (f'N{n}CCC[C@@H]{n}' in content)
478
- for n in '123456789'
479
- ]):
480
- return 'Pro', mods
481
-
482
- # Tryptophan (W) - more specific indole pattern
483
- if re.search(r'c[0-9]c\[nH\]c[0-9]ccccc[0-9][0-9]', content) and \
484
- 'c[nH]c' in content.replace(' ', ''):
485
- return 'Trp', mods
486
-
487
- # Lysine (K) - both patterns
488
- if '[C@@H](CCCCN)' in content or '[C@H](CCCCN)' in content:
489
- return 'Lys', mods
490
-
491
- # Arginine (R) - both patterns
492
- if '[C@@H](CCCNC(=N)N)' in content or '[C@H](CCCNC(=N)N)' in content:
493
- return 'Arg', mods
494
-
495
- if ('C[C@H](CCCC)' in content or 'C[C@@H](CCCC)' in content) and 'CC(C)' not in content:
496
- return 'Nle', mods
497
-
498
- # Ornithine (Orn) - 3-carbon chain with NH2
499
- if ('C[C@H](CCCN)' in content or 'C[C@@H](CCCN)' in content) and 'CC(C)' not in content:
500
- return 'Orn', mods
501
-
502
- # 2-Naphthylalanine (2Nal) - distinct from Phe pattern
503
- if ('Cc3cc2ccccc2c3' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
504
- return '2Nal', mods
505
-
506
- # Cyclohexylalanine (Cha) - already in your code but moved here for clarity
507
- if 'N2CCCCC2' in content or 'CCCCC2' in content:
508
- return 'Cha', mods
509
-
510
- # Aminobutyric acid (Abu) - 2-carbon chain
511
- if ('C[C@H](CC)' in content or 'C[C@@H](CC)' in content) and not any(p in content for p in ['CC(C)', 'CCCC', 'CCC(C)']):
512
- return 'Abu', mods
513
-
514
- # Pipecolic acid (Pip) - 6-membered ring like Pro
515
- if ('N3CCCCC3' in content or 'CCCCC3' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
516
- return 'Pip', mods
517
-
518
- # Cyclohexylglycine (Chg) - direct cyclohexyl without CH2
519
- if ('C[C@H](C1CCCCC1)' in content or 'C[C@@H](C1CCCCC1)' in content):
520
- return 'Chg', mods
521
-
522
- # 4-Fluorophenylalanine (4F-Phe)
523
- if ('Cc2ccc(F)cc2' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
524
- return '4F-Phe', mods
525
-
526
- # Regular residue identification
527
- if ('NCC(=O)' in content) or (content == 'C'):
528
- # Middle case - between bonds
529
- if segment.get('bond_before') and segment.get('bond_after'):
530
- if ('C(=O)N' in segment['bond_before'] or 'C(=O)N(C)' in segment['bond_before']):
531
- return 'Gly', mods
532
- # Terminal case - at the end
533
- elif segment.get('bond_before') and segment.get('bond_before').startswith('C(=O)N'):
534
- return 'Gly', mods
535
-
536
- if 'CC(C)C[C@H]' in content or 'CC(C)C[C@@H]' in content:
537
- return 'Leu', mods
538
- if '[C@@H](CC(C)C)' in content or '[C@H](CC(C)C)' in content:
539
- return 'Leu', mods
540
-
541
- if '[C@@H]([C@@H](C)O)' in content or '[C@H]([C@H](C)O)' in content:
542
- return 'Thr', mods
543
-
544
- if '[C@H](Cc2ccccc2)' in content or '[C@@H](Cc2ccccc2)' in content:
545
- return 'Phe', mods
546
-
547
- if ('[C@H](C(C)C)' in content or # With outer parentheses
548
- '[C@@H](C(C)C)' in content or # With outer parentheses
549
- '[C@H]C(C)C' in content or # Without outer parentheses
550
- '[C@@H]C(C)C' in content): # Without outer parentheses
551
- if not any(p in content for p in ['CC(C)C[C@H]', 'CC(C)C[C@@H]']): # Still check not Leu
552
- return 'Val', mods
553
-
554
- if '[C@H](COC(C)(C)C)' in content or '[C@@H](COC(C)(C)C)' in content:
555
- return 'O-tBu', mods
556
-
557
- if any([
558
- 'CC[C@H](C)' in content,
559
- 'CC[C@@H](C)' in content,
560
- 'C(C)C[C@H]' in content and 'CC(C)C' not in content,
561
- 'C(C)C[C@@H]' in content and 'CC(C)C' not in content
562
- ]):
563
- return 'Ile', mods
564
-
565
- if ('[C@H](C)' in content or '[C@@H](C)' in content):
566
- if not any(p in content for p in ['C(C)C', 'COC', 'CN(', 'C(C)O', 'CC[C@H]', 'CC[C@@H]']):
567
- return 'Ala', mods
568
-
569
- # Tyrosine (Tyr) - 4-hydroxybenzyl side chain
570
- if re.search(r'Cc[0-9]ccc\(O\)cc[0-9]', content):
571
- return 'Tyr', mods
572
-
573
-
574
- # Serine (Ser) - Hydroxymethyl side chain
575
- if '[C@H](CO)' in content or '[C@@H](CO)' in content:
576
- if not ('C(C)O' in content or 'COC' in content):
577
- return 'Ser', mods
578
-
579
- # Threonine (Thr) - 1-hydroxyethyl side chain
580
- if '[C@@H]([C@@H](C)O)' in content or '[C@H]([C@H](C)O)' in content or '[C@@H](C)O' in content or '[C@H](C)O' in content:
581
- return 'Thr', mods
582
-
583
- # Cysteine (Cys) - Thiol side chain
584
- if '[C@H](CS)' in content or '[C@@H](CS)' in content:
585
- return 'Cys', mods
586
-
587
- # Methionine (Met) - Methylthioethyl side chain
588
- if ('C[C@H](CCSC)' in content or 'C[C@@H](CCSC)' in content):
589
- return 'Met', mods
590
-
591
- # Asparagine (Asn) - Carbamoylmethyl side chain
592
- if ('CC(=O)N' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
593
- return 'Asn', mods
594
-
595
- # Glutamine (Gln) - Carbamoylethyl side chain
596
- if ('CCC(=O)N' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
597
- return 'Gln', mods
598
-
599
- # Aspartic acid (Asp) - Carboxymethyl side chain
600
- if ('CC(=O)O' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
601
- return 'Asp', mods
602
-
603
- # Glutamic acid (Glu) - Carboxyethyl side chain
604
- if ('CCC(=O)O' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
605
- return 'Glu', mods
606
-
607
- # Arginine (Arg) - 3-guanidinopropyl side chain
608
- if ('CCCNC(=N)N' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
609
- return 'Arg', mods
610
-
611
- # Histidine (His) - Imidazole side chain
612
- if ('Cc2cnc[nH]2' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
613
- return 'His', mods
614
-
615
  return None, mods
616
 
617
  def get_modifications(self, segment):
@@ -670,109 +650,11 @@ class PeptideAnalyzer:
670
  'one_letter': one_letter,
671
  'is_cyclic': is_cyclic
672
  }
673
-
674
- """
675
- def annotate_cyclic_structure(mol, sequence):
676
- '''Create annotated 2D structure with clear, non-overlapping residue labels'''
677
- # Generate 2D coordinates
678
- # Generate 2D coordinates
679
- AllChem.Compute2DCoords(mol)
680
-
681
- # Create drawer with larger size for annotations
682
- drawer = Draw.rdMolDraw2D.MolDraw2DCairo(2000, 2000) # Even larger size
683
-
684
- # Get residue list and reverse it to match structural representation
685
- if sequence.startswith('cyclo('):
686
- residues = sequence[6:-1].split('-')
687
- else:
688
- residues = sequence.split('-')
689
- residues = list(reversed(residues)) # Reverse the sequence
690
-
691
- # Draw molecule first to get its bounds
692
- drawer.drawOptions().addAtomIndices = False
693
- drawer.DrawMolecule(mol)
694
- drawer.FinishDrawing()
695
-
696
- # Convert to PIL Image
697
- img = Image.open(BytesIO(drawer.GetDrawingText()))
698
- draw = ImageDraw.Draw(img)
699
-
700
- try:
701
- # Try to use DejaVuSans as it's commonly available on Linux systems
702
- font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 60)
703
- small_font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 60)
704
- except OSError:
705
- try:
706
- # Fallback to Arial if available (common on Windows)
707
- font = ImageFont.truetype("arial.ttf", 60)
708
- small_font = ImageFont.truetype("arial.ttf", 60)
709
- except OSError:
710
- # If no TrueType fonts are available, fall back to default
711
- print("Warning: TrueType fonts not available, using default font")
712
- font = ImageFont.load_default()
713
- small_font = ImageFont.load_default()
714
- # Get molecule bounds
715
- conf = mol.GetConformer()
716
- positions = []
717
- for i in range(mol.GetNumAtoms()):
718
- pos = conf.GetAtomPosition(i)
719
- positions.append((pos.x, pos.y))
720
-
721
- x_coords = [p[0] for p in positions]
722
- y_coords = [p[1] for p in positions]
723
- min_x, max_x = min(x_coords), max(x_coords)
724
- min_y, max_y = min(y_coords), max(y_coords)
725
-
726
- # Calculate scaling factors
727
- scale = 150 # Increased scale factor
728
- center_x = 1000 # Image center
729
- center_y = 1000
730
-
731
- # Add residue labels in a circular arrangement around the structure
732
- n_residues = len(residues)
733
- radius = 700 # Distance of labels from center
734
-
735
- # Start from the rightmost point (3 o'clock position) and go counterclockwise
736
- # Offset by -3 positions to align with structure
737
- offset = 0 # Adjust this value to match the structure alignment
738
- for i, residue in enumerate(residues):
739
- # Calculate position in a circle around the structure
740
- # Start from 0 (3 o'clock) and go counterclockwise
741
- angle = -(2 * np.pi * ((i + offset) % n_residues) / n_residues)
742
 
743
- # Calculate label position
744
- label_x = center_x + radius * np.cos(angle)
745
- label_y = center_y + radius * np.sin(angle)
746
-
747
- # Draw residue label
748
- text = f"{i+1}. {residue}"
749
- bbox = draw.textbbox((label_x, label_y), text, font=font)
750
- padding = 10
751
- draw.rectangle([bbox[0]-padding, bbox[1]-padding,
752
- bbox[2]+padding, bbox[3]+padding],
753
- fill='white', outline='white')
754
- draw.text((label_x, label_y), text,
755
- font=font, fill='black', anchor="mm")
756
-
757
- # Add sequence at the top with white background
758
- seq_text = f"Sequence: {sequence}"
759
- bbox = draw.textbbox((center_x, 100), seq_text, font=small_font)
760
- padding = 10
761
- draw.rectangle([bbox[0]-padding, bbox[1]-padding,
762
- bbox[2]+padding, bbox[3]+padding],
763
- fill='white', outline='white')
764
- draw.text((center_x, 100), seq_text,
765
- font=small_font, fill='black', anchor="mm")
766
-
767
- return img
768
-
769
- """
770
  def annotate_cyclic_structure(mol, sequence):
771
- """Create structure visualization with just the sequence header"""
772
- # Generate 2D coordinates
773
  AllChem.Compute2DCoords(mol)
774
 
775
- # Create drawer with larger size for annotations
776
  drawer = Draw.rdMolDraw2D.MolDraw2DCairo(2000, 2000)
777
 
778
  # Draw molecule first
@@ -792,7 +674,7 @@ def annotate_cyclic_structure(mol, sequence):
792
  print("Warning: TrueType fonts not available, using default font")
793
  small_font = ImageFont.load_default()
794
 
795
- # Add just the sequence header at the top
796
  seq_text = f"Sequence: {sequence}"
797
  bbox = draw.textbbox((1000, 100), seq_text, font=small_font)
798
  padding = 10
@@ -805,61 +687,50 @@ def annotate_cyclic_structure(mol, sequence):
805
  return img
806
 
807
  def create_enhanced_linear_viz(sequence, smiles):
808
- """Create an enhanced linear representation using PeptideAnalyzer"""
809
- analyzer = PeptideAnalyzer() # Create analyzer instance
810
 
811
- # Create figure with two subplots
812
  fig = plt.figure(figsize=(15, 10))
813
  gs = fig.add_gridspec(2, 1, height_ratios=[1, 2])
814
  ax_struct = fig.add_subplot(gs[0])
815
  ax_detail = fig.add_subplot(gs[1])
816
 
817
- # Parse sequence and get residues
818
  if sequence.startswith('cyclo('):
819
  residues = sequence[6:-1].split('-')
820
  else:
821
  residues = sequence.split('-')
822
 
823
- # Get segments using analyzer
824
  segments = analyzer.split_on_bonds(smiles)
825
 
826
- # Debug print
827
  print(f"Number of residues: {len(residues)}")
828
  print(f"Number of segments: {len(segments)}")
829
 
830
- # Top subplot - Basic structure
831
  ax_struct.set_xlim(0, 10)
832
  ax_struct.set_ylim(0, 2)
833
 
834
  num_residues = len(residues)
835
  spacing = 9.0 / (num_residues - 1) if num_residues > 1 else 9.0
836
 
837
- # Draw basic structure
838
  y_pos = 1.5
839
  for i in range(num_residues):
840
  x_pos = 0.5 + i * spacing
841
 
842
- # Draw amino acid box
843
  rect = patches.Rectangle((x_pos-0.3, y_pos-0.2), 0.6, 0.4,
844
  facecolor='lightblue', edgecolor='black')
845
  ax_struct.add_patch(rect)
846
 
847
- # Draw connecting bonds if not the last residue
848
  if i < num_residues - 1:
849
  segment = segments[i] if i < len(segments) else None
850
  if segment:
851
- # Determine bond type from segment info
852
  bond_type = 'ester' if 'O-linked' in segment.get('bond_after', '') else 'peptide'
853
  is_n_methylated = 'N-Me' in segment.get('bond_after', '')
854
 
855
  bond_color = 'red' if bond_type == 'ester' else 'black'
856
  linestyle = '--' if bond_type == 'ester' else '-'
857
 
858
- # Draw bond line
859
  ax_struct.plot([x_pos+0.3, x_pos+spacing-0.3], [y_pos, y_pos],
860
  color=bond_color, linestyle=linestyle, linewidth=2)
861
 
862
- # Add bond type label
863
  mid_x = x_pos + spacing/2
864
  bond_label = f"{bond_type}"
865
  if is_n_methylated:
@@ -868,16 +739,13 @@ def create_enhanced_linear_viz(sequence, smiles):
868
  ha='center', va='bottom', fontsize=10,
869
  color=bond_color)
870
 
871
- # Add residue label
872
  ax_struct.text(x_pos, y_pos-0.5, residues[i],
873
  ha='center', va='top', fontsize=14)
874
 
875
- # Bottom subplot - Detailed breakdown
876
  ax_detail.set_ylim(0, len(segments)+1)
877
  ax_detail.set_xlim(0, 1)
878
 
879
- # Create detailed breakdown
880
- segment_y = len(segments) # Start from top
881
  for i, segment in enumerate(segments):
882
  y = segment_y - i
883
 
@@ -899,7 +767,6 @@ def create_enhanced_linear_viz(sequence, smiles):
899
  text += "peptide"
900
  color = 'red'
901
 
902
- # Add segment analysis
903
  ax_detail.text(0.05, y, text, fontsize=12, color=color)
904
  ax_detail.text(0.5, y, f"SMILES: {segment.get('content', '')}", fontsize=10, color='gray')
905
 
@@ -910,11 +777,9 @@ def create_enhanced_linear_viz(sequence, smiles):
910
  ax_struct.text(5, y_pos+0.3, 'Cyclic Connection',
911
  ha='center', color='red', fontsize=14)
912
 
913
- # Add titles and adjust layout
914
  ax_struct.set_title("Peptide Structure Overview", pad=20)
915
  ax_detail.set_title("Segment Analysis Breakdown", pad=20)
916
 
917
- # Remove axes
918
  for ax in [ax_struct, ax_detail]:
919
  ax.set_xticks([])
920
  ax.set_yticks([])
@@ -924,7 +789,7 @@ def create_enhanced_linear_viz(sequence, smiles):
924
  return fig
925
 
926
  class PeptideStructureGenerator:
927
- """A class to generate 3D structures of peptides using different embedding methods"""
928
 
929
  @staticmethod
930
  def prepare_molecule(smiles):
@@ -933,7 +798,6 @@ class PeptideStructureGenerator:
933
  if mol is None:
934
  raise ValueError("Failed to create molecule from SMILES")
935
 
936
- # Calculate valence for each atom
937
  for atom in mol.GetAtoms():
938
  atom.UpdatePropertyCache(strict=False)
939
 
@@ -951,7 +815,7 @@ class PeptideStructureGenerator:
951
 
952
  @staticmethod
953
  def get_etkdg_params(attempt=0):
954
- """Get ETKDG parameters with optional modifications based on attempt number"""
955
  params = AllChem.ETKDGv3()
956
  params.randomSeed = -1
957
  params.maxIterations = 200
@@ -1025,13 +889,11 @@ class PeptideStructureGenerator:
1025
  @staticmethod
1026
  def mol_to_sdf_bytes(mol):
1027
  """Convert RDKit molecule to SDF file bytes"""
1028
- # First write to StringIO in text mode
1029
  sio = StringIO()
1030
  writer = Chem.SDWriter(sio)
1031
  writer.write(mol)
1032
  writer.close()
1033
 
1034
- # Convert the string to bytes
1035
  return sio.getvalue().encode('utf-8')
1036
 
1037
  def process_input(smiles_input=None, file_obj=None, show_linear=False,
@@ -1045,17 +907,14 @@ def process_input(smiles_input=None, file_obj=None, show_linear=False,
1045
  if smiles_input:
1046
  smiles = smiles_input.strip()
1047
 
1048
- # First check if it's a peptide using analyzer's method
1049
  if not analyzer.is_peptide(smiles):
1050
  return "Error: Input SMILES does not appear to be a peptide structure.", None, None
1051
 
1052
  try:
1053
- # Create molecule
1054
  mol = Chem.MolFromSmiles(smiles)
1055
  if mol is None:
1056
  return "Error: Invalid SMILES notation.", None, None
1057
 
1058
- # Generate 3D structures if requested
1059
  if generate_3d:
1060
  generator = PeptideStructureGenerator()
1061
 
@@ -1080,10 +939,8 @@ def process_input(smiles_input=None, file_obj=None, show_linear=False,
1080
  except Exception as e:
1081
  return f"Error generating 3D structures: {str(e)}", None, None, None
1082
 
1083
- # Use analyzer to get sequence
1084
  segments = analyzer.split_on_bonds(smiles)
1085
 
1086
- # Process segments and build sequence
1087
  sequence_parts = []
1088
  output_text = ""
1089
 
@@ -1108,7 +965,6 @@ def process_input(smiles_input=None, file_obj=None, show_linear=False,
1108
  output_text += f"Warning: Could not identify residue in segment: {segment['content']}\n"
1109
  output_text += "\n"
1110
  else:
1111
- # Just build sequence without detailed analysis in output
1112
  for segment in segments:
1113
  residue, mods = analyzer.identify_residue(segment)
1114
  if residue:
@@ -1117,7 +973,6 @@ def process_input(smiles_input=None, file_obj=None, show_linear=False,
1117
  else:
1118
  sequence_parts.append(residue)
1119
 
1120
- # Check if cyclic using analyzer's method
1121
  is_cyclic, peptide_cycles, aromatic_cycles = analyzer.is_cyclic(smiles)
1122
  three_letter = '-'.join(sequence_parts)
1123
  one_letter = ''.join(analyzer.three_to_one.get(aa.split('(')[0], 'X') for aa in sequence_parts)
@@ -1126,7 +981,6 @@ def process_input(smiles_input=None, file_obj=None, show_linear=False,
1126
  three_letter = f"cyclo({three_letter})"
1127
  one_letter = f"cyclo({one_letter})"
1128
 
1129
- # Create cyclic structure visualization
1130
  img_cyclic = annotate_cyclic_structure(mol, three_letter)
1131
 
1132
  # Create linear representation if requested
@@ -1139,7 +993,6 @@ def process_input(smiles_input=None, file_obj=None, show_linear=False,
1139
  img_linear = Image.open(buf)
1140
  plt.close(fig_linear)
1141
 
1142
- # Add summary to output
1143
  summary = "Summary:\n"
1144
  summary += f"Sequence: {three_letter}\n"
1145
  summary += f"One-letter code: {one_letter}\n"
@@ -1161,7 +1014,6 @@ def process_input(smiles_input=None, file_obj=None, show_linear=False,
1161
  # Handle file input
1162
  if file_obj is not None:
1163
  try:
1164
- # Handle file content
1165
  if hasattr(file_obj, 'name'):
1166
  with open(file_obj.name, 'r') as f:
1167
  content = f.read()
@@ -1172,16 +1024,13 @@ def process_input(smiles_input=None, file_obj=None, show_linear=False,
1172
  for line in content.splitlines():
1173
  smiles = line.strip()
1174
  if smiles:
1175
- # Check if it's a peptide
1176
  if not analyzer.is_peptide(smiles):
1177
  output_text += f"Skipping non-peptide SMILES: {smiles}\n"
1178
  continue
1179
 
1180
- # Process this SMILES
1181
  segments = analyzer.split_on_bonds(smiles)
1182
  sequence_parts = []
1183
 
1184
- # Add segment details if requested
1185
  if show_segment_details:
1186
  output_text += f"\nSegment Analysis for SMILES: {smiles}\n"
1187
  for i, segment in enumerate(segments):
@@ -1206,7 +1055,6 @@ def process_input(smiles_input=None, file_obj=None, show_linear=False,
1206
  else:
1207
  sequence_parts.append(residue)
1208
 
1209
- # Get cyclicity and create sequence
1210
  is_cyclic, peptide_cycles, aromatic_cycles = analyzer.is_cyclic(smiles)
1211
  sequence = f"cyclo({'-'.join(sequence_parts)})" if is_cyclic else '-'.join(sequence_parts)
1212
 
@@ -1215,7 +1063,6 @@ def process_input(smiles_input=None, file_obj=None, show_linear=False,
1215
  output_text += f"Is Cyclic: {'Yes' if is_cyclic else 'No'}\n"
1216
  if is_cyclic:
1217
  output_text += f"Peptide Cycles: {', '.join(peptide_cycles)}\n"
1218
- #output_text += f"Aromatic Cycles: {', '.join(aromatic_cycles)}\n"
1219
  output_text += "-" * 50 + "\n"
1220
 
1221
  return output_text, None, None
@@ -1298,6 +1145,5 @@ iface = gr.Interface(
1298
  flagging_mode="never"
1299
  )
1300
 
1301
- # Launch the app
1302
  if __name__ == "__main__":
1303
  iface.launch(share=True)
 
72
 
73
  is_cyclic = len(peptide_cycles) > 0 and not smiles.endswith('C(=O)O')
74
  return is_cyclic, peptide_cycles, aromatic_cycles
75
+
76
  def split_on_bonds(self, smiles):
 
77
  positions = []
78
  used = set()
 
79
  # Find Gly pattern first
80
  gly_pattern = r'NCC\(=O\)'
81
  for match in re.finditer(gly_pattern, smiles):
 
104
 
105
  # Create segments
106
  segments = []
 
107
  if positions:
108
  # First segment
109
  if positions[0]['start'] > 0:
 
111
  'content': smiles[0:positions[0]['start']],
112
  'bond_after': positions[0]['pattern']
113
  })
 
114
  # Process segments
115
  for i in range(len(positions)-1):
116
  current = positions[i]
117
  next_pos = positions[i+1]
 
118
  if current['type'] == 'gly':
119
  segments.append({
120
  'content': 'NCC(=O)',
121
  'bond_before': positions[i-1]['pattern'] if i > 0 else None,
122
  'bond_after': next_pos['pattern']
123
  })
124
+ segments.append({
125
+ 'content': smiles[current['start']+7:next_pos['start']],
126
+ 'bond_before': 'gly_bond',
127
+ 'bond_after': next_pos['pattern']
128
+ })
129
  else:
130
  content = smiles[current['end']:next_pos['start']]
131
  if content:
 
141
  'content': smiles[positions[-1]['end']:],
142
  'bond_before': positions[-1]['pattern']
143
  })
 
144
  return segments
145
 
146
  def clean_terminal_carboxyl(self, segment):
 
167
  content = self.clean_terminal_carboxyl(segment)
168
  mods = self.get_modifications(segment)
169
 
170
+ # Proline (P) - flexible ring numbers
171
+ if any([
172
+ # Check for any ring number in bond patterns
173
+ (segment.get('bond_after', '').startswith(f'N{n}C(=O)') and 'CCC' in content and
174
+ any(f'[C@@H]{n}' in content or f'[C@H]{n}' in content for n in '123456789'))
175
+ for n in '123456789'
176
+ ]) or any([(segment.get('bond_before', '').startswith(f'C(=O)N{n}') and 'CCC' in content and
177
+ any(f'CCC{n}' for n in '123456789'))
178
+ for n in '123456789'
179
+ ]) or any([
180
+ # Check ending patterns with any ring number
181
+ (f'CCCN{n}' in content and content.endswith('=O') and
182
+ any(f'[C@@H]{n}' in content or f'[C@H]{n}' in content for n in '123456789'))
183
+ for n in '123456789'
184
+ ]) or any([
185
+ # Handle CCC[C@H]n patterns
186
+ (content == f'CCC[C@H]{n}' and segment.get('bond_before', '').startswith(f'C(=O)N{n}')) or
187
+ (content == f'CCC[C@@H]{n}' and segment.get('bond_before', '').startswith(f'C(=O)N{n}')) or
188
+ # N-terminal Pro with any ring number
189
+ (f'N{n}CCC[C@H]{n}' in content) or
190
+ (f'N{n}CCC[C@@H]{n}' in content)
191
+ for n in '123456789'
192
+ ]):
193
+ return 'Pro', mods
194
+
195
+ # Tryptophan (W) - more specific indole pattern
196
+ if re.search(r'c[0-9]c\[nH\]c[0-9]ccccc[0-9][0-9]', content) and \
197
+ 'c[nH]c' in content.replace(' ', ''):
198
+ return 'Trp', mods
199
+
200
+ # Lysine (K)
201
+ if '[C@@H](CCCCN)' in content or '[C@H](CCCCN)' in content:
202
+ return 'Lys', mods
203
+
204
+ # Arginine (R)
205
+ if '[C@@H](CCCNC(=N)N)' in content or '[C@H](CCCNC(=N)N)' in content:
206
+ return 'Arg', mods
207
+
208
+ if ('NCC(=O)' in content) or (content == 'C'):
209
+ if segment.get('bond_before') and segment.get('bond_after'):
210
+ if ('C(=O)N' in segment['bond_before'] or 'C(=O)N(C)' in segment['bond_before']):
211
+ return 'Gly', mods
212
+ elif segment.get('bond_before') and segment.get('bond_before').startswith('C(=O)N'):
213
+ return 'Gly', mods
214
+
215
+ if 'CC(C)C[C@H]' in content or 'CC(C)C[C@@H]' in content or '[C@@H](CC(C)C)' in content or '[C@H](CC(C)C)' in content or (('N[C@H](CCC(C)C)' in content or 'N[C@@H](CCC(C)C)' in content) and segment.get('bond_before') is None):
216
+ return 'Leu', mods
217
+
218
+ if '[C@@H]([C@@H](C)O)' in content or '[C@H]([C@H](C)O)' in content:
219
+ return 'Thr', mods
220
+ if re.search(r'\[C@H\]\(Cc\d+ccccc\d+\)', content) or re.search(r'\[C@@H\]\(Cc\d+ccccc\d+\)', content):
221
+ return 'Phe', mods
222
+
223
+ if ('[C@H](C(C)C)' in content or
224
+ '[C@@H](C(C)C)' in content or
225
+ '[C@H]C(C)C' in content or
226
+ '[C@@H]C(C)C' in content
227
+ ):
228
+ if not any(p in content for p in ['CC(C)C[C@H]', 'CC(C)C[C@@H]']): # Still check not Leu
229
+ return 'Val', mods
230
+
231
+ if any([
232
+ 'CC[C@H](C)' in content,
233
+ 'CC[C@@H](C)' in content,
234
+ '[C@@H](CC)C' in content,
235
+ '[C@H](CC)C' in content,
236
+ 'C(C)C[C@H]' in content and 'CC(C)C' not in content,
237
+ 'C(C)C[C@@H]' in content and 'CC(C)C' not in content
238
+ ]):
239
+ return 'Ile', mods
240
+
241
+ if ('[C@H](C)' in content or '[C@@H](C)' in content):
242
+ if not any(p in content for p in ['C(C)C', 'COC', 'CN(', 'C(C)O', 'CC[C@H]', 'CC[C@@H]']):
243
+ return 'Ala', mods
244
+
245
+ # Tyrosine (Tyr) - 4-hydroxybenzyl side chain
246
+ if re.search(r'Cc[0-9]ccc\(O\)cc[0-9]', content):
247
+ return 'Tyr', mods
248
+
249
+ # Serine (Ser) - Hydroxymethyl side chain
250
+ if '[C@H](CO)' in content or '[C@@H](CO)' in content:
251
+ if not ('C(C)O' in content or 'COC' in content):
252
+ return 'Ser', mods
253
+
254
+ # Threonine (Thr) - 1-hydroxyethyl side chain
255
+ if '[C@@H]([C@@H](C)O)' in content or '[C@H]([C@H](C)O)' in content or '[C@@H](C)O' in content or '[C@H](C)O' in content:
256
+ return 'Thr', mods
257
+
258
+ # Cysteine (Cys) - Thiol side chain
259
+ if '[C@H](CS)' in content or '[C@@H](CS)' in content:
260
+ return 'Cys', mods
261
+
262
+ # Methionine (Met) - Methylthioethyl side chain
263
+ if ('CCSC' in content):
264
+ return 'Met', mods
265
+
266
+ # Glutamine (Gln) - Carbamoylethyl side chain
267
+ if (content == '[C@@H](CC' or content == '[C@H](CC' and segment.get('bond_before')=='C(=O)N' and segment.get('bond_after')=='C(=O)N') or ('CCC(=O)N' in content) or ('CCC(N)=O' in content):
268
+ return 'Gln', mods
269
+ # Asparagine (Asn) - Carbamoylmethyl side chain
270
+ if (content == '[C@@H](C' or content == '[C@H](C' and segment.get('bond_before')=='C(=O)N' and segment.get('bond_after')=='C(=O)N') or ('CC(=O)N' in content) or ('CCN(=O)' in content) or ('CC(N)=O' in content):
271
+ return 'Asn', mods
272
+
273
+ # Glutamic acid (Glu) - Carboxyethyl side chain
274
+ if ('CCC(=O)O' in content):
275
+ return 'Glu', mods
276
+ # Aspartic acid (Asp) - Carboxymethyl side chain
277
+ if ('CC(=O)O' in content):
278
+ return 'Asp', mods
279
+
280
+ # Arginine (Arg) - 3-guanidinopropyl side chain
281
+ if ('CCCNC(=N)N' in content):
282
+ return 'Arg', mods
283
+
284
+ # Histidine (His) - Imidazole side chain
285
+ if re.search(r'Cc\d+c\[nH\]cn\d+', content) or re.search(r'Cc\d+cnc\[nH\]\d+', content):
286
+ return 'His', mods
287
+
288
+ ############UAA
289
+
290
+ if '[C@H](COC(C)(C)C)' in content or '[C@@H](COC(C)(C)C)' in content:
291
+ return 'O-tBu', mods
292
+
293
+ if re.search(r'c\d+ccccc\d+', content):
294
  if '[C@@H](c1ccccc1)' in content or '[C@H](c1ccccc1)' in content:
295
  return '4', mods # Base phenylglycine
296
+ if ('C[C@H](CCCC)' in content or 'C[C@@H](CCCC)' in content) and 'CC(C)' not in content:
297
+ return 'Nle', mods
298
+
299
+ # Ornithine (Orn) - 3-carbon chain with NH2
300
+ if ('C[C@H](CCCN)' in content or 'C[C@@H](CCCN)' in content) and 'CC(C)' not in content:
301
+ return 'Orn', mods
302
+
303
+ # 2-Naphthylalanine (2Nal)
304
+ if ('Cc3cc2ccccc2c3' in content):
305
+ return '2Nal', mods
306
+
307
+ # Cyclohexylalanine (Cha)
308
+ if 'N2CCCCC2' in content or 'CCCCC2' in content:
309
+ return 'Cha', mods
310
+
311
+ # Aminobutyric acid (Abu) - 2-carbon chain
312
+ if ('C[C@H](CC)' in content or 'C[C@@H](CC)' in content) and not any(p in content for p in ['CC(C)', 'CCCC', 'CCC(C)']):
313
+ return 'Abu', mods
314
+
315
+ # Pipecolic acid (Pip)
316
+ if ('N3CCCCC3' in content or 'CCCCC3' in content):
317
+ return 'Pip', mods
318
+
319
+ # Cyclohexylglycine (Chg) - direct cyclohexyl without CH2
320
+ if ('C[C@H](C1CCCCC1)' in content or 'C[C@@H](C1CCCCC1)' in content):
321
+ return 'Chg', mods
322
+
323
+ # 4-Fluorophenylalanine (4F-Phe)
324
+ if ('Cc2ccc(F)cc2' in content):
325
+ return '4F-Phe', mods
326
 
327
  # 4-substituted phenylalanines
328
  if 'Cc1ccc' in content:
 
432
  if 'c1ccc(c(c1)O)O' in content:
433
  return 'DAH', mods # 3,4-Dihydroxy-phenylalanine
434
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
435
  # Modified histidines
436
  if 'c1cnc' in content:
437
  if '[C@@H]1CN[C@@H](N1)F' in content:
 
441
  if 'c1c[nH]c(n1)F' in content:
442
  return '2HF2', mods # 2-fluoro-l-histidine variant
443
 
 
444
  if '[SeH]' in content:
445
  return 'CSE', mods # Selenocysteine
446
  if 'S' in content:
 
451
  if 'CCS' in content:
452
  return 'HCS', mods # homocysteine
453
 
 
454
  if 'CN=[N]=N' in content:
455
  return 'AZDA', mods # azido-alanine
456
  if '[NH]=[C](=[NH2])=[NH2]' in content:
 
458
  return 'AGM', mods # 5-methyl-arginine
459
  if 'CC[NH]=' in content:
460
  return 'GDPR', mods # 2-Amino-3-guanidinopropionic acid
461
+
462
+ # Others
463
+ if 'C1CCCC1' in content:
464
+ return 'CPA3', mods # 3-Cyclopentyl-alanine
465
+ if 'C1CCCCC1' in content:
466
+ if 'CC1CCCCC1' in content:
467
+ return 'ALC', mods # 3-cyclohexyl-alanine
468
+ else:
469
+ return 'CHG', mods # Cyclohexylglycine
470
 
471
+ if 'CCC[C@@H]' in content or 'CCC[C@H]' in content:
472
+ return 'NLE', mods # Norleucine
473
+ if 'CC[C@@H]' in content or 'CC[C@H]' in content:
474
+ if not any(x in content for x in ['CC(C)', 'COC', 'CN(']):
475
+ return 'ABA', mods # 2-Aminobutyric acid
476
  if 'CCON' in content:
477
  return 'CAN', mods # canaline
478
  if '[C@@H]1C=C[C@@H](C=C1)' in content:
 
496
  if 'c1cccc(c1)[C](=[NH2])=[NH2]' in content:
497
  return 'APM', mods # m-amidinophenyl-3-alanine
498
 
 
499
  if 'O' in content:
500
  if '[C@H]([C@H](C)O)O' in content:
501
  return 'ILX', mods # 4,5-dihydroxy-isoleucine
 
510
  if '[C@H](c1ccc(c(Cl)c1)O)O' in content:
511
  return 'OMY', mods # (betar)-3-chloro-beta-hydroxy-l-tyrosine
512
 
 
513
  if 'n1' in content:
514
  if 'n1cccn1' in content:
515
  return 'PYZ1', mods # 3-(1-Pyrazolyl)-alanine
 
528
  if 'c1cnc2c(n1)cccc2' in content:
529
  return 'QX32', mods # 3-(2-quinoxalyl)-alanine
530
 
 
531
  if 'N' in content:
532
  if '[NH3]CC[C@@H]' in content:
533
  return 'DAB', mods # Diaminobutyric acid
 
540
  if '[NH]=[C](=S)=[NH2]' in content:
541
  return 'THIC', mods # Thio-citrulline
542
 
 
543
  if 'CC' in content:
544
  if 'CCCC[C@@H]' in content:
545
  return 'AHP', mods # 2-Aminoheptanoic acid
 
552
  if '[C@@H]([C@@H](C)O)C' in content:
553
  return 'HLU', mods # beta-hydroxyleucine
554
 
 
555
  if '[C@@H]' in content:
556
  if '[C@@H](C[C@@H](F))' in content:
557
  return 'FGA4', mods # 4-Fluoro-glutamic acid
 
562
  if '[C@@H](CC[C@H](C))' in content:
563
  return 'MEG', mods # (3s)-3-methyl-l-glutamic acid
564
 
 
565
  if 'S' in content:
566
  if 'SCC[C@@H]' in content:
567
  return 'HSER', mods # homoserine
 
574
  if 'S(=O)(=O)' in content:
575
  return 'OMT', mods # Methionine sulfone
576
 
 
577
  if 'C=' in content:
578
  if 'C=C[C@@H]' in content:
579
  return '2AG', mods # 2-Allyl-glycine
 
582
  if 'C=Cc1ccccc1' in content:
583
  return 'STYA', mods # Styrylalanine
584
 
 
585
  if '[C@@H]1Cc2c(C1)cccc2' in content:
586
  return 'IGL', mods # alpha-amino-2-indanacetic acid
587
  if '[C](=[C](=O)=O)=O' in content:
588
  return '26P', mods # 2-amino-6-oxopimelic acid
589
  if '[C](=[C](=O)=O)=C' in content:
590
  return '2NP', mods # l-2-amino-6-methylene-pimelic acid
 
 
591
  if 'c1cccc2c1cc(O)cc2' in content:
592
  return 'NAO1', mods # 5-hydroxy-1-naphthalene
593
  if 'c1ccc2c(c1)cc(O)cc2' in content:
594
+ return 'NAO2', mods # 6-hydroxy-2-naphthalene
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
595
  return None, mods
596
 
597
  def get_modifications(self, segment):
 
650
  'one_letter': one_letter,
651
  'is_cyclic': is_cyclic
652
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
653
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
654
  def annotate_cyclic_structure(mol, sequence):
655
+ """Create structure visualization"""
 
656
  AllChem.Compute2DCoords(mol)
657
 
 
658
  drawer = Draw.rdMolDraw2D.MolDraw2DCairo(2000, 2000)
659
 
660
  # Draw molecule first
 
674
  print("Warning: TrueType fonts not available, using default font")
675
  small_font = ImageFont.load_default()
676
 
677
+ # Header
678
  seq_text = f"Sequence: {sequence}"
679
  bbox = draw.textbbox((1000, 100), seq_text, font=small_font)
680
  padding = 10
 
687
  return img
688
 
689
  def create_enhanced_linear_viz(sequence, smiles):
690
+ """"Linear visualization"""
691
+ analyzer = PeptideAnalyzer()
692
 
 
693
  fig = plt.figure(figsize=(15, 10))
694
  gs = fig.add_gridspec(2, 1, height_ratios=[1, 2])
695
  ax_struct = fig.add_subplot(gs[0])
696
  ax_detail = fig.add_subplot(gs[1])
697
 
 
698
  if sequence.startswith('cyclo('):
699
  residues = sequence[6:-1].split('-')
700
  else:
701
  residues = sequence.split('-')
702
 
 
703
  segments = analyzer.split_on_bonds(smiles)
704
 
 
705
  print(f"Number of residues: {len(residues)}")
706
  print(f"Number of segments: {len(segments)}")
707
 
 
708
  ax_struct.set_xlim(0, 10)
709
  ax_struct.set_ylim(0, 2)
710
 
711
  num_residues = len(residues)
712
  spacing = 9.0 / (num_residues - 1) if num_residues > 1 else 9.0
713
 
 
714
  y_pos = 1.5
715
  for i in range(num_residues):
716
  x_pos = 0.5 + i * spacing
717
 
 
718
  rect = patches.Rectangle((x_pos-0.3, y_pos-0.2), 0.6, 0.4,
719
  facecolor='lightblue', edgecolor='black')
720
  ax_struct.add_patch(rect)
721
 
 
722
  if i < num_residues - 1:
723
  segment = segments[i] if i < len(segments) else None
724
  if segment:
 
725
  bond_type = 'ester' if 'O-linked' in segment.get('bond_after', '') else 'peptide'
726
  is_n_methylated = 'N-Me' in segment.get('bond_after', '')
727
 
728
  bond_color = 'red' if bond_type == 'ester' else 'black'
729
  linestyle = '--' if bond_type == 'ester' else '-'
730
 
 
731
  ax_struct.plot([x_pos+0.3, x_pos+spacing-0.3], [y_pos, y_pos],
732
  color=bond_color, linestyle=linestyle, linewidth=2)
733
 
 
734
  mid_x = x_pos + spacing/2
735
  bond_label = f"{bond_type}"
736
  if is_n_methylated:
 
739
  ha='center', va='bottom', fontsize=10,
740
  color=bond_color)
741
 
 
742
  ax_struct.text(x_pos, y_pos-0.5, residues[i],
743
  ha='center', va='top', fontsize=14)
744
 
 
745
  ax_detail.set_ylim(0, len(segments)+1)
746
  ax_detail.set_xlim(0, 1)
747
 
748
+ segment_y = len(segments)
 
749
  for i, segment in enumerate(segments):
750
  y = segment_y - i
751
 
 
767
  text += "peptide"
768
  color = 'red'
769
 
 
770
  ax_detail.text(0.05, y, text, fontsize=12, color=color)
771
  ax_detail.text(0.5, y, f"SMILES: {segment.get('content', '')}", fontsize=10, color='gray')
772
 
 
777
  ax_struct.text(5, y_pos+0.3, 'Cyclic Connection',
778
  ha='center', color='red', fontsize=14)
779
 
 
780
  ax_struct.set_title("Peptide Structure Overview", pad=20)
781
  ax_detail.set_title("Segment Analysis Breakdown", pad=20)
782
 
 
783
  for ax in [ax_struct, ax_detail]:
784
  ax.set_xticks([])
785
  ax.set_yticks([])
 
789
  return fig
790
 
791
  class PeptideStructureGenerator:
792
+ """Generate 3D structures of peptides using different embedding methods"""
793
 
794
  @staticmethod
795
  def prepare_molecule(smiles):
 
798
  if mol is None:
799
  raise ValueError("Failed to create molecule from SMILES")
800
 
 
801
  for atom in mol.GetAtoms():
802
  atom.UpdatePropertyCache(strict=False)
803
 
 
815
 
816
  @staticmethod
817
  def get_etkdg_params(attempt=0):
818
+ """Get ETKDG parameters"""
819
  params = AllChem.ETKDGv3()
820
  params.randomSeed = -1
821
  params.maxIterations = 200
 
889
  @staticmethod
890
  def mol_to_sdf_bytes(mol):
891
  """Convert RDKit molecule to SDF file bytes"""
 
892
  sio = StringIO()
893
  writer = Chem.SDWriter(sio)
894
  writer.write(mol)
895
  writer.close()
896
 
 
897
  return sio.getvalue().encode('utf-8')
898
 
899
  def process_input(smiles_input=None, file_obj=None, show_linear=False,
 
907
  if smiles_input:
908
  smiles = smiles_input.strip()
909
 
 
910
  if not analyzer.is_peptide(smiles):
911
  return "Error: Input SMILES does not appear to be a peptide structure.", None, None
912
 
913
  try:
 
914
  mol = Chem.MolFromSmiles(smiles)
915
  if mol is None:
916
  return "Error: Invalid SMILES notation.", None, None
917
 
 
918
  if generate_3d:
919
  generator = PeptideStructureGenerator()
920
 
 
939
  except Exception as e:
940
  return f"Error generating 3D structures: {str(e)}", None, None, None
941
 
 
942
  segments = analyzer.split_on_bonds(smiles)
943
 
 
944
  sequence_parts = []
945
  output_text = ""
946
 
 
965
  output_text += f"Warning: Could not identify residue in segment: {segment['content']}\n"
966
  output_text += "\n"
967
  else:
 
968
  for segment in segments:
969
  residue, mods = analyzer.identify_residue(segment)
970
  if residue:
 
973
  else:
974
  sequence_parts.append(residue)
975
 
 
976
  is_cyclic, peptide_cycles, aromatic_cycles = analyzer.is_cyclic(smiles)
977
  three_letter = '-'.join(sequence_parts)
978
  one_letter = ''.join(analyzer.three_to_one.get(aa.split('(')[0], 'X') for aa in sequence_parts)
 
981
  three_letter = f"cyclo({three_letter})"
982
  one_letter = f"cyclo({one_letter})"
983
 
 
984
  img_cyclic = annotate_cyclic_structure(mol, three_letter)
985
 
986
  # Create linear representation if requested
 
993
  img_linear = Image.open(buf)
994
  plt.close(fig_linear)
995
 
 
996
  summary = "Summary:\n"
997
  summary += f"Sequence: {three_letter}\n"
998
  summary += f"One-letter code: {one_letter}\n"
 
1014
  # Handle file input
1015
  if file_obj is not None:
1016
  try:
 
1017
  if hasattr(file_obj, 'name'):
1018
  with open(file_obj.name, 'r') as f:
1019
  content = f.read()
 
1024
  for line in content.splitlines():
1025
  smiles = line.strip()
1026
  if smiles:
 
1027
  if not analyzer.is_peptide(smiles):
1028
  output_text += f"Skipping non-peptide SMILES: {smiles}\n"
1029
  continue
1030
 
 
1031
  segments = analyzer.split_on_bonds(smiles)
1032
  sequence_parts = []
1033
 
 
1034
  if show_segment_details:
1035
  output_text += f"\nSegment Analysis for SMILES: {smiles}\n"
1036
  for i, segment in enumerate(segments):
 
1055
  else:
1056
  sequence_parts.append(residue)
1057
 
 
1058
  is_cyclic, peptide_cycles, aromatic_cycles = analyzer.is_cyclic(smiles)
1059
  sequence = f"cyclo({'-'.join(sequence_parts)})" if is_cyclic else '-'.join(sequence_parts)
1060
 
 
1063
  output_text += f"Is Cyclic: {'Yes' if is_cyclic else 'No'}\n"
1064
  if is_cyclic:
1065
  output_text += f"Peptide Cycles: {', '.join(peptide_cycles)}\n"
 
1066
  output_text += "-" * 50 + "\n"
1067
 
1068
  return output_text, None, None
 
1145
  flagging_mode="never"
1146
  )
1147
 
 
1148
  if __name__ == "__main__":
1149
  iface.launch(share=True)