Spaces:

ChatterjeeLab
/

SMILES2PEPTIDE

Running

App Files Files Community

yinuozhang commited on 4 days ago

Commit

0095ae9

1 Parent(s): b3dd269

synthesizable

Browse files

Files changed (1) hide show

app.py +325 -405

app.py CHANGED Viewed

@@ -17,22 +17,98 @@ from rdkit import Chem
 class PeptideAnalyzer:
     def __init__(self):
         self.bond_patterns = [
-            (r'OC\(=O\)', 'ester'),  # Ester bond
             (r'N\(C\)C\(=O\)', 'n_methyl'),  # N-methylated peptide bond
             (r'N[0-9]C\(=O\)', 'proline'),  # Proline peptide bond
             (r'NC\(=O\)', 'peptide'),  # Standard peptide bond
             (r'C\(=O\)N\(C\)', 'n_methyl_reverse'),  # Reverse N-methylated
             (r'C\(=O\)N[12]?', 'peptide_reverse')  # Reverse peptide bond
         ]
         # Three to one letter code mapping
         self.three_to_one = {
             'Ala': 'A', 'Cys': 'C', 'Asp': 'D', 'Glu': 'E',
             'Phe': 'F', 'Gly': 'G', 'His': 'H', 'Ile': 'I',
             'Lys': 'K', 'Leu': 'L', 'Met': 'M', 'Asn': 'N',
             'Pro': 'P', 'Gln': 'Q', 'Arg': 'R', 'Ser': 'S',
-            'Thr': 'T', 'Val': 'V', 'Trp': 'W', 'Tyr': 'Y'
         }
     def is_peptide(self, smiles):
         """Check if the SMILES represents a peptide structure"""
         mol = Chem.MolFromSmiles(smiles)
@@ -73,14 +149,34 @@ class PeptideAnalyzer:
         is_cyclic = len(peptide_cycles) > 0 and not smiles.endswith('C(=O)O')
         return is_cyclic, peptide_cycles, aromatic_cycles
-    def split_on_bonds(self, smiles):
         positions = []
         used = set()
         # Find Gly pattern first
         gly_pattern = r'NCC\(=O\)'
         for match in re.finditer(gly_pattern, smiles):
             if not any(p in range(match.start(), match.end()) for p in used):
-                positions.append({
                     'start': match.start(),
                     'end': match.end(),
                     'type': 'gly',
@@ -91,56 +187,70 @@ class PeptideAnalyzer:
         for pattern, bond_type in self.bond_patterns:
             for match in re.finditer(pattern, smiles):
                 if not any(p in range(match.start(), match.end()) for p in used):
-                    positions.append({
                         'start': match.start(),
                         'end': match.end(),
                         'type': bond_type,
                         'pattern': match.group()
                     })
                     used.update(range(match.start(), match.end()))
-        # Sort by position
-        positions.sort(key=lambda x: x['start'])
         # Create segments
         segments = []
-        if positions:
-            # First segment
-            if positions[0]['start'] > 0:
                 segments.append({
-                    'content': smiles[0:positions[0]['start']],
-                    'bond_after': positions[0]['pattern']
                 })
-            # Process segments
-            for i in range(len(positions)-1):
-                current = positions[i]
-                next_pos = positions[i+1]
-                if current['type'] == 'gly':
-                    segments.append({
-                        'content': 'NCC(=O)',
-                        'bond_before': positions[i-1]['pattern'] if i > 0 else None,
-                        'bond_after': next_pos['pattern']
-                    })
                     segments.append({
-                        'content': smiles[current['start']+7:next_pos['start']],
-                        'bond_before': 'gly_bond',
-                        'bond_after': next_pos['pattern']
                     })
-                else:
-                    content = smiles[current['end']:next_pos['start']]
-                    if content:
-                        segments.append({
-                            'content': content,
-                            'bond_before': current['pattern'],
-                            'bond_after': next_pos['pattern']
-                        })
-            # Last segment
-            if positions[-1]['end'] < len(smiles):
                 segments.append({
-                    'content': smiles[positions[-1]['end']:],
-                    'bond_before': positions[-1]['pattern']
                 })
         return segments
     def clean_terminal_carboxyl(self, segment):
@@ -164,9 +274,23 @@ class PeptideAnalyzer:
     def identify_residue(self, segment):
         """Identify residue with Pro reconstruction"""
         # Only clean terminal carboxyl if this is the last segment
         content = self.clean_terminal_carboxyl(segment)
         mods = self.get_modifications(segment)
         # Proline (P) - flexible ring numbers
         if any([
             # Check for any ring number in bond patterns
@@ -174,8 +298,8 @@ class PeptideAnalyzer:
             any(f'[C@@H]{n}' in content or f'[C@H]{n}' in content for n in '123456789'))
             for n in '123456789'
         ]) or any([(segment.get('bond_before', '').startswith(f'C(=O)N{n}') and 'CCC' in content and
-            any(f'CCC{n}' for n in '123456789'))
-            for n in '123456789'
         ]) or any([
             # Check ending patterns with any ring number
             (f'CCCN{n}' in content and content.endswith('=O') and
@@ -192,430 +316,226 @@ class PeptideAnalyzer:
         ]):
             return 'Pro', mods
-        # Tryptophan (W) - more specific indole pattern
         if re.search(r'c[0-9]c\[nH\]c[0-9]ccccc[0-9][0-9]', content) and \
         'c[nH]c' in content.replace(' ', ''):
             return 'Trp', mods
-        # Lysine (K)
         if '[C@@H](CCCCN)' in content or '[C@H](CCCCN)' in content:
             return 'Lys', mods
-        # Arginine (R)
         if '[C@@H](CCCNC(=N)N)' in content or '[C@H](CCCNC(=N)N)' in content:
             return 'Arg', mods
-        if ('NCC(=O)' in content) or (content == 'C'):
-            if segment.get('bond_before') and segment.get('bond_after'):
-                if ('C(=O)N' in segment['bond_before'] or 'C(=O)N(C)' in segment['bond_before']):
-                    return 'Gly', mods
-            elif segment.get('bond_before') and segment.get('bond_before').startswith('C(=O)N'):
                 return 'Gly', mods
         if 'CC(C)C[C@H]' in content or 'CC(C)C[C@@H]' in content or '[C@@H](CC(C)C)' in content or '[C@H](CC(C)C)' in content or (('N[C@H](CCC(C)C)' in content or 'N[C@@H](CCC(C)C)' in content) and segment.get('bond_before') is None):
             return 'Leu', mods
-        if '[C@@H]([C@@H](C)O)' in content or '[C@H]([C@H](C)O)' in content:
             return 'Thr', mods
         if re.search(r'\[C@H\]\(Cc\d+ccccc\d+\)', content) or re.search(r'\[C@@H\]\(Cc\d+ccccc\d+\)', content):
             return 'Phe', mods
-        if ('[C@H](C(C)C)' in content or
-            '[C@@H](C(C)C)' in content or
-            '[C@H]C(C)C' in content or
-            '[C@@H]C(C)C' in content
-            ):
-            if not any(p in content for p in ['CC(C)C[C@H]', 'CC(C)C[C@@H]']):  # Still check not Leu
-                return 'Val', mods
         if any([
-            'CC[C@H](C)' in content,
-            'CC[C@@H](C)' in content,
             '[C@@H](CC)C' in content,
-            '[C@H](CC)C' in content,
-            'C(C)C[C@H]' in content and 'CC(C)C' not in content,
             'C(C)C[C@@H]' in content and 'CC(C)C' not in content
         ]):
             return 'Ile', mods
         if ('[C@H](C)' in content or '[C@@H](C)' in content):
             if not any(p in content for p in ['C(C)C', 'COC', 'CN(', 'C(C)O', 'CC[C@H]', 'CC[C@@H]']):
                 return 'Ala', mods
-        # Tyrosine (Tyr) - 4-hydroxybenzyl side chain
         if re.search(r'Cc[0-9]ccc\(O\)cc[0-9]', content):
             return 'Tyr', mods
-        # Serine (Ser) - Hydroxymethyl side chain
         if '[C@H](CO)' in content or '[C@@H](CO)' in content:
             if not ('C(C)O' in content or 'COC' in content):
                 return 'Ser', mods
-        # Threonine (Thr) - 1-hydroxyethyl side chain
-        if '[C@@H]([C@@H](C)O)' in content or '[C@H]([C@H](C)O)' in content or '[C@@H](C)O' in content or '[C@H](C)O' in content:
-            return 'Thr', mods
-        # Cysteine (Cys) - Thiol side chain
         if '[C@H](CS)' in content or '[C@@H](CS)' in content:
             return 'Cys', mods
-        # Methionine (Met) - Methylthioethyl side chain
-        if ('CCSC' in content):
             return 'Met', mods
-        # Glutamine (Gln) - Carbamoylethyl side chain
         if (content == '[C@@H](CC' or content == '[C@H](CC' and segment.get('bond_before')=='C(=O)N' and segment.get('bond_after')=='C(=O)N') or ('CCC(=O)N' in content) or ('CCC(N)=O' in content):
             return 'Gln', mods
-        # Asparagine (Asn) - Carbamoylmethyl side chain
         if (content == '[C@@H](C' or content == '[C@H](C' and segment.get('bond_before')=='C(=O)N' and segment.get('bond_after')=='C(=O)N') or ('CC(=O)N' in content) or ('CCN(=O)' in content) or ('CC(N)=O' in content):
             return 'Asn', mods
-        # Glutamic acid (Glu) - Carboxyethyl side chain
         if ('CCC(=O)O' in content):
-            return 'Glu', mods
-        # Aspartic acid (Asp) - Carboxymethyl side chain
         if ('CC(=O)O' in content):
             return 'Asp', mods
-        # Arginine (Arg) - 3-guanidinopropyl side chain
-        if ('CCCNC(=N)N' in content):
-            return 'Arg', mods
-        # Histidine (His) - Imidazole side chain
         if re.search(r'Cc\d+c\[nH\]cn\d+', content) or re.search(r'Cc\d+cnc\[nH\]\d+', content):
             return 'His', mods
-        ############UAA
-        if '[C@H](COC(C)(C)C)' in content or '[C@@H](COC(C)(C)C)' in content:
-            return 'O-tBu', mods
-        if re.search(r'c\d+ccccc\d+', content):
-            if '[C@@H](c1ccccc1)' in content or '[C@H](c1ccccc1)' in content:
-                return '4', mods  # Base phenylglycine
-        if ('C[C@H](CCCC)' in content or 'C[C@@H](CCCC)' in content) and 'CC(C)' not in content:
-            return 'Nle', mods
-        # Ornithine (Orn) - 3-carbon chain with NH2
-        if ('C[C@H](CCCN)' in content or 'C[C@@H](CCCN)' in content) and 'CC(C)' not in content:
-            return 'Orn', mods
-        # 2-Naphthylalanine (2Nal)
-        if ('Cc3cc2ccccc2c3' in content):
-            return '2Nal', mods
-        # Cyclohexylalanine (Cha)
-        if 'N2CCCCC2' in content or 'CCCCC2' in content:
-            return 'Cha', mods
-        # Aminobutyric acid (Abu) - 2-carbon chain
-        if ('C[C@H](CC)' in content or 'C[C@@H](CC)' in content) and not any(p in content for p in ['CC(C)', 'CCCC', 'CCC(C)']):
-            return 'Abu', mods
-        # Pipecolic acid (Pip)
-        if ('N3CCCCC3' in content or 'CCCCC3' in content):
-            return 'Pip', mods
-        # Cyclohexylglycine (Chg) - direct cyclohexyl without CH2
-        if ('C[C@H](C1CCCCC1)' in content or 'C[C@@H](C1CCCCC1)' in content):
-            return 'Chg', mods
-        # 4-Fluorophenylalanine (4F-Phe)
-        if ('Cc2ccc(F)cc2' in content):
-            return '4F-Phe', mods
-        # 4-substituted phenylalanines
-        if 'Cc1ccc' in content:
-            if 'OMe' in content or 'OCc1ccc' in content:
-                return '0A1', mods  # 4-methoxy-Phenylalanine
-            elif 'Clc1ccc' in content:
-                return '200', mods  # 4-chloro-Phenylalanine
-            elif 'Brc1ccc' in content:
-                return '4BF', mods  # 4-Bromo-phenylalanine
-            elif 'C#Nc1ccc' in content:
-                return '4CF', mods  # 4-cyano-phenylalanine
-            elif 'Ic1ccc' in content:
-                return 'PHI', mods  # 4-Iodo-phenylalanine
-            elif 'Fc1ccc' in content:
-                return 'PFF', mods  # 4-Fluoro-phenylalanine
-        # Modified tryptophans
-        if 'c[nH]c2' in content:
-            if 'Oc2cccc2' in content:
-                return '0AF', mods  # 7-hydroxy-tryptophan
-            elif 'Fc2cccc2' in content:
-                return '4FW', mods  # 4-fluoro-tryptophan
-            elif 'Clc2cccc2' in content:
-                return '6CW', mods  # 6-chloro-tryptophan
-            elif 'Brc2cccc2' in content:
-                return 'BTR', mods  # 6-bromo-tryptophan
-            elif 'COc2cccc2' in content:
-                return 'MOT5', mods  # 5-Methoxy-tryptophan
-            elif 'Cc2cccc2' in content:
-                return 'MTR5', mods  # 5-Methyl-tryptophan
-        # Special amino acids
-        if 'CC(C)(C)[C@@H]' in content or 'CC(C)(C)[C@H]' in content:
-            return 'BUG', mods  # Tertleucine
-        if 'CCCNC(=N)N' in content:
-            return 'CIR', mods  # Citrulline
-        if '[SeH]' in content:
-            return 'CSE', mods  # Selenocysteine
-        if '[NH3]CC[C@@H]' in content or '[NH3]CC[C@H]' in content:
-            return 'DAB', mods  # Diaminobutyric acid
-        if 'C1CCCCC1' in content:
-            if 'C1CCCCC1[C@@H]' in content or 'C1CCCCC1[C@H]' in content:
-                return 'CHG', mods  # Cyclohexylglycine
-            elif 'C1CCCCC1C[C@@H]' in content or 'C1CCCCC1C[C@H]' in content:
-                return 'ALC', mods  # 3-cyclohexyl-alanine
-        # Naphthalene derivatives
-        if 'c1cccc2c1cccc2' in content:
-            if 'c1cccc2c1cccc2[C@@H]' in content or 'c1cccc2c1cccc2[C@H]' in content:
-                return 'NAL', mods  # 2-Naphthyl-alanine
-        # Heteroaromatic derivatives
-        if 'c1cncc' in content:
-            return 'PYR4', mods  # 3-(4-Pyridyl)-alanine
-        if 'c1cscc' in content:
-            return 'THA3', mods  # 3-(3-thienyl)-alanine
-        if 'c1nnc' in content:
-            return 'TRZ4', mods  # 3-(1,2,4-Triazol-1-yl)-alanine
-        # Modified serines and threonines
-        if 'OP(O)(O)O' in content:
-            if '[C@@H](COP' in content or '[C@H](COP' in content:
-                return 'SEP', mods  # phosphoserine
-            elif '[C@@H](OP' in content or '[C@H](OP' in content:
-                return 'TPO', mods  # phosphothreonine
-        # Specialized ring systems
-        if 'c1c2ccccc2cc2c1cccc2' in content:
-            return 'ANTH', mods  # 3-(9-anthryl)-alanine
-        if 'c1csc2c1cccc2' in content:
-            return 'BTH3', mods  # 3-(3-benzothienyl)-alanine
-        if '[C@]12C[C@H]3C[C@@H](C2)C[C@@H](C1)C3' in content:
-            return 'ADAM', mods  # Adamanthane
-        # Fluorinated derivatives
-        if 'FC(F)(F)' in content:
-            if 'CC(F)(F)F' in content:
-                return 'FLA', mods  # Trifluoro-alanine
-            if 'C(F)(F)F)c1' in content:
-                if 'c1ccccc1C(F)(F)F' in content:
-                    return 'TFG2', mods  # 2-(Trifluoromethyl)-phenylglycine
-                if 'c1cccc(c1)C(F)(F)F' in content:
-                    return 'TFG3', mods  # 3-(Trifluoromethyl)-phenylglycine
-                if 'c1ccc(cc1)C(F)(F)F' in content:
-                    return 'TFG4', mods  # 4-(Trifluoromethyl)-phenylglycine
-        # Multiple halogen patterns
-        if 'F' in content and 'c1' in content:
-            if 'c1ccc(c(c1)F)F' in content:
-                return 'F2F', mods  # 3,4-Difluoro-phenylalanine
-            if 'cc(F)cc(c1)F' in content:
-                return 'WFP', mods  # 3,5-Difluoro-phenylalanine
-        if 'Cl' in content and 'c1' in content:
-            if 'c1ccc(cc1Cl)Cl' in content:
-                return 'CP24', mods  # 2,4-dichloro-phenylalanine
-            if 'c1ccc(c(c1)Cl)Cl' in content:
-                return 'CP34', mods  # 3,4-dichloro-phenylalanine
-        # Hydroxy and amino derivatives
-        if 'O' in content and 'c1' in content:
-            if 'c1cc(O)cc(c1)O' in content:
-                return '3FG', mods  # (2s)-amino(3,5-dihydroxyphenyl)-ethanoic acid
-            if 'c1ccc(c(c1)O)O' in content:
-                return 'DAH', mods  # 3,4-Dihydroxy-phenylalanine
-        # Modified histidines
-        if 'c1cnc' in content:
-            if '[C@@H]1CN[C@@H](N1)F' in content:
-                return '2HF', mods  # 2-fluoro-l-histidine
-            if 'c1cnc([nH]1)F' in content:
-                return '2HF1', mods  # 2-fluoro-l-histidine variant
-            if 'c1c[nH]c(n1)F' in content:
-                return '2HF2', mods  # 2-fluoro-l-histidine variant
-        if '[SeH]' in content:
-            return 'CSE', mods  # Selenocysteine
-        if 'S' in content:
-            if 'CSCc1ccccc1' in content:
-                return 'BCS', mods  # benzylcysteine
-            if 'CCSC' in content:
-                return 'ESC', mods  # Ethionine
-            if 'CCS' in content:
-                return 'HCS', mods  # homocysteine
-        if 'CN=[N]=N' in content:
-            return 'AZDA', mods  # azido-alanine
-        if '[NH]=[C](=[NH2])=[NH2]' in content:
-            if 'CCC[NH]=' in content:
-                return 'AGM', mods  # 5-methyl-arginine
-            if 'CC[NH]=' in content:
-                return 'GDPR', mods  # 2-Amino-3-guanidinopropionic acid
-        # Others
-        if 'C1CCCC1' in content:
-            return 'CPA3', mods  # 3-Cyclopentyl-alanine
-        if 'C1CCCCC1' in content:
-            if 'CC1CCCCC1' in content:
-                return 'ALC', mods  # 3-cyclohexyl-alanine
-            else:
-                return 'CHG', mods  # Cyclohexylglycine
-        if 'CCC[C@@H]' in content or 'CCC[C@H]' in content:
-            return 'NLE', mods  # Norleucine
-        if 'CC[C@@H]' in content or 'CC[C@H]' in content:
-            if not any(x in content for x in ['CC(C)', 'COC', 'CN(']):
-                return 'ABA', mods  # 2-Aminobutyric acid
-        if 'CCON' in content:
-            return 'CAN', mods  # canaline
-        if '[C@@H]1C=C[C@@H](C=C1)' in content:
-            return 'ACZ', mods  # cis-amiclenomycin
-        if 'CCC(=O)[NH3]' in content:
-            return 'ONL', mods  # 5-oxo-l-norleucine
-        if 'c1ccncc1' in content:
-            return 'PYR4', mods  # 3-(4-Pyridyl)-alanine
-        if 'c1ccco1' in content:
-            return 'FUA2', mods  # (2-furyl)-alanine
-        if 'c1ccc' in content:
-            if 'c1ccc(cc1)c1ccccc1' in content:
-                return 'BIF', mods  # 4,4-biphenylalanine
-            if 'c1ccc(cc1)C(=O)c1ccccc1' in content:
-                return 'PBF', mods  # 4-benzoyl-phenylalanine
-            if 'c1ccc(cc1)C(C)(C)C' in content:
-                return 'TBP4', mods  # 4-tert-butyl-phenylalanine
-            if 'c1ccc(cc1)[C](=[NH2])=[NH2]' in content:
-                return '0BN', mods  # 4-carbamimidoyl-l-phenylalanine
-            if 'c1cccc(c1)[C](=[NH2])=[NH2]' in content:
-                return 'APM', mods  # m-amidinophenyl-3-alanine
-        if 'O' in content:
-            if '[C@H]([C@H](C)O)O' in content:
-                return 'ILX', mods  # 4,5-dihydroxy-isoleucine
-            if '[C@H]([C@@H](C)O)O' in content:
-                return 'ALO', mods  # Allo-threonine
-            if '[C@H](COP(O)(O)O)' in content:
-                return 'SEP', mods  # phosphoserine
-            if '[C@H]([C@@H](C)OP(O)(O)O)' in content:
-                return 'TPO', mods  # phosphothreonine
-            if '[C@H](c1ccc(O)cc1)O' in content:
-                return 'OMX', mods  # (betar)-beta-hydroxy-l-tyrosine
-            if '[C@H](c1ccc(c(Cl)c1)O)O' in content:
-                return 'OMY', mods  # (betar)-3-chloro-beta-hydroxy-l-tyrosine
-        if 'n1' in content:
-            if 'n1cccn1' in content:
-                return 'PYZ1', mods  # 3-(1-Pyrazolyl)-alanine
-            if 'n1nncn1' in content:
-                return 'TEZA', mods  # 3-(2-Tetrazolyl)-alanine
-            if 'c2c(n1)cccc2' in content:
-                return 'QU32', mods  # 3-(2-Quinolyl)-alanine
-            if 'c1cnc2c(c1)cccc2' in content:
-                return 'QU33', mods  # 3-(3-quinolyl)-alanine
-            if 'c1ccnc2c1cccc2' in content:
-                return 'QU34', mods  # 3-(4-quinolyl)-alanine
-            if 'c1ccc2c(c1)nccc2' in content:
-                return 'QU35', mods  # 3-(5-Quinolyl)-alanine
-            if 'c1ccc2c(c1)cncc2' in content:
-                return 'QU36', mods  # 3-(6-Quinolyl)-alanine
-            if 'c1cnc2c(n1)cccc2' in content:
-                return 'QX32', mods  # 3-(2-quinoxalyl)-alanine
-        if 'N' in content:
-            if '[NH3]CC[C@@H]' in content:
-                return 'DAB', mods  # Diaminobutyric acid
-            if '[NH3]C[C@@H]' in content:
-                return 'DPP', mods  # 2,3-Diaminopropanoic acid
-            if '[NH3]CCCCCC[C@@H]' in content:
-                return 'HHK', mods  # (2s)-2,8-diaminooctanoic acid
-            if 'CCC[NH]=[C](=[NH2])=[NH2]' in content:
-                return 'GBUT', mods  # 2-Amino-4-guanidinobutryric acid
-            if '[NH]=[C](=S)=[NH2]' in content:
-                return 'THIC', mods  # Thio-citrulline
-        if 'CC' in content:
-            if 'CCCC[C@@H]' in content:
-                return 'AHP', mods  # 2-Aminoheptanoic acid
-            if 'CCC([C@@H])(C)C' in content:
-                return 'I2M', mods  # 3-methyl-l-alloisoleucine
-            if 'CC[C@H]([C@@H])C' in content:
-                return 'IIL', mods  # Allo-Isoleucine
-            if '[C@H](CCC(C)C)' in content:
-                return 'HLEU', mods  # Homoleucine
-            if '[C@@H]([C@@H](C)O)C' in content:
-                return 'HLU', mods  # beta-hydroxyleucine
-        if '[C@@H]' in content:
-            if '[C@@H](C[C@@H](F))' in content:
-                return 'FGA4', mods  # 4-Fluoro-glutamic acid
-            if '[C@@H](C[C@@H](O))' in content:
-                return '3GL', mods  # 4-hydroxy-glutamic-acid
-            if '[C@@H](C[C@H](C))' in content:
-                return 'LME', mods  # (3r)-3-methyl-l-glutamic acid
-            if '[C@@H](CC[C@H](C))' in content:
-                return 'MEG', mods  # (3s)-3-methyl-l-glutamic acid
-        if 'S' in content:
-            if 'SCC[C@@H]' in content:
-                return 'HSER', mods  # homoserine
-            if 'SCCN' in content:
-                return 'SLZ', mods  # thialysine
-            if 'SC(=O)' in content:
-                return 'CSA', mods  # s-acetonylcysteine
-            if '[S@@](=O)' in content:
-                return 'SME', mods  # Methionine sulfoxide
-            if 'S(=O)(=O)' in content:
-                return 'OMT', mods  # Methionine sulfone
-        if 'C=' in content:
-            if 'C=C[C@@H]' in content:
-                return '2AG', mods  # 2-Allyl-glycine
-            if 'C=C[C@@H]' in content:
-                return 'LVG', mods  # vinylglycine
-            if 'C=Cc1ccccc1' in content:
-                return 'STYA', mods  # Styrylalanine
-        if '[C@@H]1Cc2c(C1)cccc2' in content:
-            return 'IGL', mods  # alpha-amino-2-indanacetic acid
-        if '[C](=[C](=O)=O)=O' in content:
-            return '26P', mods  # 2-amino-6-oxopimelic acid
-        if '[C](=[C](=O)=O)=C' in content:
-            return '2NP', mods  # l-2-amino-6-methylene-pimelic acid
-        if 'c1cccc2c1cc(O)cc2' in content:
-            return 'NAO1', mods  # 5-hydroxy-1-naphthalene
-        if 'c1ccc2c(c1)cc(O)cc2' in content:
-            return 'NAO2', mods  # 6-hydroxy-2-naphthalene
         return None, mods
     def get_modifications(self, segment):
-        """Get modifications based on bond types"""
         mods = []
-        if segment.get('bond_after'):
-            if 'N(C)' in segment['bond_after'] or segment['bond_after'].startswith('C(=O)N(C)'):
-                mods.append('N-Me')
-            if 'OC(=O)' in segment['bond_after']:
-                mods.append('O-linked')
         return mods
     def analyze_structure(self, smiles):
-        """Main analysis function with debug output"""
         print("\nAnalyzing structure:", smiles)
-        # Split into segments
-        segments = self.split_on_bonds(smiles)
         print("\nSegment Analysis:")
         sequence = []
         for i, segment in enumerate(segments):
             print(f"\nSegment {i}:")
-            print(f"Content: {segment['content']}")
             print(f"Bond before: {segment.get('bond_before', 'None')}")
             print(f"Bond after: {segment.get('bond_after', 'None')}")
@@ -628,11 +548,10 @@ class PeptideAnalyzer:
                 print(f"Identified as: {residue}")
                 print(f"Modifications: {mods}")
             else:
-                print(f"Warning: Could not identify residue in segment: {segment['content']}")
-        # Check if cyclic
-        is_cyclic, peptide_cycles, aromatic_cycles = self.is_cyclic(smiles)
         three_letter = '-'.join(sequence)
         one_letter = ''.join(self.three_to_one.get(aa.split('(')[0], 'X') for aa in sequence)
         if is_cyclic:
@@ -642,13 +561,14 @@ class PeptideAnalyzer:
         print(f"\nFinal sequence: {three_letter}")
         print(f"One-letter code: {one_letter}")
         print(f"Is cyclic: {is_cyclic}")
-        #print(f"Peptide cycles: {peptide_cycles}")
-        #print(f"Aromatic cycles: {aromatic_cycles}")
         return {
             'three_letter': three_letter,
             'one_letter': one_letter,
-            'is_cyclic': is_cyclic
         }
 def annotate_cyclic_structure(mol, sequence):

 class PeptideAnalyzer:
     def __init__(self):
         self.bond_patterns = [
+            #(r'OC\(=O\)', 'ester'),  # Ester bond
             (r'N\(C\)C\(=O\)', 'n_methyl'),  # N-methylated peptide bond
             (r'N[0-9]C\(=O\)', 'proline'),  # Proline peptide bond
             (r'NC\(=O\)', 'peptide'),  # Standard peptide bond
             (r'C\(=O\)N\(C\)', 'n_methyl_reverse'),  # Reverse N-methylated
             (r'C\(=O\)N[12]?', 'peptide_reverse')  # Reverse peptide bond
         ]
+        self.complex_residue_patterns = [
+            # Kpg - Lys(palmitoyl-Glu-OtBu)
+            (r'\[C[@]H\]\(CCCNC\(=O\)CCC\[C@@H\]\(NC\(=O\)CCCCCCCCCCCCCCCC\)C\(=O\)OC\(C\)\(C\)C\)', 'Kpg'),
+            (r'CCCCCCCCCCCCCCCCC\(=O\)N\[C@H\]\(CCCC\(=O\)NCCC\[C@@H\]', 'Kpg'),
+            (r'\[C[@]?H\]\(CSC\(c\d+ccccc\d+\)\(c\d+ccccc\d+\)c\d+ccc\(OC\)cc\d+\)', 'Cmt'),
+            (r'CSC\(c.*?c.*?OC\)', 'Cmt'),        # Core structure of Cys-Mmt group
+            (r'COc.*?ccc\(C\(SC', 'Cmt'),         # Start of Cmt in cyclic peptides
+            (r'c2ccccc2\)c2ccccc2\)cc', 'Cmt'),   # End of Cmt in cyclic peptides
+            # Glu(OAll)
+            (r'C=CCOC\(=O\)CC\[C@@H\]', 'Eal'),
+            #(r'COc\d+ccc\(C\(SC\[C@@H\]\d+.*?\)\(c\d+ccccc\d+\)c\d+ccccc\d+\)cc\d+', 'Cmt-cyclic'),
+            # Dtg - Asp(OtBu)-(Dmb)Gly
+            (r'CN\(Cc\d+ccc\(OC\)cc\d+OC\)C\(=O\)\[C@H\]\(CC\(=O\)OC\(C\)\(C\)C\)', 'Dtg'),
+            (r'C\(=O\)N\(CC\d+=C\(C=C\(C=C\d+\)OC\)OC\)CC\(=O\)', 'Dtg'),
+            (r'N\[C@@H\]\(CC\(=O\)OC\(C\)\(C\)C\)C\(=O\)N\(CC\d+=C\(C=C\(C=C\d+\)OC\)OC\)CC\(=O\)', 'Dtg'),
+        ]
         # Three to one letter code mapping
         self.three_to_one = {
             'Ala': 'A', 'Cys': 'C', 'Asp': 'D', 'Glu': 'E',
             'Phe': 'F', 'Gly': 'G', 'His': 'H', 'Ile': 'I',
             'Lys': 'K', 'Leu': 'L', 'Met': 'M', 'Asn': 'N',
             'Pro': 'P', 'Gln': 'Q', 'Arg': 'R', 'Ser': 'S',
+            'Thr': 'T', 'Val': 'V', 'Trp': 'W', 'Tyr': 'Y',
+            'ala': 'a', 'cys': 'c', 'asp': 'd', 'glu': 'e',
+            'phe': 'f', 'gly': 'g', 'his': 'h', 'ile': 'i',
+            'lys': 'k', 'leu': 'l', 'met': 'm', 'asn': 'n',
+            'pro': 'p', 'gln': 'q', 'arg': 'r', 'ser': 's',
+            'thr': 't', 'val': 'v', 'trp': 'w', 'tyr': 'y', 'Cmt-cyclic': 'Ĉ',
+            'Aib': 'Ŷ', 'Dtg': 'Ĝ', 'Cmt': 'Ĉ', 'Eal': 'Ė', 'Nml': "Ŀ", 'Nma': 'Ṃ',
+            'Kpg': 'Ƙ', 'Tpb': 'Ṯ', 'Cyl': 'Ċ', 'Nle': 'Ł', 'Hph': 'Ĥ', 'Cys-Cys': 'CC', 'cys-cys': 'cc',
         }
+    def preprocess_complex_residues(self, smiles):
+        complex_positions = []
+        for pattern, residue_type in self.complex_residue_patterns:
+            for match in re.finditer(pattern, smiles):
+                # Only add if this position doesn't overlap with existing matches
+                if not any(pos['start'] <= match.start() < pos['end'] or
+                          pos['start'] < match.end() <= pos['end'] for pos in complex_positions):
+                    complex_positions.append({
+                        'start': match.start(),
+                        'end': match.end(),
+                        'type': residue_type,
+                        'pattern': match.group()
+                    })
+        # Sort by position (to handle potential overlapping matches)
+        complex_positions.sort(key=lambda x: x['start'])
+        if not complex_positions:
+            return smiles, []
+        # Build a new SMILES string, protecting complex residues
+        preprocessed_smiles = smiles
+        offset = 0  # Track offset from replacements
+        protected_residues = []
+        for pos in complex_positions:
+            start = pos['start'] + offset
+            end = pos['end'] + offset
+            complex_part = preprocessed_smiles[start:end]
+            if not ('[C@H]' in complex_part or '[C@@H]' in complex_part):
+                continue
+            placeholder = f"COMPLEX_RESIDUE_{len(protected_residues)}"
+            preprocessed_smiles = preprocessed_smiles[:start] + placeholder + preprocessed_smiles[end:]
+            offset += len(placeholder) - (end - start)
+            protected_residues.append({
+                'placeholder': placeholder,
+                'type': pos['type'],
+                'content': complex_part
+            })
+            #print(f"Protected {pos['type']}: {complex_part[:20]}... as {placeholder}")
+        return preprocessed_smiles, protected_residues
     def is_peptide(self, smiles):
         """Check if the SMILES represents a peptide structure"""
         mol = Chem.MolFromSmiles(smiles)
         is_cyclic = len(peptide_cycles) > 0 and not smiles.endswith('C(=O)O')
         return is_cyclic, peptide_cycles, aromatic_cycles
+    def split_on_bonds(self, smiles, protected_residues=None):
+        """Split SMILES into segments based on peptide bonds, with improved handling of protected residues"""
         positions = []
         used = set()
+        # First, handle protected complex residues if any
+        if protected_residues:
+            for residue in protected_residues:
+                match = re.search(residue['placeholder'], smiles)
+                if match:
+                    positions.append({
+                        'start': match.start(),
+                        'end': match.end(),
+                        'type': 'complex',
+                        'pattern': residue['placeholder'],
+                        'residue_type': residue['type'],
+                        'content': residue['content']
+                    })
+                    used.update(range(match.start(), match.end()))
+        # Find all peptide bonds
+        bond_positions = []
         # Find Gly pattern first
         gly_pattern = r'NCC\(=O\)'
         for match in re.finditer(gly_pattern, smiles):
             if not any(p in range(match.start(), match.end()) for p in used):
+                bond_positions.append({
                     'start': match.start(),
                     'end': match.end(),
                     'type': 'gly',
         for pattern, bond_type in self.bond_patterns:
             for match in re.finditer(pattern, smiles):
                 if not any(p in range(match.start(), match.end()) for p in used):
+                    bond_positions.append({
                         'start': match.start(),
                         'end': match.end(),
                         'type': bond_type,
                         'pattern': match.group()
                     })
                     used.update(range(match.start(), match.end()))
+        bond_positions.sort(key=lambda x: x['start'])
+        # Combine complex residue positions and bond positions
+        all_positions = positions + bond_positions
+        all_positions.sort(key=lambda x: x['start'])
         # Create segments
         segments = []
+        if all_positions and all_positions[0]['start'] > 0:
+            segments.append({
+                'content': smiles[0:all_positions[0]['start']],
+                'bond_after': all_positions[0]['pattern'] if all_positions[0]['type'] != 'complex' else None,
+                'complex_after': all_positions[0]['pattern'] if all_positions[0]['type'] == 'complex' else None
+            })
+        for i in range(len(all_positions)-1):
+            current = all_positions[i]
+            next_pos = all_positions[i+1]
+            if current['type'] == 'complex':
                 segments.append({
+                    'content': current['content'],
+                    'bond_before': all_positions[i-1]['pattern'] if i > 0 and all_positions[i-1]['type'] != 'complex' else None,
+                    'bond_after': next_pos['pattern'] if next_pos['type'] != 'complex' else None,
+                    'complex_type': current['residue_type']
                 })
+            elif current['type'] == 'gly':
+                segments.append({
+                    'content': 'NCC(=O)',
+                    'bond_before': all_positions[i-1]['pattern'] if i > 0 and all_positions[i-1]['type'] != 'complex' else None,
+                    'bond_after': next_pos['pattern'] if next_pos['type'] != 'complex' else None
+                })
+            else:
+                # Only create segment if there's content between this bond and next position
+                content = smiles[current['end']:next_pos['start']]
+                if content and next_pos['type'] != 'complex':
                     segments.append({
+                        'content': content,
+                        'bond_before': current['pattern'],
+                        'bond_after': next_pos['pattern'] if next_pos['type'] != 'complex' else None
                     })
+        if all_positions and all_positions[-1]['end'] < len(smiles):
+            if all_positions[-1]['type'] == 'complex':
                 segments.append({
+                    'content': all_positions[-1]['content'],
+                    'bond_before': all_positions[-2]['pattern'] if len(all_positions) > 1 and all_positions[-2]['type'] != 'complex' else None,
+                    'complex_type': all_positions[-1]['residue_type']
                 })
+            else:
+                segments.append({
+                    'content': smiles[all_positions[-1]['end']:],
+                    'bond_before': all_positions[-1]['pattern']
+                })
         return segments
     def clean_terminal_carboxyl(self, segment):
     def identify_residue(self, segment):
         """Identify residue with Pro reconstruction"""
         # Only clean terminal carboxyl if this is the last segment
+        if 'complex_type' in segment:
+            return segment['complex_type'], []
         content = self.clean_terminal_carboxyl(segment)
         mods = self.get_modifications(segment)
+        if content.startswith('COc1ccc(C(SC[C@@H]'):
+            print("DIRECT MATCH: Found Cmt at beginning")
+            return 'Cmt', mods
+        if '[C@@H]3CCCN3C2=O)(c2ccccc2)c2ccccc2)cc' in content:
+            print("DIRECT MATCH: Found Pro at end")
+            return 'Pro', mods
+        # Eal - Glu(OAll)
+        if 'CCC(=O)OCC=C' in content or 'CC(=O)OCC=C' in content or 'C=CCOC(=O)CC' in content:
+            return 'Eal', mods
         # Proline (P) - flexible ring numbers
         if any([
             # Check for any ring number in bond patterns
             any(f'[C@@H]{n}' in content or f'[C@H]{n}' in content for n in '123456789'))
             for n in '123456789'
         ]) or any([(segment.get('bond_before', '').startswith(f'C(=O)N{n}') and 'CCC' in content and
+                any(f'CCC{n}' for n in '123456789'))
+                for n in '123456789'
         ]) or any([
             # Check ending patterns with any ring number
             (f'CCCN{n}' in content and content.endswith('=O') and
         ]):
             return 'Pro', mods
+        # D-Proline (p)
+        if ('N1[C@H](CCC1)' in content):
+            return 'pro', mods
+        # Tryptophan (W)
         if re.search(r'c[0-9]c\[nH\]c[0-9]ccccc[0-9][0-9]', content) and \
         'c[nH]c' in content.replace(' ', ''):
+            if '[C@H](CC' in content:  # D-form
+                return 'trp', mods
             return 'Trp', mods
+        # Lysine (K) - both patterns
         if '[C@@H](CCCCN)' in content or '[C@H](CCCCN)' in content:
+            if '[C@H](CCCCN)' in content:  # D-form
+                return 'lys', mods
             return 'Lys', mods
+        # Arginine (R) - both patterns
         if '[C@@H](CCCNC(=N)N)' in content or '[C@H](CCCNC(=N)N)' in content:
+            if '[C@H](CCCNC(=N)N)' in content:  # D-form
+                return 'arg', mods
             return 'Arg', mods
+        if content == 'C' and segment.get('bond_before') and segment.get('bond_after'):
+            # If it's surrounded by peptide bonds, it's almost certainly Gly
+            if ('C(=O)N' in segment['bond_before'] or 'NC(=O)' in segment['bond_before'] or 'N(C)C(=O)' in segment['bond_before']) and \
+               ('NC(=O)' in segment['bond_after'] or 'C(=O)N' in segment['bond_after'] or 'N(C)C(=O)' in segment['bond_after']):
                 return 'Gly', mods
+        # Leucine patterns (L/l)
         if 'CC(C)C[C@H]' in content or 'CC(C)C[C@@H]' in content or '[C@@H](CC(C)C)' in content or '[C@H](CC(C)C)' in content or (('N[C@H](CCC(C)C)' in content or 'N[C@@H](CCC(C)C)' in content) and segment.get('bond_before') is None):
+            if '[C@H](CC(C)C)' in content or 'CC(C)C[C@H]' in content:  # D-form
+                return 'leu', mods
             return 'Leu', mods
+        # Threonine patterns (T/t)
+        if '[C@@H]([C@@H](C)O)' in content or '[C@H]([C@H](C)O)' in content or '[C@@H]([C@H](C)O)' in content or '[C@H]([C@@H](C)O)' in content:
+            # Check both stereochemistry patterns
+            if '[C@H]([C@@H](C)O)' in content:  # D-form
+                return 'thr', mods
             return 'Thr', mods
+        if re.search(r'\[C@H\]\(CCc\d+ccccc\d+\)', content) or re.search(r'\[C@@H\]\(CCc\d+ccccc\d+\)', content):
+            return 'Hph', mods
+        # Phenylalanine patterns (F/f)
         if re.search(r'\[C@H\]\(Cc\d+ccccc\d+\)', content) or re.search(r'\[C@@H\]\(Cc\d+ccccc\d+\)', content):
+            if re.search(r'\[C@H\]\(Cc\d+ccccc\d+\)', content):  # D-form
+                return 'phe', mods
             return 'Phe', mods
+        if ('CC(C)[C@@H]' in content or 'CC(C)[C@H]' in content or
+            '[C@H](C(C)C)' in content or '[C@@H](C(C)C)' in content or
+            'C(C)C[C@H]' in content or 'C(C)C[C@@H]' in content):
+            # Make sure it's not leucine
+            if not any(p in content for p in ['CC(C)C[C@H]', 'CC(C)C[C@@H]', 'CCC(=O)']):
+                if '[C@H]' in content and not '[C@@H]' in content:  # D-form
+                    return 'val', mods
+                return 'Val', mods
+        # Isoleucine patterns (I/i)
         if any([
+            'CC[C@@H](C)' in content, '[C@@H](C)CC' in content,
             '[C@@H](CC)C' in content,
             'C(C)C[C@@H]' in content and 'CC(C)C' not in content
         ]):
+            if '[C@H]([C@@H](CC)C)' in content or '[C@H](CC)C' in content:  # D-form
+                return 'ile', mods
+            elif '[C@H](C)CC' in content or '[C@H](CC)C' in content or 'CC[C@H](C)' in content:
+                return 'ile', mods
+            elif 'C(C)C[C@H]' in content and 'CC(C)C' not in content:
+                return 'ile', mods
             return 'Ile', mods
+        # Alanine patterns (A/a)
         if ('[C@H](C)' in content or '[C@@H](C)' in content):
             if not any(p in content for p in ['C(C)C', 'COC', 'CN(', 'C(C)O', 'CC[C@H]', 'CC[C@@H]']):
+                if '[C@H](C)' in content:  # D-form
+                    return 'ala', mods
                 return 'Ala', mods
+        # Tyrosine patterns (Y/y)
         if re.search(r'Cc[0-9]ccc\(O\)cc[0-9]', content):
+            if '[C@H](Cc1ccc(O)cc1)' in content:  # D-form
+                return 'tyr', mods
             return 'Tyr', mods
+        # Serine patterns (S/s)
         if '[C@H](CO)' in content or '[C@@H](CO)' in content:
             if not ('C(C)O' in content or 'COC' in content):
+                if '[C@H](CO)' in content:  # D-form
+                    return 'ser', mods
                 return 'Ser', mods
+        if 'CSSC' in content:
+            if re.search(r'\[C@@H\].*CSSC.*\[C@@H\]', content) or re.search(r'\[C@H\].*CSSC.*\[C@H\]', content):
+                if '[C@H]' in content and not '[C@@H]' in content:  # D-form
+                    return 'cys-cys', mods
+                return 'Cys-Cys', mods
+            if '[C@@H](N)CSSC' in content or '[C@H](N)CSSC' in content:
+                if '[C@H](N)CSSC' in content:  # D-form
+                    return 'cys-cys', mods
+                return 'Cys-Cys', mods
+            if 'CSSC[C@@H](C(=O)O)' in content or 'CSSC[C@H](C(=O)O)' in content:
+                if 'CSSC[C@H](C(=O)O)' in content:  # D-form
+                    return 'cys-cys', mods
+                return 'Cys-Cys', mods
+        # Cysteine patterns (C/c)
         if '[C@H](CS)' in content or '[C@@H](CS)' in content:
+            if '[C@H](CS)' in content:  # D-form
+                return 'cys', mods
             return 'Cys', mods
+        # Methionine patterns (M/m)
+        if ('CCSC' in content) or ("CSCC" in content):
+            if '[C@H](CCSC)' in content:  # D-form
+                return 'met', mods
+            elif '[C@H]' in content:
+                return 'met', mods
             return 'Met', mods
+        # Glutamine patterns (Q/q)
         if (content == '[C@@H](CC' or content == '[C@H](CC' and segment.get('bond_before')=='C(=O)N' and segment.get('bond_after')=='C(=O)N') or ('CCC(=O)N' in content) or ('CCC(N)=O' in content):
+            if '[C@H](CCC(=O)N)' in content:  # D-form
+                return 'gln', mods
             return 'Gln', mods
+        # Asparagine patterns (N/n)
         if (content == '[C@@H](C' or content == '[C@H](C' and segment.get('bond_before')=='C(=O)N' and segment.get('bond_after')=='C(=O)N') or ('CC(=O)N' in content) or ('CCN(=O)' in content) or ('CC(N)=O' in content):
+            if '[C@H](CC(=O)N)' in content:  # D-form
+                return 'asn', mods
             return 'Asn', mods
+        # Glutamic acid patterns (E/e)
         if ('CCC(=O)O' in content):
+            if '[C@H](CCC(=O)O)' in content:  # D-form
+                return 'glu', mods
+            return 'Glu', mods
+        # Aspartic acid patterns (D/d)
         if ('CC(=O)O' in content):
+            if '[C@H](CC(=O)O)' in content:  # D-form
+                return 'asp', mods
             return 'Asp', mods
         if re.search(r'Cc\d+c\[nH\]cn\d+', content) or re.search(r'Cc\d+cnc\[nH\]\d+', content):
+            if '[C@H]' in content:  # D-form
+                return 'his', mods
             return 'His', mods
+        if 'C2(CCCC2)' in content or 'C1(CCCC1)' in content or re.search(r'C\d+\(CCCC\d+\)', content):
+            return 'Cyl', mods
+        if ('N[C@@H](CCCC)' in content or '[C@@H](CCCC)' in content or 'CCCC[C@@H]' in content or
+            'N[C@H](CCCC)' in content or '[C@H](CCCC)' in content) and 'CC(C)' not in content:
+            return 'Nle', mods
+        # Aib - alpha-aminoisobutyric acid (2-aminoisobutyric acid)
+        if 'C(C)(C)(N)' in content or 'C(C)(C)' in content or 'C(C)(C)' in content and ('C(=O)N' in segment['bond_before'] or 'NC(=O)' in segment['bond_before'] or 'N(C)C(=O)' in segment['bond_before']) and \
+               ('NC(=O)' in segment['bond_after'] or 'C(=O)N' in segment['bond_after'] or 'N(C)C(=O)' in segment['bond_after']):
+            return 'Aib', mods
+        # Dtg - Asp(OtBu)-(Dmb)Gly
+        if 'CC(=O)OC(C)(C)C' in content and 'CC1=C(C=C(C=C1)OC)OC' in content:
+            return 'Dtg', mods
+        # Kpg - Lys(palmitoyl-Glu-OtBu)
+        if 'CCCNC(=O)' in content and 'CCCCCCCCCCCC' in content:
+            return 'Kpg', mods
+        # Tpb - Thr(PO(OBzl)OH)
+        if re.search(r'\[C[@]?H\]\(C\)OP\(=O\)\(O\)', content) or 'OP(=O)(O)OCC' in content:
+            return 'Tpb', mods
         return None, mods
     def get_modifications(self, segment):
+        """Get modifications based on bond types and segment content - fixed to avoid duplicates"""
         mods = []
+        # Check for N-methylation in any form, but only add it once
+        # Check both bonds and segment content for N-methylation patterns
+        if ((segment.get('bond_after') and
+            ('N(C)' in segment['bond_after'] or segment['bond_after'].startswith('C(=O)N(C)'))) or
+            ('N(C)C(=O)' in segment['content'] or 'N(C)C1=O' in segment['content']) or
+            (segment['content'].endswith('N(C)C(=O)') or segment['content'].endswith('N(C)C1=O'))):
+            mods.append('N-Me')
+        # Check for O-linked modifications
+        #if segment.get('bond_after') and 'OC(=O)' in segment['bond_after']:
+            #mods.append('O-linked')
         return mods
     def analyze_structure(self, smiles):
+        """Main analysis function with preprocessing for complex residues"""
         print("\nAnalyzing structure:", smiles)
+        # Pre-process to identify complex residues first
+        preprocessed_smiles, protected_residues = self.preprocess_complex_residues(smiles)
+        if protected_residues:
+            print(f"Identified {len(protected_residues)} complex residues during pre-processing")
+            for i, residue in enumerate(protected_residues):
+                print(f"Complex residue {i+1}: {residue['type']}")
+        # Check if it's cyclic
+        is_cyclic, peptide_cycles, aromatic_cycles = self.is_cyclic(smiles)
+        segments = self.split_on_bonds(preprocessed_smiles, protected_residues)
         print("\nSegment Analysis:")
         sequence = []
         for i, segment in enumerate(segments):
             print(f"\nSegment {i}:")
+            print(f"Content: {segment.get('content', 'None')}")
             print(f"Bond before: {segment.get('bond_before', 'None')}")
             print(f"Bond after: {segment.get('bond_after', 'None')}")
                 print(f"Identified as: {residue}")
                 print(f"Modifications: {mods}")
             else:
+                print(f"Warning: Could not identify residue in segment: {segment.get('content', 'None')}")
         three_letter = '-'.join(sequence)
         one_letter = ''.join(self.three_to_one.get(aa.split('(')[0], 'X') for aa in sequence)
         if is_cyclic:
         print(f"\nFinal sequence: {three_letter}")
         print(f"One-letter code: {one_letter}")
         print(f"Is cyclic: {is_cyclic}")
+        print(f"Peptide cycles: {peptide_cycles}")
+        print(f"Aromatic cycles: {aromatic_cycles}")
         return {
             'three_letter': three_letter,
             'one_letter': one_letter,
+            'is_cyclic': is_cyclic,
+            'residues': sequence
         }
 def annotate_cyclic_structure(mol, sequence):