chemselfies-embed / README.md
gbyuvd's picture
Update README.md
7aedee9 verified
metadata
library_name: sentence-transformers
metrics:
  - pearson_cosine
  - spearman_cosine
  - pearson_manhattan
  - spearman_manhattan
  - pearson_euclidean
  - spearman_euclidean
  - pearson_dot
  - spearman_dot
  - pearson_max
  - spearman_max
pipeline_tag: sentence-similarity
tags:
  - sentence-transformers
  - sentence-similarity
  - feature-extraction
  - loss:MatryoshkaLoss
  - loss:CosineSimilarityLoss
  - chemistry
  - biology
  - drug-discovery
widget:
  - source_sentence: >-
      [N]  [C]  [=N]  [C]  [=N]  [C]  [=C]  [Ring1]  [=Branch1]  [S]  [C]  [=C] 
      [C]  [Branch1]  [=Branch2]  [C]  [=C]  [C]  [=C]  [C]  [=C]  [Ring1] 
      [=Branch1]  [=C]  [C]  [=C]  [Ring1]  [N]  [Ring1]  [#C]
    sentences:
      - >-
        [C]  [C]  [C]  [C]  [C@H1]  [Branch2]  [#Branch2]  [Branch2]  [N]  [C] 
        [=Branch1]  [C]  [=O]  [C@@H1]  [C]  [C]  [C]  [C]  [N]  [C] 
        [=Branch1]  [C]  [=O]  [C]  [C]  [C@H1]  [Branch2]  [=Branch1]  [S] 
        [N]  [C]  [=Branch1]  [C]  [=O]  [C@H1]  [Branch1]  [#Branch2]  [C] 
        [C]  [C]  [N]  [=C]  [Branch1]  [C]  [N]  [N]  [N]  [C]  [=Branch1] 
        [C]  [=O]  [C@H1]  [Branch1]  [#Branch1]  [C]  [C]  [Branch1]  [C]  [C] 
        [C]  [N]  [C]  [=Branch1]  [C]  [=O]  [C@H1]  [Branch1]  [#Branch1] 
        [C]  [C]  [Branch1]  [C]  [C]  [C]  [N]  [C]  [=Branch1]  [C]  [=O] 
        [C@H1]  [Branch1]  [=Branch2]  [C]  [C]  [=C]  [NH1]  [C]  [=N] 
        [Ring1]  [Branch1]  [N]  [C]  [=Branch1]  [C]  [=O]  [C@H1]  [Branch1] 
        [C]  [N]  [C]  [C]  [=C]  [C]  [=C]  [C]  [=C]  [Ring1]  [=Branch1] 
        [C]  [=Branch1]  [C]  [=O]  [N]  [C@@H1]  [Branch1]  [=Branch1]  [C] 
        [Branch1]  [C]  [C]  [C]  [C]  [=Branch1]  [C]  [=O]  [N]  [C@H1] 
        [Branch1]  [#Branch1]  [C]  [C]  [Branch1]  [C]  [C]  [C]  [C] 
        [=Branch1]  [C]  [=O]  [N]  [Ring2]  [=Branch1]  [=C]  [C]  [=Branch1] 
        [C]  [=O]  [N]  [C@@H1]  [Branch1]  [C]  [C]  [C]  [=Branch1]  [C] 
        [=O]  [N]  [C@@H1]  [Branch1]  [#Branch2]  [C]  [C]  [C]  [N]  [=C] 
        [Branch1]  [C]  [N]  [N]  [C]  [=Branch1]  [C]  [=O]  [N]  [C@@H1] 
        [Branch1]  [C]  [C]  [C]  [=Branch1]  [C]  [=O]  [N]  [C@@H1] 
        [Branch1]  [Branch2]  [C]  [C]  [C]  [=Branch1]  [C]  [=O]  [O]  [C] 
        [=Branch1]  [C]  [=O]  [N]  [C@@H1]  [Branch1]  [Branch2]  [C]  [C] 
        [C]  [Branch1]  [C]  [N]  [=O]  [C]  [=Branch1]  [C]  [=O]  [N] 
        [C@@H1]  [Branch1]  [#Branch1]  [C]  [C]  [Branch1]  [C]  [C]  [C]  [C] 
        [=Branch1]  [C]  [=O]  [N]  [C@@H1]  [Branch1]  [C]  [C]  [C] 
        [=Branch1]  [C]  [=O]  [N]  [C@@H1]  [Branch1]  [Branch2]  [C]  [C] 
        [C]  [Branch1]  [C]  [N]  [=O]  [C]  [=Branch1]  [C]  [=O]  [N] 
        [C@@H1]  [Branch1]  [Branch2]  [C]  [C]  [C]  [Branch1]  [C]  [N]  [=O] 
        [C]  [=Branch1]  [C]  [=O]  [N]  [C@@H1]  [Branch1]  [C]  [C]  [C] 
        [=Branch1]  [C]  [=O]  [N]  [C@@H1]  [Branch1]  [=Branch2]  [C]  [C] 
        [=C]  [NH1]  [C]  [=N]  [Ring1]  [Branch1]  [C]  [=Branch1]  [C]  [=O] 
        [N]  [C@@H1]  [Branch1]  [Ring1]  [C]  [O]  [C]  [=Branch1]  [C]  [=O] 
        [N]  [C@@H1]  [Branch1]  [#Branch1]  [C]  [C]  [Branch1]  [C]  [N] 
        [=O]  [C]  [=Branch1]  [C]  [=O]  [N]  [C@@H1]  [Branch1]  [#Branch2] 
        [C]  [C]  [C]  [N]  [=C]  [Branch1]  [C]  [N]  [N]  [C]  [=Branch1] 
        [C]  [=O]  [N]  [C@@H1]  [Branch1]  [=Branch1]  [C]  [C]  [C]  [C]  [N] 
        [C]  [=Branch1]  [C]  [=O]  [N]  [C@@H1]  [Branch1]  [#Branch1]  [C] 
        [C]  [Branch1]  [C]  [C]  [C]  [C]  [=Branch1]  [C]  [=O]  [N]  [C@@H1] 
        [Branch1]  [Branch1]  [C]  [C]  [C]  [C]  [C]  [=Branch1]  [C]  [=O] 
        [N]  [C@@H1]  [Branch1]  [Branch2]  [C]  [C]  [C]  [=Branch1]  [C] 
        [=O]  [O]  [C]  [=Branch1]  [C]  [=O]  [N]  [C@H1]  [Branch2]  [Ring1] 
        [Ring2]  [C]  [=Branch1]  [C]  [=O]  [N]  [C@H1]  [Branch1]  [=Branch1] 
        [C]  [Branch1]  [C]  [N]  [=O]  [C@@H1]  [Branch1]  [C]  [C]  [C]  [C] 
        [C@@H1]  [Branch1]  [C]  [C]  [C]  [C]
      - >-
        [C]  [C]  [=Branch1]  [C]  [=O]  [N]  [C@H1]  [C@H1]  [Branch2] 
        [Ring2]  [#Branch2]  [O]  [C@H1]  [C@@H1]  [Branch1]  [C]  [O]  [C@@H1] 
        [Branch1]  [Ring1]  [C]  [O]  [O]  [C@@H1]  [Branch2]  [Ring1] 
        [Branch1]  [O]  [C@H1]  [C@H1]  [Branch1]  [C]  [O]  [C@@H1]  [Branch1] 
        [C]  [O]  [C@H1]  [Branch1]  [C]  [O]  [O]  [C@@H1]  [Ring1] 
        [=Branch2]  [C]  [O]  [C@@H1]  [Ring2]  [Ring1]  [Branch1]  [O]  [O] 
        [C@H1]  [Branch1]  [Ring1]  [C]  [O]  [C@@H1]  [Branch1]  [C]  [O] 
        [C@@H1]  [Ring2]  [Ring1]  [S]  [O]  [C@@H1]  [O]  [C@H1]  [Branch1] 
        [Ring1]  [C]  [O]  [C@H1]  [Branch1]  [C]  [O]  [C@H1]  [Branch1]  [C] 
        [O]  [C@H1]  [Ring1]  [#Branch2]  [O]
      - >-
        [C]  [C]  [=C]  [C]  [=C]  [C]  [Branch2]  [Ring1]  [Ring1]  [N]  [C] 
        [=Branch1]  [C]  [=O]  [C]  [O]  [C]  [=C]  [C]  [=C]  [C]  [Branch1] 
        [C]  [C]  [=C]  [Ring1]  [#Branch1]  [=C]  [Ring2]  [Ring1]  [C]
  - source_sentence: >-
      [O]  [=C]  [Branch1]  [C]  [O]  [C]  [O]  [C]  [Branch2]  [O]  [=C]  [O] 
      [C]  [C]  [C]  [C]  [Branch1]  [C]  [C]  [C]  [Branch2]  [#Branch2] 
      [#Branch2]  [C]  [C]  [C]  [Branch1]  [C]  [C]  [C]  [Ring1]  [Branch2] 
      [C]  [C]  [=C]  [C]  [C]  [C]  [Branch1]  [C]  [C]  [Branch1]  [C]  [C] 
      [C]  [C]  [C]  [Ring1]  [Branch2]  [Branch2]  [#Branch1]  [S]  [C] 
      [=Branch1]  [C]  [=O]  [O]  [C]  [O]  [C]  [C]  [Branch1]  [C]  [O]  [C] 
      [Branch1]  [C]  [O]  [C]  [Ring1]  [Branch2]  [O]  [C]  [O]  [C] 
      [Branch1]  [C]  [C]  [C]  [Branch2]  [Ring1]  [S]  [O]  [C]  [O]  [C] 
      [C]  [Branch1]  [C]  [O]  [C]  [Branch1]  [P]  [O]  [C]  [O]  [C]  [C] 
      [Branch1]  [C]  [O]  [Branch1]  [Ring1]  [C]  [O]  [C]  [Ring1] 
      [Branch2]  [O]  [C]  [Ring1]  [P]  [O]  [C]  [Branch2]  [Ring1] 
      [#Branch2]  [O]  [C]  [O]  [C]  [Branch1]  [Branch2]  [C]  [O]  [C] 
      [=Branch1]  [C]  [=O]  [C]  [C]  [Branch1]  [C]  [O]  [C]  [Branch1]  [C] 
      [O]  [C]  [Ring1]  [=N]  [O]  [C]  [Ring2]  [Ring2]  [=Branch2]  [O]  [C] 
      [Branch1]  [C]  [O]  [C]  [C]  [Ring2]  [=Branch1]  [Branch2]  [Ring2] 
      [=Branch1]  [Ring1]  [C]  [C]  [Ring2]  [#Branch1]  [C]  [Branch1]  [C] 
      [C]  [C]  [C]  [Branch1]  [C]  [O]  [C]  [Branch1]  [C]  [O]  [C] 
      [Ring2]  [#Branch1]  [=N]  [O]
    sentences:
      - >-
        [C]  [C]  [=C]  [C]  [Branch1]  [C]  [C]  [=C]  [Branch1]  [S]  [N] 
        [C]  [=C]  [C]  [Branch1]  [C]  [F]  [=C]  [C]  [Branch1]  [C]  [F] 
        [=C]  [Ring1]  [Branch2]  [C]  [Branch1]  [C]  [C]  [=C]  [Ring1]  [P]
      - >-
        [N]  [C]  [=N]  [C]  [=N]  [C]  [=C]  [Ring1]  [=Branch1]  [N]  [=C] 
        [N]  [Ring1]  [Branch1]  [C@@H1]  [O]  [C@H1]  [Branch2]  [=Branch2] 
        [O]  [C]  [O]  [P]  [=Branch1]  [C]  [=O]  [Branch1]  [C]  [O]  [O] 
        [C@H1]  [C@@H1]  [Branch1]  [C]  [O]  [C@H1]  [Branch1]  [S]  [N]  [C] 
        [=C]  [Branch1]  [C]  [I]  [C]  [=Branch1]  [C]  [=O]  [NH1]  [C] 
        [Ring1]  [Branch2]  [=O]  [O]  [C@@H1]  [Ring1]  [#C]  [C]  [O]  [P] 
        [=Branch1]  [C]  [=O]  [Branch1]  [C]  [O]  [O]  [C@H1]  [C@@H1] 
        [Branch1]  [C]  [O]  [C@H1]  [Branch1]  [S]  [N]  [C]  [=C]  [Branch1] 
        [C]  [F]  [C]  [=Branch1]  [C]  [=O]  [NH1]  [C]  [Ring1]  [Branch2] 
        [=O]  [O]  [C@@H1]  [Ring1]  [#C]  [C]  [O]  [P]  [=Branch1]  [C]  [=O] 
        [Branch1]  [C]  [O]  [O]  [C@H1]  [C@@H1]  [Branch1]  [C]  [O]  [C@H1] 
        [Branch1]  [P]  [N]  [C]  [=N]  [C]  [=C]  [Branch1]  [C]  [N]  [N] 
        [=C]  [N]  [=C]  [Ring1]  [#Branch1]  [Ring1]  [#Branch2]  [O]  [C@@H1] 
        [Ring1]  [S]  [C]  [O]  [P]  [=Branch1]  [C]  [=O]  [Branch1]  [C]  [O] 
        [O]  [C@@H1]  [Branch1]  [C]  [O]  [C@H1]  [Ring2]  [=Branch1]  [N]  [O]
      - >-
        [C]  [C]  [C]  [Branch1]  [C]  [C]  [C]  [C]  [C]  [C]  [C]  [C]  [#C] 
        [/C]  [=C]  [\O]  [C]  [C@@H1]  [Branch1]  [C]  [O]  [C]  [O]
  - source_sentence: >-
      [O]  [=C]  [N]  [=C]  [Branch1]  [C]  [NH1-1]  [N]  [=C]  [C]  [Ring1] 
      [#Branch1]  [=N]  [C]  [NH1+1]  [Ring1]  [Branch1]  [C]  [=N]  [C]  [=C] 
      [N]  [Ring1]  [Branch1]  [C]  [C]  [=C]  [C]  [=C]  [Branch2]  [Ring2] 
      [=Branch2]  [O]  [C]  [=C]  [Ring1]  [=Branch1]  [C]  [Branch1]  [Ring1] 
      [O]  [C]  [=C]  [C]  [Branch2]  [Ring1]  [Ring2]  [O]  [C]  [Branch1] 
      [Ring2]  [C]  [Ring1]  [Branch1]  [C]  [Branch1]  [C]  [O]  [Branch1] 
      [C]  [C]  [C]  [C]  [C]  [C]  [O]  [=C]  [Ring2]  [Ring1]  [Ring1]  [C] 
      [O]  [C]  [=Branch1]  [C]  [=O]  [O]  [C]  [C]
    sentences:
      - >-
        [O]  [=C]  [Branch2]  [#Branch1]  [=Branch1]  [O]  [C]  [Branch1]  [C] 
        [C]  [C]  [C]  [=C]  [C]  [S]  [S]  [C]  [C]  [N]  [Branch2]  [Branch1] 
        [=N]  [C]  [=Branch1]  [C]  [=O]  [C]  [C]  [=C]  [N]  [=C]  [Branch1] 
        [C]  [N]  [C]  [=C]  [Ring1]  [#Branch1]  [C]  [Ring2]  [Ring1]  [C] 
        [C]  [Ring2]  [Ring1]  [#Branch1]  [O]  [C]  [=C]  [C]  [O]  [C] 
        [=Branch1]  [C]  [=O]  [C]  [=Branch1]  [N]  [=C]  [C]  [=Ring1] 
        [#Branch1]  [C]  [=C]  [Ring1]  [O]  [C]  [Ring1]  [=C]  [C]  [Branch1] 
        [Ring2]  [C]  [C]  [O]  [C]  [C]  [C]  [C]  [C]  [Ring1]  [=Branch2] 
        [C]  [C]  [Ring2]  [Ring2]  [=Branch1]  [C]  [N]  [C]  [C]  [=Branch1] 
        [Ring1]  [=C]  [C]  [C]
      - >-
        [N]  [C]  [=C]  [C]  [=C]  [Branch1]  [C]  [Cl]  [C]  [=C]  [Ring1] 
        [#Branch1]  [C]  [=O]
      - >-
        [O]  [=P]  [Branch1]  [C]  [O]  [Branch1]  [C]  [O]  [O]  [P] 
        [=Branch1]  [C]  [=O]  [Branch1]  [C]  [O]  [O]  [C]  [C]  [=C]  [C] 
        [C]  [C]  [C]  [Ring1]  [=Branch1]
  - source_sentence: >-
      [C]  [C]  [O]  [/N]  [=C]  [/C]  [N]  [Branch2]  [Ring2]  [C]  [C]  [=C] 
      [Branch1]  [C]  [F]  [C]  [=C]  [C]  [=Branch1]  [C]  [=O]  [C] 
      [Branch1]  [=Branch1]  [C]  [=Branch1]  [C]  [=O]  [O]  [=C]  [N]  [C] 
      [Ring1]  [#Branch2]  [=C]  [Ring1]  [#C]  [O]  [C]  [C@@H1]  [Ring1] 
      [=Branch1]  [C]  [C]  [C]  [Ring2]  [Ring1]  [Branch2]  [Branch1]  [C] 
      [C]  [C]  [N]  [C]  [C]  [C]  [Ring1]  [Ring1]
    sentences:
      - >-
        [O]  [=C]  [C@H1]  [Branch1]  [C]  [O]  [C@@H1]  [Branch1]  [C]  [O] 
        [C@H1]  [Branch1]  [C]  [O]  [C@H1]  [Branch1]  [C]  [S]  [C]  [O]
      - >-
        [C]  [C]  [O]  [C]  [=C]  [C]  [=C]  [C]  [=C]  [Ring1]  [=Branch1] 
        [C]  [N]  [C]  [C]  [N]  [C]  [=Branch1]  [C]  [=O]  [C]  [Ring1] 
        [#Branch1]  [C]  [C]  [=Branch1]  [C]  [=O]  [N]  [C]  [C]  [C]  [C] 
        [C]  [C]  [Ring1]  [=Branch1]
      - >-
        [C]  [C]  [=N]  [C]  [=C]  [N]  [Ring1]  [Branch1]  [C]  [C]  [N]  [C] 
        [C]  [O]  [C]  [C]  [Ring1]  [=Branch1]
  - source_sentence: >-
      [C]  [C]  [=C]  [C]  [=C]  [Branch2]  [Ring2]  [S]  [C]  [C]  [N]  [C] 
      [=Branch1]  [C]  [=O]  [C]  [=C]  [N]  [Branch1]  [C]  [C]  [C]  [=C] 
      [C]  [=C]  [Branch2]  [Ring1]  [Ring1]  [S]  [=Branch1]  [C]  [=O] 
      [=Branch1]  [C]  [=O]  [N]  [C]  [C]  [C]  [Branch1]  [C]  [C]  [C]  [C] 
      [Ring1]  [#Branch1]  [C]  [=C]  [Ring1]  [S]  [C]  [Ring2]  [Ring1] 
      [Branch1]  [=O]  [C]  [=C]  [Ring2]  [Ring1]  [P]
    sentences:
      - >-
        [C]  [N]  [C]  [=N]  [C]  [Branch2]  [Branch1]  [C]  [S]  [=Branch1] 
        [C]  [=O]  [=Branch1]  [C]  [=O]  [N]  [Branch1]  [#Branch2]  [C]  [C] 
        [C]  [C]  [N]  [C]  [C]  [Ring1]  [=Branch1]  [C]  [C]  [C]  [=C]  [C] 
        [Branch1]  [Ring1]  [C]  [#N]  [=C]  [C]  [=C]  [Ring1]  [Branch2]  [N] 
        [Branch1]  [#Branch2]  [C]  [C]  [=C]  [N]  [=C]  [N]  [Ring1] 
        [Branch1]  [C]  [C]  [Ring2]  [Ring1]  [Ring1]  [=C]  [Ring2]  [Ring2] 
        [Ring1]
      - >-
        [O]  [=C]  [Branch1]  [C]  [O]  [C]  [C]  [C]  [C]  [=C]  [C]  [=C] 
        [C]  [=C]  [C]  [=C]  [C]  [=C]
      - >-
        [C]  [C]  [C]  [C]  [C@@H1]  [C]  [N]  [Branch2]  [Ring2]  [#C]  [C@H1] 
        [Branch1]  [S]  [C]  [C]  [=C]  [C]  [=C]  [C]  [=C]  [C]  [=C]  [C] 
        [Ring1]  [=Branch1]  [=C]  [Ring1]  [#Branch2]  [C]  [N]  [C]  [C]  [C] 
        [C]  [Ring1]  [Branch1]  [C]  [N]  [C]  [Branch1]  [C]  [N]  [=N]  [C] 
        [C@@H1]  [Ring1]  [=Branch1]  [C]  [C]  [=C]  [C]  [=C]  [C]  [=C] 
        [Ring1]  [=Branch1]  [C]  [=Branch1]  [C]  [=N]  [N]  [Ring2]  [Ring2] 
        [=Branch1]  [C]  [C]  [Branch1]  [C]  [C]  [C]  [=C]  [C]  [=C] 
        [Branch1]  [#Branch1]  [C]  [C]  [Branch1]  [C]  [C]  [C]  [C]  [=C] 
        [Ring1]  [#Branch2]
model-index:
  - name: SentenceTransformer
    results:
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: combined test
          type: combined-test
        metrics:
          - type: pearson_cosine
            value: 0.960544
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.951972
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.878769
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.85873
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.881126
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.86117
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.841371
            name: Pearson Dot
          - type: spearman_dot
            value: 0.842071
            name: Spearman Dot
          - type: pearson_max
            value: 0.960544
            name: Pearson Max
          - type: spearman_max
            value: 0.951972
            name: Spearman Max
license: cc-by-nc-sa-4.0

ChemFIE-BED (ChemSELFIES Embedding)

ChemFIE-BED is a sentence-transformers based on gbyuvd/chemselfies-base-bertmlm fine-tuned on around (for now) 2 million pairs of valid molecules' SELFIES (Krenn et al. 2020) taken from COCONUTDB (Sorokina et al. 2021) and (Zdrazil et al. 2023). It maps compounds' Self-Referencing Embedded Strings (SELFIES) into a 320-dimensional dense vector space, potentially can be used for chemical similarity, similarity search, classification, clustering, and more.

Although there is more data for the model to train on, the test metrics on unseen data of combined natural products and bioactives are already sufficient for now.

This model is the full implementation of Tom Aarsen's suggestions on previous prototype model, now using my own pre-trained BERT and Matryoshka embeddings. For the latter, the model uses 320, 160, and 80 dimension that you can truncate depending on your needs.

For more informations:

Model Details

Model Description

  • Model Type: Sentence Transformer
  • Base model: gbyuvd/chemselfies-base-bertmlm
  • Maximum Sequence Length: 512 tokens
  • Output Dimensionality: 320 tokens
  • Similarity Function: Cosine Similarity
  • Pooling: Mean pooling
  • Training Dataset: SELFIES pairs generated from COCONUTDB and ChemBL34
  • Language: SELFIES
  • License: CC-BY-NC-SA 4.0

Full Model Architecture

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 320, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': False})
)

Usage

Direct Usage (Sentence Transformers)

First install the Sentence Transformers library:

pip install -U sentence-transformers

Then you can load this model and run inference.

from sentence_transformers import SentenceTransformer

# Specify preffered dimensions
# 320, 160, or 80
dimensions = 320

# Download the model from the 🤗 Hub
model = SentenceTransformer("gbyuvd/chembed-chemselfies-bed", truncate_dim=dimensions)

# Run inference
sentences = [
    '[C]  [C]  [=C]  [C]  [=C]  [Branch2]  [Ring2]  [S]  [C]  [C]  [N]  [C]  [=Branch1]  [C]  [=O]  [C]  [=C]  [N]  [Branch1]  [C]  [C]  [C]  [=C]  [C]  [=C]  [Branch2]  [Ring1]  [Ring1]  [S]  [=Branch1]  [C]  [=O]  [=Branch1]  [C]  [=O]  [N]  [C]  [C]  [C]  [Branch1]  [C]  [C]  [C]  [C]  [Ring1]  [#Branch1]  [C]  [=C]  [Ring1]  [S]  [C]  [Ring2]  [Ring1]  [Branch1]  [=O]  [C]  [=C]  [Ring2]  [Ring1]  [P]',
    '[O]  [=C]  [Branch1]  [C]  [O]  [C]  [C]  [C]  [C]  [=C]  [C]  [=C]  [C]  [=C]  [C]  [=C]  [C]  [=C]',
    '[C]  [N]  [C]  [=N]  [C]  [Branch2]  [Branch1]  [C]  [S]  [=Branch1]  [C]  [=O]  [=Branch1]  [C]  [=O]  [N]  [Branch1]  [#Branch2]  [C]  [C]  [C]  [C]  [N]  [C]  [C]  [Ring1]  [=Branch1]  [C]  [C]  [C]  [=C]  [C]  [Branch1]  [Ring1]  [C]  [#N]  [=C]  [C]  [=C]  [Ring1]  [Branch2]  [N]  [Branch1]  [#Branch2]  [C]  [C]  [=C]  [N]  [=C]  [N]  [Ring1]  [Branch1]  [C]  [C]  [Ring2]  [Ring1]  [Ring1]  [=C]  [Ring2]  [Ring2]  [Ring1]',
]

"""
0: CHEMBL1885710
1: CID78383937
2: CHEMBL234161
"""

embeddings = model.encode(sentences)
print(embeddings.shape)
# [3, 320]

# Get the similarity scores for the embeddings
similarities = model.similarity(embeddings, embeddings)
print(similarities.shape)
# [3, 3]

Dataset

Dataset Reference Total Number of Pairs
COCONUTDB (Sorokina et al. 2021) 1,183,186
ChemBL34 (Part I) (Zdrazil et al. 2023) 1,064,858

Evaluation

Metrics

Semantic Similarity

Metric Value
pearson_cosine 0.9605
spearman_cosine 0.9520
pearson_manhattan 0.8788
spearman_manhattan 0.8587
pearson_euclidean 0.8802
spearman_euclidean 0.8612
pearson_dot 0.8414
spearman_dot 0.8421
pearson_max 0.9605
spearman_max 0.9520

Recommendations

To fully utilize the model capabitilities on a large dataset for similarity search, I'd recommend using Meta's FAISS for rapid results or any of your preferred document retrieval framework.

Training Details

Training Hyperparameters

  • optimizer : AdamW
  • eval_strategy: epoch
  • per_device_train_batch_size: 64
  • per_device_eval_batch_size: 32
  • weight_decay: 0.01
  • num_train_epochs: 1
  • warmup_ratio: 0.1
  • dataloader_num_workers: 8
  • Loss: MatryoshkaLoss with these parameters:
    {
        "loss": "CosineSimilarityLoss",
        "matryoshka_dims": [
            320,
            160,
            80
        ],
        "matryoshka_weights": [
            1,
            1,
            1
        ],
        "n_dims_per_step": -1
    }
    

Training Logs

Natural Products

Epoch Step Training Loss loss NPiso-base-test_spearman_cosine
0.2771 4099 0.0243 - -
0.5543 8198 0.0099 - -
0.8314 12297 0.0083 - -
1.0 14790 - 0.0074 0.9548

Combined I

Epoch Step Training Loss loss All-base-test_spearman_cosine
0.2737 4099 0.0111 - -
0.5474 8198 0.0086 - -
0.8212 12297 0.0077 - -
1.0 14975 - 0.0072 0.9516

Testing The Generated Embedding to Find Similar Molecules

Using Atolypene A as the query molecule, I used FAISS (Facebook AI Similarity Search) on the pre-embedded SELFIES representations of 0.5M molecules from COCONUTDB and HerbalDB to find top-10 most similar molecules based on their cosine similarities. It took 50mins to generate the embeddings of said database with my laptop's NVIDIA GeForce 930M (using 64 batch_size).

Atolypene A: image/png

top 10 (returned in 3.9s with visualization):

image/png

image/png

or you can take multiple inputs then average their embeddings to find those most similar. For example, using 5 samples of MRSA-antibiotics: Vancomycin, Linezolid, Tigecycline, and Ceftobiprole

image/png

then query similars based on the average embeddings:

image/png

Testing Generated Embeddings' Clusters

The plot below shows how the model's embeddings (at this stage) cluster different classes of compounds, compared to using MACCS fingerprints.

Using perplexity of 20 over 5500 iterations. 2D:

image/png

image/png

3D:

image/png

image/png

For a more simple separation between two active nAChR-a4b2 agonist vs anticoagulants (perplexity = 5):

image/png

And for more data points and classes (perplexity = 7):

image/png

Framework Versions

  • Python: 3.9.13
  • Sentence Transformers: 3.0.1
  • Transformers: 4.42.4
  • PyTorch: 2.3.1+cu121
  • Accelerate: 0.33.0
  • Datasets: 2.20.0
  • Tokenizers: 0.19.1

Citation

BibTeX

ChemFIE-Base

@software{chemfie_basebertmlm,
  author = {GP Bayu},
  title = {{ChemFIE Base}: Pretraining A Lightweight BERT-like model on Molecular SELFIES},
  url = {https://huggingface.co/gbyuvd/chemselfies-base-bertmlm},
  version = {1.0},
  year = {2024},
}

Sentence Transformers

@inproceedings{reimers-2019-sentence-bert,
    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "https://arxiv.org/abs/1908.10084",
}

MatryoshkaLoss

@misc{kusupati2024matryoshka,
    title={Matryoshka Representation Learning}, 
    author={Aditya Kusupati and Gantavya Bhatt and Aniket Rege and Matthew Wallingford and Aditya Sinha and Vivek Ramanujan and William Howard-Snyder and Kaifeng Chen and Sham Kakade and Prateek Jain and Ali Farhadi},
    year={2024},
    eprint={2205.13147},
    archivePrefix={arXiv},
    primaryClass={cs.LG}
}

COCONUTDB

@article{sorokina2021coconut,
  title={COCONUT online: Collection of Open Natural Products database},
  author={Sorokina, Maria and Merseburger, Peter and Rajan, Kohulan and Yirik, Mehmet Aziz and Steinbeck, Christoph},
  journal={Journal of Cheminformatics},
  volume={13},
  number={1},
  pages={2},
  year={2021},
  doi={10.1186/s13321-020-00478-9}
}

ChemBL34

@article{zdrazil2023chembl,
  title={The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods},
  author={Zdrazil, Barbara and Felix, Eloy and Hunter, Fiona and Manners, Emma J and Blackshaw, James and Corbett, Sybilla and de Veij, Marleen and Ioannidis, Harris and Lopez, David Mendez and Mosquera, Juan F and Magarinos, Maria Paula and Bosc, Nicolas and Arcila, Ricardo and Kizil{\"o}ren, Tevfik and Gaulton, Anna and Bento, A Patr{\'i}cia and Adasme, Melissa F and Monecke, Peter and Landrum, Gregory A and Leach, Andrew R},
  journal={Nucleic Acids Research},
  year={2023},
  volume={gkad1004},
  doi={10.1093/nar/gkad1004}
}

@misc{chembl34,
  title={ChemBL34},
  year={2023},
  doi={10.6019/CHEMBL.database.34}
}

Contact & Support My Work

G Bayu (gbyuvd@proton.me)

This project has been quiet a journey for me, I’ve dedicated hours on this and I would like to improve myself, this model, and future projects. However, financial and computational constraints can be challenging.

If you find my work valuable and would like to support my journey, please consider supporting me here. Your support will help me cover costs for computational resources, data acquisition, and further development of this project. Any amount, big or small, is greatly appreciated and will enable me to continue learning and explore more.

Thank you for checking out this model, I am more than happy to receive any feedback, so that I can improve myself and the future model/projects I will be working on.