babilonczyk commited on
Commit
b48bc49
·
verified ·
1 Parent(s): 4feb434

Upload 7 files

Browse files
Files changed (7) hide show
  1. Dockerfile +14 -0
  2. LICENSE +21 -0
  3. README.md +61 -9
  4. main.py +44 -0
  5. pyproject.toml +10 -0
  6. requirements.txt +77 -0
  7. utils.py +18 -0
Dockerfile ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ RUN useradd -m -u 1000 user
4
+ USER user
5
+ ENV PATH="/home/user/.local/bin:$PATH"
6
+
7
+ WORKDIR /app
8
+
9
+ COPY --chown=user ./requirements.txt requirements.txt
10
+ RUN pip install --no-cache-dir --upgrade pip && \
11
+ pip install --no-cache-dir --upgrade -r requirements.txt
12
+
13
+ COPY --chown=user . /app
14
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Jan Piotrzkowski
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,12 +1,64 @@
 
 
 
 
 
 
1
  ---
2
- title: Protein Similarity Api
3
- emoji: 🏆
4
- colorFrom: red
5
- colorTo: gray
6
- sdk: docker
7
- pinned: false
8
- license: mit
9
- short_description: About A lightweight FastAPI service for comparing protein se
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # protein-similarity-api
2
+
3
+ A lightweight, production-ready FastAPI service for comparing protein sequences using state-of-the-art transformer embeddings (ESM-2).
4
+
5
+ It provides a simple REST API to measure semantic similarity between protein sequences, returning a similarity score based on cosine distance in embedding space.
6
+
7
  ---
8
+
9
+ ## Why this matters?
10
+
11
+ Protein similarity is at the heart of functional annotation, homology detection, and structure prediction. This API lets anyone (researcher, student, dev) compare protein sequences using powerful models.
12
+
 
 
 
13
  ---
14
 
15
+ ## Features
16
+
17
+ - Compare two protein sequences using [ESM-2](https://huggingface.co/facebook/esm2_t33_650M_UR50D)
18
+ - Returns cosine similarity + classification (homologous vs. non-homologous)
19
+ - Lightweight, deployable on free-tier platforms like Render or Railway
20
+ - Clean FastAPI structure, easy to extend (e.g. embeddings endpoint, caching)
21
+
22
+ ---
23
+
24
+ ## API Endpoints
25
+
26
+ ### `POST /compare`
27
+
28
+ Compare two sequences and get similarity:
29
+
30
+ #### Request:
31
+
32
+ ```json
33
+ {
34
+ "sequence_1": "MSSKVIFF...",
35
+ "sequence_2": "MTTRLIFF...",
36
+ "model": "esm_2_650m"
37
+ }
38
+ ```
39
+
40
+ #### Response:
41
+
42
+ ```json
43
+ {
44
+ "cosine_similarity": 0.57,
45
+ "classification": "moderate similarity (possible remote homolog)",
46
+ "model": "ESM-2 650M"
47
+ }
48
+ ```
49
+
50
+ ## How to run it?
51
+
52
+ ```shell
53
+ python3 -m venv venv
54
+ source venv/bin/activate
55
+ pip install -r requirements.txt
56
+
57
+ uvicorn main:app --reload
58
+ ```
59
+
60
+ ## How to deploy it?
61
+
62
+ ```
63
+
64
+ ```
main.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from pydantic import BaseModel
3
+ from utils import compare_embeddings
4
+
5
+ from models.esm_2_650m import get_embedding as get_embedding_esm_2_650m
6
+
7
+ app = FastAPI()
8
+
9
+
10
+ class CompareRequest(BaseModel):
11
+ sequence_1: str
12
+ sequence_2: str
13
+ model: str = "esm_2_650m"
14
+
15
+
16
+ model_mapping = {"esm_2_650m": get_embedding_esm_2_650m}
17
+
18
+
19
+ # ----------------------------------------------------------------------
20
+ @app.get("/")
21
+ def root():
22
+ return {
23
+ "message": "API is running. Use POST /compare to compare protein sequences."
24
+ }
25
+
26
+
27
+ # ----------------------------------------------------------------------
28
+ @app.post("/compare")
29
+ def compare(request: CompareRequest):
30
+ model = request.model
31
+
32
+ if model not in model_mapping:
33
+ return {"error": "Model not supported"}
34
+
35
+ emb1 = model_mapping[model](request.sequence_1)
36
+ emb2 = model_mapping[model](request.sequence_2)
37
+
38
+ similarity, classification = compare_embeddings(emb1, emb2)
39
+
40
+ return {
41
+ "cosine_similarity": float(similarity),
42
+ "classification": classification,
43
+ "model": model,
44
+ }
pyproject.toml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.black]
2
+ line-length = 88
3
+ target-version = ["py311"]
4
+
5
+ [tool.isort]
6
+ profile = "black"
7
+
8
+ [tool.flake8]
9
+ max-line-length = 88
10
+ extend-ignore = ["E203", "W503"]
requirements.txt ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ annotated-types==0.7.0
2
+ anyio==4.9.0
3
+ black==25.1.0
4
+ certifi==2025.7.14
5
+ charset-normalizer==3.4.2
6
+ click==8.2.1
7
+ coverage==7.10.1
8
+ dnspython==2.7.0
9
+ email_validator==2.2.0
10
+ fastapi==0.116.1
11
+ fastapi-cli==0.0.8
12
+ fastapi-cloud-cli==0.1.5
13
+ filelock==3.18.0
14
+ flake8==7.3.0
15
+ fsspec==2025.7.0
16
+ h11==0.16.0
17
+ hf-xet==1.1.5
18
+ httpcore==1.0.9
19
+ httptools==0.6.4
20
+ httpx==0.28.1
21
+ huggingface-hub==0.34.3
22
+ idna==3.10
23
+ iniconfig==2.1.0
24
+ isort==6.0.1
25
+ itsdangerous==2.2.0
26
+ Jinja2==3.1.6
27
+ markdown-it-py==3.0.0
28
+ MarkupSafe==3.0.2
29
+ mccabe==0.7.0
30
+ mdurl==0.1.2
31
+ mpmath==1.3.0
32
+ mypy_extensions==1.1.0
33
+ networkx==3.5
34
+ numpy==2.3.2
35
+ orjson==3.11.1
36
+ packaging==25.0
37
+ pathspec==0.12.1
38
+ platformdirs==4.3.8
39
+ pluggy==1.6.0
40
+ pycodestyle==2.14.0
41
+ pydantic==2.11.7
42
+ pydantic-extra-types==2.10.5
43
+ pydantic-settings==2.10.1
44
+ pydantic_core==2.33.2
45
+ pyflakes==3.4.0
46
+ Pygments==2.19.2
47
+ pytest==8.4.1
48
+ pytest-cov==6.2.1
49
+ python-dotenv==1.1.1
50
+ python-multipart==0.0.20
51
+ PyYAML==6.0.2
52
+ regex==2024.11.6
53
+ requests==2.32.4
54
+ rich==14.1.0
55
+ rich-toolkit==0.14.9
56
+ rignore==0.6.4
57
+ safetensors==0.5.3
58
+ scipy==1.16.1
59
+ sentry-sdk==2.33.2
60
+ setuptools==80.9.0
61
+ shellingham==1.5.4
62
+ sniffio==1.3.1
63
+ starlette==0.47.2
64
+ sympy==1.14.0
65
+ tokenizers==0.21.4
66
+ torch==2.7.1
67
+ tqdm==4.67.1
68
+ transformers==4.54.0
69
+ typer==0.16.0
70
+ typing-inspection==0.4.1
71
+ typing_extensions==4.14.1
72
+ ujson==5.10.0
73
+ urllib3==2.5.0
74
+ uvicorn==0.35.0
75
+ uvloop==0.21.0
76
+ watchfiles==1.1.0
77
+ websockets==15.0.1
utils.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from scipy.spatial.distance import cosine
2
+
3
+
4
+ def compare_embeddings(emb1, emb2):
5
+ similarity = 1 - cosine(emb1, emb2)
6
+
7
+ if similarity >= 0.85:
8
+ classification = "very high similarity (clear homology)"
9
+ elif similarity >= 0.70:
10
+ classification = "high similarity (likely homologous)"
11
+ elif similarity >= 0.50:
12
+ classification = "moderate similarity (possible remote homolog)"
13
+ elif similarity >= 0.30:
14
+ classification = "low similarity (likely not homologous)"
15
+ else:
16
+ classification = "very low similarity (unrelated / random match)"
17
+
18
+ return similarity, classification