File size: 4,282 Bytes
b71fded
 
 
 
a9df227
dc4a259
a9df227
 
f201229
a9df227
b859fb5
 
caa89c2
b859fb5
dc4a259
a9df227
dc4a259
a9df227
 
b859fb5
a9df227
 
b71fded
b859fb5
caa89c2
 
dc4a259
a9df227
 
 
b71fded
 
 
a9df227
d60b7c4
b71fded
d60b7c4
b71fded
 
a9df227
d60b7c4
b859fb5
 
a9df227
d60b7c4
b859fb5
caa89c2
b71fded
dc4a259
a9df227
b71fded
 
a9df227
 
b71fded
caa89c2
a9df227
dc4a259
a9df227
d60b7c4
a9df227
2a861c0
a9df227
a2e895f
d60b7c4
a9df227
dc4a259
 
b859fb5
a2e895f
caa89c2
b71fded
a9df227
f28a058
d60b7c4
 
dc4a259
f28a058
b71fded
c3ec480
a9df227
2a861c0
b71fded
d60b7c4
f28a058
a9df227
 
dc4a259
 
 
 
 
 
 
 
 
a9df227
b71fded
 
c3ec480
caa89c2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import gradio as gr
import sqlite3
import json
import numpy as np
from numpy.linalg import norm
from huggingface_hub import hf_hub_download
from sentence_transformers import SentenceTransformer
import os

# Get Hugging Face Token from Environment Variables
HF_TOKEN = os.environ.get("HF_TOKEN")
if not HF_TOKEN:
    raise ValueError("Missing Hugging Face API token. Please set HF_TOKEN as an environment variable in Hugging Face Secrets.")

# Load the Nomic-Embed Model from Hugging Face
EMBEDDING_MODEL = "nomic-ai/nomic-embed-text-v1.5"
embedder = SentenceTransformer(EMBEDDING_MODEL, trust_remote_code=True)

# Download database from Hugging Face Datasets if it does not exist
db_filename = "hpo_genes.db"
db_repo = "UoS-HGIG/hpo_genes"
db_path = os.path.join(os.getcwd(), db_filename)

if not os.path.exists(db_path):
    db_path = hf_hub_download(repo_id=db_repo, filename=db_filename, repo_type="dataset", use_auth_token=HF_TOKEN)


def find_best_hpo_match(finding, region, threshold):
    query_text = f"{finding} in {region}" if region else finding
    query_embedding = embedder.encode(query_text)

    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute("SELECT hpo_id, hpo_name, embedding FROM hpo_embeddings")

    best_match, best_score = None, -1

    for hpo_id, hpo_name, embedding_str in cursor.fetchall():
        hpo_embedding = np.array(json.loads(embedding_str))
        similarity = np.dot(query_embedding, hpo_embedding) / (norm(query_embedding) * norm(hpo_embedding))

        if similarity > best_score:
            best_score = similarity
            best_match = {"hpo_id": hpo_id, "hpo_term": hpo_name}

    conn.close()
    return best_match if best_score >= threshold else None


def get_genes_for_hpo(hpo_id):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute("SELECT genes FROM hpo_gene WHERE hpo_id = ?", (hpo_id,))
    result = cursor.fetchone()
    conn.close()
    return result[0].split(", ") if result else []


def hpo_mapper_ui(finding, region, threshold):
    if not finding:
        return "Please enter a pathological finding.", "", ""

    match = find_best_hpo_match(finding, region, threshold)

    if match:
        genes = get_genes_for_hpo(match["hpo_id"])
        return match["hpo_id"], match["hpo_term"], ", ".join(genes)

    return "No match found.", "", ""


demo = gr.Interface(
    fn=hpo_mapper_ui,
    inputs=[
        gr.Textbox(label="Pathological Finding"),
        gr.Textbox(label="Anatomical Region (optional)"),
        gr.Slider(0.0, 1.0, step=0.01, value=0.76, label="Similarity Threshold")
    ],
    outputs=[
        gr.Textbox(label="HPO ID"),
        gr.Textbox(label="HPO Term"),
        gr.Textbox(label="Disease genes annotated as being associated with this HPO term"),
    ],
    title="Human Phenotype Ontology (HPO) Mapper",
    description=(
        'Enter a pathological finding (e.g., "chronic inflammation") and anatomical region '
        '(e.g., "terminal ileum") to map it to the closest Human Phenotype Ontology (HPO) '
        'term and retrieve genes annotated as being associated with this HPO term.\n\n'
        '**References:**\n'
        'Kadhim, A. Z., Green, Z., Nazari, I., Baker, J., George, M., Heinson, A., Stammers, M., Kipps, C., Beattie, R. M., Ashton, J. J., & Ennis, S. (2025).\n'
        'Application of generative artificial intelligence to utilise unstructured clinical data for acceleration of inflammatory bowel disease research.\n'
        '*medRxiv*. [https://doi.org/10.1101/2025.03.07.25323569](https://doi.org/10.1101/2025.03.07.25323569)\n\n'
        'Gargano, M. A., Matentzoglu, N., Coleman, B., Addo-Lartey, E. B., Anagnostopoulos, A. V., Anderton, J., Avillach, P., Bagley, A. M., Bakštein, E., Balhoff, J. P., Baynam, G., Bello, S. M., Berk, M., Bertram, H., Bishop, S., Blau, H., Bodenstein, D. F., Botas, P., Boztug, K., Čady, J., … Robinson, P. N. (2024)\n' 
        'The Human Phenotype Ontology in 2024: phenotypes around the world.\n' 
        '*Nucleic Acids Research* [https://doi.org/10.1093/nar/gkad1005](https://doi.org/10.1093/nar/gkad1005)\n\n'
        'HPO to gene mappings obtained from [Jax](https://hpo.jax.org/data/annotations)'
    )
)

if __name__ == "__main__":
    demo.launch()