Spaces:
Running
Running
File size: 4,282 Bytes
b71fded a9df227 dc4a259 a9df227 f201229 a9df227 b859fb5 caa89c2 b859fb5 dc4a259 a9df227 dc4a259 a9df227 b859fb5 a9df227 b71fded b859fb5 caa89c2 dc4a259 a9df227 b71fded a9df227 d60b7c4 b71fded d60b7c4 b71fded a9df227 d60b7c4 b859fb5 a9df227 d60b7c4 b859fb5 caa89c2 b71fded dc4a259 a9df227 b71fded a9df227 b71fded caa89c2 a9df227 dc4a259 a9df227 d60b7c4 a9df227 2a861c0 a9df227 a2e895f d60b7c4 a9df227 dc4a259 b859fb5 a2e895f caa89c2 b71fded a9df227 f28a058 d60b7c4 dc4a259 f28a058 b71fded c3ec480 a9df227 2a861c0 b71fded d60b7c4 f28a058 a9df227 dc4a259 a9df227 b71fded c3ec480 caa89c2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
import gradio as gr
import sqlite3
import json
import numpy as np
from numpy.linalg import norm
from huggingface_hub import hf_hub_download
from sentence_transformers import SentenceTransformer
import os
# Get Hugging Face Token from Environment Variables
HF_TOKEN = os.environ.get("HF_TOKEN")
if not HF_TOKEN:
raise ValueError("Missing Hugging Face API token. Please set HF_TOKEN as an environment variable in Hugging Face Secrets.")
# Load the Nomic-Embed Model from Hugging Face
EMBEDDING_MODEL = "nomic-ai/nomic-embed-text-v1.5"
embedder = SentenceTransformer(EMBEDDING_MODEL, trust_remote_code=True)
# Download database from Hugging Face Datasets if it does not exist
db_filename = "hpo_genes.db"
db_repo = "UoS-HGIG/hpo_genes"
db_path = os.path.join(os.getcwd(), db_filename)
if not os.path.exists(db_path):
db_path = hf_hub_download(repo_id=db_repo, filename=db_filename, repo_type="dataset", use_auth_token=HF_TOKEN)
def find_best_hpo_match(finding, region, threshold):
query_text = f"{finding} in {region}" if region else finding
query_embedding = embedder.encode(query_text)
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute("SELECT hpo_id, hpo_name, embedding FROM hpo_embeddings")
best_match, best_score = None, -1
for hpo_id, hpo_name, embedding_str in cursor.fetchall():
hpo_embedding = np.array(json.loads(embedding_str))
similarity = np.dot(query_embedding, hpo_embedding) / (norm(query_embedding) * norm(hpo_embedding))
if similarity > best_score:
best_score = similarity
best_match = {"hpo_id": hpo_id, "hpo_term": hpo_name}
conn.close()
return best_match if best_score >= threshold else None
def get_genes_for_hpo(hpo_id):
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute("SELECT genes FROM hpo_gene WHERE hpo_id = ?", (hpo_id,))
result = cursor.fetchone()
conn.close()
return result[0].split(", ") if result else []
def hpo_mapper_ui(finding, region, threshold):
if not finding:
return "Please enter a pathological finding.", "", ""
match = find_best_hpo_match(finding, region, threshold)
if match:
genes = get_genes_for_hpo(match["hpo_id"])
return match["hpo_id"], match["hpo_term"], ", ".join(genes)
return "No match found.", "", ""
demo = gr.Interface(
fn=hpo_mapper_ui,
inputs=[
gr.Textbox(label="Pathological Finding"),
gr.Textbox(label="Anatomical Region (optional)"),
gr.Slider(0.0, 1.0, step=0.01, value=0.76, label="Similarity Threshold")
],
outputs=[
gr.Textbox(label="HPO ID"),
gr.Textbox(label="HPO Term"),
gr.Textbox(label="Disease genes annotated as being associated with this HPO term"),
],
title="Human Phenotype Ontology (HPO) Mapper",
description=(
'Enter a pathological finding (e.g., "chronic inflammation") and anatomical region '
'(e.g., "terminal ileum") to map it to the closest Human Phenotype Ontology (HPO) '
'term and retrieve genes annotated as being associated with this HPO term.\n\n'
'**References:**\n'
'Kadhim, A. Z., Green, Z., Nazari, I., Baker, J., George, M., Heinson, A., Stammers, M., Kipps, C., Beattie, R. M., Ashton, J. J., & Ennis, S. (2025).\n'
'Application of generative artificial intelligence to utilise unstructured clinical data for acceleration of inflammatory bowel disease research.\n'
'*medRxiv*. [https://doi.org/10.1101/2025.03.07.25323569](https://doi.org/10.1101/2025.03.07.25323569)\n\n'
'Gargano, M. A., Matentzoglu, N., Coleman, B., Addo-Lartey, E. B., Anagnostopoulos, A. V., Anderton, J., Avillach, P., Bagley, A. M., Bakštein, E., Balhoff, J. P., Baynam, G., Bello, S. M., Berk, M., Bertram, H., Bishop, S., Blau, H., Bodenstein, D. F., Botas, P., Boztug, K., Čady, J., … Robinson, P. N. (2024)\n'
'The Human Phenotype Ontology in 2024: phenotypes around the world.\n'
'*Nucleic Acids Research* [https://doi.org/10.1093/nar/gkad1005](https://doi.org/10.1093/nar/gkad1005)\n\n'
'HPO to gene mappings obtained from [Jax](https://hpo.jax.org/data/annotations)'
)
)
if __name__ == "__main__":
demo.launch() |