Spaces:
Running
Running
| import gradio as gr | |
| import sqlite3 | |
| import json | |
| import numpy as np | |
| from numpy.linalg import norm | |
| from huggingface_hub import hf_hub_download | |
| from sentence_transformers import SentenceTransformer | |
| import os | |
| # Get Hugging Face Token from Environment Variables | |
| HF_TOKEN = os.environ.get("HF_TOKEN") | |
| if not HF_TOKEN: | |
| raise ValueError("Missing Hugging Face API token. Please set HF_TOKEN as an environment variable in Hugging Face Secrets.") | |
| # Load the Nomic-Embed Model from Hugging Face | |
| EMBEDDING_MODEL = "nomic-ai/nomic-embed-text-v1.5" | |
| embedder = SentenceTransformer(EMBEDDING_MODEL, trust_remote_code=True) | |
| # Download database from Hugging Face Datasets if it does not exist | |
| db_filename = "hpo_genes.db" | |
| db_repo = "UoS-HGIG/hpo_genes" | |
| db_path = os.path.join(os.getcwd(), db_filename) | |
| if not os.path.exists(db_path): | |
| db_path = hf_hub_download(repo_id=db_repo, filename=db_filename, repo_type="dataset", use_auth_token=HF_TOKEN) | |
| def find_best_hpo_match(finding, region, threshold): | |
| query_text = f"{finding} in {region}" if region else finding | |
| query_embedding = embedder.encode(query_text) | |
| conn = sqlite3.connect(db_path) | |
| cursor = conn.cursor() | |
| cursor.execute("SELECT hpo_id, hpo_name, embedding FROM hpo_embeddings") | |
| best_match, best_score = None, -1 | |
| for hpo_id, hpo_name, embedding_str in cursor.fetchall(): | |
| hpo_embedding = np.array(json.loads(embedding_str)) | |
| similarity = np.dot(query_embedding, hpo_embedding) / (norm(query_embedding) * norm(hpo_embedding)) | |
| if similarity > best_score: | |
| best_score = similarity | |
| best_match = {"hpo_id": hpo_id, "hpo_term": hpo_name} | |
| conn.close() | |
| return best_match if best_score >= threshold else None | |
| def get_genes_for_hpo(hpo_id): | |
| conn = sqlite3.connect(db_path) | |
| cursor = conn.cursor() | |
| cursor.execute("SELECT genes FROM hpo_gene WHERE hpo_id = ?", (hpo_id,)) | |
| result = cursor.fetchone() | |
| conn.close() | |
| return result[0].split(", ") if result else [] | |
| def hpo_mapper_ui(finding, region, threshold): | |
| if not finding: | |
| return "Please enter a pathological finding.", "", "" | |
| match = find_best_hpo_match(finding, region, threshold) | |
| if match: | |
| genes = get_genes_for_hpo(match["hpo_id"]) | |
| return match["hpo_id"], match["hpo_term"], ", ".join(genes) | |
| return "No match found.", "", "" | |
| demo = gr.Interface( | |
| fn=hpo_mapper_ui, | |
| inputs=[ | |
| gr.Textbox(label="Pathological Finding"), | |
| gr.Textbox(label="Anatomical Region (optional)"), | |
| gr.Slider(0.0, 1.0, step=0.01, value=0.76, label="Similarity Threshold") | |
| ], | |
| outputs=[ | |
| gr.Textbox(label="HPO ID"), | |
| gr.Textbox(label="HPO Term"), | |
| gr.Textbox(label="Disease genes annotated as being associated with this HPO term"), | |
| ], | |
| title="Human Phenotype Ontology (HPO) Mapper", | |
| description=( | |
| 'Enter a pathological finding (e.g., "chronic inflammation") and anatomical region ' | |
| '(e.g., "terminal ileum") to map it to the closest Human Phenotype Ontology (HPO) ' | |
| 'term and retrieve genes annotated as being associated with this HPO term.\n\n' | |
| '**References:**\n' | |
| 'Kadhim, A. Z., Green, Z., Nazari, I., Baker, J., George, M., Heinson, A., Stammers, M., Kipps, C., Beattie, R. M., Ashton, J. J., & Ennis, S. (2025).\n' | |
| 'Application of generative artificial intelligence to utilise unstructured clinical data for acceleration of inflammatory bowel disease research.\n' | |
| '*medRxiv*. [https://doi.org/10.1101/2025.03.07.25323569](https://doi.org/10.1101/2025.03.07.25323569)\n\n' | |
| 'Gargano, M. A., Matentzoglu, N., Coleman, B., Addo-Lartey, E. B., Anagnostopoulos, A. V., Anderton, J., Avillach, P., Bagley, A. M., Bakštein, E., Balhoff, J. P., Baynam, G., Bello, S. M., Berk, M., Bertram, H., Bishop, S., Blau, H., Bodenstein, D. F., Botas, P., Boztug, K., Čady, J., … Robinson, P. N. (2024)\n' | |
| 'The Human Phenotype Ontology in 2024: phenotypes around the world.\n' | |
| '*Nucleic Acids Research* [https://doi.org/10.1093/nar/gkad1005](https://doi.org/10.1093/nar/gkad1005)\n\n' | |
| 'HPO to gene mappings obtained from [Jax](https://hpo.jax.org/data/annotations)' | |
| ) | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |