{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Thanks for using UniBioMap demo\n", "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/xfd997700/unibiomap-demo/blob/main/demo.ipynb)\n", "[![Open In GitHub](https://img.shields.io/badge/Open%20in-GitHub-black?logo=github)](https://github.com/xfd997700/unibiomap_demo)\n", "\n", "🧪If you want to compile **UniBioMap** by yourself, please visit [UniBioMap](https://github.com/xfd997700/UniBioMap)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "⚠️ This cell is only for `Google Colab`.\n", "\n", "**DO NOT RUN THIS CELL IF YOU ARE RUNNING LOCALLY**" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# if you are running locally, DO NOT RUN THIS CELL\n", "!git clone https://github.com/xfd997700/unibiomap_demo.git\n", "!rsync -av unibiomap_demo/ ./ --exclude=.git\n", "!pip install -r requirements.txt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Load Knowledge Graph\n", "The raw graph data will be automatically downloaded in your first run." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/mnt/d/WSL2/miniforge3/envs/KG/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Found existing data files, loading...\n", "Done\n" ] } ], "source": [ "from utils import *\n", "from os.path import join\n", "import os\n", "\n", "# Load unibiokg\n", "link_root = \"database/unibiomap\"\n", "data_root = \"database/processed\"\n", "\n", "os.makedirs(data_root, exist_ok=True)\n", "node_map_path = join(data_root, \"node_map.json\")\n", "graph_path = join(data_root, \"unibiomap_simp.dgl\")\n", "link_path = join(link_root, \"unibiomap.links.tsv\")\n", "\n", "def nodemap2idmap(node_map):\n", " return {k: {vv: kk for kk, vv in v.items()} for k, v in node_map.items()}\n", "\n", "if os.path.exists(node_map_path) and os.path.exists(graph_path):\n", " print(\"Found existing data files, loading...\")\n", " with open(node_map_path, \"r\") as f:\n", " node_map = json.load(f)\n", " graph = dgl.load_graphs(graph_path)[0][0]\n", "else:\n", " print(\"Data files not found, processing from raw links...\")\n", " # simplify_edge=True 则忽略细粒度关系,简化关系类型为 头实体类型_尾实体类型,如 compound_protein\n", " if not os.path.exists(link_path):\n", " download_raw_kg(link_path)\n", "\n", " graph, node_map = process_knowledge_graph(link_path, simplify_edge=True)\n", " os.makedirs(data_root, exist_ok=True)\n", " dgl.save_graphs(graph_path, [graph])\n", " with open(node_map_path, \"w\") as f:\n", " json.dump(node_map, f)\n", "\n", "id_map = nodemap2idmap(node_map)\n", "print(\"Done\")\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Get Subgraph\n", "Use `DGL` to obtain the subgraph for specific nodes." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "📝 Please setup this cell" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Set the query vertex nodes for sampling the subgraph\n", "sample_dict = {\n", " \"protein\": [\"P05091\"],\n", "}\n", "# Set the sample depth\n", "depth = 1\n", "# Set the subgraph save root\n", "sub_save_root = \"results/\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Graph(num_nodes={'complex': 3, 'compound': 370, 'disease': 20, 'genetic_disorder': 1, 'go': 18, 'pathway': 109, 'phenotype': 0, 'protein': 67},\n", " num_edges={('complex', 'complex-pathway', 'pathway'): 5, ('compound', 'compound-compound', 'compound'): 11, ('compound', 'compound-disease', 'disease'): 10, ('compound', 'compound-pathway', 'pathway'): 655, ('compound', 'compound-protein', 'protein'): 557, ('disease', 'disease-genetic_disorder', 'genetic_disorder'): 0, ('disease', 'disease-pathway', 'pathway'): 288, ('go', 'go-go', 'go'): 4, ('pathway', 'pathway-go', 'go'): 1, ('phenotype', 'phenotype-genetic_disorder', 'genetic_disorder'): 0, ('phenotype', 'phenotype-phenotype', 'phenotype'): 0, ('protein', 'protein-complex', 'complex'): 14, ('protein', 'protein-disease', 'disease'): 53, ('protein', 'protein-genetic_disorder', 'genetic_disorder'): 2, ('protein', 'protein-go', 'go'): 175, ('protein', 'protein-pathway', 'pathway'): 837, ('protein', 'protein-phenotype', 'phenotype'): 0, ('protein', 'protein-protein', 'protein'): 509, ('protein', 'protein_metabolite', 'compound'): 338},\n", " metagraph=[('complex', 'pathway', 'complex-pathway'), ('pathway', 'go', 'pathway-go'), ('compound', 'compound', 'compound-compound'), ('compound', 'disease', 'compound-disease'), ('compound', 'pathway', 'compound-pathway'), ('compound', 'protein', 'compound-protein'), ('disease', 'genetic_disorder', 'disease-genetic_disorder'), ('disease', 'pathway', 'disease-pathway'), ('protein', 'complex', 'protein-complex'), ('protein', 'disease', 'protein-disease'), ('protein', 'genetic_disorder', 'protein-genetic_disorder'), ('protein', 'go', 'protein-go'), ('protein', 'pathway', 'protein-pathway'), ('protein', 'phenotype', 'protein-phenotype'), ('protein', 'protein', 'protein-protein'), ('protein', 'compound', 'protein_metabolite'), ('go', 'go', 'go-go'), ('phenotype', 'genetic_disorder', 'phenotype-genetic_disorder'), ('phenotype', 'phenotype', 'phenotype-phenotype')])\n", "Total triplets: 3459\n" ] } ], "source": [ "# 生成子图\n", "# 使用 subgraph_by_node 函数获取子图\n", "sub_g, new2orig, node_map, _ = subgraph_by_node(graph, sample_dict, node_map, depth=depth)\n", "# 打印子图的基本信息\n", "print(sub_g)\n", "# 获取子图的实体和三元组信息\n", "id_map = nodemap2idmap(node_map)\n", "entities, triplets = report_subgraph(sub_g, id_map, save_root=sub_save_root)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Visualize Subgraph\n", "Using `networkx` and `pyvis` to visualize the subgraph as a static and interactive unit respectively." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "📝 Please setup this cell" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "networkx graph generated\n" ] } ], "source": [ "# Configs\n", "# 设置每种类型节点的显示数量,-1 表示无限制\n", "display_limits = {\n", " 'complex': 10, \n", " 'compound': 10,\n", " 'disease': 10,\n", " 'genetic_disorder': 10,\n", " 'go': 10,\n", " 'pathway': 10,\n", " 'phenotype': 10,\n", " 'protein': 10,\n", "}\n", "# 设置必须显示的节点,字典的值为节点名称列表(必须与 id_map 中的名称一致)\n", "must_show = {\n", " \"protein\": [\"P05091\"],\n", " # 可为其他类型添加必显示节点\n", "}\n", "# 设置是否移除自环\n", "remove_self_loop = True\n", "\n", "G = convert_subgraph_to_networkx(sub_g, id_map, display_limits, must_show, remove_self_loop)\n", "print(\"networkx graph generated\")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "for node in G.nodes(data=True):\n", " break\n", "for edge in G.edges(data=True):\n", " break" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "('complex_0',\n", " {'label': 'R-HSA-9838070', 'title': 'R-HSA-9838070', 'group': 'complex'})" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "node" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "📝 Please setup this cell" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Visualization configs\n", "# 定义不同类型节点的颜色\n", "color_map = {\n", " 'complex': '#FFA07A',\n", " 'compound': '#98FB98',\n", " 'disease': '#FFD700',\n", " 'genetic_disorder': '#FF69B4',\n", " 'go': '#87CEEB',\n", " 'pathway': '#DDA0DD',\n", " 'phenotype': '#808080',\n", " 'protein': '#FF6347'\n", "}\n", "\n", "# static visual config\n", "node_size = 500\n", "font_size = 10\n", "font_color = \"black\"\n", "edge_color = \"gray\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import networkx as nx\n", "# Display the NetworkX graph using matplotlib\n", "import matplotlib.pyplot as plt\n", "\n", "plt.figure(figsize=(12, 12))\n", "pos = nx.spring_layout(G, seed=42) # positions for all nodes\n", "\n", "# 获取节点颜色列表\n", "node_colors = [color_map.get(G.nodes[node]['group'], 'gray') for node in G.nodes]\n", "\n", "# 使用节点的 label 作为标签\n", "labels = nx.get_node_attributes(G, 'label')\n", "nx.draw(G, pos, labels=labels, node_size=node_size, node_color=node_colors,\n", " font_size=font_size, font_color=font_color, edge_color=edge_color)\n", "# plt.title(\"Subgraph Visualization\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from pyvis.network import Network\n", "from IPython.display import IFrame, display\n", "\n", "# 利用 pyvis 进行交互式可视化\n", "# net = Network(height='750px', width='100%', notebook=True, bgcolor='#ffffff', font_color='black')\n", "net = Network(height='750px', width='100%', notebook=True, cdn_resources='in_line', bgcolor='#ffffff', font_color='black')\n", "net.from_nx(G)\n", "# 展示图形(生成的 HTML 文件将自动打开或在 notebook 中显示)\n", "html_root = join(sub_save_root, \"dgl_subgraph.html\")\n", "net.show(html_root)\n", "print(f\"Subgraph visualization saved in {html_root}\")" ] } ], "metadata": { "kernelspec": { "display_name": "KG", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.9" } }, "nbformat": 4, "nbformat_minor": 2 }