{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Vector Search " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os, pandas as pd\n", "from sqlalchemy import create_engine, text" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "username = 'demo'\n", "password = 'demo'\n", "hostname = os.getenv('IRIS_HOSTNAME', 'localhost')\n", "port = '1972' \n", "namespace = 'USER'\n", "CONNECTION_STRING = f\"iris://{username}:{password}@{hostname}:{port}/{namespace}\"\n", "\n", "engine = create_engine(CONNECTION_STRING)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Load knowledge graph\n", "entity_embeddings = pd.read_csv('./data/entity_embeddings.csv', index_col=0)\n", "entity_embeddings[\"embedding\"] = entity_embeddings[\"embedding\"].apply(\n", " lambda x: x[1:-1])\n", "\n", "len_label = entity_embeddings['label'].str.len().max()\n", "len_uri = entity_embeddings['uri'].str.len().max()\n", "# TODO: set varchar length dynamically as above\n", "with engine.connect() as conn:\n", " with conn.begin(): \n", " result = conn.execute(text('DROP TABLE IF EXISTS Test.EntityEmbeddings'))\n", " sql = f\"\"\"\n", " CREATE TABLE Test.EntityEmbeddings (\n", " embedding VECTOR(DOUBLE, 50),\n", " label VARCHAR(143),\n", " uri VARCHAR(38)\n", " )\n", " \"\"\"\n", " result = conn.execute(text(sql))\n", "\n", "with engine.connect() as conn:\n", " with conn.begin():\n", " for index, row in entity_embeddings.iterrows():\n", " sql = text(\"\"\"\n", " INSERT INTO Test.EntityEmbeddings \n", " (embedding, label, uri) \n", " VALUES (TO_VECTOR(:embedding), :label, :uri)\n", " \"\"\")\n", " conn.execute(sql, {\n", " 'embedding': str(row['embedding']),\n", " 'label': row['label'], \n", " 'uri': row['uri']\n", " })\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Calculate distance between entities\n", "with engine.connect() as conn:\n", " with conn.begin():\n", " sql = f\"\"\"\n", " SELECT TOP 10 e1.uri AS uri1, e2.uri AS uri2, e1.label AS label1, e2.label AS label2,\n", " VECTOR_COSINE(e1.embedding, e2.embedding) AS distance\n", " FROM Test.EntityEmbeddings e1, Test.EntityEmbeddings e2\n", " WHERE e1.uri = 'http://identifiers.org/medgen/C0002395'\n", " ORDER BY distance DESC\n", " \"\"\"\n", " result = conn.execute(text(sql))\n", " data = result.fetchall()\n", " display(data)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Load clinical trials\n", "\n", "relation_embeddings = pd.read_csv('./data/relation_embeddings.csv', index_col=0)\n", "relation_embeddings[\"embedding\"] = relation_embeddings[\"embedding\"].apply(\n", " lambda x: x[1:-1])\n", "\n", "len_label = relation_embeddings['label'].str.len().max()\n", "len_uri = relation_embeddings['uri'].str.len().max()\n", "# TODO: set varchar length dynamically as above\n", "with engine.connect() as conn:\n", " with conn.begin():# Load \n", " result = conn.execute(text('DROP TABLE IF EXISTS Test.RelationEmbeddings'))\n", " sql = f\"\"\"\n", " CREATE TABLE Test.RelationEmbeddings (\n", " embedding VECTOR(DOUBLE, 50),\n", " label VARCHAR(10),\n", " uri VARCHAR(38)\n", " )\n", " \"\"\"\n", " result = conn.execute(text(sql))\n", "\n", "with engine.connect() as conn:\n", " with conn.begin():\n", " for index, row in relation_embeddings.iterrows():\n", " sql = text(\"\"\"\n", " INSERT INTO Test.ClinicalTrials \n", " (embedding, label, uri) \n", " VALUES (TO_VECTOR(:embedding), :label, :uri)\n", " \"\"\")\n", " conn.execute(sql, {\n", " 'embedding': str(row['embedding']),\n", " 'label': row['label'], \n", " 'uri': row['uri']\n", " })\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Load knowledge graph\n", "clinical_trials = pd.read_csv(\"clinical_trials_embeddings.csv\")\n", "clinical_trials[\"embeddings\"] = clinical_trials[\"embeddings\"].apply(lambda x: x[1:-1])\n", "display(clinical_trials.head())\n", "\n", "# TODO: set varchar length dynamically as above\n", "with engine.connect() as conn:\n", " with conn.begin():\n", " result = conn.execute(text(\"DROP TABLE IF EXISTS Test.ClinicalTrials\"))\n", " sql = f\"\"\"\n", " CREATE TABLE Test.ClinicalTrials (\n", " nct_id VARCHAR(11) PRIMARY KEY,\n", " diseases TEXT,\n", " embedding VECTOR(DOUBLE, 768)\n", " )\n", " \"\"\"\n", " result = conn.execute(text(sql))\n", "\n", "with engine.connect() as conn:\n", " with conn.begin():\n", " for index, row in clinical_trials.iterrows():\n", "\n", " sql = text(\n", " \"\"\"\n", " INSERT INTO Test.ClinicalTrials \n", " (nct_id, diseases, embedding)\n", " VALUES (:nct_id, :diseases, TO_VECTOR(:embedding))\n", " \"\"\"\n", " )\n", " conn.execute(\n", " sql,\n", " {\n", " \"nct_id\": row[\"nct_id\"],\n", " \"diseases\": row[\"desease_condition\"],\n", " \"embedding\": str(row[\"embeddings\"]),\n", " },\n", " )" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# %%\n", "import pandas as pd\n", "import rdflib\n", "\n", "# Load the disease descriptions from MGDEF.RRF\n", "df_disease_descriptions = pd.read_csv(\"MGDEF.RRF\", sep=\"|\", header=0)\n", "# Rename the column '#CUI' to 'CUI'\n", "df_disease_descriptions.rename(columns={\"#CUI\": \"CUI\"}, inplace=True)\n", "# Remove the last column, it's empty\n", "df_disease_descriptions = df_disease_descriptions.iloc[:, :-1]\n", "# Filter out the rows where the SUPPRESS field is equal to 'Y'\n", "df_disease_descriptions = df_disease_descriptions[df_disease_descriptions[\"SUPPRESS\"] != \"Y\"]\n", "# Some of the rows include a \\n character, so we need to remove the rows where the CUI field contains spaces or doesn't start with 'C'\n", "df_disease_descriptions = df_disease_descriptions[df_disease_descriptions[\"CUI\"].str.startswith(\"C\") & ~df_disease_descriptions[\"CUI\"].str.contains(\" \")]\n", "# Remove the rows where the DEF field is empty\n", "df_disease_descriptions = df_disease_descriptions[df_disease_descriptions[\"DEF\"].notnull()]\n", "df_disease_descriptions['uri'] = df_disease_descriptions['CUI'].apply(lambda x: f'http://identifiers.org/medgen/{x}')\n", "\n", "with engine.connect() as conn:\n", " with conn.begin(): \n", " result = conn.execute(text('DROP TABLE IF EXISTS Test.DiseaseDescriptions'))\n", " sql = f\"\"\"\n", " CREATE TABLE Test.DiseaseDescriptions (\n", " uri VARCHAR(50),\n", " description TEXT\n", " )\n", " \"\"\"\n", " result = conn.execute(text(sql))\n", "\n", "with engine.connect() as conn:\n", " with conn.begin():\n", " for index, row in df_disease_descriptions.iterrows():\n", " print(row['DEF'])\n", " print(row['uri'])\n", " sql = text(\"\"\"\n", " INSERT INTO Test.DiseaseDescriptions \n", " (uri, description) \n", " VALUES ( :uri, :description)\n", " \"\"\")\n", " conn.execute(sql, {\n", " 'uri': row['uri'],\n", " 'description': row['DEF'], \n", " })" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "treehacks", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 2 }