{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "9db3b813-22dc-4209-86d2-42e935f5f5dd", "metadata": {}, "outputs": [], "source": [ "from langchain_community.document_loaders.csv_loader import CSVLoader\n", "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", "import pandas as pd\n", "import langchain\n", "import os\n", "import openai\n", "import ast\n", "from langchain import OpenAI\n", "from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain\n", "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", "from langchain.document_loaders import UnstructuredURLLoader\n", "from langchain.embeddings import OpenAIEmbeddings\n", "from langchain.vectorstores import FAISS" ] }, { "cell_type": "code", "execution_count": 2, "id": "fb48dccd-37e5-484a-a2fc-c482839b9ed9", "metadata": {}, "outputs": [], "source": [ "# loader = CSVLoader(file_path=\"trials/brief_summaries.csv\")\n", "# data = loader.load()" ] }, { "cell_type": "code", "execution_count": 3, "id": "cdcc3107-e6a8-47ca-bd89-e381fbcf9b9e", "metadata": {}, "outputs": [], "source": [ "# df= pd.read_csv(\"trials/brief_summaries.txt\", delimiter=\"|\")\n", "# df.shape" ] }, { "cell_type": "code", "execution_count": 4, "id": "c6a94c14-b197-4eff-9941-1ca52069cd5c", "metadata": {}, "outputs": [], "source": [ "# df.head(20)" ] }, { "cell_type": "code", "execution_count": 5, "id": "95c000ff-bf0c-4489-8643-93a238db41dc", "metadata": {}, "outputs": [], "source": [ "os.environ[\"OPENAI_API_KEY\"] = (\n", " \"sk-proj-CG2E98bSWs53X2eWO0Z4T3BlbkFJLm7H1vfkbua0zP548CKQ\"\n", ")" ] }, { "cell_type": "code", "execution_count": 6, "id": "e2e03936-fcce-4287-bfe6-e31f1b69f693", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/aldan/miniconda3/envs/hackupc/lib/python3.10/site-packages/langchain_core/_api/deprecation.py:119: LangChainDeprecationWarning: The class `OpenAI` was deprecated in LangChain 0.0.10 and will be removed in 0.2.0. An updated version of the class exists in the langchain-openai package and should be used instead. To use it run `pip install -U langchain-openai` and import as `from langchain_openai import OpenAI`.\n", " warn_deprecated(\n" ] } ], "source": [ "llm = OpenAI(temperature=0.6, max_tokens=500)" ] }, { "cell_type": "code", "execution_count": 7, "id": "aede3d75-2441-44f6-b3cf-2f86f050da24", "metadata": {}, "outputs": [ { "ename": "FileNotFoundError", "evalue": "[Errno 2] No such file or directory: 'clinical_trials.csv'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[7], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m df_trials\u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mclinical_trials.csv\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28mprint\u001b[39m(df_trials\u001b[38;5;241m.\u001b[39mshape)\n\u001b[1;32m 3\u001b[0m df_trials\u001b[38;5;241m.\u001b[39mhead()\n", "File \u001b[0;32m~/miniconda3/envs/hackupc/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1024\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[1;32m 1011\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m 1012\u001b[0m dialect,\n\u001b[1;32m 1013\u001b[0m delimiter,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1020\u001b[0m dtype_backend\u001b[38;5;241m=\u001b[39mdtype_backend,\n\u001b[1;32m 1021\u001b[0m )\n\u001b[1;32m 1022\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m-> 1024\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m~/miniconda3/envs/hackupc/lib/python3.10/site-packages/pandas/io/parsers/readers.py:618\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 615\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[1;32m 617\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[0;32m--> 618\u001b[0m parser \u001b[38;5;241m=\u001b[39m \u001b[43mTextFileReader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 620\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[1;32m 621\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parser\n", "File \u001b[0;32m~/miniconda3/envs/hackupc/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1618\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 1615\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 1617\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1618\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_engine\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m~/miniconda3/envs/hackupc/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1878\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[0;34m(self, f, engine)\u001b[0m\n\u001b[1;32m 1876\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[1;32m 1877\u001b[0m mode \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 1878\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;241m=\u001b[39m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1879\u001b[0m \u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1880\u001b[0m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1881\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1882\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompression\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcompression\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1883\u001b[0m \u001b[43m \u001b[49m\u001b[43mmemory_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmemory_map\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1884\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_text\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_text\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1885\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding_errors\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstrict\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1886\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstorage_options\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1887\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1888\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1889\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles\u001b[38;5;241m.\u001b[39mhandle\n", "File \u001b[0;32m~/miniconda3/envs/hackupc/lib/python3.10/site-packages/pandas/io/common.py:873\u001b[0m, in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m 868\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m 869\u001b[0m \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[1;32m 870\u001b[0m \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[1;32m 871\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mmode:\n\u001b[1;32m 872\u001b[0m \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[0;32m--> 873\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m 874\u001b[0m \u001b[43m \u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 875\u001b[0m \u001b[43m \u001b[49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 876\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 877\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 878\u001b[0m \u001b[43m \u001b[49m\u001b[43mnewline\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 879\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 880\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 881\u001b[0m \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[1;32m 882\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n", "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'clinical_trials.csv'" ] } ], "source": [ "df_trials = pd.read_csv(\"clinical_trials.csv\")\n", "print(df_trials.shape)\n", "df_trials.head()" ] }, { "cell_type": "code", "execution_count": 15, "id": "65589fc4-4f55-4b72-9935-c3b163fde0e2", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
desease_condition
0['marijuana abuse', 'substance-related disorde...
1['marijuana abuse', 'substance-related disorde...
2['tuberculosis', 'latent tuberculosis', 'infec...
3['heart failure', 'heart diseases', 'cardiovas...
4['lymphoma', 'neoplasms by histologic type', '...
\n", "
" ], "text/plain": [ " desease_condition\n", "0 ['marijuana abuse', 'substance-related disorde...\n", "1 ['marijuana abuse', 'substance-related disorde...\n", "2 ['tuberculosis', 'latent tuberculosis', 'infec...\n", "3 ['heart failure', 'heart diseases', 'cardiovas...\n", "4 ['lymphoma', 'neoplasms by histologic type', '..." ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_trials_filtered = df_trials[[\"desease_condition\"]]\n", "df_trials_filtered.head()" ] }, { "cell_type": "code", "execution_count": 16, "id": "88e28056-e340-416c-a1a9-4a6c29556dc7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\"['marijuana abuse', 'substance-related disorders', 'chemically-induced disorders', 'mental disorders']\"" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_trials_filtered[\"desease_condition\"].iloc[0]" ] }, { "cell_type": "code", "execution_count": 17, "id": "5bd8f876-0480-40a5-a32f-ca7ec137a70f", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\ariji\\AppData\\Local\\Temp\\ipykernel_22340\\16068817.py:4: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_trials_filtered['desease_condition']= df_trials_filtered['desease_condition'].apply(list_to_string)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
desease_condition
0marijuana abuse, substance-related disorders, ...
1marijuana abuse, substance-related disorders, ...
2tuberculosis, latent tuberculosis, infections,...
3heart failure, heart diseases, cardiovascular ...
4lymphoma, neoplasms by histologic type, neopla...
......
440512obesity, overweight, overnutrition, nutrition ...
440513obesity, overweight, overnutrition, nutrition ...
440514obesity, overweight, overnutrition, nutrition ...
440515autistic disorder, autism spectrum disorder, c...
440516autistic disorder, autism spectrum disorder, c...
\n", "

440517 rows × 1 columns

\n", "
" ], "text/plain": [ " desease_condition\n", "0 marijuana abuse, substance-related disorders, ...\n", "1 marijuana abuse, substance-related disorders, ...\n", "2 tuberculosis, latent tuberculosis, infections,...\n", "3 heart failure, heart diseases, cardiovascular ...\n", "4 lymphoma, neoplasms by histologic type, neopla...\n", "... ...\n", "440512 obesity, overweight, overnutrition, nutrition ...\n", "440513 obesity, overweight, overnutrition, nutrition ...\n", "440514 obesity, overweight, overnutrition, nutrition ...\n", "440515 autistic disorder, autism spectrum disorder, c...\n", "440516 autistic disorder, autism spectrum disorder, c...\n", "\n", "[440517 rows x 1 columns]" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def list_to_string(disease_list):\n", " disease_list = ast.literal_eval(disease_list)\n", " return \", \".join(disease_list)\n", "\n", "\n", "df_trials_filtered[\"desease_condition\"] = df_trials_filtered[\"desease_condition\"].apply(\n", " list_to_string\n", ")\n", "df_trials_filtered" ] }, { "cell_type": "code", "execution_count": 18, "id": "bbbb22e6-4883-4869-8ccc-95696bc67b1b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 marijuana abuse, substance-related disorders, ...\n", "1 marijuana abuse, substance-related disorders, ...\n", "2 tuberculosis, latent tuberculosis, infections,...\n", "3 heart failure, heart diseases, cardiovascular ...\n", "4 lymphoma, neoplasms by histologic type, neopla...\n", "Name: desease_condition, dtype: object" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_trials_filtered[\"desease_condition\"].head()" ] }, { "cell_type": "code", "execution_count": 19, "id": "7f4e7ceb-8bfd-4294-a850-8935f88b6555", "metadata": {}, "outputs": [], "source": [ "df_trials_filtered.to_csv(\"diseases.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": 20, "id": "af1c5c2b-24a0-44a1-9e5d-7ee89ca4cccf", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "440517" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "loader = CSVLoader(file_path=\"./diseases.csv\", encoding=\"utf-8\")\n", "data = loader.load()\n", "len(data)" ] }, { "cell_type": "code", "execution_count": null, "id": "cab89218-41ca-4048-886d-bc2c1c9b30bc", "metadata": {}, "outputs": [], "source": [ "embeddings = OpenAIEmbeddings()\n", "vectorstore = FAISS.from_documents(data, embeddings)" ] }, { "cell_type": "code", "execution_count": null, "id": "225ade4a-d004-44cc-a5ff-22ce2bfcac32", "metadata": {}, "outputs": [], "source": [ "file_path = \"vector_index.pkl\"\n", "with open(file_path, \"wb\") as f:\n", " pickle.dump(vectorstore, f)" ] }, { "cell_type": "code", "execution_count": 98, "id": "11912a93-ad02-41cb-8bce-2750c947fa74", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(440517, 2)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
desease_conditiontext
0['marijuana abuse', 'substance-related disorde...nct_id: NCT03055377\\nsummary: This is a 12-wee...
1['marijuana abuse', 'substance-related disorde...nct_id: NCT03055377\\nsummary: This is a 12-wee...
2['tuberculosis', 'latent tuberculosis', 'infec...nct_id: NCT03042754\\nsummary: Early diagnosis ...
3['heart failure', 'heart diseases', 'cardiovas...nct_id: NCT03035123\\nsummary: The EduStra-HF s...
4['lymphoma', 'neoplasms by histologic type', '...nct_id: NCT02272751\\nsummary: This study will ...
\n", "
" ], "text/plain": [ " desease_condition \\\n", "0 ['marijuana abuse', 'substance-related disorde... \n", "1 ['marijuana abuse', 'substance-related disorde... \n", "2 ['tuberculosis', 'latent tuberculosis', 'infec... \n", "3 ['heart failure', 'heart diseases', 'cardiovas... \n", "4 ['lymphoma', 'neoplasms by histologic type', '... \n", "\n", " text \n", "0 nct_id: NCT03055377\\nsummary: This is a 12-wee... \n", "1 nct_id: NCT03055377\\nsummary: This is a 12-wee... \n", "2 nct_id: NCT03042754\\nsummary: Early diagnosis ... \n", "3 nct_id: NCT03035123\\nsummary: The EduStra-HF s... \n", "4 nct_id: NCT02272751\\nsummary: This study will ... " ] }, "execution_count": 98, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_trials = pd.read_csv(\"clinical_trials.csv\")\n", "print(df_trials.shape)\n", "df_trials.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "25b31e55-2961-474d-92d8-5963f2c6bf84", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "918c078c-46fe-4d7b-9748-88c52a5b004a", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "36e83202-97ad-425d-95ae-075a1e26a34e", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "5d705875-5dd7-4c71-8d94-99c101020ac0", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 2, "id": "f2818abe-1a43-4d7d-92a7-7562812bf43d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
desease_condition
0marijuana abuse, substance-related disorders, ...
1marijuana abuse, substance-related disorders, ...
2tuberculosis, latent tuberculosis, infections,...
3heart failure, heart diseases, cardiovascular ...
4lymphoma, neoplasms by histologic type, neopla...
......
440512obesity, overweight, overnutrition, nutrition ...
440513obesity, overweight, overnutrition, nutrition ...
440514obesity, overweight, overnutrition, nutrition ...
440515autistic disorder, autism spectrum disorder, c...
440516autistic disorder, autism spectrum disorder, c...
\n", "

440517 rows × 1 columns

\n", "
" ], "text/plain": [ " desease_condition\n", "0 marijuana abuse, substance-related disorders, ...\n", "1 marijuana abuse, substance-related disorders, ...\n", "2 tuberculosis, latent tuberculosis, infections,...\n", "3 heart failure, heart diseases, cardiovascular ...\n", "4 lymphoma, neoplasms by histologic type, neopla...\n", "... ...\n", "440512 obesity, overweight, overnutrition, nutrition ...\n", "440513 obesity, overweight, overnutrition, nutrition ...\n", "440514 obesity, overweight, overnutrition, nutrition ...\n", "440515 autistic disorder, autism spectrum disorder, c...\n", "440516 autistic disorder, autism spectrum disorder, c...\n", "\n", "[440517 rows x 1 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_trials_filtered = pd.read_csv(\"diseases.csv\")\n", "df_trials_filtered" ] }, { "cell_type": "code", "execution_count": 3, "id": "c89e3cf6-a376-4029-9c04-0f5e664a2237", "metadata": { "notebookRunGroups": { "groupValue": "1" } }, "outputs": [ { "data": { "text/plain": [ "(440517, 1)" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2 = df_trials_filtered # [:100]\n", "df2.shape" ] }, { "cell_type": "code", "execution_count": 14, "id": "c5012bcf-3e25-4f21-a29c-6bdbdafbb8c7", "metadata": {}, "outputs": [], "source": [ "from openai import OpenAI\n", "\n", "client = OpenAI()" ] }, { "cell_type": "code", "execution_count": 15, "id": "40a480bd-6754-40b6-870c-42d10ce9a960", "metadata": {}, "outputs": [], "source": [ "def get_embeddings(text):\n", " response = client.embeddings.create(\n", " input=text, dimensions=128, model=\"text-embedding-3-small\"\n", " )\n", " return response.data[0].embedding" ] }, { "cell_type": "code", "execution_count": 4, "id": "ef6d6b62-de0b-4bc6-a6eb-847ab8e99da5", "metadata": { "notebookRunGroups": { "groupValue": "1" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/aldan/miniconda3/envs/hackupc/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", "/home/aldan/miniconda3/envs/hackupc/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", " warnings.warn(\n", "/home/aldan/miniconda3/envs/hackupc/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", " warnings.warn(\n", "Batches: 100%|██████████| 6884/6884 [04:32<00:00, 25.25it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 14min 6s, sys: 1min 31s, total: 15min 37s\n", "Wall time: 4min 48s\n" ] }, { "data": { "text/plain": [ "(440517, 768)" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "from sentence_transformers import SentenceTransformer\n", "\n", "encoder= SentenceTransformer(\"allenai-specter\")\n", "vectors= encoder.encode(df2.desease_condition, show_progress_bar=True, batch_size=64)\n", "vectors.shape" ] }, { "cell_type": "code", "execution_count": 17, "id": "7966d754-56d7-4555-a6c6-6a13772fb000", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 261 ms, sys: 13 ms, total: 274 ms\n", "Wall time: 26.8 s\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ ":1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
desease_conditionembeddings
0marijuana abuse, substance-related disorders, ...[-0.05811865255236626, -0.023393018171191216, ...
1marijuana abuse, substance-related disorders, ...[-0.0581701435148716, -0.023382455110549927, 0...
2tuberculosis, latent tuberculosis, infections,...[-0.03460180386900902, -0.084668830037117, 0.2...
3heart failure, heart diseases, cardiovascular ...[-0.08236236125230789, -0.1235777735710144, 0....
4lymphoma, neoplasms by histologic type, neopla...[-0.1227850392460823, 0.07155642658472061, 0.1...
\n", "
" ], "text/plain": [ " desease_condition \\\n", "0 marijuana abuse, substance-related disorders, ... \n", "1 marijuana abuse, substance-related disorders, ... \n", "2 tuberculosis, latent tuberculosis, infections,... \n", "3 heart failure, heart diseases, cardiovascular ... \n", "4 lymphoma, neoplasms by histologic type, neopla... \n", "\n", " embeddings \n", "0 [-0.05811865255236626, -0.023393018171191216, ... \n", "1 [-0.0581701435148716, -0.023382455110549927, 0... \n", "2 [-0.03460180386900902, -0.084668830037117, 0.2... \n", "3 [-0.08236236125230789, -0.1235777735710144, 0.... \n", "4 [-0.1227850392460823, 0.07155642658472061, 0.1... " ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "df2['embeddings']= df2['desease_condition'].apply(get_embeddings)\n", "df2.head()" ] }, { "cell_type": "code", "execution_count": 18, "id": "c2f99031", "metadata": {}, "outputs": [], "source": [ "df2['embeddings'] = vectors.astype('float32',casting='same_kind').tolist()" ] }, { "cell_type": "code", "execution_count": 19, "id": "952d69c7", "metadata": {}, "outputs": [], "source": [ "# Remove duplicate rows based on the 'nct_id' column\n", "df2_without_duplicates = df2.drop_duplicates(subset='nct_id', keep='first')" ] }, { "cell_type": "code", "execution_count": 24, "id": "2711980a-d1c0-441e-ae9a-531500b7b7cd", "metadata": {}, "outputs": [], "source": [ "df2_without_duplicates[:130077].to_csv(\n", " \"diseases_embeddings.csv\", index=False, header=True\n", ")" ] }, { "cell_type": "code", "execution_count": 21, "id": "fccd4f0e", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
desease_conditionembeddingsnct_id
0marijuana abuse, substance-related disorders, ...[-0.8323991298675537, 1.47855544090271, 0.0013...NCT03055377
2tuberculosis, latent tuberculosis, infections,...[-0.43443307280540466, 0.9625586271286011, -0....NCT03042754
3heart failure, heart diseases, cardiovascular ...[-0.5791705250740051, 0.13008448481559753, 0.1...NCT03035123
4lymphoma, neoplasms by histologic type, neopla...[-0.1608569175004959, 0.8489153981208801, -0.5...NCT02272751
6anemia, hematologic diseases[0.21379394829273224, 0.17073844373226166, -0....NCT00931606
............
440506scoliosis, spinal curvatures, spinal diseases,...[-1.20807683467865, 0.19357842206954956, 0.314...NCT03641469
440507asphyxia neonatorum, asphyxia, death, patholog...[-0.7226205468177795, 1.0146900415420532, -0.2...NCT03621956
440509tuberculosis, helminthiasis, malnutrition, myc...[-0.7196142673492432, 0.9588190913200378, 0.08...NCT03598842
440512obesity, overweight, overnutrition, nutrition ...[-1.159234642982483, 0.5251776576042175, 0.237...NCT03574103
440515autistic disorder, autism spectrum disorder, c...[-0.8618993759155273, 0.7515497803688049, 0.08...NCT03570372
\n", "

233077 rows × 3 columns

\n", "
" ], "text/plain": [ " desease_condition \\\n", "0 marijuana abuse, substance-related disorders, ... \n", "2 tuberculosis, latent tuberculosis, infections,... \n", "3 heart failure, heart diseases, cardiovascular ... \n", "4 lymphoma, neoplasms by histologic type, neopla... \n", "6 anemia, hematologic diseases \n", "... ... \n", "440506 scoliosis, spinal curvatures, spinal diseases,... \n", "440507 asphyxia neonatorum, asphyxia, death, patholog... \n", "440509 tuberculosis, helminthiasis, malnutrition, myc... \n", "440512 obesity, overweight, overnutrition, nutrition ... \n", "440515 autistic disorder, autism spectrum disorder, c... \n", "\n", " embeddings nct_id \n", "0 [-0.8323991298675537, 1.47855544090271, 0.0013... NCT03055377 \n", "2 [-0.43443307280540466, 0.9625586271286011, -0.... NCT03042754 \n", "3 [-0.5791705250740051, 0.13008448481559753, 0.1... NCT03035123 \n", "4 [-0.1608569175004959, 0.8489153981208801, -0.5... NCT02272751 \n", "6 [0.21379394829273224, 0.17073844373226166, -0.... NCT00931606 \n", "... ... ... \n", "440506 [-1.20807683467865, 0.19357842206954956, 0.314... NCT03641469 \n", "440507 [-0.7226205468177795, 1.0146900415420532, -0.2... NCT03621956 \n", "440509 [-0.7196142673492432, 0.9588190913200378, 0.08... NCT03598842 \n", "440512 [-1.159234642982483, 0.5251776576042175, 0.237... NCT03574103 \n", "440515 [-0.8618993759155273, 0.7515497803688049, 0.08... NCT03570372 \n", "\n", "[233077 rows x 3 columns]" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2_without_duplicates" ] }, { "cell_type": "code", "execution_count": 106, "id": "8ed985f4-9402-431f-bfba-1236ba16b895", "metadata": {}, "outputs": [ { "data": { "text/plain": [ " >" ] }, "execution_count": 106, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import faiss\n", "\n", "index = faiss.IndexFlatL2(dim)\n", "index" ] }, { "cell_type": "code", "execution_count": 108, "id": "71dad860-1f19-4166-a309-c9ce15f24792", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(768,)\n" ] }, { "data": { "text/plain": [ "array([-8.82369652e-03, 8.50650743e-02, 2.08267733e-03, 6.77651772e-03,\n", " -2.86661759e-02, -8.71188380e-03, 6.99447095e-02, 5.04214764e-02,\n", " 3.58386151e-02, 5.29594952e-03, -1.40875215e-02, 1.99297220e-02,\n", " 2.27009598e-03, 2.10810862e-02, 2.66138893e-02, 1.90623086e-02,\n", " 4.44708914e-02, 2.96202525e-02, 5.42085357e-02, -2.34859088e-03,\n", " -9.87798795e-02, -5.00183590e-02, -3.42465192e-02, 2.08440255e-02,\n", " 5.31156994e-02, -1.37044629e-02, 2.92537250e-02, -2.61334293e-02,\n", " -1.21854078e-04, -2.36813519e-02, -3.81283499e-02, -1.79494768e-02,\n", " -6.29265187e-03, 1.27150817e-02, 1.19849676e-06, -7.78729608e-03,\n", " -1.28973828e-04, 4.01791967e-02, 4.21229303e-02, -8.72302521e-03,\n", " 7.44823692e-03, 7.68032745e-02, 6.50246907e-03, 3.40298638e-02,\n", " -1.80711355e-02, -2.71878559e-02, 5.74751608e-02, 3.67745496e-02,\n", " -3.34868580e-02, 1.05205458e-02, 2.08975170e-02, 4.36686277e-02,\n", " 3.47612537e-02, -4.99080680e-02, 4.44446988e-02, 5.57280704e-03,\n", " -2.31200755e-02, -4.60692644e-02, -1.39789237e-02, -3.79957110e-02,\n", " 4.67903316e-02, 1.91651955e-02, -5.12171052e-02, 2.46807020e-02,\n", " -5.52081019e-02, 3.50596346e-02, -7.01438356e-03, -3.36519890e-02,\n", " -1.41502097e-02, -1.37693482e-02, 4.11427952e-02, 6.94046309e-03,\n", " -1.30138136e-02, 5.91567121e-02, 3.37168351e-02, 3.01467292e-02,\n", " -4.59552221e-02, 1.37365120e-03, 1.00179566e-02, 6.98126853e-04,\n", " 3.58139984e-02, 1.18174301e-02, 1.33722462e-02, -1.35893077e-02,\n", " 4.75908853e-02, 5.48331346e-03, -6.41460950e-03, -1.23906611e-02,\n", " 5.82688041e-02, -1.60842277e-02, -2.95833423e-04, 6.97355811e-03,\n", " 2.48331465e-02, -2.35959496e-02, -1.24989869e-02, -1.36585534e-02,\n", " 1.52637456e-02, 7.01832073e-03, 5.50601333e-02, 4.35096538e-03,\n", " 2.36319732e-02, -1.38118947e-02, -7.24233836e-02, 9.39742289e-03,\n", " -2.66901590e-02, 2.96042152e-02, 1.28761679e-02, 2.23339219e-02,\n", " 3.08373477e-03, 7.12765753e-02, 7.13613164e-03, 3.62721197e-02,\n", " -4.53250594e-02, 2.54001115e-02, -2.54253373e-02, -1.23151275e-03,\n", " -1.34750446e-02, -2.70653702e-02, -1.02220355e-02, 2.07683407e-02,\n", " -7.31003610e-03, 2.65329964e-02, -2.79857730e-03, 4.20840643e-02,\n", " 3.20205763e-02, -1.19518824e-02, -5.77116087e-02, 9.88688134e-03,\n", " 1.86814573e-02, -5.10204993e-02, -6.77110278e-04, 9.40234493e-03,\n", " -3.33383717e-02, -5.52933291e-02, 5.64148054e-02, 4.92153503e-02,\n", " 3.33690383e-02, -3.92963700e-02, -6.91099390e-02, 3.79911740e-03,\n", " -1.74410697e-02, -1.60171147e-02, 4.89675067e-02, 2.67119659e-03,\n", " 2.61192098e-02, -2.74193864e-02, 6.92490395e-03, -4.64810384e-03,\n", " -8.99862905e-04, 1.02159111e-02, -4.81114909e-02, 1.22787328e-02,\n", " -9.32844076e-03, -2.00431682e-02, -1.36102587e-02, -3.67914373e-03,\n", " -1.60810221e-02, -2.20200215e-02, 2.32051890e-02, -5.07331975e-02,\n", " -1.01248249e-02, 5.62567115e-02, -2.60966737e-03, 9.27545596e-03,\n", " 5.32410555e-02, 4.81746234e-02, -9.83138476e-03, 1.81230865e-02,\n", " -2.12969314e-02, 9.82244611e-02, -2.47648880e-02, 7.06253499e-02,\n", " 8.71159416e-03, -2.73140483e-02, 5.59884915e-03, -2.14829091e-02,\n", " -6.67077005e-02, 2.48677693e-02, -8.29503238e-02, -7.96182230e-02,\n", " 3.77488993e-02, -1.37352264e-02, -2.85069812e-02, 1.81708820e-02,\n", " -4.07746173e-02, -4.71230270e-03, -1.59605164e-02, -1.25815195e-03,\n", " -6.59954594e-03, -1.51611334e-02, 7.87123516e-02, -4.09705602e-02,\n", " 3.07933297e-02, 1.27626080e-02, -4.34489138e-02, 9.91576444e-03,\n", " 1.25470785e-02, -8.67356583e-02, 1.26097840e-03, 3.24709825e-02,\n", " -6.92409948e-02, -4.35011238e-02, -2.79313605e-02, -3.37213017e-02,\n", " -2.35359464e-02, -2.95022167e-02, 2.88009271e-02, -3.26618887e-02,\n", " 7.09307985e-03, 3.09435464e-03, -5.09097055e-02, 3.54242921e-02,\n", " 5.37336655e-02, 1.55867739e-02, 2.09988486e-02, -4.38529663e-02,\n", " 2.93767708e-03, 2.27999203e-02, 1.02668423e-02, 3.35033536e-02,\n", " -8.28316063e-02, -4.17127199e-02, -1.23034064e-02, 2.38543525e-02,\n", " -3.72257493e-02, 2.97443867e-02, 3.35034318e-02, -5.21336049e-02,\n", " 5.74519299e-03, 2.89844945e-02, -2.21337453e-02, 2.34603398e-02,\n", " 6.33142609e-03, -2.24104542e-02, 1.47326495e-02, 1.98041964e-02,\n", " 3.05697713e-02, -9.37094465e-02, -6.84579164e-02, 4.63523576e-03,\n", " 3.88860740e-02, -3.97440195e-02, -4.70216498e-02, 1.02172708e-02,\n", " -3.37972888e-03, -8.54947045e-03, 4.81354557e-02, 4.99849804e-02,\n", " 7.11378129e-03, -2.54327375e-02, -1.14872465e-02, -3.54485810e-02,\n", " 5.24284095e-02, 2.16708388e-02, -4.00698110e-02, 5.15380092e-02,\n", " -6.03203699e-02, -6.50304696e-03, -1.03860423e-02, -7.47132823e-02,\n", " 3.59848235e-03, -4.68364358e-02, -4.23019789e-02, -1.86387468e-02,\n", " -2.88047381e-02, -2.81904116e-02, 1.52729014e-02, -1.55570190e-02,\n", " 1.34619148e-02, 2.34364290e-02, 3.10326237e-02, -4.70464528e-02,\n", " -2.43550166e-02, -7.20657408e-03, -1.16065536e-02, -3.42444591e-02,\n", " -5.30204549e-03, 5.52049950e-02, 4.50828709e-02, -7.30262510e-03,\n", " 5.56289777e-02, -9.46066808e-03, -3.37345451e-02, -1.87659152e-02,\n", " 3.57284099e-02, 4.20488343e-02, 1.66770478e-03, -5.27675785e-02,\n", " 2.96422077e-04, 4.22447585e-02, 4.97253910e-02, 6.03130311e-02,\n", " 1.32281650e-02, 2.35939436e-02, -1.59284715e-02, 4.46444489e-02,\n", " -1.68315917e-02, 1.34740606e-01, -3.54593806e-02, 4.79029641e-02,\n", " 8.99049267e-03, 4.74606343e-02, 6.70041004e-03, -1.15184486e-03,\n", " 2.69540539e-03, -2.77549177e-02, -1.33260442e-02, 2.60788556e-02,\n", " 4.35438640e-02, -2.55859867e-02, 2.76670083e-02, 3.37177999e-02,\n", " 2.93240137e-02, 1.82274636e-03, -1.40310880e-02, -1.91633645e-02,\n", " 1.18790809e-02, -4.65121269e-02, -4.19883654e-02, -2.69681774e-02,\n", " -3.23035605e-02, -6.84630498e-02, 6.26784265e-02, 1.37511576e-02,\n", " -2.55833156e-02, -5.73152229e-02, 3.30126472e-02, -7.90146552e-03,\n", " -1.08651863e-02, 1.10474667e-02, 3.03509296e-03, 1.55274626e-02,\n", " 1.05599947e-02, -7.16960803e-03, -5.01419827e-02, -3.34469602e-02,\n", " 3.77239436e-02, 9.44003314e-02, -4.80610691e-02, 4.73537892e-02,\n", " 3.40655483e-02, 7.88806472e-03, -2.84915343e-02, 7.96849206e-02,\n", " 1.57442074e-02, -4.15650755e-02, 7.51048513e-03, 3.66957486e-02,\n", " -1.72730908e-01, -8.72075930e-02, 2.86346450e-02, 2.16962174e-02,\n", " -4.80199270e-02, 6.49317261e-03, 1.67240556e-02, -2.56227311e-02,\n", " 2.19670162e-02, -6.10647202e-02, -2.65449155e-02, 6.17929082e-03,\n", " -2.89566331e-02, 1.19498251e-02, -2.33849231e-02, -2.69133616e-02,\n", " -1.46602485e-02, 1.18886270e-02, 1.64973717e-02, -3.90495770e-02,\n", " -3.45575088e-03, 5.12249060e-02, -8.63745401e-04, 5.59820198e-02,\n", " 2.10017413e-02, 2.74998210e-02, 3.03551817e-04, -1.15796946e-01,\n", " -4.66962112e-03, -4.80118394e-02, -3.55160870e-02, -4.72528581e-03,\n", " -4.29739058e-02, -1.07347388e-02, -1.32423071e-02, -2.34632343e-02,\n", " 1.98413953e-02, -7.27679394e-03, 2.27117930e-02, -2.59338003e-02,\n", " 4.31442596e-02, 1.07885078e-02, -2.47129947e-02, -4.14506458e-02,\n", " 4.40958813e-02, 6.65106403e-04, -2.26945560e-02, -4.76796739e-02,\n", " 1.13289580e-02, -5.57265691e-02, 1.71151303e-03, -1.24145029e-02,\n", " -3.57853901e-03, -4.86295968e-02, -5.14956787e-02, 4.79425713e-02,\n", " -3.24050151e-02, 7.39779174e-02, 2.67242044e-02, 1.16365692e-02,\n", " 8.20766483e-03, -6.27530292e-02, -1.30661400e-02, -3.52081768e-02,\n", " 4.83807474e-02, 9.81860235e-03, 1.14539362e-01, -1.88471414e-02,\n", " 6.07751869e-02, -1.75345445e-03, 3.13236266e-02, -1.94595556e-03,\n", " 2.64345529e-03, 3.07400171e-02, -4.31060083e-02, -6.19985871e-02,\n", " 5.50477020e-03, 1.62547994e-02, -8.26352183e-03, 7.56437238e-03,\n", " -4.79784003e-03, 6.93615247e-03, 3.59064825e-02, 2.08517518e-02,\n", " 1.41595434e-02, 5.31185642e-02, 6.78585656e-03, 6.56357184e-02,\n", " -5.06135784e-02, -3.05179805e-02, 7.06539825e-02, -3.55644710e-02,\n", " -4.92612133e-03, 9.91953164e-02, 1.00235650e-02, -2.22671125e-02,\n", " -1.86746120e-02, 2.49281265e-02, -4.92450967e-03, 1.66887734e-02,\n", " 4.62210961e-02, 4.07794118e-02, 2.52511259e-02, -2.83305068e-02,\n", " -2.78001893e-02, -1.69764105e-02, 1.79186705e-02, 1.09842177e-02,\n", " 1.09969089e-02, 1.69700030e-02, -8.59475043e-03, 4.70476560e-02,\n", " 3.64770554e-02, 2.09835749e-02, 1.01236468e-02, 2.75151283e-02,\n", " 4.33402918e-02, -4.30559181e-02, -3.53547297e-02, 7.77268112e-02,\n", " -6.10819347e-02, -2.86280159e-02, 4.68054451e-02, 1.29892454e-02,\n", " -1.71940885e-02, -2.52429228e-02, 3.86423096e-02, -1.35919163e-02,\n", " -5.27431667e-02, 6.45831088e-03, 2.96409409e-02, 5.97442053e-02,\n", " 3.23252901e-02, 5.03172688e-02, -4.45654802e-02, 2.90075876e-02,\n", " -1.35373492e-02, 6.78209821e-03, -5.89249916e-02, 4.28890549e-02,\n", " -2.36034058e-02, -5.30969724e-03, 3.85405980e-02, -1.82616734e-03,\n", " 1.45543357e-02, 1.07806427e-02, -6.06855676e-02, -4.95252907e-02,\n", " 1.02004781e-02, 4.60227691e-02, -1.08090881e-02, 4.42408510e-02,\n", " 4.15152796e-02, 1.23609398e-02, 5.11957100e-03, 1.17597533e-02,\n", " -2.70090066e-02, 2.68773828e-02, -1.97812133e-02, 2.25932393e-02,\n", " -1.33560598e-02, -1.50896851e-02, -3.14053567e-03, 1.54051669e-02,\n", " 1.86488125e-02, -1.71708278e-02, -3.95283476e-03, 7.68053811e-04,\n", " -2.37891261e-04, 1.84722953e-02, 3.60381305e-02, -5.85213909e-03,\n", " 4.44293395e-02, -1.11264118e-03, -4.79441285e-02, 3.46464328e-02,\n", " -2.53370814e-02, -3.26901935e-02, -2.28975322e-02, -1.96164921e-02,\n", " -4.38152434e-04, 4.08602282e-02, -2.29470823e-02, -1.89938806e-02,\n", " -1.52037974e-04, 1.05516789e-02, 2.08601039e-02, -6.98119551e-02,\n", " 3.66246551e-02, -1.26779894e-03, -4.03217562e-02, -5.35424761e-02,\n", " 6.51817098e-02, 4.29646857e-02, 2.56071109e-02, -3.28080021e-02,\n", " 1.20534413e-02, 3.56224040e-03, -1.01593453e-02, -1.96505673e-04,\n", " 4.33485657e-02, -4.25680764e-02, 9.73126665e-03, 3.76882474e-03,\n", " -1.40319867e-02, -3.63940969e-02, -3.09983976e-02, -4.19548260e-33,\n", " 7.11604580e-02, 4.78382297e-02, 1.89297704e-03, -1.60731785e-02,\n", " 2.53787991e-02, -3.15741785e-02, -4.27713171e-02, -7.53164338e-03,\n", " 1.68679946e-03, 1.92391127e-02, -2.20667192e-04, 1.32907527e-02,\n", " 5.99487219e-03, 2.75156219e-02, -5.06000873e-03, -3.58465910e-02,\n", " 8.20948277e-03, -2.11624149e-02, -7.07996823e-03, -4.23992332e-03,\n", " -1.09853260e-01, -3.66037302e-02, 3.55480015e-02, 4.23291475e-02,\n", " 1.48312682e-02, 5.68749309e-02, 3.57767567e-02, 1.40728084e-02,\n", " -4.00471613e-02, 1.01988176e-02, 2.83056553e-02, -1.55737845e-03,\n", " 1.24238459e-02, 1.20237898e-02, -7.69484974e-03, -3.30727436e-02,\n", " -1.45808076e-02, 3.43246050e-02, 3.21143419e-02, -4.96741422e-02,\n", " -5.27968369e-02, 2.51889303e-02, -1.11904610e-02, 5.64832352e-02,\n", " 2.77636852e-02, 5.90689071e-02, -2.61273161e-02, -6.95008039e-02,\n", " -3.15576978e-02, -5.62214339e-03, -7.93884136e-03, -3.62196900e-02,\n", " -8.26047733e-03, 8.05249214e-02, -4.16241921e-02, -2.01846119e-02,\n", " -2.52235290e-02, -3.88054736e-02, -2.00710595e-02, 1.50789914e-03,\n", " -5.51338419e-02, -8.35673045e-03, -1.61523875e-02, -8.79513845e-02,\n", " -5.28004877e-02, -2.88654189e-03, -1.11697149e-02, 7.10910782e-02,\n", " 4.44932319e-02, 8.69598426e-03, -1.14432694e-02, 4.47212979e-02,\n", " 2.70624813e-02, -3.86100151e-02, -3.07358261e-02, 2.75634117e-02,\n", " 1.48464069e-02, -1.00845508e-02, 6.45884350e-02, 4.28387662e-03,\n", " 8.05836394e-02, -1.69498641e-02, 4.44465503e-02, -2.09145956e-02,\n", " -3.37407738e-02, 3.85780074e-02, -7.44559616e-02, 1.17512364e-02,\n", " 1.01964204e-02, -3.02421930e-03, 4.80608828e-02, -1.49494391e-02,\n", " 2.54592765e-02, -1.46158040e-02, 5.46646416e-02, 1.43051194e-03,\n", " 2.99116820e-02, 2.24273186e-02, -5.79927117e-03, -1.33864526e-02,\n", " -2.52460372e-02, -2.69225910e-02, 1.64003875e-02, 1.20901112e-02,\n", " 3.38429734e-02, -2.11539529e-02, 7.17787817e-02, -7.78904185e-02,\n", " -4.04084288e-02, 4.90567498e-02, -2.61603445e-02, 1.97753590e-02,\n", " 4.97209951e-02, -4.88655381e-02, -4.52128090e-02, 3.63065898e-02,\n", " 2.68440694e-02, 3.29160057e-02, -8.24410375e-03, -1.33646047e-02,\n", " -6.22822754e-02, -1.13362661e-02, -3.79339382e-02, -6.56360280e-05,\n", " -1.08087100e-02, 2.67575700e-02, 1.33866509e-02, 5.89998253e-02,\n", " -2.54666172e-02, -3.05371322e-02, -1.53249800e-02, -9.87035502e-03,\n", " 1.95337094e-07, -1.76476724e-02, 5.71432859e-02, -2.49180794e-02,\n", " 5.85253723e-02, 4.49808314e-02, -5.99673577e-02, -9.97425616e-03,\n", " 4.07801419e-02, 4.13940698e-02, 2.55707726e-02, 2.18985360e-02,\n", " -3.04434425e-03, -3.77355106e-02, -6.24866784e-02, -1.17468778e-02,\n", " -4.82194684e-02, -7.78659210e-02, -1.48841189e-02, -1.75396129e-02,\n", " -2.48471629e-02, 8.05181568e-04, -4.85844910e-03, -5.16015477e-03,\n", " 7.53483502e-03, -9.46175400e-03, -2.39896346e-02, -3.14654633e-02,\n", " 1.50111094e-02, -1.22348899e-02, 3.00448518e-02, 3.55701670e-02,\n", " 3.08971256e-02, 1.72299352e-02, 5.93419448e-02, -5.74274361e-02,\n", " -8.16087723e-02, -4.80572283e-02, -2.68838424e-02, -1.96331330e-02,\n", " -9.15831141e-03, 1.07509056e-02, 2.35639680e-02, -2.62569580e-02,\n", " 9.21937004e-02, 1.37132118e-02, -1.19096776e-02, -4.09874134e-02,\n", " 3.37628126e-02, -4.64820908e-03, -2.50304434e-02, 6.25852346e-02,\n", " -1.24449311e-02, 3.82654071e-02, -2.35330854e-02, 8.68125912e-03,\n", " 5.08641489e-02, 2.53822445e-03, 5.25634140e-02, 1.14882430e-02,\n", " 5.01894541e-02, -3.55215147e-02, -3.31749097e-02, -3.02003417e-03,\n", " -5.36288768e-02, -2.80938316e-02, -7.51279444e-02, -4.71623316e-02,\n", " 9.56887701e-35, 2.55127084e-02, -1.44770980e-04, 1.96710341e-02,\n", " -1.33620016e-02, -1.51910949e-02, -3.28495577e-02, -1.52465852e-03,\n", " -2.65272055e-02, -4.35708016e-02, -1.75950192e-02, -2.20594816e-02],\n", " dtype=float32)" ] }, "execution_count": 108, "metadata": {}, "output_type": "execute_result" } ], "source": [ "search_query = \"clinical trials related to alzheimers\"\n", "vec = encoder.encode(search_query)\n", "print(vec.shape)\n", "vec" ] }, { "cell_type": "code", "execution_count": 109, "id": "613fd415-4194-45e6-b9f3-9a7707845ad5", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(1, 768)\n" ] }, { "data": { "text/plain": [ "array([[-8.82369652e-03, 8.50650743e-02, 2.08267733e-03,\n", " 6.77651772e-03, -2.86661759e-02, -8.71188380e-03,\n", " 6.99447095e-02, 5.04214764e-02, 3.58386151e-02,\n", " 5.29594952e-03, -1.40875215e-02, 1.99297220e-02,\n", " 2.27009598e-03, 2.10810862e-02, 2.66138893e-02,\n", " 1.90623086e-02, 4.44708914e-02, 2.96202525e-02,\n", " 5.42085357e-02, -2.34859088e-03, -9.87798795e-02,\n", " -5.00183590e-02, -3.42465192e-02, 2.08440255e-02,\n", " 5.31156994e-02, -1.37044629e-02, 2.92537250e-02,\n", " -2.61334293e-02, -1.21854078e-04, -2.36813519e-02,\n", " -3.81283499e-02, -1.79494768e-02, -6.29265187e-03,\n", " 1.27150817e-02, 1.19849676e-06, -7.78729608e-03,\n", " -1.28973828e-04, 4.01791967e-02, 4.21229303e-02,\n", " -8.72302521e-03, 7.44823692e-03, 7.68032745e-02,\n", " 6.50246907e-03, 3.40298638e-02, -1.80711355e-02,\n", " -2.71878559e-02, 5.74751608e-02, 3.67745496e-02,\n", " -3.34868580e-02, 1.05205458e-02, 2.08975170e-02,\n", " 4.36686277e-02, 3.47612537e-02, -4.99080680e-02,\n", " 4.44446988e-02, 5.57280704e-03, -2.31200755e-02,\n", " -4.60692644e-02, -1.39789237e-02, -3.79957110e-02,\n", " 4.67903316e-02, 1.91651955e-02, -5.12171052e-02,\n", " 2.46807020e-02, -5.52081019e-02, 3.50596346e-02,\n", " -7.01438356e-03, -3.36519890e-02, -1.41502097e-02,\n", " -1.37693482e-02, 4.11427952e-02, 6.94046309e-03,\n", " -1.30138136e-02, 5.91567121e-02, 3.37168351e-02,\n", " 3.01467292e-02, -4.59552221e-02, 1.37365120e-03,\n", " 1.00179566e-02, 6.98126853e-04, 3.58139984e-02,\n", " 1.18174301e-02, 1.33722462e-02, -1.35893077e-02,\n", " 4.75908853e-02, 5.48331346e-03, -6.41460950e-03,\n", " -1.23906611e-02, 5.82688041e-02, -1.60842277e-02,\n", " -2.95833423e-04, 6.97355811e-03, 2.48331465e-02,\n", " -2.35959496e-02, -1.24989869e-02, -1.36585534e-02,\n", " 1.52637456e-02, 7.01832073e-03, 5.50601333e-02,\n", " 4.35096538e-03, 2.36319732e-02, -1.38118947e-02,\n", " -7.24233836e-02, 9.39742289e-03, -2.66901590e-02,\n", " 2.96042152e-02, 1.28761679e-02, 2.23339219e-02,\n", " 3.08373477e-03, 7.12765753e-02, 7.13613164e-03,\n", " 3.62721197e-02, -4.53250594e-02, 2.54001115e-02,\n", " -2.54253373e-02, -1.23151275e-03, -1.34750446e-02,\n", " -2.70653702e-02, -1.02220355e-02, 2.07683407e-02,\n", " -7.31003610e-03, 2.65329964e-02, -2.79857730e-03,\n", " 4.20840643e-02, 3.20205763e-02, -1.19518824e-02,\n", " -5.77116087e-02, 9.88688134e-03, 1.86814573e-02,\n", " -5.10204993e-02, -6.77110278e-04, 9.40234493e-03,\n", " -3.33383717e-02, -5.52933291e-02, 5.64148054e-02,\n", " 4.92153503e-02, 3.33690383e-02, -3.92963700e-02,\n", " -6.91099390e-02, 3.79911740e-03, -1.74410697e-02,\n", " -1.60171147e-02, 4.89675067e-02, 2.67119659e-03,\n", " 2.61192098e-02, -2.74193864e-02, 6.92490395e-03,\n", " -4.64810384e-03, -8.99862905e-04, 1.02159111e-02,\n", " -4.81114909e-02, 1.22787328e-02, -9.32844076e-03,\n", " -2.00431682e-02, -1.36102587e-02, -3.67914373e-03,\n", " -1.60810221e-02, -2.20200215e-02, 2.32051890e-02,\n", " -5.07331975e-02, -1.01248249e-02, 5.62567115e-02,\n", " -2.60966737e-03, 9.27545596e-03, 5.32410555e-02,\n", " 4.81746234e-02, -9.83138476e-03, 1.81230865e-02,\n", " -2.12969314e-02, 9.82244611e-02, -2.47648880e-02,\n", " 7.06253499e-02, 8.71159416e-03, -2.73140483e-02,\n", " 5.59884915e-03, -2.14829091e-02, -6.67077005e-02,\n", " 2.48677693e-02, -8.29503238e-02, -7.96182230e-02,\n", " 3.77488993e-02, -1.37352264e-02, -2.85069812e-02,\n", " 1.81708820e-02, -4.07746173e-02, -4.71230270e-03,\n", " -1.59605164e-02, -1.25815195e-03, -6.59954594e-03,\n", " -1.51611334e-02, 7.87123516e-02, -4.09705602e-02,\n", " 3.07933297e-02, 1.27626080e-02, -4.34489138e-02,\n", " 9.91576444e-03, 1.25470785e-02, -8.67356583e-02,\n", " 1.26097840e-03, 3.24709825e-02, -6.92409948e-02,\n", " -4.35011238e-02, -2.79313605e-02, -3.37213017e-02,\n", " -2.35359464e-02, -2.95022167e-02, 2.88009271e-02,\n", " -3.26618887e-02, 7.09307985e-03, 3.09435464e-03,\n", " -5.09097055e-02, 3.54242921e-02, 5.37336655e-02,\n", " 1.55867739e-02, 2.09988486e-02, -4.38529663e-02,\n", " 2.93767708e-03, 2.27999203e-02, 1.02668423e-02,\n", " 3.35033536e-02, -8.28316063e-02, -4.17127199e-02,\n", " -1.23034064e-02, 2.38543525e-02, -3.72257493e-02,\n", " 2.97443867e-02, 3.35034318e-02, -5.21336049e-02,\n", " 5.74519299e-03, 2.89844945e-02, -2.21337453e-02,\n", " 2.34603398e-02, 6.33142609e-03, -2.24104542e-02,\n", " 1.47326495e-02, 1.98041964e-02, 3.05697713e-02,\n", " -9.37094465e-02, -6.84579164e-02, 4.63523576e-03,\n", " 3.88860740e-02, -3.97440195e-02, -4.70216498e-02,\n", " 1.02172708e-02, -3.37972888e-03, -8.54947045e-03,\n", " 4.81354557e-02, 4.99849804e-02, 7.11378129e-03,\n", " -2.54327375e-02, -1.14872465e-02, -3.54485810e-02,\n", " 5.24284095e-02, 2.16708388e-02, -4.00698110e-02,\n", " 5.15380092e-02, -6.03203699e-02, -6.50304696e-03,\n", " -1.03860423e-02, -7.47132823e-02, 3.59848235e-03,\n", " -4.68364358e-02, -4.23019789e-02, -1.86387468e-02,\n", " -2.88047381e-02, -2.81904116e-02, 1.52729014e-02,\n", " -1.55570190e-02, 1.34619148e-02, 2.34364290e-02,\n", " 3.10326237e-02, -4.70464528e-02, -2.43550166e-02,\n", " -7.20657408e-03, -1.16065536e-02, -3.42444591e-02,\n", " -5.30204549e-03, 5.52049950e-02, 4.50828709e-02,\n", " -7.30262510e-03, 5.56289777e-02, -9.46066808e-03,\n", " -3.37345451e-02, -1.87659152e-02, 3.57284099e-02,\n", " 4.20488343e-02, 1.66770478e-03, -5.27675785e-02,\n", " 2.96422077e-04, 4.22447585e-02, 4.97253910e-02,\n", " 6.03130311e-02, 1.32281650e-02, 2.35939436e-02,\n", " -1.59284715e-02, 4.46444489e-02, -1.68315917e-02,\n", " 1.34740606e-01, -3.54593806e-02, 4.79029641e-02,\n", " 8.99049267e-03, 4.74606343e-02, 6.70041004e-03,\n", " -1.15184486e-03, 2.69540539e-03, -2.77549177e-02,\n", " -1.33260442e-02, 2.60788556e-02, 4.35438640e-02,\n", " -2.55859867e-02, 2.76670083e-02, 3.37177999e-02,\n", " 2.93240137e-02, 1.82274636e-03, -1.40310880e-02,\n", " -1.91633645e-02, 1.18790809e-02, -4.65121269e-02,\n", " -4.19883654e-02, -2.69681774e-02, -3.23035605e-02,\n", " -6.84630498e-02, 6.26784265e-02, 1.37511576e-02,\n", " -2.55833156e-02, -5.73152229e-02, 3.30126472e-02,\n", " -7.90146552e-03, -1.08651863e-02, 1.10474667e-02,\n", " 3.03509296e-03, 1.55274626e-02, 1.05599947e-02,\n", " -7.16960803e-03, -5.01419827e-02, -3.34469602e-02,\n", " 3.77239436e-02, 9.44003314e-02, -4.80610691e-02,\n", " 4.73537892e-02, 3.40655483e-02, 7.88806472e-03,\n", " -2.84915343e-02, 7.96849206e-02, 1.57442074e-02,\n", " -4.15650755e-02, 7.51048513e-03, 3.66957486e-02,\n", " -1.72730908e-01, -8.72075930e-02, 2.86346450e-02,\n", " 2.16962174e-02, -4.80199270e-02, 6.49317261e-03,\n", " 1.67240556e-02, -2.56227311e-02, 2.19670162e-02,\n", " -6.10647202e-02, -2.65449155e-02, 6.17929082e-03,\n", " -2.89566331e-02, 1.19498251e-02, -2.33849231e-02,\n", " -2.69133616e-02, -1.46602485e-02, 1.18886270e-02,\n", " 1.64973717e-02, -3.90495770e-02, -3.45575088e-03,\n", " 5.12249060e-02, -8.63745401e-04, 5.59820198e-02,\n", " 2.10017413e-02, 2.74998210e-02, 3.03551817e-04,\n", " -1.15796946e-01, -4.66962112e-03, -4.80118394e-02,\n", " -3.55160870e-02, -4.72528581e-03, -4.29739058e-02,\n", " -1.07347388e-02, -1.32423071e-02, -2.34632343e-02,\n", " 1.98413953e-02, -7.27679394e-03, 2.27117930e-02,\n", " -2.59338003e-02, 4.31442596e-02, 1.07885078e-02,\n", " -2.47129947e-02, -4.14506458e-02, 4.40958813e-02,\n", " 6.65106403e-04, -2.26945560e-02, -4.76796739e-02,\n", " 1.13289580e-02, -5.57265691e-02, 1.71151303e-03,\n", " -1.24145029e-02, -3.57853901e-03, -4.86295968e-02,\n", " -5.14956787e-02, 4.79425713e-02, -3.24050151e-02,\n", " 7.39779174e-02, 2.67242044e-02, 1.16365692e-02,\n", " 8.20766483e-03, -6.27530292e-02, -1.30661400e-02,\n", " -3.52081768e-02, 4.83807474e-02, 9.81860235e-03,\n", " 1.14539362e-01, -1.88471414e-02, 6.07751869e-02,\n", " -1.75345445e-03, 3.13236266e-02, -1.94595556e-03,\n", " 2.64345529e-03, 3.07400171e-02, -4.31060083e-02,\n", " -6.19985871e-02, 5.50477020e-03, 1.62547994e-02,\n", " -8.26352183e-03, 7.56437238e-03, -4.79784003e-03,\n", " 6.93615247e-03, 3.59064825e-02, 2.08517518e-02,\n", " 1.41595434e-02, 5.31185642e-02, 6.78585656e-03,\n", " 6.56357184e-02, -5.06135784e-02, -3.05179805e-02,\n", " 7.06539825e-02, -3.55644710e-02, -4.92612133e-03,\n", " 9.91953164e-02, 1.00235650e-02, -2.22671125e-02,\n", " -1.86746120e-02, 2.49281265e-02, -4.92450967e-03,\n", " 1.66887734e-02, 4.62210961e-02, 4.07794118e-02,\n", " 2.52511259e-02, -2.83305068e-02, -2.78001893e-02,\n", " -1.69764105e-02, 1.79186705e-02, 1.09842177e-02,\n", " 1.09969089e-02, 1.69700030e-02, -8.59475043e-03,\n", " 4.70476560e-02, 3.64770554e-02, 2.09835749e-02,\n", " 1.01236468e-02, 2.75151283e-02, 4.33402918e-02,\n", " -4.30559181e-02, -3.53547297e-02, 7.77268112e-02,\n", " -6.10819347e-02, -2.86280159e-02, 4.68054451e-02,\n", " 1.29892454e-02, -1.71940885e-02, -2.52429228e-02,\n", " 3.86423096e-02, -1.35919163e-02, -5.27431667e-02,\n", " 6.45831088e-03, 2.96409409e-02, 5.97442053e-02,\n", " 3.23252901e-02, 5.03172688e-02, -4.45654802e-02,\n", " 2.90075876e-02, -1.35373492e-02, 6.78209821e-03,\n", " -5.89249916e-02, 4.28890549e-02, -2.36034058e-02,\n", " -5.30969724e-03, 3.85405980e-02, -1.82616734e-03,\n", " 1.45543357e-02, 1.07806427e-02, -6.06855676e-02,\n", " -4.95252907e-02, 1.02004781e-02, 4.60227691e-02,\n", " -1.08090881e-02, 4.42408510e-02, 4.15152796e-02,\n", " 1.23609398e-02, 5.11957100e-03, 1.17597533e-02,\n", " -2.70090066e-02, 2.68773828e-02, -1.97812133e-02,\n", " 2.25932393e-02, -1.33560598e-02, -1.50896851e-02,\n", " -3.14053567e-03, 1.54051669e-02, 1.86488125e-02,\n", " -1.71708278e-02, -3.95283476e-03, 7.68053811e-04,\n", " -2.37891261e-04, 1.84722953e-02, 3.60381305e-02,\n", " -5.85213909e-03, 4.44293395e-02, -1.11264118e-03,\n", " -4.79441285e-02, 3.46464328e-02, -2.53370814e-02,\n", " -3.26901935e-02, -2.28975322e-02, -1.96164921e-02,\n", " -4.38152434e-04, 4.08602282e-02, -2.29470823e-02,\n", " -1.89938806e-02, -1.52037974e-04, 1.05516789e-02,\n", " 2.08601039e-02, -6.98119551e-02, 3.66246551e-02,\n", " -1.26779894e-03, -4.03217562e-02, -5.35424761e-02,\n", " 6.51817098e-02, 4.29646857e-02, 2.56071109e-02,\n", " -3.28080021e-02, 1.20534413e-02, 3.56224040e-03,\n", " -1.01593453e-02, -1.96505673e-04, 4.33485657e-02,\n", " -4.25680764e-02, 9.73126665e-03, 3.76882474e-03,\n", " -1.40319867e-02, -3.63940969e-02, -3.09983976e-02,\n", " -4.19548260e-33, 7.11604580e-02, 4.78382297e-02,\n", " 1.89297704e-03, -1.60731785e-02, 2.53787991e-02,\n", " -3.15741785e-02, -4.27713171e-02, -7.53164338e-03,\n", " 1.68679946e-03, 1.92391127e-02, -2.20667192e-04,\n", " 1.32907527e-02, 5.99487219e-03, 2.75156219e-02,\n", " -5.06000873e-03, -3.58465910e-02, 8.20948277e-03,\n", " -2.11624149e-02, -7.07996823e-03, -4.23992332e-03,\n", " -1.09853260e-01, -3.66037302e-02, 3.55480015e-02,\n", " 4.23291475e-02, 1.48312682e-02, 5.68749309e-02,\n", " 3.57767567e-02, 1.40728084e-02, -4.00471613e-02,\n", " 1.01988176e-02, 2.83056553e-02, -1.55737845e-03,\n", " 1.24238459e-02, 1.20237898e-02, -7.69484974e-03,\n", " -3.30727436e-02, -1.45808076e-02, 3.43246050e-02,\n", " 3.21143419e-02, -4.96741422e-02, -5.27968369e-02,\n", " 2.51889303e-02, -1.11904610e-02, 5.64832352e-02,\n", " 2.77636852e-02, 5.90689071e-02, -2.61273161e-02,\n", " -6.95008039e-02, -3.15576978e-02, -5.62214339e-03,\n", " -7.93884136e-03, -3.62196900e-02, -8.26047733e-03,\n", " 8.05249214e-02, -4.16241921e-02, -2.01846119e-02,\n", " -2.52235290e-02, -3.88054736e-02, -2.00710595e-02,\n", " 1.50789914e-03, -5.51338419e-02, -8.35673045e-03,\n", " -1.61523875e-02, -8.79513845e-02, -5.28004877e-02,\n", " -2.88654189e-03, -1.11697149e-02, 7.10910782e-02,\n", " 4.44932319e-02, 8.69598426e-03, -1.14432694e-02,\n", " 4.47212979e-02, 2.70624813e-02, -3.86100151e-02,\n", " -3.07358261e-02, 2.75634117e-02, 1.48464069e-02,\n", " -1.00845508e-02, 6.45884350e-02, 4.28387662e-03,\n", " 8.05836394e-02, -1.69498641e-02, 4.44465503e-02,\n", " -2.09145956e-02, -3.37407738e-02, 3.85780074e-02,\n", " -7.44559616e-02, 1.17512364e-02, 1.01964204e-02,\n", " -3.02421930e-03, 4.80608828e-02, -1.49494391e-02,\n", " 2.54592765e-02, -1.46158040e-02, 5.46646416e-02,\n", " 1.43051194e-03, 2.99116820e-02, 2.24273186e-02,\n", " -5.79927117e-03, -1.33864526e-02, -2.52460372e-02,\n", " -2.69225910e-02, 1.64003875e-02, 1.20901112e-02,\n", " 3.38429734e-02, -2.11539529e-02, 7.17787817e-02,\n", " -7.78904185e-02, -4.04084288e-02, 4.90567498e-02,\n", " -2.61603445e-02, 1.97753590e-02, 4.97209951e-02,\n", " -4.88655381e-02, -4.52128090e-02, 3.63065898e-02,\n", " 2.68440694e-02, 3.29160057e-02, -8.24410375e-03,\n", " -1.33646047e-02, -6.22822754e-02, -1.13362661e-02,\n", " -3.79339382e-02, -6.56360280e-05, -1.08087100e-02,\n", " 2.67575700e-02, 1.33866509e-02, 5.89998253e-02,\n", " -2.54666172e-02, -3.05371322e-02, -1.53249800e-02,\n", " -9.87035502e-03, 1.95337094e-07, -1.76476724e-02,\n", " 5.71432859e-02, -2.49180794e-02, 5.85253723e-02,\n", " 4.49808314e-02, -5.99673577e-02, -9.97425616e-03,\n", " 4.07801419e-02, 4.13940698e-02, 2.55707726e-02,\n", " 2.18985360e-02, -3.04434425e-03, -3.77355106e-02,\n", " -6.24866784e-02, -1.17468778e-02, -4.82194684e-02,\n", " -7.78659210e-02, -1.48841189e-02, -1.75396129e-02,\n", " -2.48471629e-02, 8.05181568e-04, -4.85844910e-03,\n", " -5.16015477e-03, 7.53483502e-03, -9.46175400e-03,\n", " -2.39896346e-02, -3.14654633e-02, 1.50111094e-02,\n", " -1.22348899e-02, 3.00448518e-02, 3.55701670e-02,\n", " 3.08971256e-02, 1.72299352e-02, 5.93419448e-02,\n", " -5.74274361e-02, -8.16087723e-02, -4.80572283e-02,\n", " -2.68838424e-02, -1.96331330e-02, -9.15831141e-03,\n", " 1.07509056e-02, 2.35639680e-02, -2.62569580e-02,\n", " 9.21937004e-02, 1.37132118e-02, -1.19096776e-02,\n", " -4.09874134e-02, 3.37628126e-02, -4.64820908e-03,\n", " -2.50304434e-02, 6.25852346e-02, -1.24449311e-02,\n", " 3.82654071e-02, -2.35330854e-02, 8.68125912e-03,\n", " 5.08641489e-02, 2.53822445e-03, 5.25634140e-02,\n", " 1.14882430e-02, 5.01894541e-02, -3.55215147e-02,\n", " -3.31749097e-02, -3.02003417e-03, -5.36288768e-02,\n", " -2.80938316e-02, -7.51279444e-02, -4.71623316e-02,\n", " 9.56887701e-35, 2.55127084e-02, -1.44770980e-04,\n", " 1.96710341e-02, -1.33620016e-02, -1.51910949e-02,\n", " -3.28495577e-02, -1.52465852e-03, -2.65272055e-02,\n", " -4.35708016e-02, -1.75950192e-02, -2.20594816e-02]], dtype=float32)" ] }, "execution_count": 109, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import numpy as np\n", "\n", "svec = np.array(vec).reshape(1, -1)\n", "print(svec.shape)\n", "svec" ] }, { "cell_type": "code", "execution_count": 110, "id": "fef30d70-6958-4259-abb6-09f8c1870a2b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[0.7731663 0.79433584]] [[330 331]]\n" ] } ], "source": [ "distances, I = index.search(svec, k=2)\n", "print(distances, I)" ] }, { "cell_type": "code", "execution_count": 111, "id": "eb00598c-9799-4697-b2a3-356bb5aae0f1", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
desease_conditiontext
330['alzheimer disease', 'dementia', 'brain disea...nct_id: NCT02164643\\nsummary: A Multicenter na...
331['alzheimer disease', 'dementia', 'brain disea...nct_id: NCT02164643\\nsummary: A Multicenter na...
\n", "
" ], "text/plain": [ " desease_condition \\\n", "330 ['alzheimer disease', 'dementia', 'brain disea... \n", "331 ['alzheimer disease', 'dementia', 'brain disea... \n", "\n", " text \n", "330 nct_id: NCT02164643\\nsummary: A Multicenter na... \n", "331 nct_id: NCT02164643\\nsummary: A Multicenter na... " ] }, "execution_count": 111, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2 = df.iloc[I[0]]\n", "df2" ] }, { "cell_type": "code", "execution_count": 113, "id": "af5bf8e2-43b6-47af-affa-5111789371ad", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'nct_id: NCT02164643\\nsummary: A Multicenter national longitudinal cohort study including at least 800 individuals consecutively recruited from French Research Memory Centers and followed-up over 24 month and included in Memento.\\nintervention_type: Drug\\nintervention_name: Florbetapir (18F)\\nintervention_description: nan\\nkeywords: [\"Alzheimer\\'s disease\", \\'Mild Cognitive Impairment\\']'" ] }, "execution_count": 113, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2.iloc[1].text" ] }, { "cell_type": "code", "execution_count": null, "id": "f3899f81-e120-475c-97ed-080cb7f46510", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.14" } }, "nbformat": 4, "nbformat_minor": 5 }