{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import numpy as np\n",
    "import os\n",
    "import pandas as pd\n",
    "import sys\n",
    "\n",
    "from langchain.document_loaders.csv_loader import CSVLoader\n",
    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
    "from langchain.embeddings import HuggingFaceEmbeddings\n",
    "from langchain.vectorstores import Chroma\n",
    "from langchain.retrievers.tfidf import TFIDFRetriever\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'d:\\\\Projects\\\\information-retrieval'"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Change the current working directory to the pachage root\n",
    "# That's step is due to the way settings.py is defined\n",
    "root_path_list = os.getcwd().split(\"\\\\\")[:-1]\n",
    "root_path = os.path.join(root_path_list[0], os.sep, *root_path_list[1:])\n",
    "os.chdir(root_path)\n",
    "os.getcwd()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# User parameters\n",
    "\n",
    "# Document Loading\n",
    "csv_data_folder = os.path.join(os.getcwd(), \"data\", \"02_intermediate\")\n",
    "\n",
    "# Document Splitting\n",
    "chunk_size = 1000\n",
    "chunk_overlap = 100 \n",
    "separators = [\"\\n\\n\", \"\\n\", \"(?<=\\. )\", \" \", \"\"]\n",
    "\n",
    "# Text Embedding and Vector Store\n",
    "model_name = \"sentence-transformers/all-mpnet-base-v2\"\n",
    "embedding_persist_folder = os.path.join(os.getcwd(), \"data\", \"04_feature\", \"chroma\")\n",
    "\n",
    "# Retrieval\n",
    "model_folder = os.path.join(os.getcwd(), \"data\", \"06_models\")\n",
    "num_contexts_retrievals = 3\n",
    "\n",
    "# Evaluation\n",
    "raw_data_folder = os.path.join(os.getcwd(), \"data\", \"01_raw\")\n",
    "raw_data_file = \"ds_nlp_challenge_500samples.csv\"\n",
    "results_folder = os.path.join(os.getcwd(), \"data\", \"07_model_output\")\n",
    "results_file = \"ds_nlp_challenge_500samples_results.csv\"\n",
    "report_folder = os.path.join(os.getcwd(), \"data\", \"08_reporting\")\n",
    "report_file = \"retrieval_metrics_report.json\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 1. Document Loading"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get CSV file name\n",
    "csv_data_files = [file for file in os.listdir(csv_data_folder) if \".csv\" in file]\n",
    "\n",
    "# Load CSV\n",
    "docs = []\n",
    "for csv_data_file in csv_data_files:\n",
    "    csv_data_path = os.path.join(csv_data_folder, csv_data_file)\n",
    "    loader = CSVLoader(file_path=csv_data_path, encoding=\"utf8\")\n",
    "    docs.extend(loader.load())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "500"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(docs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Document(page_content='context: The Premier League sells its television rights on a collective basis. This is in contrast to some other European Leagues, including La Liga, in which each club sells its rights individually, leading to a much higher share of the total income going to the top few clubs. The money is divided into three parts: half is divided equally between the clubs; one quarter is awarded on a merit basis based on final league position, the top club getting twenty times as much as the bottom club, and equal steps all the way down the table; the final quarter is paid out as facilities fees for games that are shown on television, with the top clubs generally receiving the largest shares of this. The income from overseas rights is divided equally between the twenty clubs.', metadata={'source': 'd:\\\\Projects\\\\information-retrieval\\\\data\\\\02_intermediate\\\\ds_nlp_challenge_500samples_contexts.csv', 'row': 0})"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "docs[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 2. Document Splitting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Split documents into chunks\n",
    "\n",
    "text_splitter = RecursiveCharacterTextSplitter(\n",
    "    chunk_size=chunk_size,\n",
    "    chunk_overlap=chunk_overlap, \n",
    "    separators=separators,\n",
    ")\n",
    "\n",
    "splits = text_splitter.split_documents(docs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "597"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(splits)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Document(page_content='context: The Premier League sells its television rights on a collective basis. This is in contrast to some other European Leagues, including La Liga, in which each club sells its rights individually, leading to a much higher share of the total income going to the top few clubs. The money is divided into three parts: half is divided equally between the clubs; one quarter is awarded on a merit basis based on final league position, the top club getting twenty times as much as the bottom club, and equal steps all the way down the table; the final quarter is paid out as facilities fees for games that are shown on television, with the top clubs generally receiving the largest shares of this. The income from overseas rights is divided equally between the twenty clubs.', metadata={'source': 'd:\\\\Projects\\\\information-retrieval\\\\data\\\\02_intermediate\\\\ds_nlp_challenge_500samples_contexts.csv', 'row': 0}),\n",
       " Document(page_content='context: Between the third and fourth sessions the pope announced reforms in the areas of Roman Curia, revision of Canon Law, regulations for mixed marriages involving several faiths, and birth control issues. He opened the final session of the council, concelebrating with bishops from countries where the Church was persecuted. Several texts proposed for his approval had to be changed. But all texts were finally agreed upon. The Council was concluded on 8 December 1965, the Feast of the Immaculate Conception.', metadata={'source': 'd:\\\\Projects\\\\information-retrieval\\\\data\\\\02_intermediate\\\\ds_nlp_challenge_500samples_contexts.csv', 'row': 1})]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "splits[:2]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 3. Text Embedding and Vector Store"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load a pretrained text embedding model\n",
    "\n",
    "model_kwargs = {'device': 'cpu'}\n",
    "encode_kwargs = {'normalize_embeddings': False}\n",
    "embedding = HuggingFaceEmbeddings(\n",
    "    model_name=model_name,\n",
    "    model_kwargs=model_kwargs,\n",
    "    encode_kwargs=encode_kwargs\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Semantic similarity between sentences 1 and 2: 0.8981182456324139\n",
      "Semantic similarity between sentences 2 and 3: 0.005847679808422499\n"
     ]
    }
   ],
   "source": [
    "# Example: Use embeddings to compute semantic similarity\n",
    "\n",
    "sentence1 = \"i like dogs\"\n",
    "sentence2 = \"i like canines\"\n",
    "sentence3 = \"the weather is ugly outside\"\n",
    "\n",
    "embedding1 = embedding.embed_query(sentence1)\n",
    "embedding2 = embedding.embed_query(sentence2)\n",
    "embedding3 = embedding.embed_query(sentence3)\n",
    "\n",
    "print(\"Semantic similarity between sentences 1 and 2:\", np.dot(embedding1, embedding2))\n",
    "print(\"Semantic similarity between sentences 2 and 3:\", np.dot(embedding2, embedding3))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create text embeddings and store in a vector database Chroma.\n",
    "# For more options, see: \n",
    "# https://python.langchain.com/docs/modules/data_connection/vectorstores/\n",
    "\n",
    "vectordb = Chroma.from_documents(\n",
    "    documents=splits,\n",
    "    embedding=embedding,\n",
    "    persist_directory=embedding_persist_folder\n",
    ")\n",
    "\n",
    "vectordb.persist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1194"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vectordb._collection.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Document(page_content='context: The Premier League sells its television rights on a collective basis. This is in contrast to some other European Leagues, including La Liga, in which each club sells its rights individually, leading to a much higher share of the total income going to the top few clubs. The money is divided into three parts: half is divided equally between the clubs; one quarter is awarded on a merit basis based on final league position, the top club getting twenty times as much as the bottom club, and equal steps all the way down the table; the final quarter is paid out as facilities fees for games that are shown on television, with the top clubs generally receiving the largest shares of this. The income from overseas rights is divided equally between the twenty clubs.', metadata={'row': 0, 'source': 'd:\\\\Projects\\\\information-retrieval\\\\data\\\\02_intermediate\\\\ds_nlp_challenge_500samples_contexts.csv'}),\n",
       " Document(page_content='context: The Premier League sells its television rights on a collective basis. This is in contrast to some other European Leagues, including La Liga, in which each club sells its rights individually, leading to a much higher share of the total income going to the top few clubs. The money is divided into three parts: half is divided equally between the clubs; one quarter is awarded on a merit basis based on final league position, the top club getting twenty times as much as the bottom club, and equal steps all the way down the table; the final quarter is paid out as facilities fees for games that are shown on television, with the top clubs generally receiving the largest shares of this. The income from overseas rights is divided equally between the twenty clubs.', metadata={'row': 250, 'source': 'd:\\\\Projects\\\\information-retrieval\\\\data\\\\02_intermediate\\\\ds_nlp_challenge_500samples_contexts.csv'}),\n",
       " Document(page_content='context: The Premier League sells its television rights on a collective basis. This is in contrast to some other European Leagues, including La Liga, in which each club sells its rights individually, leading to a much higher share of the total income going to the top few clubs. The money is divided into three parts: half is divided equally between the clubs; one quarter is awarded on a merit basis based on final league position, the top club getting twenty times as much as the bottom club, and equal steps all the way down the table; the final quarter is paid out as facilities fees for games that are shown on television, with the top clubs generally receiving the largest shares of this. The income from overseas rights is divided equally between the twenty clubs.', metadata={'row': 250, 'source': 'd:\\\\Projects\\\\information-retrieval\\\\data\\\\02_intermediate\\\\ds_nlp_challenge_500samples_contexts.csv'})]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Example: Use vector store to retrieve chunks based on semantic similarity\n",
    "\n",
    "splits_sm = vectordb.similarity_search(\"Do European Leagues sell their television rights per a collective level?\", k=3)\n",
    "\n",
    "splits_sm"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 4. Retrieval"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "question = \"Do European Leagues sell their television rights per a collective level?\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "def print_contexts(contexts, n_char=100):\n",
    "    context_contents = {}\n",
    "    for context in contexts:\n",
    "        index = context.metadata[\"row\"]\n",
    "        content = context.page_content[:100]\n",
    "        context_contents[index] = content\n",
    "    print(json.dumps(context_contents, indent = 4))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4.1 Semantic Similarity Search"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4.1.1 Top k"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create a retriever based on the created vector db with the text embeddings\n",
    "\n",
    "retriever_sm = vectordb.as_retriever(search_kwargs={\"k\": num_contexts_retrievals})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Example: Use the retriever to get relevant chunks for the question\n",
    "\n",
    "contexts_sm = retriever_sm.get_relevant_documents(question)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\n",
      "    \"0\": \"context: The Premier League sells its television rights on a collective basis. This is in contrast t\",\n",
      "    \"250\": \"context: The Premier League sells its television rights on a collective basis. This is in contrast t\"\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "print_contexts(contexts_sm)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4.1.2 Score threshold retrieval"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "retriever_st = vectordb.as_retriever(search_type=\"similarity_score_threshold\", search_kwargs={\"score_threshold\": 0.5})\n",
    "\n",
    "contexts_st = retriever_st.get_relevant_documents(question)\n",
    "\n",
    "len(contexts_st)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4.1.3 Maximum Marginal Relevance\n",
    "\n",
    "<p align=\"center\">\n",
    "  <img src=\"../docs/images/MMR%20(Maximum%20Marginal%20Relevance)%20Algorithm.png\" alt=\"\" width=\"400\">\n",
    "</p>\n",
    "\n",
    "<center>Image source: DeepLearning.AI (2023). LangChain chat with your data, accessed September 2023, https://learn.deeplearning.ai/langchain-chat-with-your-data/lesson/5/retrieval  </center>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\n",
      "    \"0\": \"context: The Premier League sells its television rights on a collective basis. This is in contrast t\",\n",
      "    \"132\": \"context: The BBC domestic television channels do not broadcast advertisements; they are instead fund\",\n",
      "    \"305\": \"context: Under the 1995\\u20132004 National Hockey League collective bargaining agreement, teams were limi\",\n",
      "    \"95\": \"context: Most of the world's airports are owned by local, regional, or national government bodies wh\"\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "retriever_mmr = vectordb.as_retriever(search_type=\"mmr\")\n",
    "\n",
    "contexts_mmr = retriever_mmr.get_relevant_documents(question)\n",
    "\n",
    "print_contexts(contexts_mmr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\n",
      "    \"0\": \"context: The Premier League sells its television rights on a collective basis. This is in contrast t\",\n",
      "    \"305\": \"context: Under the 1995\\u20132004 National Hockey League collective bargaining agreement, teams were limi\",\n",
      "    \"95\": \"context: Most of the world's airports are owned by local, regional, or national government bodies wh\"\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "retriever_mmr = vectordb.as_retriever(search_type=\"mmr\", search_kwargs={\"k\": num_contexts_retrievals})\n",
    "\n",
    "contexts_mmr = retriever_mmr.get_relevant_documents(question)\n",
    "\n",
    "print_contexts(contexts_mmr)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4.2 \"Lexical\" Search"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\n",
      "    \"0\": \"context: The Premier League sells its television rights on a collective basis. This is in contrast t\",\n",
      "    \"250\": \"context: The Premier League sells its television rights on a collective basis. This is in contrast t\",\n",
      "    \"132\": \"context: The BBC domestic television channels do not broadcast advertisements; they are instead fund\"\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "# Create a retriever based on the TF-IDF vectorizer model\n",
    "\n",
    "retriever_tfidf = TFIDFRetriever.from_documents(splits, k=num_contexts_retrievals)\n",
    "\n",
    "contexts_tfidf = retriever_tfidf.get_relevant_documents(question)\n",
    "\n",
    "print_contexts(contexts_tfidf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\n",
      "    \"0\": \"context: The Premier League sells its television rights on a collective basis. This is in contrast t\",\n",
      "    \"250\": \"context: The Premier League sells its television rights on a collective basis. This is in contrast t\",\n",
      "    \"132\": \"context: The BBC domestic television channels do not broadcast advertisements; they are instead fund\"\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "# Save retriever\n",
    "\n",
    "retriever_tfidf.save_local(model_folder)\n",
    "retriever_tfidf = TFIDFRetriever.load_local(model_folder)\n",
    "retriever_tfidf.k = num_contexts_retrievals\n",
    "\n",
    "print_contexts(retriever_tfidf.get_relevant_documents(question))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 5. Evaluation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "retrievers = {\n",
    "    \"retriever_sm\": retriever_sm, \n",
    "    \"retriever_mmr\": retriever_mmr, \n",
    "    \"retriever_tfidf\": retriever_tfidf,\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5.1 Retrieval"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>question</th>\n",
       "      <th>context</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>497</th>\n",
       "      <td>497</td>\n",
       "      <td>Where are the large Martkirche located?</td>\n",
       "      <td>Another point of interest is the Old Town. In ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>498</th>\n",
       "      <td>498</td>\n",
       "      <td>When was she on the Sports Illustrated cover?</td>\n",
       "      <td>According to Italian fashion designer Roberto ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>499</th>\n",
       "      <td>499</td>\n",
       "      <td>How many private institutes of technology are ...</td>\n",
       "      <td>There are 16 autonomous Indian Institutes of T...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      id                                           question  \\\n",
       "497  497            Where are the large Martkirche located?   \n",
       "498  498      When was she on the Sports Illustrated cover?   \n",
       "499  499  How many private institutes of technology are ...   \n",
       "\n",
       "                                               context  \n",
       "497  Another point of interest is the Old Town. In ...  \n",
       "498  According to Italian fashion designer Roberto ...  \n",
       "499  There are 16 autonomous Indian Institutes of T...  "
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Load the test dataset\n",
    "\n",
    "data_path = os.path.join(raw_data_folder, raw_data_file)\n",
    "data = pd.read_csv(data_path, header=0, sep=',', quotechar='\"')\n",
    "data.rename(columns={\"Unnamed: 0\": \"index\"}, inplace=True)\n",
    "data.dropna(inplace=True)\n",
    "\n",
    "data.tail(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "def retrieve_question_indexes(retriever, questions, k=None):\n",
    "\n",
    "    if k:\n",
    "        if hasattr(retriever_sm, \"k\"):\n",
    "            retriever.k = k\n",
    "        else:\n",
    "            retriever_mmr.search_kwargs[\"k\"] = k\n",
    "\n",
    "    retrieved_indexes = []\n",
    "    for question in tqdm(questions):\n",
    "        q_contexts = retriever.get_relevant_documents(question)\n",
    "        q_retrieved_indexes = [context.metadata[\"row\"] for context in q_contexts]\n",
    "        retrieved_indexes.append(str(q_retrieved_indexes)[1:-1])\n",
    "\n",
    "    return retrieved_indexes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "retriever_sm\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 500/500 [00:28<00:00, 17.55it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "retriever_mmr\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 500/500 [00:29<00:00, 17.19it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "retriever_tfidf\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 500/500 [00:03<00:00, 131.91it/s]\n"
     ]
    }
   ],
   "source": [
    "# Get relevant contexts for the questions using the created retrievers\n",
    "\n",
    "for name, retriever in retrievers.items():\n",
    "    print(name)\n",
    "    questions = data.question\n",
    "    data[name] = retrieve_question_indexes(retriever, questions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>question</th>\n",
       "      <th>context</th>\n",
       "      <th>retriever_sm</th>\n",
       "      <th>retriever_mmr</th>\n",
       "      <th>retriever_tfidf</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>Do European Leagues sell their television righ...</td>\n",
       "      <td>The Premier League sells its television rights...</td>\n",
       "      <td>0, 250, 250</td>\n",
       "      <td>0, 305, 95</td>\n",
       "      <td>0, 250, 132</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>What does the Catholic church considered \"mixe...</td>\n",
       "      <td>Between the third and fourth sessions the pope...</td>\n",
       "      <td>393, 393, 346</td>\n",
       "      <td>393, 1, 129</td>\n",
       "      <td>225, 346, 104</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>What are some of the practices Gautama underwe...</td>\n",
       "      <td>Gautama first went to study with famous religi...</td>\n",
       "      <td>2, 2, 417</td>\n",
       "      <td>2, 93, 137</td>\n",
       "      <td>2, 111, 111</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id                                           question  \\\n",
       "0   0  Do European Leagues sell their television righ...   \n",
       "1   1  What does the Catholic church considered \"mixe...   \n",
       "2   2  What are some of the practices Gautama underwe...   \n",
       "\n",
       "                                             context   retriever_sm  \\\n",
       "0  The Premier League sells its television rights...    0, 250, 250   \n",
       "1  Between the third and fourth sessions the pope...  393, 393, 346   \n",
       "2  Gautama first went to study with famous religi...      2, 2, 417   \n",
       "\n",
       "  retriever_mmr retriever_tfidf  \n",
       "0    0, 305, 95     0, 250, 132  \n",
       "1   393, 1, 129   225, 346, 104  \n",
       "2    2, 93, 137     2, 111, 111  "
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save results\n",
    "\n",
    "results_path = os.path.join(results_folder, results_file)\n",
    "data.to_csv(results_path, header=True, index=False, encoding=\"utf-8\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5.2 Mean Reciprocal Rank"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "def mrr_score(true_indexes, retrieved_indexes):\n",
    "\n",
    "    n = len(true_indexes)\n",
    "\n",
    "    sum_rr = 0\n",
    "    for idx in range(n):\n",
    "        try:\n",
    "            rank = 1+retrieved_indexes[idx].index(true_indexes[idx])\n",
    "        except ValueError:\n",
    "            rank = np.inf\n",
    "        reciprocal_rank = 1 / rank\n",
    "        sum_rr += reciprocal_rank\n",
    "\n",
    "    mmr = sum_rr/n\n",
    "\n",
    "    return mmr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Compute MRR score over the test dataset for all retrievers\n",
    "\n",
    "true_indexes = data.id\n",
    "\n",
    "metrics_report = {}\n",
    "for name, retriever in retrievers.items():\n",
    "    retrieved_indexes = [[int(idx) for idx in str_indexes.split(\", \")] for str_indexes in data[name]]\n",
    "    metrics_report[name] = round(mrr_score(true_indexes, retrieved_indexes), 4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\n",
      "    \"retriever_sm\": 0.8993,\n",
      "    \"retriever_mmr\": 0.9073,\n",
      "    \"retriever_tfidf\": 0.8697\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "print(json.dumps(metrics_report, indent = 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save report\n",
    "\n",
    "report_path = os.path.join(report_folder, report_file)\n",
    "with open(report_path, 'w', encoding='utf-8') as f:\n",
    "    json.dump(metrics_report, f, ensure_ascii=False, indent=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.10.5 ('env': venv)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.5"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "8b869e708107f036752f2720e48bd15b43027f3e3652d0d79bb10a4f67c16c27"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}