{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import json\n", "import numpy as np\n", "import os\n", "import pandas as pd\n", "import sys\n", "\n", "from langchain.document_loaders.csv_loader import CSVLoader\n", "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", "from langchain.embeddings import HuggingFaceEmbeddings\n", "from langchain.vectorstores import Chroma\n", "from langchain.retrievers.tfidf import TFIDFRetriever\n", "from tqdm import tqdm" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'d:\\\\Projects\\\\information-retrieval'" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Change the current working directory to the pachage root\n", "# That's step is due to the way settings.py is defined\n", "root_path_list = os.getcwd().split(\"\\\\\")[:-1]\n", "root_path = os.path.join(root_path_list[0], os.sep, *root_path_list[1:])\n", "os.chdir(root_path)\n", "os.getcwd()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# User parameters\n", "\n", "# Document Loading\n", "csv_data_folder = os.path.join(os.getcwd(), \"data\", \"02_intermediate\")\n", "\n", "# Document Splitting\n", "chunk_size = 1000\n", "chunk_overlap = 100 \n", "separators = [\"\\n\\n\", \"\\n\", \"(?<=\\. )\", \" \", \"\"]\n", "\n", "# Text Embedding and Vector Store\n", "model_name = \"sentence-transformers/all-mpnet-base-v2\"\n", "embedding_persist_folder = os.path.join(os.getcwd(), \"data\", \"04_feature\", \"chroma\")\n", "\n", "# Retrieval\n", "model_folder = os.path.join(os.getcwd(), \"data\", \"06_models\")\n", "num_contexts_retrievals = 3\n", "\n", "# Evaluation\n", "raw_data_folder = os.path.join(os.getcwd(), \"data\", \"01_raw\")\n", "raw_data_file = \"ds_nlp_challenge_500samples.csv\"\n", "results_folder = os.path.join(os.getcwd(), \"data\", \"07_model_output\")\n", "results_file = \"ds_nlp_challenge_500samples_results.csv\"\n", "report_folder = os.path.join(os.getcwd(), \"data\", \"08_reporting\")\n", "report_file = \"retrieval_metrics_report.json\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 1. Document Loading" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# Get CSV file name\n", "csv_data_files = [file for file in os.listdir(csv_data_folder) if \".csv\" in file]\n", "\n", "# Load CSV\n", "docs = []\n", "for csv_data_file in csv_data_files:\n", " csv_data_path = os.path.join(csv_data_folder, csv_data_file)\n", " loader = CSVLoader(file_path=csv_data_path, encoding=\"utf8\")\n", " docs.extend(loader.load())" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "500" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(docs)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Document(page_content='context: The Premier League sells its television rights on a collective basis. This is in contrast to some other European Leagues, including La Liga, in which each club sells its rights individually, leading to a much higher share of the total income going to the top few clubs. The money is divided into three parts: half is divided equally between the clubs; one quarter is awarded on a merit basis based on final league position, the top club getting twenty times as much as the bottom club, and equal steps all the way down the table; the final quarter is paid out as facilities fees for games that are shown on television, with the top clubs generally receiving the largest shares of this. The income from overseas rights is divided equally between the twenty clubs.', metadata={'source': 'd:\\\\Projects\\\\information-retrieval\\\\data\\\\02_intermediate\\\\ds_nlp_challenge_500samples_contexts.csv', 'row': 0})" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "docs[0]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 2. Document Splitting" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# Split documents into chunks\n", "\n", "text_splitter = RecursiveCharacterTextSplitter(\n", " chunk_size=chunk_size,\n", " chunk_overlap=chunk_overlap, \n", " separators=separators,\n", ")\n", "\n", "splits = text_splitter.split_documents(docs)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "597" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(splits)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[Document(page_content='context: The Premier League sells its television rights on a collective basis. This is in contrast to some other European Leagues, including La Liga, in which each club sells its rights individually, leading to a much higher share of the total income going to the top few clubs. The money is divided into three parts: half is divided equally between the clubs; one quarter is awarded on a merit basis based on final league position, the top club getting twenty times as much as the bottom club, and equal steps all the way down the table; the final quarter is paid out as facilities fees for games that are shown on television, with the top clubs generally receiving the largest shares of this. The income from overseas rights is divided equally between the twenty clubs.', metadata={'source': 'd:\\\\Projects\\\\information-retrieval\\\\data\\\\02_intermediate\\\\ds_nlp_challenge_500samples_contexts.csv', 'row': 0}),\n", " Document(page_content='context: Between the third and fourth sessions the pope announced reforms in the areas of Roman Curia, revision of Canon Law, regulations for mixed marriages involving several faiths, and birth control issues. He opened the final session of the council, concelebrating with bishops from countries where the Church was persecuted. Several texts proposed for his approval had to be changed. But all texts were finally agreed upon. The Council was concluded on 8 December 1965, the Feast of the Immaculate Conception.', metadata={'source': 'd:\\\\Projects\\\\information-retrieval\\\\data\\\\02_intermediate\\\\ds_nlp_challenge_500samples_contexts.csv', 'row': 1})]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "splits[:2]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 3. Text Embedding and Vector Store" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# Load a pretrained text embedding model\n", "\n", "model_kwargs = {'device': 'cpu'}\n", "encode_kwargs = {'normalize_embeddings': False}\n", "embedding = HuggingFaceEmbeddings(\n", " model_name=model_name,\n", " model_kwargs=model_kwargs,\n", " encode_kwargs=encode_kwargs\n", ")" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Semantic similarity between sentences 1 and 2: 0.8981182456324139\n", "Semantic similarity between sentences 2 and 3: 0.005847679808422499\n" ] } ], "source": [ "# Example: Use embeddings to compute semantic similarity\n", "\n", "sentence1 = \"i like dogs\"\n", "sentence2 = \"i like canines\"\n", "sentence3 = \"the weather is ugly outside\"\n", "\n", "embedding1 = embedding.embed_query(sentence1)\n", "embedding2 = embedding.embed_query(sentence2)\n", "embedding3 = embedding.embed_query(sentence3)\n", "\n", "print(\"Semantic similarity between sentences 1 and 2:\", np.dot(embedding1, embedding2))\n", "print(\"Semantic similarity between sentences 2 and 3:\", np.dot(embedding2, embedding3))" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# Create text embeddings and store in a vector database Chroma.\n", "# For more options, see: \n", "# https://python.langchain.com/docs/modules/data_connection/vectorstores/\n", "\n", "vectordb = Chroma.from_documents(\n", " documents=splits,\n", " embedding=embedding,\n", " persist_directory=embedding_persist_folder\n", ")\n", "\n", "vectordb.persist()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1194" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vectordb._collection.count()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[Document(page_content='context: The Premier League sells its television rights on a collective basis. This is in contrast to some other European Leagues, including La Liga, in which each club sells its rights individually, leading to a much higher share of the total income going to the top few clubs. The money is divided into three parts: half is divided equally between the clubs; one quarter is awarded on a merit basis based on final league position, the top club getting twenty times as much as the bottom club, and equal steps all the way down the table; the final quarter is paid out as facilities fees for games that are shown on television, with the top clubs generally receiving the largest shares of this. The income from overseas rights is divided equally between the twenty clubs.', metadata={'row': 0, 'source': 'd:\\\\Projects\\\\information-retrieval\\\\data\\\\02_intermediate\\\\ds_nlp_challenge_500samples_contexts.csv'}),\n", " Document(page_content='context: The Premier League sells its television rights on a collective basis. This is in contrast to some other European Leagues, including La Liga, in which each club sells its rights individually, leading to a much higher share of the total income going to the top few clubs. The money is divided into three parts: half is divided equally between the clubs; one quarter is awarded on a merit basis based on final league position, the top club getting twenty times as much as the bottom club, and equal steps all the way down the table; the final quarter is paid out as facilities fees for games that are shown on television, with the top clubs generally receiving the largest shares of this. The income from overseas rights is divided equally between the twenty clubs.', metadata={'row': 250, 'source': 'd:\\\\Projects\\\\information-retrieval\\\\data\\\\02_intermediate\\\\ds_nlp_challenge_500samples_contexts.csv'}),\n", " Document(page_content='context: The Premier League sells its television rights on a collective basis. This is in contrast to some other European Leagues, including La Liga, in which each club sells its rights individually, leading to a much higher share of the total income going to the top few clubs. The money is divided into three parts: half is divided equally between the clubs; one quarter is awarded on a merit basis based on final league position, the top club getting twenty times as much as the bottom club, and equal steps all the way down the table; the final quarter is paid out as facilities fees for games that are shown on television, with the top clubs generally receiving the largest shares of this. The income from overseas rights is divided equally between the twenty clubs.', metadata={'row': 250, 'source': 'd:\\\\Projects\\\\information-retrieval\\\\data\\\\02_intermediate\\\\ds_nlp_challenge_500samples_contexts.csv'})]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Example: Use vector store to retrieve chunks based on semantic similarity\n", "\n", "splits_sm = vectordb.similarity_search(\"Do European Leagues sell their television rights per a collective level?\", k=3)\n", "\n", "splits_sm" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 4. Retrieval" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "question = \"Do European Leagues sell their television rights per a collective level?\"" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "def print_contexts(contexts, n_char=100):\n", " context_contents = {}\n", " for context in contexts:\n", " index = context.metadata[\"row\"]\n", " content = context.page_content[:100]\n", " context_contents[index] = content\n", " print(json.dumps(context_contents, indent = 4))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4.1 Semantic Similarity Search" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 4.1.1 Top k" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "# Create a retriever based on the created vector db with the text embeddings\n", "\n", "retriever_sm = vectordb.as_retriever(search_kwargs={\"k\": num_contexts_retrievals})" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "# Example: Use the retriever to get relevant chunks for the question\n", "\n", "contexts_sm = retriever_sm.get_relevant_documents(question)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " \"0\": \"context: The Premier League sells its television rights on a collective basis. This is in contrast t\",\n", " \"250\": \"context: The Premier League sells its television rights on a collective basis. This is in contrast t\"\n", "}\n" ] } ], "source": [ "print_contexts(contexts_sm)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 4.1.2 Score threshold retrieval" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "4" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "retriever_st = vectordb.as_retriever(search_type=\"similarity_score_threshold\", search_kwargs={\"score_threshold\": 0.5})\n", "\n", "contexts_st = retriever_st.get_relevant_documents(question)\n", "\n", "len(contexts_st)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 4.1.3 Maximum Marginal Relevance\n", "\n", "
\n", " \n", "
\n", "\n", "\n", " | id | \n", "question | \n", "context | \n", "
---|---|---|---|
497 | \n", "497 | \n", "Where are the large Martkirche located? | \n", "Another point of interest is the Old Town. In ... | \n", "
498 | \n", "498 | \n", "When was she on the Sports Illustrated cover? | \n", "According to Italian fashion designer Roberto ... | \n", "
499 | \n", "499 | \n", "How many private institutes of technology are ... | \n", "There are 16 autonomous Indian Institutes of T... | \n", "
\n", " | id | \n", "question | \n", "context | \n", "retriever_sm | \n", "retriever_mmr | \n", "retriever_tfidf | \n", "
---|---|---|---|---|---|---|
0 | \n", "0 | \n", "Do European Leagues sell their television righ... | \n", "The Premier League sells its television rights... | \n", "0, 250, 250 | \n", "0, 305, 95 | \n", "0, 250, 132 | \n", "
1 | \n", "1 | \n", "What does the Catholic church considered \"mixe... | \n", "Between the third and fourth sessions the pope... | \n", "393, 393, 346 | \n", "393, 1, 129 | \n", "225, 346, 104 | \n", "
2 | \n", "2 | \n", "What are some of the practices Gautama underwe... | \n", "Gautama first went to study with famous religi... | \n", "2, 2, 417 | \n", "2, 93, 137 | \n", "2, 111, 111 | \n", "