In [1]:
import json
import numpy as np
import os
import pandas as pd
import sys

from langchain.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.retrievers.tfidf import TFIDFRetriever
from tqdm import tqdm

In [2]:
# Change the current working directory to the pachage root
# That's step is due to the way settings.py is defined
root_path_list = os.getcwd().split("\\")[:-1]
root_path = os.path.join(root_path_list[0], os.sep, *root_path_list[1:])
os.chdir(root_path)
os.getcwd()

'd:\\Projects\\information-retrieval'

In [4]:
# User parameters

# Document Loading
csv_data_folder = os.path.join(os.getcwd(), "data", "02_intermediate")

# Document Splitting
chunk_size = 1000
chunk_overlap = 100 
separators = ["\n\n", "\n", "(?<=\. )", " ", ""]

# Text Embedding and Vector Store
model_name = "sentence-transformers/all-mpnet-base-v2"
embedding_persist_folder = os.path.join(os.getcwd(), "data", "04_feature", "chroma")

# Retrieval
model_folder = os.path.join(os.getcwd(), "data", "06_models")
num_contexts_retrievals = 3

# Evaluation
raw_data_folder = os.path.join(os.getcwd(), "data", "01_raw")
raw_data_file = "ds_nlp_challenge_500samples.csv"
results_folder = os.path.join(os.getcwd(), "data", "07_model_output")
results_file = "ds_nlp_challenge_500samples_results.csv"
report_folder = os.path.join(os.getcwd(), "data", "08_reporting")
report_file = "retrieval_metrics_report.json"

# 1. Document Loading

In [5]:
# Get CSV file name
csv_data_files = [file for file in os.listdir(csv_data_folder) if ".csv" in file]

# Load CSV
docs = []
for csv_data_file in csv_data_files:
    csv_data_path = os.path.join(csv_data_folder, csv_data_file)
    loader = CSVLoader(file_path=csv_data_path, encoding="utf8")
    docs.extend(loader.load())

In [6]:
len(docs)

500

In [7]:
docs[0]

Document(page_content='context: The Premier League sells its television rights on a collective basis. This is in contrast to some other European Leagues, including La Liga, in which each club sells its rights individually, leading to a much higher share of the total income going to the top few clubs. The money is divided into three parts: half is divided equally between the clubs; one quarter is awarded on a merit basis based on final league position, the top club getting twenty times as much as the bottom club, and equal steps all the way down the table; the final quarter is paid out as facilities fees for games that are shown on television, with the top clubs generally receiving the largest shares of this. The income from overseas rights is divided equally between the twenty clubs.', metadata={'source': 'd:\\Projects\\information-retrieval\\data\\02_intermediate\\ds_nlp_challenge_500samples_contexts.csv', 'row': 0})

# 2. Document Splitting

In [8]:
# Split documents into chunks

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap, 
    separators=separators,
)

splits = text_splitter.split_documents(docs)

In [9]:
len(splits)

597

In [10]:
splits[:2]

[Document(page_content='context: The Premier League sells its television rights on a collective basis. This is in contrast to some other European Leagues, including La Liga, in which each club sells its rights individually, leading to a much higher share of the total income going to the top few clubs. The money is divided into three parts: half is divided equally between the clubs; one quarter is awarded on a merit basis based on final league position, the top club getting twenty times as much as the bottom club, and equal steps all the way down the table; the final quarter is paid out as facilities fees for games that are shown on television, with the top clubs generally receiving the largest shares of this. The income from overseas rights is divided equally between the twenty clubs.', metadata={'source': 'd:\\Projects\\information-retrieval\\data\\02_intermediate\\ds_nlp_challenge_500samples_contexts.csv', 'row': 0}),
 Document(page_content='context: Between the third and fourth sess

# 3. Text Embedding and Vector Store

In [11]:
# Load a pretrained text embedding model

model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embedding = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [12]:
# Example: Use embeddings to compute semantic similarity

sentence1 = "i like dogs"
sentence2 = "i like canines"
sentence3 = "the weather is ugly outside"

embedding1 = embedding.embed_query(sentence1)
embedding2 = embedding.embed_query(sentence2)
embedding3 = embedding.embed_query(sentence3)

print("Semantic similarity between sentences 1 and 2:", np.dot(embedding1, embedding2))
print("Semantic similarity between sentences 2 and 3:", np.dot(embedding2, embedding3))

Semantic similarity between sentences 1 and 2: 0.8981182456324139
Semantic similarity between sentences 2 and 3: 0.005847679808422499


In [13]:
# Create text embeddings and store in a vector database Chroma.
# For more options, see: 
# https://python.langchain.com/docs/modules/data_connection/vectorstores/

vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=embedding_persist_folder
)

vectordb.persist()

In [15]:
vectordb._collection.count()

1194

In [16]:
# Example: Use vector store to retrieve chunks based on semantic similarity

splits_sm = vectordb.similarity_search("Do European Leagues sell their television rights per a collective level?", k=3)

splits_sm

[Document(page_content='context: The Premier League sells its television rights on a collective basis. This is in contrast to some other European Leagues, including La Liga, in which each club sells its rights individually, leading to a much higher share of the total income going to the top few clubs. The money is divided into three parts: half is divided equally between the clubs; one quarter is awarded on a merit basis based on final league position, the top club getting twenty times as much as the bottom club, and equal steps all the way down the table; the final quarter is paid out as facilities fees for games that are shown on television, with the top clubs generally receiving the largest shares of this. The income from overseas rights is divided equally between the twenty clubs.', metadata={'row': 0, 'source': 'd:\\Projects\\information-retrieval\\data\\02_intermediate\\ds_nlp_challenge_500samples_contexts.csv'}),
 Document(page_content='context: The Premier League sells its tele

# 4. Retrieval

In [17]:
question = "Do European Leagues sell their television rights per a collective level?"

In [18]:
def print_contexts(contexts, n_char=100):
    context_contents = {}
    for context in contexts:
        index = context.metadata["row"]
        content = context.page_content[:100]
        context_contents[index] = content
    print(json.dumps(context_contents, indent = 4))

## 4.1 Semantic Similarity Search

### 4.1.1 Top k

In [19]:
# Create a retriever based on the created vector db with the text embeddings

retriever_sm = vectordb.as_retriever(search_kwargs={"k": num_contexts_retrievals})

In [20]:
# Example: Use the retriever to get relevant chunks for the question

contexts_sm = retriever_sm.get_relevant_documents(question)

In [21]:
print_contexts(contexts_sm)

{
    "0": "context: The Premier League sells its television rights on a collective basis. This is in contrast t",
    "250": "context: The Premier League sells its television rights on a collective basis. This is in contrast t"
}


### 4.1.2 Score threshold retrieval

In [22]:
retriever_st = vectordb.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.5})

contexts_st = retriever_st.get_relevant_documents(question)

len(contexts_st)

4

### 4.1.3 Maximum Marginal Relevance

<p align="center">
  <img src="../docs/images/MMR%20(Maximum%20Marginal%20Relevance)%20Algorithm.png" alt="" width="400">
</p>

<center>Image source: DeepLearning.AI (2023). LangChain chat with your data, accessed September 2023, https://learn.deeplearning.ai/langchain-chat-with-your-data/lesson/5/retrieval  </center>

In [23]:
retriever_mmr = vectordb.as_retriever(search_type="mmr")

contexts_mmr = retriever_mmr.get_relevant_documents(question)

print_contexts(contexts_mmr)

{
    "0": "context: The Premier League sells its television rights on a collective basis. This is in contrast t",
    "132": "context: The BBC domestic television channels do not broadcast advertisements; they are instead fund",
    "305": "context: Under the 1995\u20132004 National Hockey League collective bargaining agreement, teams were limi",
    "95": "context: Most of the world's airports are owned by local, regional, or national government bodies wh"
}


In [24]:
retriever_mmr = vectordb.as_retriever(search_type="mmr", search_kwargs={"k": num_contexts_retrievals})

contexts_mmr = retriever_mmr.get_relevant_documents(question)

print_contexts(contexts_mmr)

{
    "0": "context: The Premier League sells its television rights on a collective basis. This is in contrast t",
    "305": "context: Under the 1995\u20132004 National Hockey League collective bargaining agreement, teams were limi",
    "95": "context: Most of the world's airports are owned by local, regional, or national government bodies wh"
}


## 4.2 "Lexical" Search

In [25]:
# Create a retriever based on the TF-IDF vectorizer model

retriever_tfidf = TFIDFRetriever.from_documents(splits, k=num_contexts_retrievals)

contexts_tfidf = retriever_tfidf.get_relevant_documents(question)

print_contexts(contexts_tfidf)

{
    "0": "context: The Premier League sells its television rights on a collective basis. This is in contrast t",
    "250": "context: The Premier League sells its television rights on a collective basis. This is in contrast t",
    "132": "context: The BBC domestic television channels do not broadcast advertisements; they are instead fund"
}


In [26]:
# Save retriever

retriever_tfidf.save_local(model_folder)
retriever_tfidf = TFIDFRetriever.load_local(model_folder)
retriever_tfidf.k = num_contexts_retrievals

print_contexts(retriever_tfidf.get_relevant_documents(question))

{
    "0": "context: The Premier League sells its television rights on a collective basis. This is in contrast t",
    "250": "context: The Premier League sells its television rights on a collective basis. This is in contrast t",
    "132": "context: The BBC domestic television channels do not broadcast advertisements; they are instead fund"
}


# 5. Evaluation

In [27]:
retrievers = {
    "retriever_sm": retriever_sm, 
    "retriever_mmr": retriever_mmr, 
    "retriever_tfidf": retriever_tfidf,
}

## 5.1 Retrieval

In [28]:
# Load the test dataset

data_path = os.path.join(raw_data_folder, raw_data_file)
data = pd.read_csv(data_path, header=0, sep=',', quotechar='"')
data.rename(columns={"Unnamed: 0": "index"}, inplace=True)
data.dropna(inplace=True)

data.tail(3)

Unnamed: 0,id,question,context
497,497,Where are the large Martkirche located?,Another point of interest is the Old Town. In ...
498,498,When was she on the Sports Illustrated cover?,According to Italian fashion designer Roberto ...
499,499,How many private institutes of technology are ...,There are 16 autonomous Indian Institutes of T...


In [29]:
def retrieve_question_indexes(retriever, questions, k=None):

    if k:
        if hasattr(retriever_sm, "k"):
            retriever.k = k
        else:
            retriever_mmr.search_kwargs["k"] = k

    retrieved_indexes = []
    for question in tqdm(questions):
        q_contexts = retriever.get_relevant_documents(question)
        q_retrieved_indexes = [context.metadata["row"] for context in q_contexts]
        retrieved_indexes.append(str(q_retrieved_indexes)[1:-1])

    return retrieved_indexes

In [30]:
# Get relevant contexts for the questions using the created retrievers

for name, retriever in retrievers.items():
    print(name)
    questions = data.question
    data[name] = retrieve_question_indexes(retriever, questions)

retriever_sm


100%|██████████| 500/500 [00:28<00:00, 17.55it/s]


retriever_mmr


100%|██████████| 500/500 [00:29<00:00, 17.19it/s]


retriever_tfidf


100%|██████████| 500/500 [00:03<00:00, 131.91it/s]


In [31]:
data.head(3)

Unnamed: 0,id,question,context,retriever_sm,retriever_mmr,retriever_tfidf
0,0,Do European Leagues sell their television righ...,The Premier League sells its television rights...,"0, 250, 250","0, 305, 95","0, 250, 132"
1,1,"What does the Catholic church considered ""mixe...",Between the third and fourth sessions the pope...,"393, 393, 346","393, 1, 129","225, 346, 104"
2,2,What are some of the practices Gautama underwe...,Gautama first went to study with famous religi...,"2, 2, 417","2, 93, 137","2, 111, 111"


In [32]:
# Save results

results_path = os.path.join(results_folder, results_file)
data.to_csv(results_path, header=True, index=False, encoding="utf-8")

## 5.2 Mean Reciprocal Rank

In [33]:
def mrr_score(true_indexes, retrieved_indexes):

    n = len(true_indexes)

    sum_rr = 0
    for idx in range(n):
        try:
            rank = 1+retrieved_indexes[idx].index(true_indexes[idx])
        except ValueError:
            rank = np.inf
        reciprocal_rank = 1 / rank
        sum_rr += reciprocal_rank

    mmr = sum_rr/n

    return mmr

In [34]:
# Compute MRR score over the test dataset for all retrievers

true_indexes = data.id

metrics_report = {}
for name, retriever in retrievers.items():
    retrieved_indexes = [[int(idx) for idx in str_indexes.split(", ")] for str_indexes in data[name]]
    metrics_report[name] = round(mrr_score(true_indexes, retrieved_indexes), 4)

In [35]:
print(json.dumps(metrics_report, indent = 4))

{
    "retriever_sm": 0.8993,
    "retriever_mmr": 0.9073,
    "retriever_tfidf": 0.8697
}


In [36]:
# Save report

report_path = os.path.join(report_folder, report_file)
with open(report_path, 'w', encoding='utf-8') as f:
    json.dump(metrics_report, f, ensure_ascii=False, indent=4)