Spaces:

jmlon
/

DemoRag

No application file

App Files Files

xet

Community

Jorge Londono commited on Mar 29, 2024

Commit

83233f5

1 Parent(s): 849d2d9

Implemented RAG with memory

Browse files

Files changed (2) hide show

app03-chatRagLcelMem.py +108 -104
test.ipynb +32 -34

app03-chatRagLcelMem.py CHANGED Viewed

@@ -10,7 +10,10 @@ from operator import itemgetter
 from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
 from langchain_core.runnables import RunnableParallel,RunnablePassthrough,RunnableLambda
 from langchain_core.output_parsers import StrOutputParser
-from langchain_core.messages import AIMessage, HumanMessage
 # HuggingFace
 from langchain_community.embeddings import HuggingFaceEmbeddings
@@ -32,6 +35,10 @@ load_dotenv()
 setid = "global"
 embeddings = HuggingFaceEmbeddings(model_name=os.getenv("EMBEDDINGS_MODEL"))
 # OpenAI
@@ -50,7 +57,7 @@ model = ChatGroq(model_name='mixtral-8x7b-32768')
 pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
 index = pc.Index(setid)
 vectorstore = PineconeVectorStore(index, embeddings, "text")
-retriever = vectorstore.as_retriever(kwargs={"k":5})            # Find 5 documents
 template_no_history = """Answer the question based only on the following context:
@@ -58,142 +65,139 @@ template_no_history = """Answer the question based only on the following context
 Question: {question}
 """
-PROMPT_NH = ChatPromptTemplate.from_template(template_no_history)
-template_with_history = """Given the following conversation history, answer the follow up question:
 Chat History:
 {chat_history}
-Question: {question}
-"""
-PROMPT_WH = ChatPromptTemplate.from_template(template_with_history)
-def pipeLog(x):
-    print("***", x)
-    return x
-setup_and_retrieval = RunnableParallel(
-    {"context": retriever, "question": RunnablePassthrough()}
-)
-def format_docs(docs):
-    return "\n\n".join(doc.page_content for doc in docs)
-rag_chain_from_docs = (
-    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
-    | PROMPT_NH
-    | model
-    | StrOutputParser()
-)
-rag_chain_with_source = RunnableParallel(
-    {"context": retriever, "question": RunnablePassthrough()}
-).assign(answer=rag_chain_from_docs)
-def rag_query(question: str, history: list[list[str]]):
-    if len(history)==0:
-        # chain = setup_and_retrieval | PROMPT_NH | model
-        # response = chain.invoke(question)
-        response = rag_chain_with_source.invoke(question)
-        sources = [ doc.metadata['source'] for doc in response['context'] ]
-        print(response, '\n', sources)
-        return response['answer'] # FAILS!!!
-    else:
-        chat_history = ""
-        for l in history:
-            chat_history += " : ".join(l)
-            chat_history += "\n"
-        chain = (
-            { "chat_history": itemgetter('chat_history'), "question": itemgetter('question') }
-            | PROMPT_WH
-            | pipeLog
-            | model
-        )
-        response = chain.invoke({ "chat_history": chat_history, "question": question })
-        return response.content
 # ----------------------------------------
-def pipeLog(s:str, x):
-    print(s, x)
-    return x
-pipe_a = RunnableLambda(lambda x: pipeLog("a:",x))
-pipe_b = RunnableLambda(lambda x: pipeLog("b:",x))
-contextualize_q_system_prompt = """Given a chat history and the latest user question \
-which might reference context in the chat history, formulate a standalone question \
-which can be understood without the chat history. Do NOT answer the question, \
-just reformulate it if needed and otherwise return it as is."""
-contextualize_q_prompt = ChatPromptTemplate.from_messages(
-    [
-        ("system", contextualize_q_system_prompt),
-        MessagesPlaceholder(variable_name="chat_history"),
-        ("human", "{question}"),
-    ]
 )
-contextualize_q_chain = contextualize_q_prompt | model | StrOutputParser()
-qa_system_prompt = """You are an assistant for question-answering tasks.
-Use the following pieces of retrieved context to answer the question.
-If you don't know the answer, just say that you don't know.
-Use three sentences maximum and keep the answer concise.
-{context}"""
-qa_prompt = ChatPromptTemplate.from_messages(
-    [
-        ("system", qa_system_prompt),
-        MessagesPlaceholder(variable_name="chat_history"),
-        ("human", "{question}"),
-    ]
-)
-def contextualized_question(input: dict):
-    if input.get("chat_history"):
-        return contextualize_q_chain
-    else:
-        return input["question"]
-rag_chain = (
-    RunnablePassthrough.assign(
-        context=pipe_b | contextualized_question | retriever | format_docs
-    )
-    | qa_prompt
-    | model
-)
-rag_chain_with_source = RunnableParallel(
-    {"xx": pipe_a, "context": itemgetter('question')|retriever, "question": itemgetter('question'), "chat_history": itemgetter('chat_history') }
-).assign(answer=rag_chain)
-def rag_query_2(question: str, history: list[list[str]]):
-    response = rag_chain_with_source.invoke({ 'question':question, 'chat_history':history })
-    print(response)
     # sources = [ doc.metadata['source'] for doc in response['context'] ]
     # print(response, '\n', sources)
     return response['answer'].content
 gr.ChatInterface(
-    rag_query_2,
     title="RAG Chatbot demo",
     description="A chatbot doing Retrieval Augmented Generation, backed by a Pinecone vector database"
     ).launch()

 from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
 from langchain_core.runnables import RunnableParallel,RunnablePassthrough,RunnableLambda
 from langchain_core.output_parsers import StrOutputParser
+from langchain_core.messages import AIMessage, HumanMessage, get_buffer_string
+from langchain.prompts.prompt import PromptTemplate
+from langchain.schema import format_document
+from langchain.memory import ConversationBufferMemory
 # HuggingFace
 from langchain_community.embeddings import HuggingFaceEmbeddings
 setid = "global"
+def pipeLog(x):
+    print("***", x)
+    return x
 embeddings = HuggingFaceEmbeddings(model_name=os.getenv("EMBEDDINGS_MODEL"))
 # OpenAI
 pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
 index = pc.Index(setid)
 vectorstore = PineconeVectorStore(index, embeddings, "text")
+retriever = vectorstore.as_retriever(kwargs={"k":5})            # Find top-5 documents
 template_no_history = """Answer the question based only on the following context:
 Question: {question}
 """
+ANSWER_PROMPT = ChatPromptTemplate.from_template(template_no_history)
+template_with_history = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
 Chat History:
 {chat_history}
+Follow Up Input: {question}
+Standalone question:"""
+CONDENSE_QUESTION_PROMPT = ChatPromptTemplate.from_template(template_with_history)
+DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")
+def _combine_documents(docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, document_separator="\n\n"):
+    doc_strings = [format_document(doc, document_prompt) for doc in docs]
+    return document_separator.join(doc_strings)
+# setup_and_retrieval = RunnableParallel(
+#     {"context": retriever, "question": RunnablePassthrough()}
+# )
+# def format_docs(docs):
+#     return "\n\n".join(doc.page_content for doc in docs)
+# rag_chain_from_docs = (
+#     RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
+#     | PROMPT_NH
+#     | model
+#     | StrOutputParser()
+# )
+# rag_chain_with_source = RunnableParallel(
+#     {"context": retriever, "question": RunnablePassthrough()}
+# ).assign(answer=rag_chain_from_docs)
+# def rag_query(question: str, history: list[list[str]]):
+#     if len(history)==0:
+#         # chain = setup_and_retrieval | PROMPT_NH | model
+#         # response = chain.invoke(question)
+#         response = rag_chain_with_source.invoke(question)
+#         sources = [ doc.metadata['source'] for doc in response['context'] ]
+#         print(response, '\n', sources)
+#         return response['answer'] # FAILS!!!
+#     else:
+#         chat_history = ""
+#         for l in history:
+#             chat_history += " : ".join(l)
+#             chat_history += "\n"
+#         chain = (
+#             { "chat_history": itemgetter('chat_history'), "question": itemgetter('question') }
+#             | PROMPT_WH
+#             | pipeLog
+#             | model
+#         )
+#         response = chain.invoke({ "chat_history": chat_history, "question": question })
+#         return response.content
 # ----------------------------------------
+# Prepare the chain to run the queries
+# Store chat history
+memory = ConversationBufferMemory(return_messages=True, output_key="answer", input_key="question")
+# Load chat history into 'memory' key
+loaded_memory = RunnablePassthrough.assign(
+    chat_history=RunnableLambda(memory.load_memory_variables) | itemgetter("history"),
 )
+# Generate a standalone question
+standalone_question = {
+    "standalone_question": {
+        "question": lambda x: x["question"],
+        "chat_history": lambda x: get_buffer_string(x["chat_history"]),
+    }
+    | CONDENSE_QUESTION_PROMPT
+    | model
+    | StrOutputParser(),
+}
+# Retrieve related documents
+retrieved_documents = {
+    "docs": itemgetter("standalone_question") | retriever,
+    "question": lambda x: x["standalone_question"],
+}
+# Construct the inputs for the final prompt
+final_inputs = {
+    "context": lambda x: _combine_documents(x["docs"]),
+    "question": itemgetter("question"),
+}
+# And finally, we do the part that returns the answers
+answer = {
+    "answer": final_inputs | ANSWER_PROMPT | model,
+    "docs": itemgetter("docs"),
+}
+# The complete chain
+final_chain = loaded_memory | standalone_question | retrieved_documents | answer
+def pipeLog(s:str, x):
+    print(s, x)
+    return x
+pipe_a = RunnableLambda(lambda x: pipeLog("a:",x))
+pipe_b = RunnableLambda(lambda x: pipeLog("b:",x))
+def rag_query(question: str, history: list[list[str]]) -> str:
+    """Run a RAG query using own history, not the gradio history"""
+    inputs = { 'question':question }
+    response = final_chain.invoke(inputs)
+    # print(response)
+    memory.save_context(inputs, {"answer": response["answer"].content})
     # sources = [ doc.metadata['source'] for doc in response['context'] ]
     # print(response, '\n', sources)
     return response['answer'].content
+def test_query(question):
+    print('QUESTION:', question)
+    answer = rag_query(question, None)
+    print('ANSWER:  ', answer, '\n')
+# test_query("What is the capital of France?")
+# test_query("What is a Blockchain?")
+# test_query("What is it useful for?")
 gr.ChatInterface(
+    rag_query,
     title="RAG Chatbot demo",
     description="A chatbot doing Retrieval Augmented Generation, backed by a Pinecone vector database"
     ).launch()

test.ipynb CHANGED Viewed

@@ -20,7 +20,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -91,7 +91,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -100,9 +100,7 @@
     "\n",
     "embeddings = HuggingFaceEmbeddings(model_name=os.getenv(\"EMBEDDINGS_MODEL\"))\n",
     "    \n",
-    "pc = Pinecone(\n",
-    "        api_key=os.getenv(\"PINECONE_API_KEY\")\n",
-    "    )\n",
     "index = pc.Index(setid)\n",
     "vectorstore = PineconeVectorStore(index, embeddings, \"text\")\n",
     "retriever = vectorstore.as_retriever(kwargs={\"k\":5})            # Find 5 documents\n"
@@ -117,7 +115,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -139,14 +137,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "content='Based on the provided document, a blockchain is a distributed ledger technology that enables secure and immutable record-keeping of digital transactions. It consists of a chain of blocks, each containing a list of validated and time-stamped transactions. The key features of blockchain include transparency, immutability, security, and decentralization of recorded data in the ledger. Blockchain technology implements a decentralized fully replicated append-only ledger in a peer-to-peer network, where multiple participants, or nodes, maintain copies of the ledger. This distributed consensus mechanism ensures that no single entity has control over the entire network, making it resistant to tampering and censorship. In public blockchains, transparency is achieved by its public nature, allowing members and non-members to view and verify the transactions.'\n"
      ]
     }
    ],
@@ -175,7 +173,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -184,7 +182,7 @@
        "'Que es blockchain? : Blockchain es una cadena de bloques\\nPara que se usa : Para registrar transacciones\\n'"
       ]
      },
-     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -205,16 +203,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "AIMessage(content='En la temática de blockchain, el consenso se refiere al método por el cual todos los nodos de una red descentralizada de blockchain agreement on the current state of the blockchain. In other words, consenso is the way that the nodes reach a consensus on the validity of transactions and the order in which they were received, ensuring that all nodes have the same copy of the blockchain. Examples of consensus algorithms include Proof of Work (PoW) and Proof of Stake (PoS).')"
       ]
      },
-     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -238,7 +236,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -259,21 +257,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "{'context': [Document(page_content='the nodes present on the chain maintain a complete local copy of the blockchain. The\\nblockchain is an indigenous technology that has emerged for decentralized applications\\nas the outcome of complication, privacy, and security issues present in the applications\\nover half a century [3,4]. It is a peer-to-peer system that authorizes the users to maintain a\\nledger for various transactions that are reproduced, and remains identical in more than\\none location over multiple user servers [5].\\nA blockchain is essentially a block of chains, with the growing list of records referred\\nto as blocks that are joined with cryptography [4]. Each blockchain contains a hash of a\\nprevious block, and a timestamp that keeps track of the creation and modification time of', metadata={'chunk': 3.0, 'source': 'CustodyBlock-2021.txt'}),\n",
-       "  Document(page_content='customer information [14]. Blockchain is the core strength of IoT so\\xad\\nlutions to build a system with cryptographically protected records that\\nare reluctant to change and inaccuracy. Additionally, Blockchain faces\\nseveral crucial issues intrinsic to the Internet of Things, such as a large\\nnumber of IoT devices, a non-homogeneous network topology, limited\\ncomputational capacity, poor communication bandwidth, etc.', metadata={'chunk': 24.0, 'source': 'BlockchainBased-2023.txt'}),\n",
-       "  Document(page_content='as a distributed ledger technology that enables secure and immutable record-keeping of\\ndigital transactions. It comprises a chain of blocks, each containing a list of validated and\\ntime-stamped transactions. An interesting feature of blockchain is its decentralized nature,\\nwhere multiple participants, or nodes, maintain copies of the ledger. This distributed\\nconsensus mechanism ensures that no single entity has control over the entire network,\\nmaking it resistant to tampering and censorship. Thus, blockchain is ripe for contexts\\ninvolving multiple parties with a need for a reliable and trustworthy ambiance in the\\nregistering of sensitive information, since it can “allow for an audit trail of all operations\\ncarried out between peers without the need for a centralized authority” (Grima et al. 2021).\\nBlockchains can be classified as public, private/permissioned, or hybrid. Public\\nblockchain allows any interested party to be a node in the network and to participate in\\nthe consensus. Registered data can be viewed by members or non-members. In its turn,\\nprivate or permissioned blockchains only allow the participation of authorized members,\\nlimiting data access to such participants. Lastly, hybrid blockchains embed characteristics\\nof both public and private blockchains.\\nThe key features of blockchain include transparency, immutability, security, and decentralization of recorded data in the ledger data. In public blockchains, transparency is\\nachieved by its public nature, allowing members and non-members to view and verify', metadata={'chunk': 6.0, 'source': 'ExploringBC-2023.txt'}),\n",
-       "  Document(page_content='2\\n\\nBackground\\n\\n2.1\\n\\nBlockchain technology\\n\\nThe blockchain technology implements a decentralized fully replicated append-only ledger in a\\npeer-to-peer network, originally employed for the Bitcoin cryptocurrency [7]. All participating\\nnodes maintain a full local copy of the blockchain. The blockchain consists of a sequence\\nof blocks containing the transactions of the ledger. Transactions inside blocks are sorted\\nchronologically and each block contains a cryptographic hash of the previous block in the\\nchain. Nodes create new blocks as they receives transactions, which are broadcast in the\\nnetwork. Once a block is complete, they start the consensus process to convince other nodes\\nto include it in the blockchain. In the original blockchain technology employed in Bitcoin\\nthe consensus process is based on Proof-of-Work (PoW) [7]. With PoW nodes compete with\\neach other in confirming transactions and creating new blocks by solving a mathematical\\npuzzle. While solving a block is a computational intensive task, verifying its validity is easy.\\nTo incentivize such mechanism, solving a block also results in mining a certain amount of\\n\\n\\x0cS. Bonomi, M. Casini, and C. Ciccotelli\\n\\n12:3', metadata={'chunk': 5.0, 'source': 'B-CoC-2020.txt'})],\n",
        " 'question': 'What is a blockchain?',\n",
-       " 'answer': 'A blockchain is a distributed ledger technology that comprises a chain of blocks, each containing a list of validated and time-stamped transactions. It is a decentralized system where multiple participants, or nodes, maintain copies of the ledger, ensuring no single entity has control over the entire network. This mechanism makes it resistant to tampering and censorship. Blockchains can be classified as public, private/permissioned, or hybrid, and feature transparency, immutability, security, and decentralization of recorded data in the ledger data. In public blockchains, transparency is achieved by its public nature, allowing members and non-members to view and verify the transactions.'}"
       ]
      },
-     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -285,17 +283,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
        "('What is a blockchain?',\n",
-       " 'A blockchain is a distributed ledger technology that comprises a chain of blocks, each containing a list of validated and time-stamped transactions. It is a decentralized system where multiple participants, or nodes, maintain copies of the ledger, ensuring no single entity has control over the entire network. This mechanism makes it resistant to tampering and censorship. Blockchains can be classified as public, private/permissioned, or hybrid, and feature transparency, immutability, security, and decentralization of recorded data in the ledger data. In public blockchains, transparency is achieved by its public nature, allowing members and non-members to view and verify the transactions.')"
       ]
      },
-     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -306,17 +304,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CustodyBlock-2021.txt\n",
       "BlockchainBased-2023.txt\n",
-      "ExploringBC-2023.txt\n",
-      "B-CoC-2020.txt\n"
      ]
     }
    ],
@@ -327,19 +325,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "['CustodyBlock-2021.txt',\n",
        " 'BlockchainBased-2023.txt',\n",
-       " 'ExploringBC-2023.txt',\n",
-       " 'B-CoC-2020.txt']"
       ]
      },
-     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1212,7 +1210,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
   }
  },
  "nbformat": 4,

   },
   {
    "cell_type": "code",
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
     "\n",
     "embeddings = HuggingFaceEmbeddings(model_name=os.getenv(\"EMBEDDINGS_MODEL\"))\n",
     "    \n",
+    "pc = Pinecone( api_key=os.getenv(\"PINECONE_API_KEY\") )\n",
     "index = pc.Index(setid)\n",
     "vectorstore = PineconeVectorStore(index, embeddings, \"text\")\n",
     "retriever = vectorstore.as_retriever(kwargs={\"k\":5})            # Find 5 documents\n"
   },
   {
    "cell_type": "code",
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "content='Based on the provided document, a blockchain is a type of distributed ledger technology that implements a decentralized, fully replicated append-only ledger in a peer-to-peer network. It consists of a chain of blocks, where each block contains a list of validated and timestamped transactions. Blockchain technology is known for its secure and immutable record-keeping of digital transactions, as well as its resistance to tampering and censorship due to its decentralized nature. In a blockchain network, multiple participants, or nodes, maintain copies of the ledger, and processing and verifying transactions are the responsibility of every node. Blockchain technology can be classified as public, private/permissioned, or hybrid.' response_metadata={'token_usage': {'completion_time': 0.269, 'completion_tokens': 151, 'prompt_time': 1.5510000000000002, 'prompt_tokens': 1712, 'queue_time': None, 'total_time': 1.8200000000000003, 'total_tokens': 1863}, 'model_name': 'mixtral-8x7b-32768', 'system_fingerprint': 'fp_13a4b82d64', 'finish_reason': 'stop', 'logprobs': None}\n"
      ]
     }
    ],
   },
   {
    "cell_type": "code",
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
        "'Que es blockchain? : Blockchain es una cadena de bloques\\nPara que se usa : Para registrar transacciones\\n'"
       ]
      },
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
   },
   {
    "cell_type": "code",
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
+       "AIMessage(content='En la tecnología de blockchain, \"consenso\" se refiere al mecanismo por el cual se llega a un acuerdo sobre el estado del registro distribuido. Hay varios algoritmos de consenso, como Proof of Work (PoW) y Proof of Stake (PoS), que se utilizan para asegurar la exactitud y la validez de las transacciones en la red blockchain. El algoritmo de consenso ayuda a evitar la duplicación de entradas y garantiza que las transacciones sean seguras y verificables.\\n\\nEn resumen, consenso en blockchain es el proceso de llegar a un acuerdo sobre el estado del registro distribuido, usando algoritmos para asegurar la exactitud y validez de las transacciones.', response_metadata={'token_usage': {'completion_time': 0.355, 'completion_tokens': 195, 'prompt_time': 0.063, 'prompt_tokens': 68, 'queue_time': None, 'total_time': 0.418, 'total_tokens': 263}, 'model_name': 'mixtral-8x7b-32768', 'system_fingerprint': 'fp_1cc6d039b0', 'finish_reason': 'stop', 'logprobs': None})"
       ]
      },
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
   },
   {
    "cell_type": "code",
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
+       "{'context': [Document(page_content='2\\n\\nBackground\\n\\n2.1\\n\\nBlockchain technology\\n\\nThe blockchain technology implements a decentralized fully replicated append-only ledger in a\\npeer-to-peer network, originally employed for the Bitcoin cryptocurrency [7]. All participating\\nnodes maintain a full local copy of the blockchain. The blockchain consists of a sequence\\nof blocks containing the transactions of the ledger. Transactions inside blocks are sorted\\nchronologically and each block contains a cryptographic hash of the previous block in the\\nchain. Nodes create new blocks as they receives transactions, which are broadcast in the\\nnetwork. Once a block is complete, they start the consensus process to convince other nodes\\nto include it in the blockchain. In the original blockchain technology employed in Bitcoin\\nthe consensus process is based on Proof-of-Work (PoW) [7]. With PoW nodes compete with\\neach other in confirming transactions and creating new blocks by solving a mathematical\\npuzzle. While solving a block is a computational intensive task, verifying its validity is easy.\\nTo incentivize such mechanism, solving a block also results in mining a certain amount of\\n\\n\\x0cS. Bonomi, M. Casini, and C. Ciccotelli\\n\\n12:3', metadata={'chunk': 4.0, 'source': 'B-CoC-2020.txt'}),\n",
+       "  Document(page_content='2\\n\\nBackground\\n\\n2.1\\n\\nBlockchain technology\\n\\nThe blockchain technology implements a decentralized fully replicated append-only ledger in a\\npeer-to-peer network, originally employed for the Bitcoin cryptocurrency [7]. All participating\\nnodes maintain a full local copy of the blockchain. The blockchain consists of a sequence\\nof blocks containing the transactions of the ledger. Transactions inside blocks are sorted\\nchronologically and each block contains a cryptographic hash of the previous block in the\\nchain. Nodes create new blocks as they receives transactions, which are broadcast in the\\nnetwork. Once a block is complete, they start the consensus process to convince other nodes\\nto include it in the blockchain. In the original blockchain technology employed in Bitcoin\\nthe consensus process is based on Proof-of-Work (PoW) [7]. With PoW nodes compete with\\neach other in confirming transactions and creating new blocks by solving a mathematical\\npuzzle. While solving a block is a computational intensive task, verifying its validity is easy.\\nTo incentivize such mechanism, solving a block also results in mining a certain amount of\\n\\n\\x0cS. Bonomi, M. Casini, and C. Ciccotelli\\n\\n12:3', metadata={'chunk': 4.0, 'source': 'OASIcs-Tokenomics-2019-12.txt'}),\n",
+       "  Document(page_content='2.5. Components in blockchain technology\\nThe structure of a blockchain is a decentralized database consisting\\nof a chain of blocks that contain transactions, with each block linked to\\nthe previous one through cryptographic hashes, creating an immutable\\nand secure ledger of transactions as shown in Fig. 1. This structure en\\xad\\nables trust and transparency in the network by allowing participants to\\nverify and validate transactions without the need for intermediaries. The\\ncomponent used for blockchain technology are as follows:\\n(a) Node: A node in a blockchain network is a system, or it can be a\\nrouter or switch. It’s possible to create a dispersed network of\\nnodes with equal rights by using a P2P network. Processing and\\nverifying transactions are the exclusive responsibility of every\\nnode in the network [33].\\n(b) Transactions: Transactions are the smallest and most funda\\xad\\nmental part of the Blockchain. In blockchain technology, a record\\nacts as a transaction for payment history that includes the sender\\nand recipient address and a timestamp of the occurrence of a\\ntransaction. In a blockchain network, the storage, analysis, and\\nretrieval of completed transactions are important aspects of\\nmaintaining the integrity and transparency of the network [34].\\n(c) Block: The procedures for block validation are depicted by the\\nblock version number given to each block in the Blockchain. A\\ntimestamp value indicates when the particular block was\\n\\n2.4. Blockchain technology\\nBlockchain technology is a distributed ledger that is immutable,\\n4\\n\\n\\x0cSakshi et al.\\n\\nJournal of Information Security and Applications 77 (2023) 103579\\n\\nFig. 1. Blockchain Structure.', metadata={'chunk': 19.0, 'source': 'BlockchainBased-2023.txt'}),\n",
+       "  Document(page_content='The review was based on resources from four established scientific databases. A total\\nof 72 resources were found in these databases, of which 26 resources were fully analyzed\\nand provided evidence of the status of the research of blockchain-based solutions to solve\\nproblems related to the chain of custody of physical evidence and of how the current\\nliterature relates to the concept of physical evidence. The final selected resources (37%)\\nsufficiently represented a diverse range of perspectives and findings, enabling this article\\nto draw relevant conclusions and to contribute to the existing knowledge on the topic.\\nThe other sections of this paper are organized as follows. Section 2 provides the main\\nconcepts discussed in this paper, and Section 3 highlights current literature reviews focusing\\non the use of blockchain in the forensic field. Section 4 explains the research methodology.\\nSection 5 provides the results, and Section 6 the discussion. Finally, Section 7 presents the\\nlimitations and proposed future research and Section 8 concludes the paper.\\n2. Background\\nBlockchain technology has emerged as a disruptive innovation, providing a decentralized and transparent environment across various domains. Blockchain can be understood\\nas a distributed ledger technology that enables secure and immutable record-keeping of\\ndigital transactions. It comprises a chain of blocks, each containing a list of validated and\\ntime-stamped transactions. An interesting feature of blockchain is its decentralized nature,\\nwhere multiple participants, or nodes, maintain copies of the ledger. This distributed\\nconsensus mechanism ensures that no single entity has control over the entire network,\\nmaking it resistant to tampering and censorship. Thus, blockchain is ripe for contexts\\ninvolving multiple parties with a need for a reliable and trustworthy ambiance in the\\nregistering of sensitive information, since it can “allow for an audit trail of all operations\\ncarried out between peers without the need for a centralized authority” (Grima et al. 2021).\\nBlockchains can be classified as public, private/permissioned, or hybrid. Public\\nblockchain allows any interested party to be a node in the network and to participate in\\nthe consensus. Registered data can be viewed by members or non-members. In its turn,', metadata={'chunk': 3.0, 'source': 'ExploringBC-2023.txt'})],\n",
        " 'question': 'What is a blockchain?',\n",
+       " 'answer': 'A blockchain is a decentralized fully replicated append-only ledger in a peer-to-peer network, consisting of a chain of blocks containing transactions of the ledger. Each block contains a cryptographic hash of the previous block in the chain, creating an immutable and secure ledger of transactions. The structure enables trust and transparency in the network by allowing participants to verify and validate transactions without the need for intermediaries. It comprises components such as nodes, transactions, and blocks. Nodes maintain a full local copy of the blockchain and are responsible for processing and verifying transactions. Transactions are the smallest and most fundamental part of the blockchain, while blocks are linked to the previous one through cryptographic hashes. The procedures for block validation are depicted by the block version number given to each block in the blockchain. A timestamp value indicates when the particular block was created.'}"
       ]
      },
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
   },
   {
    "cell_type": "code",
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
        "('What is a blockchain?',\n",
+       " 'A blockchain is a decentralized fully replicated append-only ledger in a peer-to-peer network, consisting of a chain of blocks containing transactions of the ledger. Each block contains a cryptographic hash of the previous block in the chain, creating an immutable and secure ledger of transactions. The structure enables trust and transparency in the network by allowing participants to verify and validate transactions without the need for intermediaries. It comprises components such as nodes, transactions, and blocks. Nodes maintain a full local copy of the blockchain and are responsible for processing and verifying transactions. Transactions are the smallest and most fundamental part of the blockchain, while blocks are linked to the previous one through cryptographic hashes. The procedures for block validation are depicted by the block version number given to each block in the blockchain. A timestamp value indicates when the particular block was created.')"
       ]
      },
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
   },
   {
    "cell_type": "code",
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "B-CoC-2020.txt\n",
+      "OASIcs-Tokenomics-2019-12.txt\n",
       "BlockchainBased-2023.txt\n",
+      "ExploringBC-2023.txt\n"
      ]
     }
    ],
   },
   {
    "cell_type": "code",
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
+       "['B-CoC-2020.txt',\n",
+       " 'OASIcs-Tokenomics-2019-12.txt',\n",
        " 'BlockchainBased-2023.txt',\n",
+       " 'ExploringBC-2023.txt']"
       ]
      },
+     "execution_count": 14,
      "metadata": {},
      "output_type": "execute_result"
     }
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
+   "version": "3.11.8"
   }
  },
  "nbformat": 4,