{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Arxiv Metadata Dataset - Loader and Retriever\n", "\n", "- Load Arxiv Metadata from Hugging Face DataSet and Load in to Qdrant\n", "- Use LangGraph to store trace info" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%pip install -qU pymupdf \n", "%pip install -qU langchain langchain-core langchain-community langchain-text-splitters \n", "%pip install -qU langchain-openai\n", "%pip install -qU langchain-groq\n", "%pip install -qU langchain-qdrant" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Parameterize some stuff\n", "\n", "QUESTION = \"What are the emerging patterns for building Systems of Agents that could provide the system the ability to evolve and improve its own processes through learning?\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "from langchain import hub\n", "from langchain_groq import ChatGroq\n", "from config import COLLECTION_NAME, DATASET_NAME, OPENAI_API_KEY, QDRANT_API_KEY, QDRANT_API_URL, LANGCHAIN_HUB_PROMPT\n", "from langchain_community.document_loaders import PyMuPDFLoader\n", "from langchain_text_splitters import RecursiveCharacterTextSplitter\n", "from langchain_qdrant import Qdrant\n", "# idenify data loader for html documents" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from langchain_openai import OpenAIEmbeddings\n", "\n", "embedding = OpenAIEmbeddings(model=\"text-embedding-3-small\")\n", "prompt = hub.pull(LANGCHAIN_HUB_PROMPT)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# URL Path is retrieved from the dataset\n", "# need to use another loader for HTML documents\n", "\n", "# iterate over retrieved records from the huggingface dataset\n", "URL_PATH = # need to retrieve the URL path from the dataset\n", "loader = PyMuPDFLoader(URL_PATH, extract_images=True)\n", "docs = loader.load()\n", "\n", "text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n", "splits = text_splitter.split_documents(docs)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Store the chunks in Qdrant\n", "from_splits = Qdrant.from_documents(\n", " embedding=embedding,\n", " collection_name=COLLECTION_NAME,\n", " url=QDRANT_API_URL,\n", " api_key=QDRANT_API_KEY,\n", " prefer_grpc=True, \n", " documents=splits,\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Retrieve Information using Metadata in Vector Store" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "qdrant = Qdrant.from_existing_collection(\n", " embedding=embedding,\n", " collection_name=COLLECTION_NAME,\n", " url=QDRANT_API_URL,\n", " api_key=QDRANT_API_KEY,\n", " prefer_grpc=True, \n", ")\n", "\n", "retriever = qdrant.as_retriever(\n", " search_type=\"similarity_score_threshold\",\n", " search_kwargs={\"score_threshold\": 0.5, \"k\": 5}\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from langchain_groq import ChatGroq\n", "from operator import itemgetter\n", "from langchain.schema.runnable import RunnablePassthrough\n", "\n", "llm = ChatGroq(model=\"llama3-70b-8192\", temperature=0.3)\n", "\n", "rag_chain = (\n", " {\"context\": itemgetter(\"question\") | retriever, \"question\": itemgetter(\"question\")}\n", " | RunnablePassthrough.assign(context=itemgetter(\"context\"))\n", " | {\"response\": prompt | llm, \"context\": itemgetter(\"context\")}\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(rag_chain.get_graph().draw_ascii())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "response = rag_chain.invoke({\"question\" : QUESTION})" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# return the response. filter on the response key AIMessage content element\n", "print(response[\"response\"].content)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "response[\"context\"]" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" } }, "nbformat": 4, "nbformat_minor": 2 }