Final_Assignment_Template

Runtime error

App Files Files Community

CheeYung commited on May 8, 2025

Commit

065bc2a

1 Parent(s): 81917a3

setup supabase retriever

Browse files

Files changed (6) hide show

.gitignore +1 -0
agent.py +116 -0
metadata.jsonl +0 -0
requirements.txt +3 -1
sample.ipynb +333 -0
supabase.sql +30 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .env

agent.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import os
+from typing import TypedDict, Annotated
+from langgraph.graph import MessagesState, START, StateGraph
+from langgraph.graph.message import add_messages
+from langgraph.prebuilt import tools_condition, ToolNode
+from langchain_core.messages import HumanMessage, SystemMessage, AnyMessage
+from langchain_core.tools import tool
+from langchain_community.tools.tavily_search import TavilySearchResults
+from langchain_google_genai import ChatGoogleGenerativeAI
+@tool
+def add(a: int, b: int) -> int:
+    """Add two numbers.
+    Args:
+        a: first int
+        b: second int
+    """
+    return a + b
+@tool
+def subtract(a: int, b: int) -> int:
+    """Subtract two numbers.
+    Args:
+        a: first int
+        b: second int
+    """
+    return a - b
+@tool
+def multiply(a: int, b: int) -> int:
+    """Multiply two numbers.
+    Args:
+        a: first int
+        b: second int
+    """
+    return a * b
+@tool
+def power(a: int, b: int) -> int:
+    """Power up first number by second number.
+    Args:
+        a: first int
+        b: second int
+    """
+    return a ** b
+@tool
+def divide(a: int, b: int) -> int:
+    """Divide first number by second number.
+    Args:
+        a: first int
+        b: second int
+    """
+    try:
+        return a / b
+    except ZeroDivisionError:
+        return None
+@tool
+def modulus(a: int, b: int) -> int:
+    """Get remainder of first number divided by second number.
+    Args:
+        a: first int
+        b: second int
+    """
+    return a % b
+# list of tools
+tools = [
+    add,
+    subtract,
+    multiply,
+    power,
+    divide,
+    modulus
+]
+# Generate the AgentState and Agent graph
+class AgentState(TypedDict):
+    messages: Annotated[list[AnyMessage], add_messages]
+def build_graph():
+    llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0)
+    llm_with_tools = llm.bind_tools(tools)
+    # Node
+    def assistant(state: AgentState):
+        """Assistant node"""
+        return { "messages": [llm_with_tools.invoke(state['messages'])] }
+    def retriever(state: AgentState):
+        return None
+    builder = StateGraph(AgentState)
+    # Define nodes: these do the work
+    builder.add_node("assistant", assistant)
+    builder.add_node("tools", ToolNode(tools))
+    builder.add_conditional_edges(
+        "assistant",
+        tools_condition
+    )
+    builder.add_edge("tools", "assistant")
+    # Compile graph
+    return builder.compile()
+# Test
+if __name__ == "__main__":
+    question = "When was a picture of St. Thomas Aquinas first added to the Wikipedia page on the Principle of double effect?"
+    graph = build_graph()
+    messages = [HumanMessage(content=question)]
+    messages = graph.invoke({ "messages": messages })
+    for m in messages["messages"]:
+        m.pretty_print()

metadata.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt CHANGED Viewed

@@ -1,2 +1,4 @@
 gradio
-requests

 gradio
+requests
+langchain
+langchain-google-genai

sample.ipynb ADDED Viewed

	@@ -0,0 +1,333 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "0b73a8e4",
+   "metadata": {},
+   "source": [
+    "# Get questions\n",
+    "\n",
+    "The first part we are going to retrieve all questions of GAIA. The `metadata.jsonl` \n",
+    "contained all the questions and answers for validation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "113ce3ae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import and load the metadata.jsonl file\n",
+    "import json\n",
+    "\n",
+    "qa_lines = []\n",
+    "with open('metadata.jsonl', 'r') as jsonl_file:\n",
+    "    for line in jsonl_file:\n",
+    "        try:\n",
+    "            json_qa = json.loads(line)\n",
+    "            qa_lines.append(json_qa)\n",
+    "        except json.JSONDecodeError:\n",
+    "            print(f\"Skipping invalid JSON line: {line.strip()}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "37a595de",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['1. Search engine', '2. Web browser', '3. PDF viewer']"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sample = qa_lines[22]\n",
+    "sample['Annotator Metadata']['Tools'].split('\\n')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "7a9f694e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "List of tools used in all samples:\n",
+      "Total number of tools used: 83\n",
+      "  ├── web browser: 107\n",
+      "  ├── search engine: 101\n",
+      "  ├── calculator: 34\n",
+      "  ├── image recognition tools: 12\n",
+      "  ├── ne: 9\n",
+      "  ├── pdf access: 7\n",
+      "  ├── pdf viewer: 7\n",
+      "  ├── a web browser: 7\n",
+      "  ├── a search engine: 7\n",
+      "  ├── microsoft excel: 5\n",
+      "  ├── image recognition: 5\n",
+      "  ├── a calculator: 5\n",
+      "  ├── ocr: 4\n",
+      "  ├── python: 3\n",
+      "  ├── video recognition tools: 3\n",
+      "  ├── microsoft excel / google sheets: 3\n",
+      "  ├── excel: 3\n",
+      "  ├── color recognition: 3\n",
+      "  ├── excel file access: 3\n",
+      "  ├── access to wikipedia: 3\n",
+      "  ├── image recognition/ocr: 3\n",
+      "  ├── a file interface: 3\n",
+      "  ├── a web browser.: 2\n",
+      "  ├── a search engine.: 2\n",
+      "  ├── file handling: 2\n",
+      "  ├── a speech-to-text tool: 2\n",
+      "  ├── audio capability: 2\n",
+      "  ├── image recognition tools (to identify and parse a figure with three axes): 1\n",
+      "  ├── unlambda compiler (optional): 1\n",
+      "  ├── a calculator.: 1\n",
+      "  ├── google search: 1\n",
+      "  ├── jsonld file access: 1\n",
+      "  ├── video parsing: 1\n",
+      "  ├── python compiler: 1\n",
+      "  ├── word document access: 1\n",
+      "  ├── tool to extract text from images: 1\n",
+      "  ├── a word reversal tool / script: 1\n",
+      "  ├── counter: 1\n",
+      "  ├── xml file access: 1\n",
+      "  ├── access to the internet archive, web.archive.org: 1\n",
+      "  ├── text processing/diff tool: 1\n",
+      "  ├── gif parsing tools: 1\n",
+      "  ├── code/data analysis tools: 1\n",
+      "  ├── pdf reader: 1\n",
+      "  ├── markdown: 1\n",
+      "  ├── google translate access: 1\n",
+      "  ├── bass note data: 1\n",
+      "  ├── text editor: 1\n",
+      "  ├── xlsx file access: 1\n",
+      "  ├── powerpoint viewer: 1\n",
+      "  ├── csv file access: 1\n",
+      "  ├── calculator (or use excel): 1\n",
+      "  ├── computer algebra system: 1\n",
+      "  ├── video processing software: 1\n",
+      "  ├── audio processing software: 1\n",
+      "  ├── computer vision: 1\n",
+      "  ├── google maps: 1\n",
+      "  ├── access to excel files: 1\n",
+      "  ├── calculator (or ability to count): 1\n",
+      "  ├── a python ide: 1\n",
+      "  ├── spreadsheet editor: 1\n",
+      "  ├── tools required: 1\n",
+      "  ├── b browser: 1\n",
+      "  ├── image recognition and processing tools: 1\n",
+      "  ├── computer vision or ocr: 1\n",
+      "  ├── c++ compiler: 1\n",
+      "  ├── access to google maps: 1\n",
+      "  ├── youtube player: 1\n",
+      "  ├── natural language processor: 1\n",
+      "  ├── graph interaction tools: 1\n",
+      "  ├── bablyonian cuniform -> arabic legend: 1\n",
+      "  ├── access to youtube: 1\n",
+      "  ├── image search tools: 1\n",
+      "  ├── calculator or counting function: 1\n",
+      "  ├── a speech-to-text audio processing tool: 1\n",
+      "  ├── access to academic journal websites: 1\n",
+      "  ├── pdf reader/extracter: 1\n",
+      "  ├── rubik's cube model: 1\n",
+      "  ├── wikipedia: 1\n",
+      "  ├── video capability: 1\n",
+      "  ├── image processing tools: 1\n",
+      "  ├── age recognition software: 1\n",
+      "  ├── youtube: 1\n"
+     ]
+    }
+   ],
+   "source": [
+    "# list out the tools that is required by all the samples\n",
+    "from collections import Counter, OrderedDict\n",
+    "\n",
+    "tools = []\n",
+    "for qa in qa_lines:\n",
+    "    for tool in qa['Annotator Metadata']['Tools'].split('\\n'):\n",
+    "        tool = tool[2:].strip().lower()\n",
+    "        if tool.startswith(\"(\"):\n",
+    "            tool = tool[11:].strip()\n",
+    "            \n",
+    "        tools.append(tool)\n",
+    "\n",
+    "tools_counter = OrderedDict(sorted(Counter(tools).items(), key=lambda x: x[1], reverse=True))\n",
+    "print(\"List of tools used in all samples:\")\n",
+    "print(\"Total number of tools used:\", len(tools_counter))\n",
+    "for tool, count in tools_counter.items():\n",
+    "    print(f\"  ├── {tool}: {count}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9830df82",
+   "metadata": {},
+   "source": [
+    "# Retrieval System\n",
+    "\n",
+    "1. build a vector database based on the metadata.jsonl\n",
+    "2. wrap the metadata.jsonl's questions and answers into a list of document\n",
+    "3. Retrieve the similar system from database for the given question."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "f242de36",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\pehcy\\miniconda3\\envs\\agent_env\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "from langchain.tools.retriever import create_retriever_tool\n",
+    "from langchain_huggingface import HuggingFaceEmbeddings\n",
+    "from dotenv import load_dotenv\n",
+    "import os\n",
+    "\n",
+    "load_dotenv()\n",
+    "\n",
+    "embeddings = HuggingFaceEmbeddings(\n",
+    "    model_name=\"sentence-transformers/all-mpnet-base-v2\",\n",
+    "    model_kwargs= { 'device': 'cuda:0' })"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "009e47c9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.vectorstores import SupabaseVectorStore\n",
+    "from langchain.schema.document import Document\n",
+    "from supabase import create_client, Client\n",
+    "\n",
+    "# connect to supabase\n",
+    "url: str = os.environ.get(\"SUPABASE_URL\")\n",
+    "key: str = os.environ.get(\"SUPABASE_SECRET_KEY\")\n",
+    "supabase: Client = create_client(url, key)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "42263deb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "docs = []\n",
+    "for sample in qa_lines:\n",
+    "    content = f\"Question: {sample['Question']}\\n\\nFinal answer: {sample['Final answer']}\"\n",
+    "    doc = {\n",
+    "        \"content\": content,\n",
+    "        \"metadata\": { \"source\": sample['task_id'] },\n",
+    "        \"embedding\": embeddings.embed_query(content)\n",
+    "    }\n",
+    "    docs.append(doc)\n",
+    "\n",
+    "# insert the documents to the vector database\n",
+    "try:\n",
+    "    response = (\n",
+    "        supabase.table('documents')\n",
+    "        .insert(docs)\n",
+    "        .execute()\n",
+    "    )\n",
+    "except Exception as exception:\n",
+    "    print(\"Error inserting data into Supabase:\", exception)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "0e64a74a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# add items to vector database\n",
+    "vector_store = SupabaseVectorStore(\n",
+    "    client=supabase,\n",
+    "    embedding= embeddings,\n",
+    "    table_name=\"documents\",\n",
+    "    query_name=\"match_documents\",\n",
+    ")\n",
+    "retriever = vector_store.as_retriever()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ff5934c3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# query = \"What did the president say about Ketanji Brown Jackson\"\n",
+    "# matched_docs = vector_store.similarity_search(query, 2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "89c2d411",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Document(metadata={'source': '840bfca7-4f7b-481a-8794-c560c340185d'}, page_content='Question: On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?\\n\\nFinal answer: 80GSFC21M0002')"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "query = \"On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?\"\n",
+    "# matched_docs = vector_store.similarity_search(query, 2)\n",
+    "docs = retriever.invoke(query)\n",
+    "docs[0]"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "agent_env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

supabase.sql ADDED Viewed

	@@ -0,0 +1,30 @@

+-- Drop old function
+drop function if exists match_documents (vector(1536), int);
+-- Create a function to search for documents
+create function match_documents (
+  query_embedding vector(1536),
+  match_count int DEFAULT null,
+  filter jsonb DEFAULT '{}'
+) returns table (
+  id bigint,
+  content text,
+  metadata jsonb,
+  similarity float
+)
+language plpgsql
+as $$
+#variable_conflict use_column
+begin
+  return query
+  select
+    id,
+    content,
+    metadata,
+    1 - (documents.embedding <=> query_embedding) as similarity
+  from documents
+  where metadata @> filter
+  order by documents.embedding <=> query_embedding
+  limit match_count;
+end;
+$$;