Spaces:

Benjamona97
/

sql-agent

Sleeping

App Files Files Community

Benjamona97 commited on Feb 19, 2024

Commit

13ebe63

0 Parent(s):

Add application file

Browse files

Files changed (8) hide show

.gitignore +10 -0
Dockerfile +11 -0
app.py +187 -0
chainlit.md +5 -0
modules/database/__init__.py +0 -0
modules/database/database.py +71 -0
modules/database/sqlitedatabase.py +21 -0
requirements.txt +15 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,10 @@

+.venv
+.chainlit
+*__pycache__
+.idea
+.env
+record_manager_cache.sql
+storage
+.DS_Store
+data
+*.sql

Dockerfile ADDED Viewed

	@@ -0,0 +1,11 @@

+FROM python:3.11
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+COPY . .
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,187 @@

+import os
+import re
+from pathlib import Path
+from typing import List
+import chainlit as cl
+from dotenv import load_dotenv
+from langchain.pydantic_v1 import BaseModel, Field
+from langchain.tools import StructuredTool
+from langchain.indexes import SQLRecordManager, index
+from langchain.schema import Document
+from langchain.agents import initialize_agent, AgentExecutor
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.vectorstores.chroma import Chroma
+from langchain_community.document_loaders import CSVLoader
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_openai import ChatOpenAI, OpenAIEmbeddings
+from openai import AsyncOpenAI
+# from modules.database.database import PostgresDB
+from modules.database.sqlitedatabase import Database
+"""
+Here we define some environment variables and the tools that the agent will use.
+Along with some configuration for the app to start.
+"""
+load_dotenv()
+chunk_size = 512
+chunk_overlap = 50
+embeddings_model = OpenAIEmbeddings()
+openai_client = AsyncOpenAI()
+CSV_STORAGE_PATH = "./data"
+def remove_triple_backticks(text):
+    # Use a regular expression to replace all occurrences of triple backticks with an empty string
+    cleaned_text = re.sub(r"```", "", text)
+    return cleaned_text
+def process_pdfs(pdf_storage_path: str):
+    csv_directory = Path(pdf_storage_path)
+    docs = []  # type: List[Document]
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=chunk_size, chunk_overlap=50)
+    for csv_path in csv_directory.glob("*.csv"):
+        loader = CSVLoader(file_path=str(csv_path))
+        documents = loader.load()
+        docs += text_splitter.split_documents(documents)
+    documents_search = Chroma.from_documents(docs, embeddings_model)
+    namespace = "chromadb/my_documents"
+    record_manager = SQLRecordManager(
+        namespace, db_url="sqlite:///record_manager_cache.sql"
+    )
+    record_manager.create_schema()
+    index_result = index(
+        docs,
+        record_manager,
+        documents_search,
+        cleanup="incremental",
+        source_id_key="source",
+    )
+    print(f"Indexing stats: {index_result}")
+    return documents_search
+doc_search = process_pdfs(CSV_STORAGE_PATH)
+"""
+Execute SQL query tool definition along schemas.
+"""
+def execute_sql(query: str) -> str:
+    """
+    Execute SQLite queries queries against the database. Delete all markdown code and backticks from the query.
+    """
+    db = Database("./db/mydatabase.db")
+    db.connect()
+    # results = db.run_sql_to_markdown(query)
+    cleaned_query = remove_triple_backticks(query)
+    results = db.execute_query(cleaned_query)
+    return results + f"\nQuery used:\n```sql{cleaned_query}```"
+class ExecuteSqlToolInput(BaseModel):
+    query: str = Field(
+        description="A SQLite query to be executed agains the database")
+execute_sql_tool = StructuredTool(
+    func=execute_sql,
+    name="Execute SQL",
+    description="useful for when you need to execute SQL queries against the database. Always use a clause LIMIT 10",
+    args_schema=ExecuteSqlToolInput
+)
+"""
+Research database tool definition along schemas.
+"""
+def research_database(user_request: str) -> str:
+    """
+    Searches for table definitions matching the user request
+    """
+    search_kwargs = {"k": 30}
+    retriever = doc_search.as_retriever(search_kwargs=search_kwargs)
+    def format_docs(docs):
+        for i, doc in enumerate(docs):
+            print(f"{i+1}. {doc.page_content}")
+        return "\n\n".join([d.page_content for d in docs])
+    results = retriever.invoke(user_request)
+    return format_docs(results)
+class ResearchDatabaseToolInput(BaseModel):
+    user_request: str = Field(
+        description="The user query to search against the table definitions for matches.")
+research_database_tool = StructuredTool(
+    func=research_database,
+    name="Search db info",
+    description="Search for database table definitions so you can have context for building SQL queries. The queries needs to be SQLite compatible.",
+    args_schema=ResearchDatabaseToolInput
+)
+@cl.on_chat_start
+def start():
+    tools = [execute_sql_tool, research_database_tool]
+    llm = ChatOpenAI(model="gpt-4", temperature=0, verbose=True)
+    prompt = ChatPromptTemplate.from_template(
+        """
+            You are a SQLite world class data scientist, based on user query
+            use your tools to do the job. Usually you would start by analyzing
+            for possible SQL queries the user wants to build based on your knowledge base.
+            Remember your tools are:
+            - execute_sql (bring back the results as of running the query against the database)
+            - research_database (search for table definitions so you can build a SQLite Query)
+            Remember, you are building SQLite compatible queries. If you don't know the answer don't
+            make anything up. Always ask for feedback. One last detail: always run the querys with LIMIT 10 and add
+            the SQL query as markdown to the final answer so the user knows what SQL query was used for the job and
+            can copy it for further use.
+            REMEMBER TO GENERATE ALWAYS SQLITE COMPATIBLE QUERIES.
+            User query: {input}
+        """
+    )
+    agent = initialize_agent(tools=tools, prompt=prompt,
+                             llm=llm, handle_parsing_errors=True)
+    cl.user_session.set("agent", agent)
+@cl.on_message
+async def main(message: cl.Message):
+    agent = cl.user_session.get("agent")  # type: AgentExecutor
+    res = await agent.arun(
+        message.content, callbacks=[cl.AsyncLangchainCallbackHandler()]
+    )
+    await cl.Message(content=res).send()

chainlit.md ADDED Viewed

	@@ -0,0 +1,5 @@

+To start, you can try to ask a simple question:
+```"I need all customers please!"```
+#### Then you can expand each cell execution to watch the agent's work and analize each step trough the process.

modules/database/__init__.py ADDED Viewed

File without changes

modules/database/database.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import json
+import psycopg2
+from tabulate import tabulate
+class PostgresDB:
+    """
+    A class to manage postgres connections and queries
+    """
+    def __init__(self):
+        self.conn = None
+        self.cur = None
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.cur:
+            self.cur.close()
+        if self.conn:
+            self.conn.close()
+    def connect_with_url(self, url):
+        self.conn = psycopg2.connect(url)
+        self.cur = self.conn.cursor()
+    def close(self):
+        if self.cur:
+            self.cur.close()
+        if self.conn:
+            self.conn.close()
+    def run_sql(self, sql) -> str:
+        """
+        Run a SQL query against the postgres database.
+        Returns JSON.
+        """
+        self.cur.execute(sql)
+        columns = [desc[0] for desc in self.cur.description]
+        res = self.cur.fetchall()
+        list_of_dicts = [dict(zip(columns, row)) for row in res]
+        json_result = json.dumps(list_of_dicts, indent=4)
+        return json_result
+    # method to run a sql and return markdown
+    def run_sql_to_markdown(self, sql) -> str:
+        """
+        Run a SQL query against the postgres database
+        Returns markdown table.
+        """
+        self.cur.execute(sql)
+        columns = [desc[0] for desc in self.cur.description]
+        res = self.cur.fetchall()
+        list_of_dicts = [dict(zip(columns, row)) for row in res]
+        markdown_table = self.to_markdown(list_of_dicts)
+        print(markdown_table)
+        return markdown_table
+    @staticmethod
+    def to_markdown(data) -> str:
+        """
+        Convert a list of dictionaries to markdown
+        """
+        return tabulate(data, headers="keys", tablefmt="pipe")

modules/database/sqlitedatabase.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import sqlite3
+import tabulate
+class Database:
+  def __init__(self, uri):
+    self.uri = uri
+    self.connection = None
+  def connect(self):
+    self.connection = sqlite3.connect(self.uri)
+  def execute_query(self, query):
+    cursor = self.connection.cursor()
+    cursor.execute(query)
+    result = cursor.fetchall()
+    cursor.close()
+    headers = [description[0] for description in cursor.description]
+    return tabulate.tabulate(result, headers, tablefmt="pipe")
+  def close(self):
+    self.connection.close()

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+openai==1.10.0
+psycopg==3.1.14
+psycopg2-binary==2.9.9
+python-dotenv==1.0.0
+tiktoken==0.5.2
+python-dotenv==1.0.0
+sqlalchemy[asyncio]
+chainlit==1.0.200
+langchain==0.1.4
+langchain-community==0.0.16
+langchain-openai==0.0.5
+asyncpg
+db-dtypes==1.2.0
+tabulate==0.9.0
+chromadb==0.4.22