Text-to-SQL / create_query_engine.py
Dylan-Kaneshiro's picture
Create create_query_engine.py
8fb353c
raw
history blame
1.65 kB
import sqlalchemy
from langchain.document_loaders import PyPDFLoader
import pandas as pd
from llama_index.objects import (
SQLTableNodeMapping,
ObjectIndex,
SQLTableSchema,
)
from llama_index import SQLDatabase
from llama_index.indices.vector_store.base import VectorStoreIndex
from llama_index.indices.struct_store import SQLTableRetrieverQueryEngine
def read_context_pdf(file):
filepath = file.name
loader = PyPDFLoader(filepath)
pages = loader.load()
content = "".join([page.page_content for page in pages])
content = [c.lstrip() for c in content.split(";")]
content = [c.split(":") for c in content]
return content
def query(engine, sql_query):
with engine.begin() as conn:
df = pd.read_sql_query(sqlalchemy.text(sql_query), conn)
return df
def create_query_engine(context_pdf, username, password, host, port, mydatabase):
# Parse context pdf
context = read_context_pdf(context_pdf)
# create sql engine
pg_uri = f"postgresql+psycopg2://{username}:{password}@{host}:{port}/{mydatabase}"
engine = sqlalchemy.create_engine(pg_uri)
sql_database = SQLDatabase(engine)
# create context mapping
table_node_mapping = SQLTableNodeMapping(sql_database)
table_schema_objs = [(SQLTableSchema(table_name=c[0], context_str=c[1])) for c in context]
obj_index = ObjectIndex.from_objects(
table_schema_objs,
table_node_mapping,
VectorStoreIndex,
)
query_engine = SQLTableRetrieverQueryEngine(
sql_database, obj_index.as_retriever(similarity_top_k=3)
)
return query_engine, engine, "Connection good"