import sqlalchemy from langchain.document_loaders import PyPDFLoader import pandas as pd from llama_index.objects import ( SQLTableNodeMapping, ObjectIndex, SQLTableSchema, ) from llama_index import SQLDatabase from llama_index.indices.vector_store.base import VectorStoreIndex from llama_index.indices.struct_store import SQLTableRetrieverQueryEngine def read_context_pdf(file): filepath = file.name loader = PyPDFLoader(filepath) pages = loader.load() content = "".join([page.page_content for page in pages]) content = [c.lstrip() for c in content.split(";")] content = [c.split(":") for c in content] return content def query(engine, sql_query): with engine.begin() as conn: df = pd.read_sql_query(sqlalchemy.text(sql_query), conn) return df def create_query_engine(context_pdf, username, password, host, port, mydatabase): # Parse context pdf context = read_context_pdf(context_pdf) # create sql engine pg_uri = f"postgresql+psycopg2://{username}:{password}@{host}:{port}/{mydatabase}" engine = sqlalchemy.create_engine(pg_uri) sql_database = SQLDatabase(engine) # create context mapping table_node_mapping = SQLTableNodeMapping(sql_database) table_schema_objs = [(SQLTableSchema(table_name=c[0], context_str=c[1])) for c in context] obj_index = ObjectIndex.from_objects( table_schema_objs, table_node_mapping, VectorStoreIndex, ) query_engine = SQLTableRetrieverQueryEngine( sql_database, obj_index.as_retriever(similarity_top_k=3) ) return query_engine, engine, "Connection good"