Spaces:
Sleeping
Sleeping
import os | |
from dotenv import load_dotenv | |
from langchain_openai import ChatOpenAI | |
import pandas as pd | |
from sqlalchemy import create_engine | |
# load environment variables from .env file | |
load_dotenv() | |
openai_key = os.getenv("OPENAI_API_KEY") | |
llm_name = "gpt-3.5-turbo" | |
model = ChatOpenAI(api_key=openai_key, model=llm_name) | |
# read csv file | |
df = pd.read_csv("Struct Data_Data Science 100K.csv") | |
from langchain.agents import create_sql_agent | |
from langchain_community.agent_toolkits.sql.toolkit import SQLDatabaseToolkit | |
from langchain_community.utilities import SQLDatabase | |
# Create db from csv file | |
# Path to your SQLite database file | |
database_file_path = "data_science_100k.db" | |
# Create an engine to connect to the SQLite database | |
# SQLite only requires the path to the database file | |
engine = create_engine(f"sqlite:///{database_file_path}") | |
file_url = "./Struct Data_Data Science 100K.csv" | |
# os.makedirs(os.path.dirname(database_file_path), exist_ok=True) | |
df = pd.read_csv(file_url) | |
df.to_sql("DataScience100k", con=engine, if_exists="replace", index=False) | |
print(f"Database created successfully! {df}") | |
# db = SQLDatabase.from_uri(f"sqlite:///{database_file_path}") | |
# toolkit = SQLDatabaseToolkit(db=db, llm=model) | |
# QUESTION = """How many data scietists are their and their avg salaries, and also how many of them are from US""" | |
# sql_agent = create_sql_agent( | |
# toolkit=toolkit, | |
# llm=model, | |
# verbose=True | |
# ) | |
# sql_agent.invoke(QUESTION) | |
# res = sql_agent.invoke(QUESTION) | |
# # print(res) | |
# Part 2 : Prepare the sql prompt | |
MSSQL_AGENT_PREFIX = """ | |
You are an agent designed to interact with a SQL database. | |
## Instructions: | |
- Given an input question, create a syntactically correct {dialect} query | |
to run, then look at the results of the query and return the answer. | |
- Unless the user specifies a specific number of examples they wish to | |
obtain, **ALWAYS** limit your query to at most {top_k} results. | |
- You can order the results by a relevant column to return the most | |
interesting examples in the database. | |
- Never query for all the columns from a specific table, only ask for | |
the relevant columns given the question. | |
- You have access to tools for interacting with the database. | |
- You MUST double check your query before executing it.If you get an error | |
while executing a query,rewrite the query and try again. | |
- DO NOT make any DML statements (INSERT, UPDATE, DELETE, DROP etc.) | |
to the database. | |
- DO NOT MAKE UP AN ANSWER OR USE PRIOR KNOWLEDGE, ONLY USE THE RESULTS | |
OF THE CALCULATIONS YOU HAVE DONE. | |
- Your response should be in Markdown. However, **when running a SQL Query | |
in "Action Input", do not include the markdown backticks**. | |
Those are only for formatting the response, not for executing the command. | |
- ALWAYS, as part of your final answer, explain how you got to the answer | |
on a section that starts with: "Explanation:". Include the SQL query as | |
part of the explanation section. | |
- If the question does not seem related to the database, just return | |
"I don\'t know" as the answer. | |
- Only use the below tools. Only use the information returned by the | |
below tools to construct your query and final answer. | |
- Do not make up table names, only use the tables returned by any of the | |
tools below. | |
- as part of your final answer, please include the SQL query you used in json format or code format | |
## Tools: | |
""" | |
MSSQL_AGENT_FORMAT_INSTRUCTIONS = """ | |
## Use the following format: | |
Question: the input question you must answer. | |
Thought: you should always think about what to do. | |
Action: the action to take, should be one of [{tool_names}]. | |
Action Input: the input to the action. | |
Observation: the result of the action. | |
(this Thought/Action/Action Input/Observation can repeat N times) | |
Thought: I now know the final answer. | |
Final Answer: the final answer to the original input question. | |
Example of Final Answer: | |
<=== Beginning of example | |
Action: query_sql_db | |
Action Input: | |
SELECT TOP (10) [base_salary], [grade] | |
FROM salaries_2023 | |
WHERE state = 'Division' | |
Observation: | |
[(27437.0,), (27088.0,), (26762.0,), (26521.0,), (26472.0,), (26421.0,), (26408.0,)] | |
Thought:I now know the final answer | |
Final Answer: There were 27437 workers making 100,000. | |
Explanation: | |
I queried the `xyz` table for the `salary` column where the department | |
is 'IGM' and the date starts with '2020'. The query returned a list of tuples | |
with the bazse salary for each day in 2020. To answer the question, | |
I took the sum of all the salaries in the list, which is 27437. | |
I used the following query | |
```sql | |
SELECT [salary] FROM xyztable WHERE department = 'IGM' AND date LIKE '2020%'" | |
``` | |
===> End of Example | |
""" | |
db = SQLDatabase.from_uri(f"sqlite:///{database_file_path}") | |
toolkit = SQLDatabaseToolkit(db=db, llm=model) | |
# QUESTION = """How many data scietists are their and their avg salaries, and also how many of them are from US""" | |
sql_agent = create_sql_agent( | |
prefix=MSSQL_AGENT_PREFIX, | |
format_instructions=MSSQL_AGENT_FORMAT_INSTRUCTIONS, | |
toolkit=toolkit, | |
llm=model, | |
tok_k=30, | |
verbose=True | |
) | |
# res = sql_agent.invoke(QUESTION) | |
import streamlit as st | |
st.title("SQL Query AI Agent") | |
question = st.text_input("Enter your query:") | |
if st.button("Run Query"): | |
if question: | |
res = sql_agent.invoke(question) | |
st.markdown(res["output"]) | |
else: | |
st.error("Please Enter a Query.") |