Spaces:

Nikhil-Murade
/

TalentEdge-AI-Agent

Sleeping

File size: 5,354 Bytes

import os
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI 
import pandas as pd

from sqlalchemy import create_engine

# load environment variables from .env file
load_dotenv()

openai_key = os.getenv("OPENAI_API_KEY")

llm_name = "gpt-3.5-turbo"
model = ChatOpenAI(api_key=openai_key, model=llm_name)

# read csv file
df = pd.read_csv("Struct Data_Data Science 100K.csv")

from langchain.agents import create_sql_agent
from langchain_community.agent_toolkits.sql.toolkit import SQLDatabaseToolkit
from langchain_community.utilities import SQLDatabase

# Create db from csv file 

# Path to your SQLite database file 
database_file_path = "data_science_100k.db"

# Create an engine to connect to the SQLite database
# SQLite only requires the path to the database file
engine = create_engine(f"sqlite:///{database_file_path}")
file_url = "./Struct Data_Data Science 100K.csv"
# os.makedirs(os.path.dirname(database_file_path), exist_ok=True)

df = pd.read_csv(file_url)
df.to_sql("DataScience100k", con=engine, if_exists="replace", index=False)

print(f"Database created successfully! {df}")


# db = SQLDatabase.from_uri(f"sqlite:///{database_file_path}")
# toolkit = SQLDatabaseToolkit(db=db, llm=model)

# QUESTION = """How many data scietists are their and their avg salaries, and also how many of them are from US"""

# sql_agent = create_sql_agent(
#     toolkit=toolkit,
#     llm=model,
#     verbose=True
# )

# sql_agent.invoke(QUESTION)

# res = sql_agent.invoke(QUESTION)

# # print(res)

# Part 2 : Prepare the sql prompt
MSSQL_AGENT_PREFIX = """
You are an agent designed to interact with a SQL database.
## Instructions:
- Given an input question, create a syntactically correct {dialect} query
to run, then look at the results of the query and return the answer.
- Unless the user specifies a specific number of examples they wish to
obtain, **ALWAYS** limit your query to at most {top_k} results.
- You can order the results by a relevant column to return the most
interesting examples in the database.
- Never query for all the columns from a specific table, only ask for
the relevant columns given the question.
- You have access to tools for interacting with the database.
- You MUST double check your query before executing it.If you get an error
while executing a query,rewrite the query and try again.
- DO NOT make any DML statements (INSERT, UPDATE, DELETE, DROP etc.)
to the database.
- DO NOT MAKE UP AN ANSWER OR USE PRIOR KNOWLEDGE, ONLY USE THE RESULTS
OF THE CALCULATIONS YOU HAVE DONE.
- Your response should be in Markdown. However, **when running  a SQL Query
in "Action Input", do not include the markdown backticks**.
Those are only for formatting the response, not for executing the command.
- ALWAYS, as part of your final answer, explain how you got to the answer
on a section that starts with: "Explanation:". Include the SQL query as
part of the explanation section.
- If the question does not seem related to the database, just return
"I don\'t know" as the answer.
- Only use the below tools. Only use the information returned by the
below tools to construct your query and final answer.
- Do not make up table names, only use the tables returned by any of the
tools below.
- as part of your final answer, please include the SQL query you used in json format or code format

## Tools:
 
"""

MSSQL_AGENT_FORMAT_INSTRUCTIONS = """

## Use the following format:

Question: the input question you must answer.
Thought: you should always think about what to do.
Action: the action to take, should be one of [{tool_names}].
Action Input: the input to the action.
Observation: the result of the action.
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer.
Final Answer: the final answer to the original input question.

Example of Final Answer:
<=== Beginning of example

Action: query_sql_db
Action Input: 
SELECT TOP (10) [base_salary], [grade] 
FROM salaries_2023

WHERE state = 'Division'

Observation:
[(27437.0,), (27088.0,), (26762.0,), (26521.0,), (26472.0,), (26421.0,), (26408.0,)]
Thought:I now know the final answer
Final Answer: There were 27437 workers making 100,000.

Explanation:
I queried the `xyz` table for the `salary` column where the department
is 'IGM' and the date starts with '2020'. The query returned a list of tuples
with the bazse salary for each day in 2020. To answer the question,
I took the sum of all the salaries in the list, which is 27437.
I used the following query

```sql
SELECT [salary] FROM xyztable WHERE department = 'IGM' AND date LIKE '2020%'"
```
===> End of Example

"""

db = SQLDatabase.from_uri(f"sqlite:///{database_file_path}")
toolkit = SQLDatabaseToolkit(db=db, llm=model)

# QUESTION = """How many data scietists are their and their avg salaries, and also how many of them are from US"""

sql_agent = create_sql_agent(
    prefix=MSSQL_AGENT_PREFIX,
    format_instructions=MSSQL_AGENT_FORMAT_INSTRUCTIONS,
    toolkit=toolkit,
    llm=model,
    tok_k=30,
    verbose=True
)

# res = sql_agent.invoke(QUESTION)


import streamlit as st 

st.title("SQL Query AI Agent")

question = st.text_input("Enter your query:")

if st.button("Run Query"):
    if question:
        res = sql_agent.invoke(question)

        st.markdown(res["output"])

else:
    st.error("Please Enter a Query.")