Spaces:

ramhemanth580
/

NL_2_SQL_Data_Analysis_Chatbot

Runtime error

App Files Files Community

ramhemanth580 commited on Apr 1

Commit

8d66574

•

1 Parent(s): f34d870

Upload 8 files

Browse files

Files changed (8) hide show

app.py +56 -0
database_schema.png +0 -0
database_table_descriptions.csv +9 -0
examples.py +149 -0
langchain_utils.py +64 -0
prompts.py +38 -0
requirements.txt +27 -0
table_details.py +114 -0

app.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import streamlit as st
+import os
+from dotenv import load_dotenv
+import google.generativeai as genai
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain_utils import  get_chain
+from langchain.memory import ChatMessageHistory
+from PIL import Image
+st.title("Langchain NL2SQL Chatbot")
+# Set Google GenAI API key from Streamlit secrets
+#client = OpenAI(api_key="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")
+load_dotenv()
+genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
+llm = ChatGoogleGenerativeAI(model="gemini-pro",temperature=0,convert_system_message_to_human=True)
+# Set a default model
+if "Gemini_model" not in st.session_state:
+    st.session_state["Gemini_model"] = "gemini-pro"
+history = ChatMessageHistory()
+if "messages" not in st.session_state:
+    # print("Creating session state")
+    st.session_state.messages = []
+def invoke_chain(question,messages):
+    chain = get_chain()
+    #history = create_history(messages)
+    response = chain.invoke({"question": question,"top_k":3,"messages":history.messages})
+    # history.add_user_message(question)
+    # history.add_ai_message(response)
+    return response
+question = st.text_input("Ask a Question about the database")
+# if question :
+#     st.session_state.messages.append({"role": "user", "content": question})
+#     history.add_user_message(question)
+#     response = invoke_chain(question, st.session_state.messages)
+#     history.add_ai_message(response)
+#     st.session_state.messages.append({"role": "assistant", "content": response})
+if st.button("submit") :
+    if question  :
+        response = invoke_chain(question, st.session_state.messages)
+        st.markdown(response)
+# Set up the sidebar with a button
+st.sidebar.title("Database Info")
+if st.sidebar.button('Show Database Schema'):
+    # Display the database schema image when the button is clicked
+    image = Image.open('database_schema.PNG')
+    st.image(image, caption='Database Schema', use_column_width=True)

database_schema.png ADDED Viewed

database_table_descriptions.csv ADDED Viewed

	@@ -0,0 +1,9 @@

+Table,Description
+productlines,"Stores information about the different product lines offered by the company, including a unique name, textual description, HTML description, and image. Categorizes products into different lines."
+products,"Contains details of each product sold by the company, including code, name, product line, scale, vendor, description, stock quantity, buy price, and MSRP. Linked to the productlines table."
+offices,"Holds data on the company's sales offices, including office code, city, phone number, address, state, country, postal code, and territory. Each office is uniquely identified by its office code."
+employees,"Stores information about employees, including number, last name, first name, job title, contact info, and office code. Links to offices and maps organizational structure through the reportsTo attribute."
+customers,"Captures data on customers, including customer number, name, contact details, address, assigned sales rep, and credit limit. Central to managing customer relationships and sales processes."
+payments,"Records payments made by customers, tracking the customer number, check number, payment date, and amount. Linked to the customers table for financial tracking and account management."
+orders,"Details each sales order placed by customers, including order number, dates, status, comments, and customer number. Linked to the customers table, tracking sales transactions."
+orderdetails,"Describes individual line items for each sales order, including order number, product code, quantity, price, and order line number. Links orders to products, detailing the items sold."

examples.py ADDED Viewed

	@@ -0,0 +1,149 @@

+examples = [
+  {
+      "input": "List all customers in France with a credit limit over 20,000.",
+      "query": "SELECT * FROM customers WHERE country = 'France' AND creditLimit > 20000;"
+  },
+  {
+      "input": "Get the highest payment amount made by any customer.",
+      "query": "SELECT MAX(amount) FROM payments;"
+  },
+  {
+      "input": "Show product details for products in the 'Motorcycles' product line.",
+      "query": "SELECT * FROM products WHERE productLine = 'Motorcycles';"
+  },
+  {
+      "input": "Retrieve the names of employees who report to employee number 1002.",
+      "query": "SELECT firstName, lastName FROM employees WHERE reportsTo = 1002;"
+  },
+  {
+      "input": "List all products with a stock quantity less than 7000.",
+      "query": "SELECT productName, quantityInStock FROM products WHERE quantityInStock < 7000;"
+  },
+  {
+    'input':"what is price of `1968 Ford Mustang`",
+    "query": "SELECT `buyPrice`, `MSRP` FROM products  WHERE `productName` = '1968 Ford Mustang' LIMIT 1;"
+  },
+  {
+    "input": "List products sold by order date.",
+    "query": "SELECT productName , orderDate , DAYNAME(orderDate) AS 'DayName' FROM products INNER JOIN orderdetails ON products.productCode = orderdetails.productCode INNER JOIN Orders ON orderdetails.orderNumber = orders.orderNumber WHERE DAYNAME(Orders.orderDate) = 'MONDAY';"
+  },
+  {
+    "input": "List the order dates in descending order for orders for the 1940 Ford Pickup Truck.",
+    "query": "SELECT DISTINCT(products.productName), orders.orderDate FROM orders JOIN orderdetails ON orderdetails.orderNumber = orders.orderNumber JOIN products ON orderdetails.productCode = products.productCode WHERE productName = '1940 Ford Pickup Truck' ORDER BY orderDate DESC;"
+  },
+  {
+    "input": "List the names of customers and their corresponding order number where a particular order from that customer has a value greater than $25,000.",
+    "query": "SELECT customers.customerName, orders.orderNumber, SUM(orderdetails.priceEach * orderdetails.quantityOrdered) AS tot_value FROM customers JOIN orders ON customers.customerNumber = orders.customerNumber JOIN orderdetails ON orders.orderNumber = orderdetails.orderNumber GROUP BY customers.customerName, orders.orderNumber HAVING tot_value > 25000 ORDER BY customers.customerName;"
+  },
+  {
+    "input": "For orders containing more than two products, report those products that constitute more than 50% of the value of the order.",
+    "query": "SELECT orderNumber, productName, ProductsCount ,contribution FROM (SELECT orderNumber, productCode, (SELECT Count(*) FROM orderdetails WHERE OrderNumber = Main.orderNumber) As 'ProductsCount', quantityOrdered*priceEach As 'Product Value', (quantityOrdered*priceEach / (SELECT SUM(quantityOrdered*priceEach) FROM orderdetails WHERE orderNumber = Main.orderNumber ))*100 As 'Contribution' FROM orderdetails Main ORDER BY orderNumber) DataTable INNER JOIN Products ON Products.productCode = DataTable.productCode WHERE ProductsCount > 2 AND Contribution > 50;"
+  },
+  {
+    "input": "List all the products purchased by Herkku Gifts.",
+    "query": "SELECT productName FROM products INNER JOIN orderdetails od on products.productCode = od.productCode INNER JOIN orders o on od.orderNumber = o.orderNumber INNER JOIN customers c on o.customerNumber = c.customerNumber WHERE c.customerName = 'Herkku Gifts';"
+  },
+  {
+    "input": "Find products containing the name 'Ford'.",
+    "query": "SELECT productName AS 'Products' FROM Products WHERE productName LIKE '%Ford%';"
+  },
+  {
+    "input": "List products ending in 'ship'.",
+    "query": "SELECT productName FROM products WHERE productName LIKE '%ship';"
+  },
+  {
+    "input": "Report the number of customers in Denmark, Norway, and Sweden.",
+    "query": "SELECT customerName FROM Customers WHERE country IN ('Denmark','Norway','Sweden');"
+  },
+  {
+    "input": "What are the products with a product code in the range S700_1000 to S700_1499",
+    "query": "SELECT productCode,productName FROM Products WHERE RIGHT(productCode,4) BETWEEN 1000 AND 1499 ORDER BY RIGHT(productCode,4);"
+  },
+  {
+    "input": "Which customers have a digit in their name?",
+    "query": "SELECT customerName FROM Customers WHERE customerName RLIKE '[0-9]';"
+  },
+  {
+    "input": "List the names of employees called Dianne or Diane.",
+    "query": "SELECT CONCAT(firstName,' ',lastName) AS 'Employee Name' FROM Employees WHERE lastName RLIKE 'Dianne|Diane' OR firstName RLIKE 'Dianne|Diane';"
+  },
+  {
+    "input": "List the products containing ship or boat in their product name.",
+    "query": "SELECT productName FROM Products WHERE productName RLIKE 'ship|boat';"
+  },
+  {
+    "input": "List the products with a product code beginning with S700.",
+    "query": "SELECT productCode, productName FROM Products WHERE productCode LIKE 'S700%';"
+  },
+  {
+    "input": "Find products containing the name 'Ford'.",
+    "query": "SELECT productName As 'Products' FROM Products WHERE productName LIKE '%Ford%';"
+  },
+  {
+    "input": "List products ending in 'ship'.",
+    "query": "SELECT productName FROM products WHERE productName LIKE '%ship';"
+  },
+  {
+    "input": "Report the number of customers in Denmark, Norway, and Sweden.",
+    "query": "SELECT customerName FROM Customers WHERE country IN ('Denmark','Norway','Sweden');"
+  },
+  {
+    "input": "what is the minimum payment received ?",
+    "query": "SELECT min(amount) As 'Minimum Payment' FROM payments;"
+  }
+]
+from langchain_community.vectorstores import Chroma
+from langchain_core.example_selectors import SemanticSimilarityExampleSelector
+from langchain.embeddings import HuggingFaceEmbeddings
+import google.generativeai as genai
+import streamlit as st
+import os
+from dotenv import load_dotenv
+load_dotenv()
+load_dotenv()
+genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
+# Access the value of Huggingface_API_KEY
+HF_API_TOKEN = os.getenv("HF_API_TOKEN")
+#embeddings = HuggingFaceEmbeddings(huggingfacehub_api_token=HF_API_TOKEN,model_name="sentence-transformers/all-MiniLM-L6-v2")
+#embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+from langchain_google_genai import GoogleGenerativeAIEmbeddings
+class CustomGoogleGenerativeAIEmbeddings:
+    def __init__(self, model, task_type=None):
+        # Initialize the GoogleGenerativeAIEmbeddings with the model and task type
+        self.embeddings = GoogleGenerativeAIEmbeddings(model=model, task_type=task_type)
+    def __call__(self, input):
+        # Use the embed_query method for single inputs
+        return self.embeddings.embed_query(input)
+    def embed_query(self, text):
+        # Use the embed_query method to generate an embedding for a single piece of text
+        return self.embeddings.embed_query(text)
+    def embed_documents(self, documents):
+        # Use the embed_documents method to generate embeddings for multiple pieces of text
+        return self.embeddings.embed_documents(documents)
+# Usage
+model = "models/embedding-001"  # Replace with your actual model name
+task_type = "retrieval_document"  # Replace with your actual task type if needembeddings = CustomGoogleGenerativeAIEmbeddings(model=model, task_type=task_type)
+embeddings = CustomGoogleGenerativeAIEmbeddings(model=model, task_type=task_type)
+vectorstore = Chroma()
+vectorstore.delete_collection()
+@st.cache_resource
+def get_example_selector():
+    example_selector = SemanticSimilarityExampleSelector.from_examples(
+        examples,
+        embeddings,
+        vectorstore,
+        k=4,
+        input_keys=["input"],
+    )
+    return example_selector

langchain_utils.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import os
+from dotenv import load_dotenv
+from operator import itemgetter
+load_dotenv()
+db_user = os.getenv("db_user")
+db_password = os.getenv("db_password")
+db_host = os.getenv("db_host")
+db_name = os.getenv("db_name")
+import google.generativeai as genai
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain_community.utilities.sql_database import SQLDatabase
+from langchain.chains import create_sql_query_chain
+from langchain_openai import ChatOpenAI
+from langchain_community.tools.sql_database.tool import QuerySQLDataBaseTool
+from langchain.memory import ChatMessageHistory
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.runnables import RunnablePassthrough
+from table_details import table_chain as select_table
+from prompts import final_prompt, answer_prompt
+import streamlit as st
+genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
+llm = ChatGoogleGenerativeAI(model="gemini-pro",temperature=0,convert_system_message_to_human=True)
+@st.cache_resource
+def get_chain():
+    #print("Creating chain")
+    db = SQLDatabase.from_uri(f"mysql+pymysql://{db_user}:{db_password}@{db_host}/{db_name}")
+    generate_query = create_sql_query_chain(llm, db,final_prompt)
+    execute_query = QuerySQLDataBaseTool(db=db)
+    rephrase_answer = answer_prompt | llm | StrOutputParser()
+    # chain = generate_query | execute_query
+    chain = (
+    RunnablePassthrough.assign(table_names_to_use=select_table) |
+    RunnablePassthrough.assign(query=generate_query).assign(
+        result=itemgetter("query") | execute_query
+    )
+    | rephrase_answer
+)
+    return chain
+def create_history(messages):
+    history = ChatMessageHistory()
+    for message in messages:
+        if message["role"] == "user":
+            history.add_user_message(message["content"])
+        else:
+            history.add_ai_message(message["content"])
+    return history
+def invoke_chain(question,messages):
+    chain = get_chain()
+    history = create_history(messages)
+    response = chain.invoke({"question": question,"top_k":3,"messages":history.messages})
+    history.add_user_message(question)
+    history.add_ai_message(response)
+    return response

prompts.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from examples import get_example_selector
+from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder,FewShotChatMessagePromptTemplate,PromptTemplate
+example_prompt = ChatPromptTemplate.from_messages(
+    [
+        ("human", "{input}\nSQLQuery:"),
+        ("ai", "{query}"),
+    ]
+)
+few_shot_prompt = FewShotChatMessagePromptTemplate(
+    example_prompt=example_prompt,
+    example_selector=get_example_selector(),
+    input_variables=["input","top_k"],
+)
+final_prompt = ChatPromptTemplate.from_messages(
+    [
+        (
+        "system", """You are a MySQL expert, Given an input question ,create a syntactically correct MySQL query to run.Unless otherwise specificed.\n\n
+         Here is the relevant table info: {table_info}\n\n
+         Below are a number of examples of questions and their corresponding SQL queries. Return the syntactically correct SQL query only and nothing else.\n\n
+        """
+        ),
+        few_shot_prompt,
+        MessagesPlaceholder(variable_name="messages"),
+        ("human", "{input}"),
+    ]
+)
+answer_prompt = PromptTemplate.from_template(
+    """Given the following user question, corresponding SQL query, and SQL result, answer the user question.
+Question: {question}
+SQL Query: {query}
+SQL Result: {result}
+Answer: """
+)

requirements.txt ADDED Viewed

	@@ -0,0 +1,27 @@

+streamlit
+streamlit_chat
+python-dotenv
+chromadb
+faiss-cpu
+logging
+warnings
+operator
+typing
+ast
+PIL
+pandas
+numpy
+google-generativeai
+langchain
+langchain_community
+langchain_core
+langchain_google_genai
+sentence-transformers==2.2.2
+mysql-connector-python
+pymysql

table_details.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import os
+from dotenv import load_dotenv
+import pandas as pd
+import streamlit as st
+from operator import itemgetter
+#from langchain.chains.openai_tools import create_extraction_chain_pydantic
+from langchain_core.pydantic_v1 import BaseModel, Field
+#from langchain_openai import ChatOpenAI
+from langchain.chains import LLMChain
+from langchain_core.prompts import ChatPromptTemplate
+import google.generativeai as genai
+from langchain_google_genai import ChatGoogleGenerativeAI
+from typing import List
+load_dotenv()
+genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
+llm = ChatGoogleGenerativeAI(model="gemini-pro",temperature=0,convert_system_message_to_human=True)
+@st.cache_data
+def get_table_details():
+    # Read the CSV file into a DataFrame
+    table_description = pd.read_csv("database_table_descriptions.csv")
+    table_docs = []
+    # Iterate over the DataFrame rows to create Document objects
+    table_details = ""
+    for index, row in table_description.iterrows():
+        table_details = table_details + "Table Name:" + row['Table'] + "\n" + "Table Description:" + row['Description'] + "\n\n"
+    return table_details
+class Table(BaseModel):
+    """Table in SQL database."""
+    name: str = Field(description="Name of table in SQL database.")
+table_details = get_table_details()
+prompt2 = ChatPromptTemplate.from_template(
+   """
+  You are a helpful Data science assistant , Your objective is to analyze the following table descriptions and Return the names of ALL the SQL tables that MIGHT be relevant to the question: {question}
+  \n\nRemember to include ALL POTENTIALLY RELEVANT tables, even if you're not sure that they're needed.and you should return the table names as a list
+  for example question : which customers made the top 5 highest payments
+  the desired answer should be ['customers','payments']
+  \n\nThe tables descriptions are:
+  Table Name:productlines
+  Table Description:Stores information about the different product lines offered by the company, including a unique name, textual description, HTML description, and image. Categorizes products into different lines.
+  Table Name:products
+  Table Description:Contains details of each product sold by the company, including code, name, product line, scale, vendor, description, stock quantity, buy price, and MSRP. Linked to the productlines table.
+  Table Name:offices
+  Table Description:Holds data on the company's sales offices, including office code, city, phone number, address, state, country, postal code, and territory. Each office is uniquely identified by its office code.
+  Table Name:employees
+  Table Description:Stores information about employees, including number, last name, first name, job title, contact info, and office code. Links to offices and maps organizational structure through the reportsTo attribute.
+  Table Name:customers
+  Table Description:Captures data on customers, including customer number, name, contact details, address, assigned sales rep, and credit limit. Central to managing customer relationships and sales processes.
+  Table Name:payments
+  Table Description:Records payments made by customers, tracking the customer number, check number, payment date, and amount. Linked to the customers table for financial tracking and account management.
+  Table Name:orders
+  Table Description:Details each sales order placed by customers, including order number, dates, status, comments, and customer number. Linked to the customers table, tracking sales transactions.
+  Table Name:orderdetails
+  Table Description:Describes individual line items for each sales order, including order number, product code, quantity, price, and order line number. Links orders to products, detailing the items sold.
+  """
+)
+from typing import List, Dict
+import ast
+# Assuming Table is a Pydantic model or similar
+class Table:
+    name: str
+def get_tables(output: Dict) -> List[str]:
+    # Extract the 'text' field from the output, which contains the list as a string
+    text_output = output.get('text', '')
+    try:
+        # Safely evaluate the string representation of the list
+        tables_list = ast.literal_eval(text_output)
+        # Ensure that the result is indeed a list
+        if isinstance(tables_list, list):
+            # Extract the table names if 'tables_list' is a list of Table objects
+            # If it's already a list of strings, you can return it directly
+            return [table.name if isinstance(table, Table) else table for table in tables_list]
+    except (ValueError, SyntaxError):
+        # Handle the case where the text output is not a valid list representation
+        return []
+table_chain = {"question": itemgetter("question")} | LLMChain(llm=llm, prompt=prompt2) | get_tables
+# table_names = "\n".join(db.get_usable_table_names())
+# table_details = get_table_details()
+# table_details_prompt = f"""Return the names of ALL the SQL tables that MIGHT be relevant to the user question. \
+# The tables are:
+# {table_details}
+# Remember to include ALL POTENTIALLY RELEVANT tables, even if you're not sure that they're needed."""
+# table_chain = {"input": itemgetter("question")} | create_extraction_chain_pydantic(Table, llm, system_message=table_details_prompt) | get_tables