Spaces:

arithescientist
/

GenBIChatbot

Sleeping

App Files Files Community

arithescientist commited on Oct 1, 2024

Commit

d0ab6a9

verified ·

1 Parent(s): db88275

Update app.py

Browse files

Files changed (1) hide show

app.py +170 -23

app.py CHANGED Viewed

@@ -11,35 +11,51 @@ if 'history' not in st.session_state:
     st.session_state.history = []
 # OpenAI API key (ensure it is securely stored)
 openai_api_key = os.getenv("OPENAI_API_KEY")
 # Step 1: Upload CSV data file (or use default)
 csv_file = st.file_uploader("Upload your CSV file", type=["csv"])
 if csv_file is None:
-    data = pd.read_csv("default_data.csv")  # Use default CSV if no file is uploaded
     st.write("Using default_data.csv file.")
 else:
     data = pd.read_csv(csv_file)
     st.write(f"Data Preview ({csv_file.name}):")
     st.dataframe(data.head())
 # Step 2: Load CSV data into a persistent SQLite database
 db_file = 'my_database.db'
 conn = sqlite3.connect(db_file)
-table_name = csv_file.name.split('.')[0] if csv_file else "default_table"
 data.to_sql(table_name, conn, index=False, if_exists='replace')
 # SQL table metadata (for validation and schema)
 valid_columns = list(data.columns)
 st.write(f"Valid columns: {valid_columns}")
-# Step 3: Set up the LLM Chain to generate SQL queries
-template = """
 You are an expert data scientist. Given a natural language question, the name of the table, and a list of valid columns, generate a valid SQL query that answers the question.
 Ensure that:
 - You only use the columns provided.
-- String comparisons in the WHERE clause are case-insensitive by using 'COLLATE NOCASE' or the LOWER() function.
 Question: {question}
@@ -49,8 +65,93 @@ Valid columns: {columns}
 SQL Query:
 """
-prompt = PromptTemplate(template=template, input_variables=['question', 'table_name', 'columns'])
-sql_generation_chain = LLMChain(llm=OpenAI(temperature=0), prompt=prompt)
 # Define the callback function
 def process_input():
@@ -61,31 +162,77 @@ def process_input():
             # Append user message to history
             st.session_state.history.append({"role": "user", "content": user_prompt})
-            if "columns" in user_prompt.lower():
                 assistant_response = f"The columns are: {', '.join(valid_columns)}"
                 st.session_state.history.append({"role": "assistant", "content": assistant_response})
-            else:
                 columns = ', '.join(valid_columns)
                 generated_sql = sql_generation_chain.run({
                     'question': user_prompt,
                     'table_name': table_name,
                     'columns': columns
-                })
-                # Debug: Display generated SQL query for inspection
-                st.write(f"Generated SQL Query:\n{generated_sql}")
-                # Attempt to execute SQL query and handle exceptions
-                try:
-                    result = pd.read_sql_query(generated_sql, conn)
-                    assistant_response = f"Generated SQL Query:\n{generated_sql}"
-                    st.session_state.history.append({"role": "assistant", "content": assistant_response})
-                    st.session_state.history.append({"role": "assistant", "content": result})
-                except Exception as e:
-                    logging.error(f"An error occurred during SQL execution: {e}")
-                    assistant_response = f"Error executing SQL query: {e}"
-                    st.session_state.history.append({"role": "assistant", "content": assistant_response})
         except Exception as e:
             logging.error(f"An error occurred: {e}")
             assistant_response = f"Error: {e}"
@@ -106,4 +253,4 @@ for message in st.session_state.history:
             st.markdown(f"**Assistant:** {message['content']}")
 # Place the input field at the bottom with the callback
-st.text_input("Enter your message:", key='user_input', on_change=process_input)

     st.session_state.history = []
 # OpenAI API key (ensure it is securely stored)
+# You can set the API key in your environment variables or a .env file
 openai_api_key = os.getenv("OPENAI_API_KEY")
+# Check if the API key is set
+if not openai_api_key:
+    st.error("OpenAI API key is not set. Please set the OPENAI_API_KEY environment variable.")
+    st.stop()
 # Step 1: Upload CSV data file (or use default)
+st.title("Natural Language to SQL Query App with Enhanced Insights")
+st.write("Upload a CSV file to get started, or use the default dataset.")
 csv_file = st.file_uploader("Upload your CSV file", type=["csv"])
 if csv_file is None:
+    data = pd.read_csv("default_data.csv")  # Ensure this file exists in your working directory
     st.write("Using default_data.csv file.")
+    table_name = "default_table"
 else:
     data = pd.read_csv(csv_file)
+    table_name = csv_file.name.split('.')[0]
     st.write(f"Data Preview ({csv_file.name}):")
     st.dataframe(data.head())
 # Step 2: Load CSV data into a persistent SQLite database
 db_file = 'my_database.db'
 conn = sqlite3.connect(db_file)
 data.to_sql(table_name, conn, index=False, if_exists='replace')
 # SQL table metadata (for validation and schema)
 valid_columns = list(data.columns)
 st.write(f"Valid columns: {valid_columns}")
+# Step 3: Set up the LLM Chains
+# SQL Generation Chain
+sql_template = """
 You are an expert data scientist. Given a natural language question, the name of the table, and a list of valid columns, generate a valid SQL query that answers the question.
 Ensure that:
 - You only use the columns provided.
+- When performing string comparisons in the WHERE clause, make them case-insensitive by using 'COLLATE NOCASE' or the LOWER() function.
+- Do not use 'COLLATE NOCASE' in ORDER BY clauses unless sorting a string column.
+- Do not apply 'COLLATE NOCASE' to numeric columns.
+If the question is vague or open-ended and does not pertain to specific data retrieval, respond with "NO_SQL" to indicate that a SQL query should not be generated.
 Question: {question}
 SQL Query:
 """
+sql_prompt = PromptTemplate(template=sql_template, input_variables=['question', 'table_name', 'columns'])
+llm = OpenAI(temperature=0, openai_api_key=openai_api_key, max_tokens = 180)
+sql_generation_chain = LLMChain(llm=llm, prompt=sql_prompt)
+# Insights Generation Chain
+insights_template =  """
+You are an expert data scientist. Based on the user's question and the SQL query result provided below, generate a concise analysis that includes key data insights and actionable recommendations. Limit the response to a maximum of 150 words.
+User's Question: {question}
+SQL Query Result:
+{result}
+Concise Analysis (max 200 words):
+"""
+insights_prompt = PromptTemplate(template=insights_template, input_variables=['question', 'result'])
+insights_chain = LLMChain(llm=llm, prompt=insights_prompt)
+# General Insights and Recommendations Chain
+general_insights_template = """
+You are an expert data scientist. Based on the entire dataset provided below, generate a concise analysis with key insights and recommendations. Limit the response to 150 words.
+Dataset Summary:
+{dataset_summary}
+Concise Analysis and Recommendations (max 150 words):
+"""
+general_insights_prompt = PromptTemplate(template=general_insights_template, input_variables=['dataset_summary'])
+general_insights_chain = LLMChain(llm=llm, prompt=general_insights_prompt)
+# Optional: Clean up function to remove incorrect COLLATE NOCASE usage
+def clean_sql_query(query):
+    """Removes incorrect usage of COLLATE NOCASE from the SQL query."""
+    parsed = sqlparse.parse(query)
+    statements = []
+    for stmt in parsed:
+        tokens = []
+        idx = 0
+        while idx < len(stmt.tokens):
+            token = stmt.tokens[idx]
+            if (token.ttype is sqlparse.tokens.Keyword and token.value.upper() == 'COLLATE'):
+                # Check if the next token is 'NOCASE'
+                next_token = stmt.tokens[idx + 2] if idx + 2 < len(stmt.tokens) else None
+                if next_token and next_token.value.upper() == 'NOCASE':
+                    # Skip 'COLLATE' and 'NOCASE' tokens
+                    idx += 3  # Skip 'COLLATE', whitespace, 'NOCASE'
+                    continue
+            tokens.append(token)
+            idx += 1
+        statements.append(''.join([str(t) for t in tokens]))
+    return ' '.join(statements)
+# Function to classify user query
+def classify_query(question):
+    """Classify the user query as either 'SQL' or 'INSIGHTS'."""
+    classification_template = """
+    You are an AI assistant that classifies user queries into two categories: 'SQL' for specific data retrieval queries and 'INSIGHTS' for general analytical or recommendation queries.
+    Determine the appropriate category for the following user question.
+    Question: "{question}"
+    Category (SQL/INSIGHTS):
+    """
+    classification_prompt = PromptTemplate(template=classification_template, input_variables=['question'])
+    classification_chain = LLMChain(llm=llm, prompt=classification_prompt)
+    category = classification_chain.run({'question': question}).strip().upper()
+    if category.startswith('SQL'):
+        return 'SQL'
+    else:
+        return 'INSIGHTS'
+# Function to generate dataset summary
+def generate_dataset_summary(data):
+    """Generate a summary of the dataset for general insights."""
+    summary_template = """
+    You are an expert data scientist. Based on the dataset provided below, generate a concise summary that includes the number of records, number of columns, data types, and any notable features.
+    Dataset:
+    {data}
+    Dataset Summary:
+    """
+    summary_prompt = PromptTemplate(template=summary_template, input_variables=['data'])
+    summary_chain = LLMChain(llm=llm, prompt=summary_prompt)
+    summary = summary_chain.run({'data': data.head().to_string(index=False)})
+    return summary
 # Define the callback function
 def process_input():
             # Append user message to history
             st.session_state.history.append({"role": "user", "content": user_prompt})
+            # Classify the user query
+            category = classify_query(user_prompt)
+            logging.info(f"User query classified as: {category}")
+            if "COLUMNS" in user_prompt.upper():
                 assistant_response = f"The columns are: {', '.join(valid_columns)}"
                 st.session_state.history.append({"role": "assistant", "content": assistant_response})
+            elif category == 'SQL':
                 columns = ', '.join(valid_columns)
                 generated_sql = sql_generation_chain.run({
                     'question': user_prompt,
                     'table_name': table_name,
                     'columns': columns
+                }).strip()
+                if generated_sql.upper() == "NO_SQL":
+                    # Handle cases where no SQL should be generated
+                    assistant_response = "Sure, let's discuss some general insights and recommendations based on the data."
+                    # Generate dataset summary
+                    dataset_summary = generate_dataset_summary(data)
+                    # Generate general insights and recommendations
+                    general_insights = general_insights_chain.run({
+                        'dataset_summary': dataset_summary
+                    })
+                    # Append the assistant's insights to the history
+                    st.session_state.history.append({"role": "assistant", "content": general_insights})
+                else:
+                    # Clean the SQL query
+                    cleaned_sql = clean_sql_query(generated_sql)
+                    logging.info(f"Generated SQL Query: {cleaned_sql}")
+                    # Attempt to execute SQL query and handle exceptions
+                    try:
+                        result = pd.read_sql_query(cleaned_sql, conn)
+                        if result.empty:
+                            assistant_response = "The query returned no results. Please try a different question."
+                            st.session_state.history.append({"role": "assistant", "content": assistant_response})
+                        else:
+                            # Convert the result to a string for the insights prompt
+                            result_str = result.head(10).to_string(index=False)  # Limit to first 10 rows
+                            # Generate insights and recommendations based on the query result
+                            insights = insights_chain.run({
+                                'question': user_prompt,
+                                'result': result_str
+                            })
+                            # Append the assistant's insights to the history
+                            st.session_state.history.append({"role": "assistant", "content": insights})
+                            # Append the result DataFrame to the history
+                            st.session_state.history.append({"role": "assistant", "content": result})
+                    except Exception as e:
+                        logging.error(f"An error occurred during SQL execution: {e}")
+                        assistant_response = f"Error executing SQL query: {e}"
+                        st.session_state.history.append({"role": "assistant", "content": assistant_response})
+            else:  # INSIGHTS category
+                # Generate dataset summary
+                dataset_summary = generate_dataset_summary(data)
+                # Generate general insights and recommendations
+                general_insights = general_insights_chain.run({
+                    'dataset_summary': dataset_summary
+                })
+                # Append the assistant's insights to the history
+                st.session_state.history.append({"role": "assistant", "content": general_insights})
         except Exception as e:
             logging.error(f"An error occurred: {e}")
             assistant_response = f"Error: {e}"
             st.markdown(f"**Assistant:** {message['content']}")
 # Place the input field at the bottom with the callback
+st.text_input("Enter your message:", key='user_input', on_change=process_input)