GenBIChatbotfree

Sleeping

App Files Files Community

Ari commited on Sep 30, 2024

Commit

bb31796

verified ·

1 Parent(s): e14b81b

Update app.py

Browse files

Files changed (1) hide show

app.py +57 -109

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import os
 import streamlit as st
 import pandas as pd
 import sqlite3
-from langchain import OpenAI, LLMChain, PromptTemplate
 import sqlparse
 import logging
@@ -10,14 +10,8 @@ import logging
 if 'history' not in st.session_state:
     st.session_state.history = []
-# OpenAI API key (ensure it is securely stored)
-# You can set the API key in your environment variables or a .env file
-openai_api_key = os.getenv("OPENAI_API_KEY")
-# Check if the API key is set
-if not openai_api_key:
-    st.error("OpenAI API key is not set. Please set the OPENAI_API_KEY environment variable.")
-    st.stop()
 # Step 1: Upload CSV data file (or use default)
 st.title("Natural Language to SQL Query App with Enhanced Insights")
@@ -43,57 +37,58 @@ data.to_sql(table_name, conn, index=False, if_exists='replace')
 valid_columns = list(data.columns)
 st.write(f"Valid columns: {valid_columns}")
-# Step 3: Set up the LLM Chains
-# SQL Generation Chain
-sql_template = """
-You are an expert data scientist. Given a natural language question, the name of the table, and a list of valid columns, generate a valid SQL query that answers the question.
-Ensure that:
-- You only use the columns provided.
-- When performing string comparisons in the WHERE clause, make them case-insensitive by using 'COLLATE NOCASE' or the LOWER() function.
-- Do not use 'COLLATE NOCASE' in ORDER BY clauses unless sorting a string column.
-- Do not apply 'COLLATE NOCASE' to numeric columns.
-If the question is vague or open-ended and does not pertain to specific data retrieval, respond with "NO_SQL" to indicate that a SQL query should not be generated.
-Question: {question}
-Table name: {table_name}
-Valid columns: {columns}
-SQL Query:
-"""
-sql_prompt = PromptTemplate(template=sql_template, input_variables=['question', 'table_name', 'columns'])
-llm = OpenAI(temperature=0, openai_api_key=openai_api_key, max_tokens = 180)
-sql_generation_chain = LLMChain(llm=llm, prompt=sql_prompt)
-# Insights Generation Chain
-insights_template =  """
-You are an expert data scientist. Based on the user's question and the SQL query result provided below, generate a concise analysis that includes key data insights and actionable recommendations. Limit the response to a maximum of 150 words.
-User's Question: {question}
-SQL Query Result:
-{result}
-Concise Analysis (max 200 words):
-"""
-insights_prompt = PromptTemplate(template=insights_template, input_variables=['question', 'result'])
-insights_chain = LLMChain(llm=llm, prompt=insights_prompt)
-# General Insights and Recommendations Chain
-general_insights_template = """
-You are an expert data scientist. Based on the entire dataset provided below, generate a concise analysis with key insights and recommendations. Limit the response to 150 words.
-Dataset Summary:
-{dataset_summary}
-Concise Analysis and Recommendations (max 150 words):
-"""
-general_insights_prompt = PromptTemplate(template=general_insights_template, input_variables=['dataset_summary'])
-general_insights_chain = LLMChain(llm=llm, prompt=general_insights_prompt)
 # Optional: Clean up function to remove incorrect COLLATE NOCASE usage
 def clean_sql_query(query):
@@ -117,42 +112,6 @@ def clean_sql_query(query):
         statements.append(''.join([str(t) for t in tokens]))
     return ' '.join(statements)
-# Function to classify user query
-def classify_query(question):
-    """Classify the user query as either 'SQL' or 'INSIGHTS'."""
-    classification_template = """
-    You are an AI assistant that classifies user queries into two categories: 'SQL' for specific data retrieval queries and 'INSIGHTS' for general analytical or recommendation queries.
-    Determine the appropriate category for the following user question.
-    Question: "{question}"
-    Category (SQL/INSIGHTS):
-    """
-    classification_prompt = PromptTemplate(template=classification_template, input_variables=['question'])
-    classification_chain = LLMChain(llm=llm, prompt=classification_prompt)
-    category = classification_chain.run({'question': question}).strip().upper()
-    if category.startswith('SQL'):
-        return 'SQL'
-    else:
-        return 'INSIGHTS'
-# Function to generate dataset summary
-def generate_dataset_summary(data):
-    """Generate a summary of the dataset for general insights."""
-    summary_template = """
-    You are an expert data scientist. Based on the dataset provided below, generate a concise summary that includes the number of records, number of columns, data types, and any notable features.
-    Dataset:
-    {data}
-    Dataset Summary:
-    """
-    summary_prompt = PromptTemplate(template=summary_template, input_variables=['data'])
-    summary_chain = LLMChain(llm=llm, prompt=summary_prompt)
-    summary = summary_chain.run({'data': data.head().to_string(index=False)})
-    return summary
 # Define the callback function
 def process_input():
     user_prompt = st.session_state['user_input']
@@ -171,11 +130,7 @@ def process_input():
                 st.session_state.history.append({"role": "assistant", "content": assistant_response})
             elif category == 'SQL':
                 columns = ', '.join(valid_columns)
-                generated_sql = sql_generation_chain.run({
-                    'question': user_prompt,
-                    'table_name': table_name,
-                    'columns': columns
-                }).strip()
                 if generated_sql.upper() == "NO_SQL":
                     # Handle cases where no SQL should be generated
@@ -185,9 +140,7 @@ def process_input():
                     dataset_summary = generate_dataset_summary(data)
                     # Generate general insights and recommendations
-                    general_insights = general_insights_chain.run({
-                        'dataset_summary': dataset_summary
-                    })
                     # Append the assistant's insights to the history
                     st.session_state.history.append({"role": "assistant", "content": general_insights})
@@ -208,10 +161,7 @@ def process_input():
                             result_str = result.head(10).to_string(index=False)  # Limit to first 10 rows
                             # Generate insights and recommendations based on the query result
-                            insights = insights_chain.run({
-                                'question': user_prompt,
-                                'result': result_str
-                            })
                             # Append the assistant's insights to the history
                             st.session_state.history.append({"role": "assistant", "content": insights})
@@ -226,9 +176,7 @@ def process_input():
                 dataset_summary = generate_dataset_summary(data)
                 # Generate general insights and recommendations
-                general_insights = general_insights_chain.run({
-                    'dataset_summary': dataset_summary
-                })
                 # Append the assistant's insights to the history
                 st.session_state.history.append({"role": "assistant", "content": general_insights})

 import streamlit as st
 import pandas as pd
 import sqlite3
+from transformers import pipeline
 import sqlparse
 import logging
 if 'history' not in st.session_state:
     st.session_state.history = []
+# Load a pre-trained GPT-2 model from Hugging Face
+llm = pipeline('text-generation', model='gpt2')
 # Step 1: Upload CSV data file (or use default)
 st.title("Natural Language to SQL Query App with Enhanced Insights")
 valid_columns = list(data.columns)
 st.write(f"Valid columns: {valid_columns}")
+# Function to generate SQL query using Hugging Face model
+def generate_sql_query(question, table_name, columns):
+    prompt = f"""
+    You are an expert data scientist. Given a natural language question, the name of the table, and a list of valid columns, generate a valid SQL query that answers the question.
+    Ensure that:
+    - You only use the columns provided.
+    - When performing string comparisons in the WHERE clause, make them case-insensitive by using 'COLLATE NOCASE' or the LOWER() function.
+    - Do not use 'COLLATE NOCASE' in ORDER BY clauses unless sorting a string column.
+    - Do not apply 'COLLATE NOCASE' to numeric columns.
+    If the question is vague or open-ended and does not pertain to specific data retrieval, respond with "NO_SQL" to indicate that a SQL query should not be generated.
+    Question: {question}
+    Table name: {table_name}
+    Valid columns: {columns}
+    SQL Query:
+    """
+    response = llm(prompt, max_length=180)
+    return response[0]['generated_text'].strip()
+# Function to generate insights using Hugging Face model
+def generate_insights(question, result):
+    prompt = f"""
+    You are an expert data scientist. Based on the user's question and the SQL query result provided below, generate a concise analysis that includes key data insights and actionable recommendations. Limit the response to a maximum of 150 words.
+    User's Question: {question}
+    SQL Query Result:
+    {result}
+    Concise Analysis (max 200 words):
+    """
+    response = llm(prompt, max_length=150)
+    return response[0]['generated_text'].strip()
+# Function to classify user query as SQL or Insights
+def classify_query(question):
+    prompt = f"""
+    You are an AI assistant that classifies user queries into two categories: 'SQL' for specific data retrieval queries and 'INSIGHTS' for general analytical or recommendation queries.
+    Determine the appropriate category for the following user question.
+    Question: "{question}"
+    Category (SQL/INSIGHTS):
+    """
+    response = llm(prompt, max_length=10)
+    category = response[0]['generated_text'].strip().upper()
+    return 'SQL' if 'SQL' in category else 'INSIGHTS'
+# Function to generate dataset summary
+def generate_dataset_summary(data):
+    summary_template = f"""
+    You are an expert data scientist. Based on the dataset provided below, generate a concise summary that includes the number of records, number of columns, data types, and any notable features.
+    Dataset:
+    {data.head().to_string(index=False)}
+    Dataset Summary:
+    """
+    response = llm(summary_template, max_length=150)
+    return response[0]['generated_text'].strip()
 # Optional: Clean up function to remove incorrect COLLATE NOCASE usage
 def clean_sql_query(query):
         statements.append(''.join([str(t) for t in tokens]))
     return ' '.join(statements)
 # Define the callback function
 def process_input():
     user_prompt = st.session_state['user_input']
                 st.session_state.history.append({"role": "assistant", "content": assistant_response})
             elif category == 'SQL':
                 columns = ', '.join(valid_columns)
+                generated_sql = generate_sql_query(user_prompt, table_name, columns)
                 if generated_sql.upper() == "NO_SQL":
                     # Handle cases where no SQL should be generated
                     dataset_summary = generate_dataset_summary(data)
                     # Generate general insights and recommendations
+                    general_insights = generate_insights(user_prompt, dataset_summary)
                     # Append the assistant's insights to the history
                     st.session_state.history.append({"role": "assistant", "content": general_insights})
                             result_str = result.head(10).to_string(index=False)  # Limit to first 10 rows
                             # Generate insights and recommendations based on the query result
+                            insights = generate_insights(user_prompt, result_str)
                             # Append the assistant's insights to the history
                             st.session_state.history.append({"role": "assistant", "content": insights})
                 dataset_summary = generate_dataset_summary(data)
                 # Generate general insights and recommendations
+                general_insights = generate_insights(user_prompt, dataset_summary)
                 # Append the assistant's insights to the history
                 st.session_state.history.append({"role": "assistant", "content": general_insights})