GenBIChatbotfree

Sleeping

GenBIChatbotfree / app.py

Ari

Update app.py

b21f6bf verified 6 months ago

7.23 kB

	import os
	import streamlit as st
	import pandas as pd
	import sqlite3
	from langchain import OpenAI, LLMChain, PromptTemplate
	import sqlparse
	import logging

	# Initialize conversation history
	if 'history' not in st.session_state:
	st.session_state.history = []

	# OpenAI API key (ensure it is securely stored)
	# You can set the API key in your environment variables or a .env file
	openai_api_key = os.getenv("OPENAI_API_KEY")

	# Check if the API key is set
	if not openai_api_key:
	st.error("OpenAI API key is not set. Please set the OPENAI_API_KEY environment variable.")
	st.stop()

	# Step 1: Upload CSV data file (or use default)
	st.title("Natural Language to SQL Query App with Data Insights")
	st.write("Upload a CSV file to get started, or use the default dataset.")

	csv_file = st.file_uploader("Upload your CSV file", type=["csv"])
	if csv_file is None:
	data = pd.read_csv("default_data.csv") # Ensure this file exists in your working directory
	st.write("Using default_data.csv file.")
	table_name = "default_table"
	else:
	data = pd.read_csv(csv_file)
	table_name = csv_file.name.split('.')[0]
	st.write(f"Data Preview ({csv_file.name}):")
	st.dataframe(data.head())

	# Step 2: Load CSV data into a persistent SQLite database
	db_file = 'my_database.db'
	conn = sqlite3.connect(db_file)
	data.to_sql(table_name, conn, index=False, if_exists='replace')

	# SQL table metadata (for validation and schema)
	valid_columns = list(data.columns)
	st.write(f"Valid columns: {valid_columns}")

	# Step 3: Set up the LLM Chains
	# SQL Generation Chain
	sql_template = """
	You are an expert data scientist. Given a natural language question, the name of the table, and a list of valid columns, generate a valid SQL query that answers the question.

	Ensure that:

	- You only use the columns provided.
	- When performing string comparisons in the WHERE clause, make them case-insensitive by using 'COLLATE NOCASE' or the LOWER() function.
	- Do not use 'COLLATE NOCASE' in ORDER BY clauses unless sorting a string column.
	- Do not apply 'COLLATE NOCASE' to numeric columns.

	Question: {question}

	Table name: {table_name}

	Valid columns: {columns}

	SQL Query:
	"""
	sql_prompt = PromptTemplate(template=sql_template, input_variables=['question', 'table_name', 'columns'])
	llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
	sql_generation_chain = LLMChain(llm=llm, prompt=sql_prompt)

	# AnswerScript for generating insights based on query results
	insights_template = """
	You are an expert data scientist. Based on the user's question and the SQL query result provided below, generate a concise and informative analysis that includes data insights and actionable recommendations.

	User's Question: {question}

	SQL Query Result:
	{result}

	Analysis and Recommendations:
	"""
	insights_prompt = PromptTemplate(template=insights_template, input_variables=['question', 'result'])
	insights_chain = LLMChain(llm=llm, prompt=insights_prompt)

	# Optional: Clean up function to remove incorrect COLLATE NOCASE usage
	def clean_sql_query(query):
	"""Removes incorrect usage of COLLATE NOCASE from the SQL query."""
	parsed = sqlparse.parse(query)
	statements = []
	for stmt in parsed:
	tokens = []
	idx = 0
	while idx < len(stmt.tokens):
	token = stmt.tokens[idx]
	if (token.ttype is sqlparse.tokens.Keyword and token.value.upper() == 'COLLATE'):
	# Check if the next token is 'NOCASE'
	next_token = stmt.tokens[idx + 2] if idx + 2 < len(stmt.tokens) else None
	if next_token and next_token.value.upper() == 'NOCASE':
	# Skip 'COLLATE' and 'NOCASE' tokens
	idx += 3 # Skip 'COLLATE', whitespace, 'NOCASE'
	continue
	tokens.append(token)
	idx += 1
	statements.append(''.join([str(t) for t in tokens]))
	return ' '.join(statements)

	# Define the callback function
	def process_input():
	user_prompt = st.session_state['user_input']

	if user_prompt:
	try:
	# Append user message to history
	st.session_state.history.append({"role": "user", "content": user_prompt})

	if "columns" in user_prompt.lower():
	assistant_response = f"The columns are: {', '.join(valid_columns)}"
	st.session_state.history.append({"role": "assistant", "content": assistant_response})
	else:
	columns = ', '.join(valid_columns)
	generated_sql = sql_generation_chain.run({
	'question': user_prompt,
	'table_name': table_name,
	'columns': columns
	})

	# Clean the SQL query
	generated_sql = clean_sql_query(generated_sql)

	# Attempt to execute SQL query and handle exceptions
	try:
	result = pd.read_sql_query(generated_sql, conn)

	if result.empty:
	assistant_response = "The query returned no results. Please try a different question."
	st.session_state.history.append({"role": "assistant", "content": assistant_response})
	else:
	# Convert the result to a string for the insights prompt
	result_str = result.head(10).to_string(index=False) # Limit to first 10 rows

	# Generate insights and recommendations
	insights = insights_chain.run({
	'question': user_prompt,
	'result': result_str
	})

	# Append the assistant's insights to the history
	st.session_state.history.append({"role": "assistant", "content": insights})
	# Append the result DataFrame to the history
	st.session_state.history.append({"role": "assistant", "content": result})
	except Exception as e:
	logging.error(f"An error occurred during SQL execution: {e}")
	assistant_response = f"Error executing SQL query: {e}"
	st.session_state.history.append({"role": "assistant", "content": assistant_response})

	except Exception as e:
	logging.error(f"An error occurred: {e}")
	assistant_response = f"Error: {e}"
	st.session_state.history.append({"role": "assistant", "content": assistant_response})

	# Reset the user_input in session state
	st.session_state['user_input'] = ''

	# Display the conversation history
	for message in st.session_state.history:
	if message['role'] == 'user':
	st.markdown(f"User: {message['content']}")
	elif message['role'] == 'assistant':
	if isinstance(message['content'], pd.DataFrame):
	st.markdown("Assistant: Query Results:")
	st.dataframe(message['content'])
	else:
	st.markdown(f"Assistant: {message['content']}")

	# Place the input field at the bottom with the callback
	st.text_input("Enter your message:", key='user_input', on_change=process_input)