Spaces:

Express-Analytics
/

QueryHelper

Runtime error

App Files Files Community

anumaurya114exp commited on Dec 1, 2023

Commit

37bd4dd

1 Parent(s): 0145029

Overwrite queryHelper with queryHelper2

Browse files

Files changed (7) hide show

app.py +284 -203
config.py +2 -0
configProd.py +19 -0
constants.py +33 -0
gptManager.py +58 -0
requirements.txt +3 -2
utils.py +111 -0

app.py CHANGED Viewed

@@ -1,222 +1,258 @@
-# !pip install gradio
-# !pip install openai
-# import openai
-import gradio
-import pandas as pd
-import psycopg2
 import pandas as pd
-import openai
-import sqlite3
 import psycopg2
 import time
 import gradio as gr
 import sqlparse
 import os
-#EA_key
-openai.api_key = os.getenv("api_key")
 pd.set_option('display.max_columns', None)
 pd.set_option('display.max_rows', None)
-#database credential
-db_name = os.getenv("db_name")
-user_db = os.getenv("user_db")
-pwd_db = os.getenv("pwd_db")
-host_db = os.getenv("host_db")
-port_db = os.getenv("port_db")
-conn = psycopg2.connect(database=db_name, user = user_db, password = pwd_db, host = host_db, port = port_db)
-# sql="select master_customer_id, c.gender,c.city_name,c.state_name, c.zip_code,product_name,department,class,category,d.date_value,s.city_name as store_city,s.state_name as store_state,s.zip_code as store_zip,s.store_name,s.opened_dt,s.closed_dt, f.transaction_amt,ch.type from oyster_demo.tbl_d_customer c,oyster_demo.tbl_d_product p,oyster_demo.tbl_f_sales f,oyster_demo.tbl_d_date d, oyster_demo.tbl_d_store s,oyster_demo.tbl_d_channel ch where p.product_id=f.product_id and c.customer_id=f.customer_id and d.date_id=f.date_id and s.store_id=f.store_id and ch.channel_id=f.channel_id"
-sql2="""select * from lpdatamart.tbl_d_customer limit 10000"""
-sql3="""select * from lpdatamart.tbl_d_product limit 1000"""
-sql4="""select * from lpdatamart.tbl_f_sales limit 10000"""
-# sql5="""select * from lpdatamart.tbl_d_time limit 10000"""
-sql6="""select * from lpdatamart.tbl_d_store limit 10000"""
-sql7="""select * from lpdatamart.tbl_d_channel limit 10000"""
-sql8="""select * from lpdatamart.tbl_d_lineaction_code limit 10000"""
-sql9 = """select * from lpdatamart.tbl_d_calendar limit 10000"""
-df_customer = pd.read_sql_query(sql2, con=conn)
-df_product = pd.read_sql_query(sql3, con=conn)
-df_sales = pd.read_sql_query(sql4, con=conn)
-# df_time = pd.read_sql_query(sql5, con=conn)
-df_store = pd.read_sql_query(sql6, con=conn)
-df_channel = pd.read_sql_query(sql7, con=conn)
-df_lineaction = pd.read_sql_query(sql8, con=conn)
-df_calendar = pd.read_sql_query(sql9, con=conn)
-conn.close()
-df_customer.head(2)
-customer_col=['customer_id','customer_type', 'first_name', 'middle_name', 'household_name', 'last_name', 'personal_email', 'city', 'state', 'zip_code', 'address1', 'country', 'gender', 'phone_number', 'reward_number']
-product_col=['product_id', 'product_name', 'product_price', 'department', 'class',  'discount', 'category', 'department_desc', 'department_type', 'product_type', 'manufacturer', 'color']
-sales_col = ['store_id', 'customer_id', 'channel_id', 'product_id', 'time_id', 'date_id','order_id', 'line_action', 'discount_amount', 'shipping_amount','transaction_date', 'transaction_amount', 'transaction_type', 'qty_sold']
-# time_col = ['time_id', 'hour', 'minute', 'second', 'am_pm']
-store_col = ['store_id', 'store_number', 'store_name', 'store_designation', 'store_longitude', 'store_latitude', 'store_manager_name', 'zip_code', 'state_code', 'city', 'street_number', 'street_name', 'store_region', 'store_type', 'address1','sublocationcode', 'channel', 'company_flag', 'kiosk_physical_store', 'sublocation_code']
-channel_col = ['channel_id', 'channel_name', 'channel_code']
-lineaction_col = ['line_action_code', 'line_action_code_desc', 'load_date', 'catgory', 'sales_type']
-calendar_col = ['date_id','calendar_date','calendar_month','day_of_week','calendar_week_number','calendar_month_number','calendar_quarter_number','day_of_month','day_of_quarter','day_of_the_year','us_holiday','lp_holiday','work_day','year','ad_week','ad_week_year','ad_month','lp_day','lp_week','lp_month','lp_year','lp_quarter','event_day']
-df_customer=df_customer[customer_col]
-df_product=df_product[product_col]
-df_sales=df_sales[sales_col]
-# df_time = df_time[time_col]
-df_store = df_store[store_col]
-df_channel = df_channel[channel_col]
-df_lineaction = df_lineaction[lineaction_col]
-df_calendar = df_calendar[calendar_col]
-# df = pd.read_csv('/content/drive/MyDrive/tbl_m_querygen.csv')
-import sqlite3
-import openai
-# Connect to SQLite database
-conn1 = sqlite3.connect('chatgpt.db')
-cursor1 = conn1.cursor()
-# Connect to SQLite database
-conn2 = sqlite3.connect('chatgpt.db')
-cursor2 = conn2.cursor()
-# Connect to SQLite database
-conn3 = sqlite3.connect('chatgpt.db')
-cursor3 = conn3.cursor()
-# Connect to SQLite database
-conn4 = sqlite3.connect('chatgpt.db')
-cursor4 = conn4.cursor()
-# Connect to SQLite database
-conn5 = sqlite3.connect('chatgpt.db')
-cursor5 = conn5.cursor()
-# Connect to SQLite database
-conn5 = sqlite3.connect('chatgpt.db')
-cursor5 = conn5.cursor()
-# Connect to SQLite database
-conn6 = sqlite3.connect('chatgpt.db')
-cursor6 = conn6.cursor()
-# Connect to SQLite database
-conn7 = sqlite3.connect('chatgpt.db')
-cursor7 = conn7.cursor()
-# Connect to SQLite database
-conn8 = sqlite3.connect('chatgpt.db')
-cursor8 = conn8.cursor()
-# openai.api_key = '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'
-# Insert DataFrame into SQLite database
-df_customer.to_sql('tbl_d_customer', conn1, if_exists='replace', index=False)
-df_product.to_sql('tbl_d_product', conn2, if_exists='replace', index=False)
-df_sales.to_sql('tbl_f_sales', conn3, if_exists='replace', index=False)
-# df_time.to_sql('tbl_d_time', conn4, if_exists='replace', index=False)
-df_store.to_sql('tbl_d_store', conn5, if_exists='replace', index=False)
-df_channel.to_sql('tbl_d_channel', conn6, if_exists='replace', index=False)
-df_lineaction.to_sql('tbl_d_lineaction_code', conn7, if_exists='replace', index=False)
-df_calendar.to_sql('tbl_d_calendar', conn8, if_exists ='replace',index=False)
-# Function to get table columns from SQLite database
-def get_table_columns(table_name1, table_name2):
-    cursor1.execute("PRAGMA table_info({})".format(table_name1))
-    columns1 = cursor1.fetchall()
-    # print(columns)
-    cursor2.execute("PRAGMA table_info({})".format(table_name2))
-    columns2 = cursor2.fetchall()
-    return [column[1] for column in columns1], [column[1] for column in columns2]
-table_name1 = 'tbl_d_customer'
-table_name2 = 'tbl_d_product'
-table_name3 = 'tbl_f_sales'
-# table_name4 = 'tbl_d_time'
-table_name5 = 'tbl_d_store'
-table_name6 = 'tbl_d_channel'
-table_name7 = 'tbl_d_lineaction_code'
-table_name8 = 'tbl_d_calendar'
-columns1,columns2  = get_table_columns(table_name1,table_name2)
-# Function to generate SQL query from input text using ChatGPT
-def generate_sql_query(text):
-    # prompt = """You are a ChatGPT language model that can generate SQL queries. Please provide a natural language input text, and I will generate the corresponding SQL query and Answer the provided question if possible for you.The table name is {} and the following data:\n {} and corresponding columns are {}.\nInput: {}\nSQL Query:""".format(table_name,read_csv, columns,text)
-    messages.append({"role": "user", "content": text})
-    # print(prompt)
-    request = openai.ChatCompletion.create(
-        model="gpt-4",
-        messages=messages
-    )
-    print(request)
-    sql_query = request['choices'][0]['message']['content']
-    return sql_query
-text = "for female customer who did a transaction of more than 100 dollars in year 2020 please write sql query ?"
-schema_name = 'lpdatamart'
-prompt = """Given an input text, and You will generate the corresponding SQL query. The schema name is {}. The first table name is {} and the following data:\n {}. The second table name is {} and the following data for second table:\n {}. The third table name is {} and the following data for third table:\n {}. The fourth table name is {} and the following data for fourth table:\n {}. The fifth table name is {} and the following data for fifth table:\n {}. The sixth table name is {} and the following data for sixth table:\n {}. The seventh table name is {} and the following data for seventh table:\n {} \n""".format(schema_name,table_name1,df_customer.loc[:5], table_name2, df_product.loc[:5], table_name3, df_sales.loc[:5], table_name5, df_store.loc[:5],  table_name6, df_channel.loc[:5],table_name7, df_lineaction.loc[:5], table_name8, df_calendar.loc[:5])
-messages = [{"role": "system", "content": prompt}]
-sql_query=generate_sql_query(text)
-print("Generated SQL query: ",sql_query)
-# prompt = """Given an input text, and You will generate the corresponding SQL query. The first table name is {} and the following data:\n {}. The second table name is {} and the following data for second table:\n {}. The third table name is {} and the following data for third table:\n {}.\n""".format(table_name1,df2.loc[:5], table_name2, df3.loc[:5], table_name3, df4.loc[:5])
-prompt = """Given an input text, and You will generate the corresponding SQL query. The schema name is {}. The first table name is {} and the following data:\n {}. The second table name is {} and the following data for second table:\n {}. The third table name is {} and the following data for third table:\n {}. The fourth table name is {} and the following data for fourth table:\n {}. The fifth table name is {} and the following data for fifth table:\n {}. The sixth table name is {} and the following data for sixth table:\n {}. The seventh table name is {} and the following data for seventh table:\n {} \n""".format(schema_name,table_name1,df_customer.loc[:5], table_name2, df_product.loc[:5], table_name3, df_sales.loc[:5], table_name5, df_store.loc[:5],  table_name6, df_channel.loc[:5],table_name7, df_lineaction.loc[:5], table_name8, df_calendar.loc[:5])
-messages = [{"role": "system", "content": prompt}]
-import time
-import gradio as gr
-def CustomChatGPT(user_inp):
-    messages.append({"role": "user", "content": user_inp})
-    response = openai.ChatCompletion.create(
-        model = "gpt-4",
-        messages = messages
-    )
-    ChatGPT_reply = response["choices"][0]["message"]["content"]
-    messages.append({"role": "assistant", "content": ChatGPT_reply})
-    return ChatGPT_reply
-def respond(message, chat_history):
-    bot_message = CustomChatGPT(message)
-    chat_history.append((message, bot_message))
-    time.sleep(2)
-    return "", chat_history
-# to test the generated sql query
-def test_Sql(sql):
   sql=sql.replace(';', '')
-  sql = sql + ' ' + 'limit 5'
   sql = str(sql)
   sql = sqlparse.format(sql, reindent=True, keyword_case='upper')
-  conn = psycopg2.connect(database=db_name, user = user_db, password = pwd_db, host = host_db, port = port_db)
-  df = pd.read_sql_query(sql, con=conn)
-  conn.close()
-  return pd.DataFrame(df)
-admin = os.getenv("admin")
-paswd = os.getenv("paswd")
-def same_auth(username, password):
-    if username == admin and password == paswd:
-        return 1
 with gr.Blocks() as demo:
     with gr.Tab("Query Helper"):
         gr.Markdown("""<h1><center> Query Helper</center></h1>""")
         chatbot = gr.Chatbot()
@@ -224,12 +260,57 @@ with gr.Blocks() as demo:
         clear = gr.ClearButton([msg, chatbot])
         msg.submit(respond, [msg, chatbot], [msg, chatbot])
     with gr.Tab("Run Query"):
-        # gr.Markdown("""<h1><center> Run Query </center></h1>""")
         text_input = gr.Textbox(label = 'Input SQL Query', placeholder="Write your SQL query here ...")
         text_output = gr.Textbox(label = 'Result')
         text_button = gr.Button("RUN QUERY")
         clear = gr.ClearButton([text_input, text_output])
-        text_button.click(test_Sql, inputs=text_input, outputs=text_output)
-demo.launch(share=True, auth=same_auth)

+from openai import OpenAI
 import pandas as pd
 import psycopg2
 import time
 import gradio as gr
 import sqlparse
+import re
 import os
+import warnings
+from config import *
+from constants import *
+from utils import *
+from gptManager import ChatgptManager
+# from queryHelper import QueryHelper
 pd.set_option('display.max_columns', None)
 pd.set_option('display.max_rows', None)
+# Filter out all warning messages
+warnings.filterwarnings("ignore")
+dbCreds = DataWrapper(DB_CREDS_DATA)
+dbEngine = DbEngine(dbCreds)
+dbEngine.connect()
+tablesAndCols = getAllTablesInfo(dbEngine, SCHEMA_NAME)
+metadataLayout = MetaDataLayout(schemaName=SCHEMA_NAME, allTablesAndCols=tablesAndCols)
+metadataLayout.setSelection(DEFAULT_TABLES_COLS)
+selectedTablesAndCols = metadataLayout.getSelectedTablesAndCols()
+def getSampleDataForTablesAndCols(dbEngine, schemaName, tablesAndCols, maxRows):
+    data = {}
+    conn = dbEngine.connection
+    for table in tablesAndCols.keys():
+      try:
+        sqlQuery = f"""select * from {schemaName}.{table} limit {maxRows}"""
+        data[table] = pd.read_sql_query(sqlQuery, con=conn)
+      except Exception as e:
+        print(e)
+        print(f"couldn't read table data. Table: {table}")
+    return data
+class QueryHelper:
+  def __init__(self, gptInstance, dbEngine, schemaName,
+               platform, metadataLayout, sampleDataRows,
+               gptSampleRows, getSampleDataForTablesAndCols):
+    self.gptInstance = gptInstance
+    self.schemaName = schemaName
+    self.platform = platform
+    self.metadataLayout = metadataLayout
+    self.sampleDataRows = sampleDataRows
+    self.gptSampleRows = gptSampleRows
+    self.getSampleDataForTablesAndCols = getSampleDataForTablesAndCols
+    self.dbEngine = dbEngine
+    self._onMetadataChange()
+  def _onMetadataChange(self):
+    metadataLayout = self.metadataLayout
+    sampleDataRows = self.sampleDataRows
+    dbEngine = self.dbEngine
+    schemaName = self.schemaName
+    selectedTablesAndCols = metadataLayout.getSelectedTablesAndCols()
+    self.sampleData = self.getSampleDataForTablesAndCols(dbEngine=dbEngine,schemaName=schemaName,
+                                                         tablesAndCols=selectedTablesAndCols, maxRows=sampleDataRows)
+  def getMetadata(self):
+    return self.metadataLayout
+  def updateMetadata(self, metadataLayout):
+    self.metadataLayout = metadataLayout
+    self._onMetadataChange()
+  def modifySqlQueryEnteredByUser(self, userSqlQuery):
+    platform = self.platform
+    userPrompt = f"Please correct the following sql query, also it has to be run on {platform}. sql query is \n {userSqlQuery}."
+    systemPrompt = ""
+    modifiedSql = self.gptInstance.getResponseForUserInput(userPrompt, systemPrompt)
+    return modifiedSql
+  def filteredSampleDataForProspects(self, prospectTablesAndCols):
+    sampleData = self.sampleData
+    filteredData = {}
+    for table in prospectTablesAndCols.keys():
+      # filteredData[table] = sampleData[table][prospectTablesAndCols[table]]
+      #take all columns of prospects
+      filteredData[table] = sampleData[table]
+    return filteredData
+  def getQueryForUserInput(self, userInput):
+    gptSampleRows = self.gptSampleRows
+    selectedTablesAndCols = self.metadataLayout.getSelectedTablesAndCols()
+    prospectTablesAndCols = self.getProspectiveTablesAndCols(userInput, selectedTablesAndCols)
+    print("getting prospects", prospectTablesAndCols)
+    prospectTablesData = self.filteredSampleDataForProspects(prospectTablesAndCols)
+    systemPromptForQueryGeneration = self.getSystemPromptForQueryGeneration(prospectTablesData, gptSampleRows=gptSampleRows)
+    queryByGpt = self.gptInstance.getResponseForUserInput(userInput, systemPromptForQueryGeneration)
+    return queryByGpt
+  def getProspectiveTablesAndCols(self, userInput, selectedTablesAndCols):
+    schemaName = self.schemaName
+    systemPromptForProspectColumns = self.getSystemPromptForProspectColumns(selectedTablesAndCols)
+    prospectiveTablesColsText = self.gptInstance.getResponseForUserInput(userInput, systemPromptForProspectColumns)
+    prospectTablesAndCols = {}
+    for table in selectedTablesAndCols.keys():
+      if table in prospectiveTablesColsText:
+        prospectTablesAndCols[table] = []
+        for column in selectedTablesAndCols[table]:
+          if column in prospectiveTablesColsText:
+            prospectTablesAndCols[table].append(column)
+    return prospectTablesAndCols
+  def getSystemPromptForQueryGeneration(self, prospectTablesData, gptSampleRows):
+    schemaName = self.schemaName
+    platform = self.platform
+    prompt = f"""Given an input text, generate the corresponding SQL query for given details. Schema Name is {schemaName}. And sql platform is {platform}.\n following is sample data"""
+    for idx, tableName in enumerate(prospectTablesData.keys(), start=1):
+        prompt += f"table name is {tableName}, table data is {prospectTablesData[tableName].head(gptSampleRows)}"
+    prompt += "XXXX"
+    return prompt.replace("\n"," ").replace("\\"," ").replace("  "," ").replace("XXXX", "    ")
+  def getSystemPromptForProspectColumns(self, selectedTablesAndCols):
+    schemaName = self.schemaName
+    platform = self.platform
+    prompt = f"""Given an input text, User wants to know which all tables and columns would be possibily to have the desired data. Output them as json. Schema Name is {schemaName}. And sql platform is {platform}.\n"""
+    for idx, tableName in enumerate(selectedTablesAndCols.keys(), start=1):
+        prompt += f"table name {tableName} {', '.join(selectedTablesAndCols[tableName])}"
+    prompt += "XXXX"
+    return prompt.replace("\n"," ").replace("\\"," ").replace("  "," ").replace("XXXX", "    ")
+openAIClient = OpenAI(api_key=OPENAI_API_KEY)
+gptInstance = ChatgptManager(openAIClient, model=GPT_MODEL)
+queryHelper = QueryHelper(gptInstance=gptInstance,
+                          schemaName=SCHEMA_NAME,platform=PLATFORM,
+                          metadataLayout=metadataLayout,
+                          sampleDataRows=SAMPLE_ROW_MAX,
+                          gptSampleRows=GPT_SAMPLE_ROWS,
+                         dbEngine=dbEngine,
+                         getSampleDataForTablesAndCols=getSampleDataForTablesAndCols)
+def checkAuth(username, password):
+  global ADMIN, PASSWD
+  if username == ADMIN and password == PASSWD:
+      return True
+  return False
+# Function to save history of chat
+def respond(message, chatHistory):
+  """gpt response handler for gradio ui"""
+  global queryHelper
+  botMessage = queryHelper.getQueryForUserInput(message)
+  chatHistory.append((message, botMessage))
+  time.sleep(2)
+  return "", chatHistory
+# Function to test the generated sql query
+def isDataQuery(sql_query):
+    upper_query = sql_query.upper()
+    dml_keywords = ['INSERT', 'UPDATE', 'DELETE', 'MERGE']
+    for keyword in dml_keywords:
+        if re.search(fr'\b{keyword}\b', upper_query):
+            return False  # Found a DML keyword, indicating modification
+    # If no DML keywords are found, it's likely a data query
+    return True
+def testSQL(sql):
+  global dbEngine, queryHelper
   sql=sql.replace(';', '')
+  if ('limit' in sql[-15:].lower())==False:
+    sql = sql + ' ' + 'limit 5'
   sql = str(sql)
   sql = sqlparse.format(sql, reindent=True, keyword_case='upper')
+  print(sql)
+  if not isDataQuery(sql):
+    return "Sorry not allowed to run. As the query modifies the data."
+  try:
+    conn = dbEngine.connection
+    df = pd.read_sql_query(sql, con=conn)
+    return pd.DataFrame(df)
+  except Exception as e:
+      print(f"Error occured during running the query {sql}.\n and the error is {str(e)}")
+      prompt = f"Please correct the following sql query, also it has to be run on {PLATFORM}. sql query is \n {sql}. the error occured is {str(e)}."
+      modifiedSql = queryHelper.modifySqlQueryEnteredByUser(prompt)
+      return f"The query you entered throws some error. Here is modified version. Please try this.\n {modifiedSql}"
+def onSelectedTablesChange(tablesSelected):
+  #Updates tables visible and allow selecting columns for them
+  global queryHelper
+  print(f"Selected tables : {tablesSelected}")
+  metadataLayout = queryHelper.getMetadata()
+  allTables = list(metadataLayout.getAllTablesCols())
+  tableBoxes = []
+  for i in range(len(allTables)):
+    if allTables[i] in tablesSelected:
+      tableBoxes.append(gr.Textbox(f"Textbox {allTables[i]}", visible=True, label=f"{allTables[i]}"))
+    else:
+      tableBoxes.append(gr.Textbox(f"Textbox {allTables[i]}", visible=False, label=f"{allTables[i]}"))
+  return tableBoxes
+def onSelectedColumnsChange(*tableBoxes):
+  #update selection of columns and tables (include new tables and cols in gpts context)
+  global queryHelper
+  metadataLayout = queryHelper.getMetadata()
+  allTablesList = list(metadataLayout.getAllTablesCols().keys())
+  tablesAndCols = {}
+  result = ''
+  print("Getting selected tables and columns from gradio")
+  for tableBox, table in zip(tableBoxes, allTablesList):
+    if isinstance(tableBox, list):
+      if len(tableBox)!=0:
+        tablesAndCols[table] = tableBox
+    else:
+      pass
+  metadataLayout.setSelection(tablesAndCols=tablesAndCols)
+  print("metadata updated")
+  print("Updating queryHelper state, and sample data")
+  queryHelper.updateMetadata(metadataLayout)
+  return "Columns udpated"
+def onResetToDefaultSelection():
+  global queryHelper
+  tablesSelected = list(DefaultTablesAndCols.keys())
+  tableBoxes = []
+  allTablesList = list(metadataLayout.getAllTablesCols().keys())
+  for i in range(len(allTablesList)):
+    if allTablesList[i] in tablesSelected:
+      tableBoxes.append(gr.Textbox(f"Textbox {allTablesList[i]}", visible=True, label=f"{allTablesList[i]}"))
+    else:
+      tableBoxes.append(gr.Textbox(f"Textbox {allTablesList[i]}", visible=False, label=f"{allTablesList[i]}"))
+  metadataLayout.resetSelection()
+  metadataLayout.setSelection(DefaultTablesAndCols)
+  queryHelper.updateMetadata(metadataLayout)
+  return tableBoxes
 with gr.Blocks() as demo:
+    # screen 1 : Chatbot for question answering to generate sql query from user input in english
     with gr.Tab("Query Helper"):
         gr.Markdown("""<h1><center> Query Helper</center></h1>""")
         chatbot = gr.Chatbot()
         clear = gr.ClearButton([msg, chatbot])
         msg.submit(respond, [msg, chatbot], [msg, chatbot])
+    # screen 2 : To run sql query against database
     with gr.Tab("Run Query"):
+        gr.Markdown("""<h1><center> Run Query </center></h1>""")
         text_input = gr.Textbox(label = 'Input SQL Query', placeholder="Write your SQL query here ...")
         text_output = gr.Textbox(label = 'Result')
         text_button = gr.Button("RUN QUERY")
         clear = gr.ClearButton([text_input, text_output])
+        text_button.click(testSQL, inputs=text_input, outputs=text_output)
+     # screen 3 : To set creds, schema, tables and columns
+    with gr.Tab("Setup"):
+        gr.Markdown("""<h1><center> Run Query </center></h1>""")
+        text_input = gr.Textbox(label = 'schema name', value= SCHEMA_NAME)
+        allTablesAndCols = queryHelper.getMetadata().getAllTablesCols()
+        selectedTablesAndCols = queryHelper.getMetadata().getSelectedTablesAndCols()
+        allTablesList = list(allTablesAndCols.keys())
+        selectedTablesList = list(selectedTablesAndCols.keys())
+        dropDown = gr.Dropdown(
+             allTablesList, value=selectedTablesList, multiselect=True, label="Selected Tables", info="Select Tables from available tables of the schema"
+        )
+        refreshTables = gr.Button("Refresh selected tables")
+        tableBoxes = []
+        for i in range(len(allTablesList)):
+            if allTablesList[i] in selectedTablesList:
+              columnsDropDown = gr.Dropdown(
+             allTablesAndCols[allTablesList[i]],visible=True,value=selectedTablesAndCols.get(allTablesList[i],None), multiselect=True, label=allTablesList[i], info="Select columns of a table"
+        )
+              #tableBoxes[allTables[i]] = columnsDropDown
+              tableBoxes.append(columnsDropDown)
+            else:
+              columnsDropDown = gr.Dropdown(
+             allTablesAndCols[allTablesList[i]], visible=False, value=None, multiselect=True, label=allTablesList[i], info="Select columns of a table"
+        )
+              #tableBoxes[allTables[i]] = columnsDropDown
+              tableBoxes.append(columnsDropDown)
+        refreshTables.click(onSelectedTablesChange, inputs=dropDown, outputs=tableBoxes)
+        columnsTextBox = gr.Textbox(label = 'Result')
+        refreshColumns = gr.Button("Refresh selected columns and Reload Data")
+        refreshColumns.click(onSelectedColumnsChange, inputs=tableBoxes, outputs=columnsTextBox)
+        resetToDefaultSelection = gr.Button("Reset to Default")
+        resetToDefaultSelection.click(onResetToDefaultSelection, inputs=None, outputs=tableBoxes)
+demo.launch(share=True, debug=True, ssl_verify=False, auth=checkAuth)
+dbEngine.connect()

config.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # from configLocal import *
2	+ from configProd import *

configProd.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import os
+OPENAI_API_KEY = os.getenv("api_key")
+#database credential
+dbName = os.getenv("db_name")
+userDB = os.getenv("user_db")
+pwdDB = os.getenv("pwd_db")
+host = os.getenv("host_db")
+port = os.getenv("port_db")
+GPT_MODEL = "gpt-4"
+# GPT_MODEL = "gpt-3.5-turbo-1106"
+#gradio login
+ADMIN = os.getenv("admin")
+PASSWD = os.getenv("paswd")
+DB_CREDS_DATA = ({"database":dbName, "user":userDB, "password":pwdDB, "host":host, "port":port})

constants.py ADDED Viewed

	@@ -0,0 +1,33 @@

+__all__ = ["SCHEMA_NAME", "GPT_SAMPLE_ROWS", "PLATFORM", "SAMPLE_ROW_MAX", "DEFAULT_TABLES_COLS", "QUERY_TIMEOUT"]
+#Constants
+SCHEMA_NAME = "lpdatamart"
+GPT_SAMPLE_ROWS = 5
+PLATFORM = "Amazon Redshift"
+SAMPLE_ROW_MAX = 50
+QUERY_TIMEOUT = 20 #timeout in seconds
+# list down the desired column
+customer_col=['customer_id','customer_type', 'first_name', 'middle_name', 'household_name', 'last_name', 'personal_email', 'city', 'state', 'zip_code', 'address1', 'country', 'gender', 'phone_number', 'reward_number']
+product_col=['product_id', 'product_name', 'product_price', 'department', 'class',  'discount', 'category', 'department_desc', 'department_type', 'product_type', 'manufacturer', 'color']
+sales_col = ['store_id', 'customer_id', 'channel_id', 'product_id', 'time_id', 'date_id','order_id', 'line_action', 'discount_amount', 'shipping_amount','transaction_date', 'transaction_amount', 'transaction_type', 'qty_sold']
+store_col = ['store_id', 'store_number', 'store_name', 'store_designation', 'store_longitude', 'store_latitude', 'store_manager_name', 'zip_code', 'state_code', 'city', 'street_number', 'street_name', 'store_region', 'store_type', 'address1','sublocationcode', 'channel', 'company_flag', 'kiosk_physical_store', 'sublocation_code']
+channel_col = ['channel_id', 'channel_name', 'channel_code']
+lineaction_col = ['line_action_code', 'line_action_code_desc', 'load_date', 'catgory', 'sales_type']
+calendar_col = ['date_id','calendar_date','calendar_month','day_of_week','calendar_week_number','calendar_month_number','calendar_quarter_number','day_of_month','day_of_quarter','day_of_the_year','us_holiday','lp_holiday','work_day','year','ad_week','ad_week_year','ad_month','lp_day','lp_week','lp_month','lp_year','lp_quarter','event_day']
+browse_col = ['cookie_id', 'session_id', 'customer_id', 'email_key', 'reward_number', 'date_id', 'time_id', 'category_id', 'browse_action_id', 'product_id', 'style_id', 'order_id']
+time_col = ['time_id', 'time_of_day']
+browse_action_col = ["browse_action_id", "browse_action"]
+browse_category_col = ['category_id', 'category_code', 'category']
+style_col = ["sku", "style", "source_file", "load_date"]
+email_col = ['event_id', 'customer_id', 'time_id', 'date_id', 'email_key']
+event_col = ['event_id', 'event_type', 'event_description', 'event_detail', 'start_date', 'end_date', 'event_code', 'event_category']
+DEFAULT_TABLES_COLS = {"tbl_d_customer":customer_col, "tbl_d_product":product_col, "tbl_f_sales":sales_col,
+          "tbl_d_store":store_col, "tbl_d_channel":channel_col, "tbl_d_lineaction_code":lineaction_col,
+                  "tbl_d_calendar":calendar_col, 'tbl_f_browse':browse_col, 'tbl_d_time': time_col, 'tbl_d_browse_action': browse_action_col,
+                  'tbl_d_browse_category':browse_category_col, 'tbl_d_style':style_col, 'tbl_f_emailing': email_col, 'tbl_d_event':event_col}

gptManager.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import re
+class ChatgptManager:
+  def __init__(self, openAIClient, model="gpt-3.5-turbo-1106", tokenLimit=8000):
+    self.client = openAIClient
+    self.tokenLimit = tokenLimit
+    self.model = model
+  def getResponseForUserInput(self, userInput, systemPrompt):
+    self.messages = []
+    newMessage = {"role":"system", "content":systemPrompt}
+    if not self.isTokeLimitExceeding(newMessage):
+      self.messages.append(newMessage)
+    else:
+      raise ValueError("System Prompt Too long.")
+    userMessage = {"role":"user", "content":userInput}
+    if not self.isTokeLimitExceeding(userMessage):
+      self.messages.append(userMessage)
+    else:
+      raise ValueError("Token Limit exceeding. With user input")
+    # completion = self.client.chat.completions.create(
+    #   model="gpt-3.5-turbo-1106",
+    #   messages=self.messages,
+    #   temperature=0,
+    # )
+    completion = self.client.chat.completions.create(
+      model=self.model,
+      messages=self.messages,
+      temperature=0,
+    )
+    gptResponse = completion.choices[0].message.content
+    self.messages.append({"role": "assistant", "content": gptResponse})
+    return gptResponse
+  def isTokeLimitExceeding(self, newMessage=None, truncate=True, throwError=True):
+    if self.getTokenCount(newMessage=newMessage) > self.tokenLimit:
+      return True
+    return False
+  def getTokenCount(self, newMessage=None):
+    """Token count including new Message"""
+    def getWordsCount(text):
+      return len(re.findall(r'\b\w+\b', text))
+    messages = self.messages[:]
+    if newMessage!=None:
+      messages.append(newMessage)
+    combinedContent = " ".join(msg["content"] for msg in messages)
+    currentTokensInMessages = getWordsCount(combinedContent)
+    return currentTokensInMessages

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
 pandas
 psycopg2
-openai
-sqlparse

 pandas
 psycopg2
+openai==1.3.5
+sqlparse
+gradio==3.50.1

utils.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import psycopg2
+class DataWrapper:
+  def __init__(self, data):
+    if isinstance(data, list):
+      emptyDict = {dataKey:None for dataKey in data}
+      self.__dict__.update(emptyDict)
+    elif isinstance(data, dict):
+      self.__dict__.update(data)
+  def addKey(self, key, val=None):
+    self.__dict__.update({key:val})
+  def __repr__(self):
+    return self.__dict__.__repr__()
+class MetaDataLayout:
+  def __init__(self, schemaName, allTablesAndCols):
+    self.schemaName = schemaName
+    self.datalayout = {
+        "schema": self.schemaName,
+        "selectedTables":{},
+        "allTables":allTablesAndCols
+    }
+  def setSelection(self, tablesAndCols):
+    """
+    tablesAndCols : {"table1":["col1", "col2"], "table1":["cola","colb"]}
+    """
+    datalayout = self.datalayout
+    for table in tablesAndCols:
+      if table in datalayout['allTables'].keys():
+        datalayout['selectedTables'][table] = tablesAndCols[table]
+      else:
+        print(f"Table {table} doesn't exists in the schema")
+    self.datalayout = datalayout
+  def resetSelection(self):
+    datalayout = self.datalayout
+    datalayout['selectedTables'] = {}
+    self.datalayout = datalayout
+  def getSelectedTablesAndCols(self):
+    return self.datalayout['selectedTables']
+  def getAllTablesCols(self):
+    return self.datalayout['allTables']
+class DbEngine:
+  def __init__(self, dbCreds):
+    self.dbCreds = dbCreds
+    self.connection = None
+  def connect(self):
+    dbCreds = self.dbCreds
+    if self.connection is None or self.connection.closed != 0:
+      self.connection = psycopg2.connect(database=dbCreds.database, user = dbCreds.user,
+                  password = dbCreds.password, host = dbCreds.host,
+                  port = dbCreds.port)
+  def disconnect(self):
+    if self.connection is not None and self.connection.closed == 0:
+      self.connection.close()
+  def execute_query(self, query):
+    with self.connection.cursor() as cursor:
+      cursor.execute(query)
+      result = cursor.fetchall()
+    return result
+def executeQuery(dbEngine, query):
+  result = dbEngine.execute_query(query)
+  return result
+def executeColumnsQuery(dbEngine, columnQuery):
+  with dbEngine.connection.cursor() as cursor:
+    cursor.execute(columnQuery)
+    columns = [desc[0] for desc in cursor.description]
+  return columns
+def closeDbEngine(dbEngine):
+  dbEngine.disconnect()
+def getAllTablesInfo(dbEngine, schemaName):
+  tablesAndCols = {}
+  allTablesQuery = f"""SELECT table_name FROM information_schema.tables
+    WHERE table_schema = '{schemaName}'"""
+  tables = executeQuery(dbEngine, allTablesQuery)
+  for table in tables:
+    tableName = table[0]
+    columnsQuery = f"""Select * FROM {schemaName}.{tableName} LIMIT 0"""
+    columns = executeColumnsQuery(dbEngine, columnsQuery)
+    tablesAndCols[tableName] = columns
+  return tablesAndCols
+def getSampleDataForTablesAndCols(dbEngine, schemaName, tablesAndCols, maxRows):
+    data = {}
+    conn = dbEngine.connection
+    for table in tablesAndCols.keys():
+      try:
+        sqlQuery = f"""select * from {schemaName}.{table} limit {maxRows}"""
+        data[table] = pd.read_sql_query(sqlQuery, con=conn)
+      except:
+        print(f"couldn't read table data. Table: {table}")
+    return data