Spaces:

NEXAS
/

InfoVoyager

Sleeping

App Files Files Community

NEXAS commited on Apr 20

Commit

f322558

•

1 Parent(s): 848fcb5

Upload 12 files

Browse files

Files changed (12) hide show

.env.example +2 -0
app.py +175 -0
helpers/__pycache__/llm.cpython-310.pyc +0 -0
helpers/__pycache__/llm.cpython-39.pyc +0 -0
helpers/__pycache__/utils.cpython-310.pyc +0 -0
helpers/__pycache__/utils.cpython-39.pyc +0 -0
helpers/__pycache__/vis.cpython-310.pyc +0 -0
helpers/__pycache__/vis.cpython-39.pyc +0 -0
helpers/llm.py +144 -0
helpers/utils.py +51 -0
helpers/vis.py +149 -0
requirements.txt +16 -0

.env.example ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ OPENAI_API_KEY=sk-enter_your_api_codehere
2	+ GROQ_API_KEY="gsk_api"

app.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import streamlit as st
+import pandas as pd
+from helpers.utils import clicked, describe_dataframe, to_show, checkbox_clicked, additional_clicked_fun
+from helpers.llm import first_look_function, eda_selection_generator, individual_eda, aaa_sample_generator, aaa_answer_generator
+from helpers.llm import filled_eda_prompt, filled_aaa_prompt
+from langchain_groq import ChatGroq
+#from langchain_community.chat_models import ChatOllama
+#from langchain_openai import ChatOpenAI
+from langchain_experimental.agents import create_pandas_dataframe_agent
+from langchain.prompts import PromptTemplate
+from langchain.chains import LLMChain
+from langchain_core.output_parsers import JsonOutputParser
+from helpers.vis import chart_generator, vis_generator
+import os
+from dotenv import load_dotenv
+load_dotenv()
+groq_api_key = os.getenv("GROQ_API_KEY")
+# Initialization
+if 'clicked' not in st.session_state:
+    st.session_state.clicked = {'begin_button': False}
+if 'eda_selection' not in st.session_state:
+    st.session_state.eda_selection = []
+if 'data_exist' not in st.session_state:
+    st.session_state.data_exist = False
+if 'checkbox_menu' not in st.session_state:
+    st.session_state.checkbox_menu = {
+        'show_data_button': True,
+        'eda_button': True,
+        'va_button': True,
+        'aaa_button': True
+    }
+if 'column_names' not in st.session_state:
+    st.session_state.column_names = None
+if 'df_details' not in st.session_state:
+    st.session_state.df_details = None
+if 'refreshed' not in st.session_state:
+    st.session_state.refreshed = {
+        'peda_clicked' : 0,
+        'aaa_clicked': 0,
+        'va_clicked': 0
+    }
+st.set_page_config(page_title="InfoVoyager", page_icon="📈")
+# LLM Declaration
+llm = ChatGroq(temperature=0, model_name="llama3-70b-8192")
+#llm = ChatGroq(temperature=0, model_name="mixtral-8x7b-32768")
+#llm = ChatGroq(temperature=0, model_name="Llama2-70b-4096")
+#llm = ChatOllama(model="llama2", request_timeout=30.0)
+#llm = ChatOpenAI(model_name='gpt-4-0125-preview')
+st.markdown("<h1 style='text-align: center;'>InfoVoyager 📈</h1>", unsafe_allow_html=True)
+st.text('Crafting Insights from Data with Llama3-70b via GROQ')
+col1, col2, col3 = st.columns(3)
+with col2:
+    st.button("Let's navigate our data voyage with AI at our side", on_click=clicked, args=['begin_button'])
+if st.session_state.clicked['begin_button']:
+    with st.expander('Upload your .csv data here'):
+        data = st.file_uploader(' ', type ='csv')
+    if data is not None:
+        st.session_state.data_exist = True
+        df = pd.read_csv(data, low_memory=False)
+        st.session_state.column_names = df.columns
+        pd_agent = create_pandas_dataframe_agent(llm, df, verbose=True)
+        if st.session_state.checkbox_menu['show_data_button']:
+            st.divider()
+            st.subheader('Show Data')
+            show_selection = ['First few rows', 'Last few rows', 'Random']
+            show_selected = st.selectbox('Select type of EDA to perform on this dataset:', options=show_selection)
+            rows_to_show = st.number_input('How many rows to show?', format='%d', step=1, value = 5)
+            st.write(to_show(df, show_selected, rows_to_show))
+        if st.session_state.checkbox_menu['eda_button']:
+            st.divider()
+            st.subheader('Exploratory Data Analysis')
+            df_details = describe_dataframe(df)
+            st.session_state.df_details = df_details
+            eda_chain = LLMChain(llm=llm, prompt=filled_eda_prompt)
+            eda_selection = eda_selection_generator(eda_chain, df_details)
+            st.session_state.eda_selection = eda_selection
+            eda_list = eda_selection.split('.\n-')[1:]
+            eda_list.insert(0, '[Default] Perform default EDA')
+            st.markdown('#### EDA to Perform')
+            eda_selected = st.selectbox('Based on the dataframe, here are the most common EDA steps to perform:', options=eda_list)
+            if st.button('Perform EDA', on_click=additional_clicked_fun, args=['peda_clicked']):
+                prompt = PromptTemplate.from_template(eda_selected)
+                with st.chat_message('assistant'):
+                    if eda_selected != '[Default] Perform default EDA':
+                        individual_eda(pd_agent, eda_selected, st.session_state.refreshed['peda_clicked'])
+                    else:
+                        first_look_function(df, pd_agent)
+        if st.session_state.checkbox_menu['va_button']:
+            st.divider()
+            st.subheader("Visualization")
+            user_question_vis = st.text_area("Tell me what you want to visualize/ investigate!")
+            st.button('Start Visualization', on_click=additional_clicked_fun, args=['va_clicked'])
+            if user_question_vis and st.session_state.refreshed['va_clicked']:
+                with st.spinner("Generating Chart Type"):
+                    chart = chart_generator(llm, df, user_question_vis)
+                st.write("Chart to be Produced:")
+                st.write(chart)
+                with st.spinner("Performing Feature Engineering and Charting!"):
+                    vis_generator(chart[0], llm, df)
+        if st.session_state.checkbox_menu['aaa_button']:
+            st.divider()
+            st.subheader("Ask AI Anything")
+            st.write('Hint: Check sidebar for Prompt Inspiration')
+            user_prompt = st.text_area('Enter your question here!')
+            st.button('Ask your question', on_click=additional_clicked_fun, args=['aaa_clicked'])
+            if user_prompt and st.session_state.refreshed['aaa_clicked']:
+                aaa_answer_generator(pd_agent, user_prompt, st.session_state.refreshed['aaa_clicked'])
+with st.sidebar:
+    if st.session_state.clicked['begin_button']:
+        st.header('Guide')
+        st.write('1. To begin, enter data in .csv format.')
+        if st.session_state.data_exist == True:
+            st.write('2. Choose what do you want to do?')
+            show_data_button = st.checkbox('Show Data', True, on_change=checkbox_clicked, args=['show_data_button'])
+            eda_button = st.checkbox('Exploratory Data Analysis', True, on_change=checkbox_clicked, args=['eda_button'])
+            va_button = st.checkbox('Visualization', True, on_change=checkbox_clicked, args=['va_button'])
+            aaa_button = st.checkbox('Ask AI Anything!', True, on_change=checkbox_clicked, args=['aaa_button'])
+            st.divider()
+            if show_data_button:
+                with st.expander('Columns Names'):
+                    st.markdown("Navigation: [Show Data](#show-data)", unsafe_allow_html=True)
+                    st.subheader('Columns Names')
+                    st.write(st.session_state.column_names)
+            if eda_button:
+                if len(st.session_state.eda_selection) != 0:
+                    with st.expander('EDA: Suggested Steps'):
+                        st.markdown("Navigation: [EDA](#exploratory-data-analysis)", unsafe_allow_html=True)
+                        # st.button("Refresh EDA Suggestions", on_click=additional_clicked_fun, args=['eda_selection_clicked'])
+                        st.write(st.session_state.eda_selection)
+            if aaa_button:
+                with st.expander('Prompt Inspiration'):
+                    st.markdown("Navigation: [Ask AI Anything](#ask-ai-anything)", unsafe_allow_html=True)
+                    aaa_chain = LLMChain(llm=llm, prompt=filled_aaa_prompt)
+                    _dataframe_details = st.session_state.df_details
+                    _eda_selection = st.session_state.eda_selection
+                    aaa_samples = aaa_sample_generator(aaa_chain, _dataframe_details, _eda_selection)
+                    st.write(aaa_samples)

helpers/__pycache__/llm.cpython-310.pyc ADDED Viewed

Binary file (4.56 kB). View file

helpers/__pycache__/llm.cpython-39.pyc ADDED Viewed

Binary file (4.61 kB). View file

helpers/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (1.79 kB). View file

helpers/__pycache__/utils.cpython-39.pyc ADDED Viewed

Binary file (1.85 kB). View file

helpers/__pycache__/vis.cpython-310.pyc ADDED Viewed

Binary file (4.94 kB). View file

helpers/__pycache__/vis.cpython-39.pyc ADDED Viewed

Binary file (5.06 kB). View file

helpers/llm.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import streamlit as st
+from langchain.prompts import PromptTemplate, PipelinePromptTemplate
+from langchain_community.callbacks import StreamlitCallbackHandler
+first_look_prompt = '''
+    {salutation}, You need to explore the dataframe in a few indicated steps below. Please indicate clearly what is the steps being done.
+    1. Data Overview:
+    1.1. Show first five rows of the data
+    1.2. Show the columns name
+    1.3. Show the missing values and duplicated for each column
+    1.4. Show Data summary: df.describe()
+    1.5. Calculate correlation in the data
+    1.6. Identify potential outliers
+    1.7. Identify potential new features to include
+'''
+first_look_template = PromptTemplate.from_template(first_look_prompt)
+def text_runner(_agent, df, text):
+    st.write(text)
+    st.write(_agent.run(text))
+def function_runner(_agent, text, function):
+    st.write(text)
+    st.write(function)
+@st.cache_data
+def first_look_function(df, _agent):
+    st.write('**Data Overview**')
+    text_runner(_agent, df, "Show columns name")
+    text_runner(_agent, df, "Show the missing values and duplicated for each column")
+    function_runner(_agent, "Show data summary", df.describe())
+    text_runner(_agent, df, "Identify potential outliers")
+    text_runner(_agent, df, "Identify potential new features to include")
+    return None
+sb_template = PromptTemplate.from_template(
+    "Output simple one liner steps for: {question}"
+)
+eda_template = '''
+{intro}
+{do_not_list}
+{dataframe_description}
+'''
+eda_prompt = PromptTemplate.from_template(eda_template)
+intro_eda_template = '''
+Give me step by step idea for an EDA provided that this is the details of the dataframe.
+The answer should be in bullet form, each step should be less than 5 words.
+Example format of the list (start with '-', ends with '.'):
+- Identify missing values.
+'''
+do_not_eda_template = '''
+- Do not show backend work such as import libraries, load dataframe.
+- Do not provide the answer to the EDA, i.e. x columns, y rows.
+- Do not provide any suggestion related to visualization.
+- Provide not more than 8 concrete/ not repetitive steps.
+- Do not show Feature Engineering steps
+- Do not generate something that we couldn't answer based on the existing dataframe, i.e. corr values when there is no numerical columns in the dataframe
+'''
+dataframe_description_template = '''
+Here is the details of the dataframe: {dataframe_details}
+'''
+intro_eda_prompt = PromptTemplate.from_template(intro_eda_template)
+do_not_eda_prompt = PromptTemplate.from_template(do_not_eda_template)
+dataframe_description_eda_prompt = PromptTemplate.from_template(dataframe_description_template)
+input_eda_prompts = [
+    ("intro", intro_eda_prompt),
+    ("do_not_list", do_not_eda_prompt),
+    ("dataframe_description", dataframe_description_eda_prompt),
+]
+filled_eda_prompt = PipelinePromptTemplate(
+    final_prompt=eda_prompt, pipeline_prompts=input_eda_prompts
+)
+@st.cache_data
+def eda_selection_generator(_eda_chain, _df_details):
+    return _eda_chain.invoke({'dataframe_details': _df_details})['text']
+@st.cache_data
+def individual_eda(_pd_agent, _eda_selected, peda_click_count):
+    st_callback = StreamlitCallbackHandler(st.container())
+    st.write(_pd_agent.run(_eda_selected, callbacks=[st_callback]))
+aaa_template = '''
+{intro}
+{dataframe_description}
+{do_not_list}
+'''
+aaa_prompt = PromptTemplate.from_template(aaa_template)
+# Give me a list of possible questions that Pandas agent can answer well about the dataframe.
+intro_aaa_template = '''
+Each sentence should be less than 6 words long and clear.
+Provide not more than 8 concrete/ not repetitive questions.
+'''
+dataframe_description_aaa_template = '''
+Here is the details of the dataframe: {dataframe_details}
+'''
+do_not_aaa_template = '''
+- DO NOT provide any list that is already captured before in the double quotation "{eda_selection}".
+- Do not provide list that cannot be answered by pandas agent.
+- Do not provide questions about number of rows/ columns, missing values
+'''
+intro_aaa_prompt = PromptTemplate.from_template(intro_aaa_template)
+dataframe_description_aaa_prompt = PromptTemplate.from_template(dataframe_description_aaa_template)
+do_not_eda_prompt = PromptTemplate.from_template(do_not_aaa_template)
+input_aaa_prompts = [
+    ("intro", intro_aaa_prompt),
+    ("dataframe_description", dataframe_description_aaa_prompt),
+    ("do_not_list", do_not_eda_prompt),
+]
+filled_aaa_prompt = PipelinePromptTemplate(
+    final_prompt=aaa_prompt, pipeline_prompts=input_aaa_prompts
+)
+@st.cache_data
+def aaa_sample_generator(_aaa_chain, _dataframe_details, _eda_selection):
+    return _aaa_chain.invoke({'dataframe_details': _dataframe_details, 'eda_selection': _eda_selection})['text']
+@st.cache_data
+def aaa_answer_generator(_pd_agent, _user_prompt, refreshed):
+    st_callback = StreamlitCallbackHandler(st.container())
+    answer_to_user = _pd_agent.run(_user_prompt, callbacks=[st_callback])
+    st.write(answer_to_user)

helpers/utils.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import streamlit as st
+import pandas as pd
+def clicked(button):
+    st.session_state.clicked[button] = True
+def checkbox_clicked(button):
+    st.session_state.checkbox_menu[button] = st.session_state.checkbox_menu[button] == False
+def additional_clicked_fun(button):
+    st.session_state.refreshed[button] += 1
+@st.cache_data
+def describe_dataframe(df):
+    # Initialize a list to hold descriptions for each column
+    column_descriptions = []
+    for column in df.columns:
+        # Basic column data
+        col_type = df[column].dtype
+        num_nulls = df[column].isnull().sum()
+        null_info = "has some missing values" if num_nulls > 0 else "has no missing values"
+        # Detailed stats for numeric columns
+        if pd.api.types.is_numeric_dtype(df[column]):
+            max_value = df[column].max()
+            min_value = df[column].min()
+            mean_value = df[column].mean()
+            column_descriptions.append(f"{column} (numeric) - type: {col_type}, {null_info}, max: {max_value}, min: {min_value}, mean: {mean_value:.2f}")
+        # Add more conditions for other data types (e.g., categorical, datetime) as needed
+        else:
+            column_descriptions.append(f"{column} - type: {col_type}, {null_info}")
+    # Combine all column descriptions into a single string
+    detailed_description = "; ".join(column_descriptions)
+    overall_description = f"The dataset has {len(df)} rows and {len(df.columns)} columns. Column details: {detailed_description}."
+    return overall_description
+@st.cache_data
+def to_show(df, show_selected, rows_to_show):
+    switch_dic = {
+        'First few rows': df.head(rows_to_show), 'Last few rows': df.tail(rows_to_show), 'Random':df.sample(rows_to_show)
+    }
+    st.write(f'There are {len(df)} rows and {len(df.columns)} columns.')
+    # columns = [col for col in df.columns]
+    # st.write('Column Names')
+    # st.write(columns)
+    return switch_dic[show_selected]

helpers/vis.py ADDED Viewed

	@@ -0,0 +1,149 @@

+from langchain.prompts.pipeline import PipelinePromptTemplate
+from langchain.prompts import (
+    PromptTemplate
+)
+from langchain.chains.llm import LLMChain
+from langchain_core.output_parsers import JsonOutputParser
+import streamlit as st
+import importlib
+def prompt_generator_chart_type():
+    system_template = """
+        The following is a conversation between a Human and an AI assistant expert on data visualization with perfect Python 3 syntax. The human will provide a sample dataset for the AI to use as the source. The real dataset that the human will use with the response of the AI is going to have several more rows. The AI assistant will only reply in the following JSON format:
+        {{
+        "charts": [{{'title': string, 'chartType': string, 'parameters': {{...}}}}, ... ]
+        }}
+        Instructions:
+        1. chartType must only contain methods of plotly.express from the Python library Plotly.
+        2. The format for charType string: plotly.express.chartType.
+        3. For each chartType, parameters must contain the value to be used for all parameters of that plotly.express method.
+        4. There should 4 parameters for each chart.
+        5. Do not include "data_frame" in the parameters.
+        6. Features in 'parameters' should not contain a space character. Joining more than one word should be done by using '_'.
+        7. There should be {num_charts} charts in total.
+        """
+    system_message_prompt = PromptTemplate.from_template(system_template)
+    human_template = """
+        Human:
+        This is the dataset:
+        {data}
+        Create chart that analyze this specific topic: {topic}
+        """
+    human_message_prompt = PromptTemplate.from_template(human_template)
+    full_template = """{system_prompt}
+    {human_prompt}
+    """
+    full_prompt = PromptTemplate.from_template(full_template)
+    input_prompts = [
+        ("system_prompt", system_message_prompt),
+        ("human_prompt", human_message_prompt),
+    ]
+    pipeline_prompt = PipelinePromptTemplate(
+        final_prompt=full_prompt, pipeline_prompts=input_prompts, input_variables=['num_charts','data', 'topic']
+    )
+    return pipeline_prompt
+def prompt_generator_feature_engineering():
+    system_template = """
+        Instructions:
+        1. Read the visualization specs as given to you. Check on all variables in 'parameters'.
+        2. If any of the variables in 'parameters' does not appear as a column in the original dataset, return pandas function which transforms the original dataset into a new dataset containing ALL variables in parameters.
+        3. Return this pandas operations in string form. Only return the string to execute without any explanation!
+        4. If there are >1 line of code, split them with ';'
+        5. Sometimes you need to rename the column to ensure ALL variables in 'parameters' are represented exactly in the final_df dataset.
+        6. Always end the answer with 'final_df = df'
+        Assumptions:
+        1. Assume that original dataframe is given as 'df'
+        2. Assume that the columns in the original dataframe might not have the right dtypes. Adjust it first to accept the right dtypes.
+        Do not do this:
+        1. Do not use python``` code here ``` format. Directly return pandas function in text format.
+        """
+    system_message_prompt = PromptTemplate.from_template(system_template)
+    human_template = """
+        Human:
+        This is the dataset:
+        {data}
+        Please perform sorting of the data!
+        This is the column names in the original dataset:
+        {column_names}
+        This is the visualization specs:
+        {vis_specs}
+        """
+    human_message_prompt = PromptTemplate.from_template(human_template)
+    full_template = """{system_prompt}
+    {human_prompt}
+    """
+    full_prompt = PromptTemplate.from_template(full_template)
+    input_prompts = [
+        ("system_prompt", system_message_prompt),
+        ("human_prompt", human_message_prompt),
+    ]
+    pipeline_prompt = PipelinePromptTemplate(
+        final_prompt=full_prompt, pipeline_prompts=input_prompts, input_variables=['data', 'column_names', 'vis_specs']
+    )
+    return pipeline_prompt
+def chart_generator(llm, df, user_question_vis):
+    chart_type_chain = LLMChain(llm=llm,
+                    prompt=prompt_generator_chart_type(),
+                    output_parser=JsonOutputParser(),
+                    output_key='vis_specs'
+                    )
+    chart_types = chart_type_chain.run({
+        "data":df.head(10),
+        "topic": user_question_vis,
+        "num_charts": 1
+    })
+    return chart_types['charts']
+def vis_generator(chart, llm, df):
+    params = chart['parameters']
+    fe_chain = LLMChain(llm=llm, prompt=prompt_generator_feature_engineering(), output_key='final_output')
+    fe_code = fe_chain.run({
+        "data": df.head(10),
+        "column_names": df.columns,
+        "vis_specs": chart
+    })
+    # st.write(fe_code)
+    final_df = None
+    try:
+        exec(fe_code)
+        st.write('Successfully Executed Feature Engineering Script')
+        final_df = df
+    except Exception as e:
+        st.write(f"Error during Feature Engineering Execution: {e}")
+    if final_df is not None:
+        # st.write(df.head())
+        # st.write(final_df.head())  # Using .head() to display just the first few rows
+        params['data_frame'] = final_df
+        chart_type = chart['chartType']
+        px_module = importlib.import_module("plotly.express")
+        chart_function = getattr(px_module, chart_type.split('.')[-1])
+        fig = chart_function(**params)
+        st.plotly_chart(fig, use_container_width=True)
+    else:
+        st.write("final_df was not defined.")

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+streamlit==1.32.2
+python-dotenv==1.0.1
+pandas
+langchain-openai==0.1.1
+langchain
+langchain-experimental
+tabulate==0.9.0
+langchainhub
+duckduckgo-search==5.2.2
+google-search-results==2.4.2
+matplotlib==3.8.4
+seaborn==0.13.2
+langchain-community
+plotly==5.21.0
+python-dotenv
+langchain-groq