NEXAS commited on
Commit
f322558
1 Parent(s): 848fcb5

Upload 12 files

Browse files
.env.example ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ OPENAI_API_KEY=sk-enter_your_api_codehere
2
+ GROQ_API_KEY="gsk_api"
app.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from helpers.utils import clicked, describe_dataframe, to_show, checkbox_clicked, additional_clicked_fun
4
+ from helpers.llm import first_look_function, eda_selection_generator, individual_eda, aaa_sample_generator, aaa_answer_generator
5
+ from helpers.llm import filled_eda_prompt, filled_aaa_prompt
6
+ from langchain_groq import ChatGroq
7
+ #from langchain_community.chat_models import ChatOllama
8
+ #from langchain_openai import ChatOpenAI
9
+ from langchain_experimental.agents import create_pandas_dataframe_agent
10
+ from langchain.prompts import PromptTemplate
11
+ from langchain.chains import LLMChain
12
+ from langchain_core.output_parsers import JsonOutputParser
13
+ from helpers.vis import chart_generator, vis_generator
14
+ import os
15
+ from dotenv import load_dotenv
16
+ load_dotenv()
17
+
18
+ groq_api_key = os.getenv("GROQ_API_KEY")
19
+
20
+ # Initialization
21
+ if 'clicked' not in st.session_state:
22
+ st.session_state.clicked = {'begin_button': False}
23
+
24
+ if 'eda_selection' not in st.session_state:
25
+ st.session_state.eda_selection = []
26
+
27
+ if 'data_exist' not in st.session_state:
28
+ st.session_state.data_exist = False
29
+
30
+ if 'checkbox_menu' not in st.session_state:
31
+ st.session_state.checkbox_menu = {
32
+ 'show_data_button': True,
33
+ 'eda_button': True,
34
+ 'va_button': True,
35
+ 'aaa_button': True
36
+ }
37
+
38
+ if 'column_names' not in st.session_state:
39
+ st.session_state.column_names = None
40
+
41
+ if 'df_details' not in st.session_state:
42
+ st.session_state.df_details = None
43
+
44
+ if 'refreshed' not in st.session_state:
45
+ st.session_state.refreshed = {
46
+ 'peda_clicked' : 0,
47
+ 'aaa_clicked': 0,
48
+ 'va_clicked': 0
49
+ }
50
+
51
+ st.set_page_config(page_title="InfoVoyager", page_icon="📈")
52
+
53
+ # LLM Declaration
54
+ llm = ChatGroq(temperature=0, model_name="llama3-70b-8192")
55
+ #llm = ChatGroq(temperature=0, model_name="mixtral-8x7b-32768")
56
+ #llm = ChatGroq(temperature=0, model_name="Llama2-70b-4096")
57
+ #llm = ChatOllama(model="llama2", request_timeout=30.0)
58
+ #llm = ChatOpenAI(model_name='gpt-4-0125-preview')
59
+
60
+ st.markdown("<h1 style='text-align: center;'>InfoVoyager 📈</h1>", unsafe_allow_html=True)
61
+ st.text('Crafting Insights from Data with Llama3-70b via GROQ')
62
+
63
+ col1, col2, col3 = st.columns(3)
64
+ with col2:
65
+ st.button("Let's navigate our data voyage with AI at our side", on_click=clicked, args=['begin_button'])
66
+
67
+ if st.session_state.clicked['begin_button']:
68
+ with st.expander('Upload your .csv data here'):
69
+ data = st.file_uploader(' ', type ='csv')
70
+ if data is not None:
71
+ st.session_state.data_exist = True
72
+ df = pd.read_csv(data, low_memory=False)
73
+ st.session_state.column_names = df.columns
74
+
75
+
76
+ pd_agent = create_pandas_dataframe_agent(llm, df, verbose=True)
77
+ if st.session_state.checkbox_menu['show_data_button']:
78
+ st.divider()
79
+ st.subheader('Show Data')
80
+
81
+ show_selection = ['First few rows', 'Last few rows', 'Random']
82
+ show_selected = st.selectbox('Select type of EDA to perform on this dataset:', options=show_selection)
83
+ rows_to_show = st.number_input('How many rows to show?', format='%d', step=1, value = 5)
84
+ st.write(to_show(df, show_selected, rows_to_show))
85
+
86
+ if st.session_state.checkbox_menu['eda_button']:
87
+ st.divider()
88
+ st.subheader('Exploratory Data Analysis')
89
+
90
+ df_details = describe_dataframe(df)
91
+ st.session_state.df_details = df_details
92
+
93
+ eda_chain = LLMChain(llm=llm, prompt=filled_eda_prompt)
94
+
95
+ eda_selection = eda_selection_generator(eda_chain, df_details)
96
+ st.session_state.eda_selection = eda_selection
97
+
98
+ eda_list = eda_selection.split('.\n-')[1:]
99
+ eda_list.insert(0, '[Default] Perform default EDA')
100
+
101
+ st.markdown('#### EDA to Perform')
102
+
103
+ eda_selected = st.selectbox('Based on the dataframe, here are the most common EDA steps to perform:', options=eda_list)
104
+
105
+ if st.button('Perform EDA', on_click=additional_clicked_fun, args=['peda_clicked']):
106
+ prompt = PromptTemplate.from_template(eda_selected)
107
+ with st.chat_message('assistant'):
108
+ if eda_selected != '[Default] Perform default EDA':
109
+ individual_eda(pd_agent, eda_selected, st.session_state.refreshed['peda_clicked'])
110
+ else:
111
+ first_look_function(df, pd_agent)
112
+
113
+ if st.session_state.checkbox_menu['va_button']:
114
+ st.divider()
115
+ st.subheader("Visualization")
116
+ user_question_vis = st.text_area("Tell me what you want to visualize/ investigate!")
117
+ st.button('Start Visualization', on_click=additional_clicked_fun, args=['va_clicked'])
118
+ if user_question_vis and st.session_state.refreshed['va_clicked']:
119
+ with st.spinner("Generating Chart Type"):
120
+ chart = chart_generator(llm, df, user_question_vis)
121
+ st.write("Chart to be Produced:")
122
+ st.write(chart)
123
+ with st.spinner("Performing Feature Engineering and Charting!"):
124
+ vis_generator(chart[0], llm, df)
125
+
126
+
127
+ if st.session_state.checkbox_menu['aaa_button']:
128
+ st.divider()
129
+ st.subheader("Ask AI Anything")
130
+ st.write('Hint: Check sidebar for Prompt Inspiration')
131
+ user_prompt = st.text_area('Enter your question here!')
132
+ st.button('Ask your question', on_click=additional_clicked_fun, args=['aaa_clicked'])
133
+ if user_prompt and st.session_state.refreshed['aaa_clicked']:
134
+ aaa_answer_generator(pd_agent, user_prompt, st.session_state.refreshed['aaa_clicked'])
135
+
136
+
137
+ with st.sidebar:
138
+ if st.session_state.clicked['begin_button']:
139
+ st.header('Guide')
140
+ st.write('1. To begin, enter data in .csv format.')
141
+ if st.session_state.data_exist == True:
142
+ st.write('2. Choose what do you want to do?')
143
+ show_data_button = st.checkbox('Show Data', True, on_change=checkbox_clicked, args=['show_data_button'])
144
+ eda_button = st.checkbox('Exploratory Data Analysis', True, on_change=checkbox_clicked, args=['eda_button'])
145
+
146
+ va_button = st.checkbox('Visualization', True, on_change=checkbox_clicked, args=['va_button'])
147
+ aaa_button = st.checkbox('Ask AI Anything!', True, on_change=checkbox_clicked, args=['aaa_button'])
148
+
149
+ st.divider()
150
+ if show_data_button:
151
+ with st.expander('Columns Names'):
152
+ st.markdown("Navigation: [Show Data](#show-data)", unsafe_allow_html=True)
153
+ st.subheader('Columns Names')
154
+ st.write(st.session_state.column_names)
155
+
156
+ if eda_button:
157
+ if len(st.session_state.eda_selection) != 0:
158
+ with st.expander('EDA: Suggested Steps'):
159
+ st.markdown("Navigation: [EDA](#exploratory-data-analysis)", unsafe_allow_html=True)
160
+ # st.button("Refresh EDA Suggestions", on_click=additional_clicked_fun, args=['eda_selection_clicked'])
161
+ st.write(st.session_state.eda_selection)
162
+
163
+
164
+ if aaa_button:
165
+ with st.expander('Prompt Inspiration'):
166
+ st.markdown("Navigation: [Ask AI Anything](#ask-ai-anything)", unsafe_allow_html=True)
167
+ aaa_chain = LLMChain(llm=llm, prompt=filled_aaa_prompt)
168
+ _dataframe_details = st.session_state.df_details
169
+ _eda_selection = st.session_state.eda_selection
170
+ aaa_samples = aaa_sample_generator(aaa_chain, _dataframe_details, _eda_selection)
171
+ st.write(aaa_samples)
172
+
173
+
174
+
175
+
helpers/__pycache__/llm.cpython-310.pyc ADDED
Binary file (4.56 kB). View file
 
helpers/__pycache__/llm.cpython-39.pyc ADDED
Binary file (4.61 kB). View file
 
helpers/__pycache__/utils.cpython-310.pyc ADDED
Binary file (1.79 kB). View file
 
helpers/__pycache__/utils.cpython-39.pyc ADDED
Binary file (1.85 kB). View file
 
helpers/__pycache__/vis.cpython-310.pyc ADDED
Binary file (4.94 kB). View file
 
helpers/__pycache__/vis.cpython-39.pyc ADDED
Binary file (5.06 kB). View file
 
helpers/llm.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from langchain.prompts import PromptTemplate, PipelinePromptTemplate
3
+ from langchain_community.callbacks import StreamlitCallbackHandler
4
+
5
+ first_look_prompt = '''
6
+ {salutation}, You need to explore the dataframe in a few indicated steps below. Please indicate clearly what is the steps being done.
7
+ 1. Data Overview:
8
+ 1.1. Show first five rows of the data
9
+ 1.2. Show the columns name
10
+ 1.3. Show the missing values and duplicated for each column
11
+ 1.4. Show Data summary: df.describe()
12
+ 1.5. Calculate correlation in the data
13
+ 1.6. Identify potential outliers
14
+ 1.7. Identify potential new features to include
15
+ '''
16
+
17
+ first_look_template = PromptTemplate.from_template(first_look_prompt)
18
+
19
+ def text_runner(_agent, df, text):
20
+ st.write(text)
21
+ st.write(_agent.run(text))
22
+
23
+ def function_runner(_agent, text, function):
24
+ st.write(text)
25
+ st.write(function)
26
+
27
+ @st.cache_data
28
+ def first_look_function(df, _agent):
29
+ st.write('**Data Overview**')
30
+ text_runner(_agent, df, "Show columns name")
31
+ text_runner(_agent, df, "Show the missing values and duplicated for each column")
32
+ function_runner(_agent, "Show data summary", df.describe())
33
+ text_runner(_agent, df, "Identify potential outliers")
34
+ text_runner(_agent, df, "Identify potential new features to include")
35
+
36
+ return None
37
+
38
+
39
+ sb_template = PromptTemplate.from_template(
40
+ "Output simple one liner steps for: {question}"
41
+ )
42
+
43
+ eda_template = '''
44
+ {intro}
45
+
46
+ {do_not_list}
47
+
48
+ {dataframe_description}
49
+ '''
50
+ eda_prompt = PromptTemplate.from_template(eda_template)
51
+
52
+ intro_eda_template = '''
53
+ Give me step by step idea for an EDA provided that this is the details of the dataframe.
54
+ The answer should be in bullet form, each step should be less than 5 words.
55
+ Example format of the list (start with '-', ends with '.'):
56
+ - Identify missing values.
57
+ '''
58
+
59
+ do_not_eda_template = '''
60
+ - Do not show backend work such as import libraries, load dataframe.
61
+ - Do not provide the answer to the EDA, i.e. x columns, y rows.
62
+ - Do not provide any suggestion related to visualization.
63
+ - Provide not more than 8 concrete/ not repetitive steps.
64
+ - Do not show Feature Engineering steps
65
+ - Do not generate something that we couldn't answer based on the existing dataframe, i.e. corr values when there is no numerical columns in the dataframe
66
+ '''
67
+
68
+ dataframe_description_template = '''
69
+ Here is the details of the dataframe: {dataframe_details}
70
+ '''
71
+
72
+ intro_eda_prompt = PromptTemplate.from_template(intro_eda_template)
73
+ do_not_eda_prompt = PromptTemplate.from_template(do_not_eda_template)
74
+ dataframe_description_eda_prompt = PromptTemplate.from_template(dataframe_description_template)
75
+
76
+ input_eda_prompts = [
77
+ ("intro", intro_eda_prompt),
78
+ ("do_not_list", do_not_eda_prompt),
79
+ ("dataframe_description", dataframe_description_eda_prompt),
80
+ ]
81
+
82
+ filled_eda_prompt = PipelinePromptTemplate(
83
+ final_prompt=eda_prompt, pipeline_prompts=input_eda_prompts
84
+ )
85
+
86
+ @st.cache_data
87
+ def eda_selection_generator(_eda_chain, _df_details):
88
+ return _eda_chain.invoke({'dataframe_details': _df_details})['text']
89
+
90
+ @st.cache_data
91
+ def individual_eda(_pd_agent, _eda_selected, peda_click_count):
92
+ st_callback = StreamlitCallbackHandler(st.container())
93
+ st.write(_pd_agent.run(_eda_selected, callbacks=[st_callback]))
94
+
95
+
96
+ aaa_template = '''
97
+ {intro}
98
+
99
+ {dataframe_description}
100
+
101
+ {do_not_list}
102
+ '''
103
+ aaa_prompt = PromptTemplate.from_template(aaa_template)
104
+
105
+ # Give me a list of possible questions that Pandas agent can answer well about the dataframe.
106
+ intro_aaa_template = '''
107
+
108
+ Each sentence should be less than 6 words long and clear.
109
+ Provide not more than 8 concrete/ not repetitive questions.
110
+ '''
111
+
112
+ dataframe_description_aaa_template = '''
113
+ Here is the details of the dataframe: {dataframe_details}
114
+ '''
115
+
116
+ do_not_aaa_template = '''
117
+ - DO NOT provide any list that is already captured before in the double quotation "{eda_selection}".
118
+ - Do not provide list that cannot be answered by pandas agent.
119
+ - Do not provide questions about number of rows/ columns, missing values
120
+ '''
121
+
122
+ intro_aaa_prompt = PromptTemplate.from_template(intro_aaa_template)
123
+ dataframe_description_aaa_prompt = PromptTemplate.from_template(dataframe_description_aaa_template)
124
+ do_not_eda_prompt = PromptTemplate.from_template(do_not_aaa_template)
125
+
126
+ input_aaa_prompts = [
127
+ ("intro", intro_aaa_prompt),
128
+ ("dataframe_description", dataframe_description_aaa_prompt),
129
+ ("do_not_list", do_not_eda_prompt),
130
+ ]
131
+
132
+ filled_aaa_prompt = PipelinePromptTemplate(
133
+ final_prompt=aaa_prompt, pipeline_prompts=input_aaa_prompts
134
+ )
135
+
136
+ @st.cache_data
137
+ def aaa_sample_generator(_aaa_chain, _dataframe_details, _eda_selection):
138
+ return _aaa_chain.invoke({'dataframe_details': _dataframe_details, 'eda_selection': _eda_selection})['text']
139
+
140
+ @st.cache_data
141
+ def aaa_answer_generator(_pd_agent, _user_prompt, refreshed):
142
+ st_callback = StreamlitCallbackHandler(st.container())
143
+ answer_to_user = _pd_agent.run(_user_prompt, callbacks=[st_callback])
144
+ st.write(answer_to_user)
helpers/utils.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+
4
+ def clicked(button):
5
+ st.session_state.clicked[button] = True
6
+
7
+ def checkbox_clicked(button):
8
+ st.session_state.checkbox_menu[button] = st.session_state.checkbox_menu[button] == False
9
+
10
+ def additional_clicked_fun(button):
11
+ st.session_state.refreshed[button] += 1
12
+
13
+ @st.cache_data
14
+ def describe_dataframe(df):
15
+ # Initialize a list to hold descriptions for each column
16
+ column_descriptions = []
17
+
18
+ for column in df.columns:
19
+ # Basic column data
20
+ col_type = df[column].dtype
21
+ num_nulls = df[column].isnull().sum()
22
+ null_info = "has some missing values" if num_nulls > 0 else "has no missing values"
23
+
24
+ # Detailed stats for numeric columns
25
+ if pd.api.types.is_numeric_dtype(df[column]):
26
+ max_value = df[column].max()
27
+ min_value = df[column].min()
28
+ mean_value = df[column].mean()
29
+ column_descriptions.append(f"{column} (numeric) - type: {col_type}, {null_info}, max: {max_value}, min: {min_value}, mean: {mean_value:.2f}")
30
+ # Add more conditions for other data types (e.g., categorical, datetime) as needed
31
+ else:
32
+ column_descriptions.append(f"{column} - type: {col_type}, {null_info}")
33
+
34
+ # Combine all column descriptions into a single string
35
+ detailed_description = "; ".join(column_descriptions)
36
+
37
+ overall_description = f"The dataset has {len(df)} rows and {len(df.columns)} columns. Column details: {detailed_description}."
38
+
39
+ return overall_description
40
+
41
+ @st.cache_data
42
+ def to_show(df, show_selected, rows_to_show):
43
+ switch_dic = {
44
+ 'First few rows': df.head(rows_to_show), 'Last few rows': df.tail(rows_to_show), 'Random':df.sample(rows_to_show)
45
+ }
46
+ st.write(f'There are {len(df)} rows and {len(df.columns)} columns.')
47
+ # columns = [col for col in df.columns]
48
+ # st.write('Column Names')
49
+ # st.write(columns)
50
+
51
+ return switch_dic[show_selected]
helpers/vis.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.prompts.pipeline import PipelinePromptTemplate
2
+ from langchain.prompts import (
3
+ PromptTemplate
4
+ )
5
+ from langchain.chains.llm import LLMChain
6
+ from langchain_core.output_parsers import JsonOutputParser
7
+ import streamlit as st
8
+ import importlib
9
+
10
+ def prompt_generator_chart_type():
11
+ system_template = """
12
+ The following is a conversation between a Human and an AI assistant expert on data visualization with perfect Python 3 syntax. The human will provide a sample dataset for the AI to use as the source. The real dataset that the human will use with the response of the AI is going to have several more rows. The AI assistant will only reply in the following JSON format:
13
+
14
+ {{
15
+ "charts": [{{'title': string, 'chartType': string, 'parameters': {{...}}}}, ... ]
16
+ }}
17
+
18
+ Instructions:
19
+
20
+ 1. chartType must only contain methods of plotly.express from the Python library Plotly.
21
+ 2. The format for charType string: plotly.express.chartType.
22
+ 3. For each chartType, parameters must contain the value to be used for all parameters of that plotly.express method.
23
+ 4. There should 4 parameters for each chart.
24
+ 5. Do not include "data_frame" in the parameters.
25
+ 6. Features in 'parameters' should not contain a space character. Joining more than one word should be done by using '_'.
26
+ 7. There should be {num_charts} charts in total.
27
+ """
28
+ system_message_prompt = PromptTemplate.from_template(system_template)
29
+
30
+ human_template = """
31
+ Human:
32
+ This is the dataset:
33
+
34
+ {data}
35
+
36
+ Create chart that analyze this specific topic: {topic}
37
+ """
38
+ human_message_prompt = PromptTemplate.from_template(human_template)
39
+
40
+ full_template = """{system_prompt}
41
+
42
+ {human_prompt}
43
+ """
44
+ full_prompt = PromptTemplate.from_template(full_template)
45
+
46
+ input_prompts = [
47
+ ("system_prompt", system_message_prompt),
48
+ ("human_prompt", human_message_prompt),
49
+ ]
50
+ pipeline_prompt = PipelinePromptTemplate(
51
+ final_prompt=full_prompt, pipeline_prompts=input_prompts, input_variables=['num_charts','data', 'topic']
52
+ )
53
+ return pipeline_prompt
54
+
55
+
56
+ def prompt_generator_feature_engineering():
57
+ system_template = """
58
+ Instructions:
59
+ 1. Read the visualization specs as given to you. Check on all variables in 'parameters'.
60
+ 2. If any of the variables in 'parameters' does not appear as a column in the original dataset, return pandas function which transforms the original dataset into a new dataset containing ALL variables in parameters.
61
+ 3. Return this pandas operations in string form. Only return the string to execute without any explanation!
62
+ 4. If there are >1 line of code, split them with ';'
63
+ 5. Sometimes you need to rename the column to ensure ALL variables in 'parameters' are represented exactly in the final_df dataset.
64
+ 6. Always end the answer with 'final_df = df'
65
+
66
+ Assumptions:
67
+ 1. Assume that original dataframe is given as 'df'
68
+ 2. Assume that the columns in the original dataframe might not have the right dtypes. Adjust it first to accept the right dtypes.
69
+
70
+ Do not do this:
71
+ 1. Do not use python``` code here ``` format. Directly return pandas function in text format.
72
+ """
73
+ system_message_prompt = PromptTemplate.from_template(system_template)
74
+
75
+ human_template = """
76
+ Human:
77
+ This is the dataset:
78
+ {data}
79
+ Please perform sorting of the data!
80
+
81
+ This is the column names in the original dataset:
82
+ {column_names}
83
+
84
+ This is the visualization specs:
85
+ {vis_specs}
86
+ """
87
+ human_message_prompt = PromptTemplate.from_template(human_template)
88
+
89
+ full_template = """{system_prompt}
90
+
91
+ {human_prompt}
92
+ """
93
+ full_prompt = PromptTemplate.from_template(full_template)
94
+
95
+ input_prompts = [
96
+ ("system_prompt", system_message_prompt),
97
+ ("human_prompt", human_message_prompt),
98
+ ]
99
+ pipeline_prompt = PipelinePromptTemplate(
100
+ final_prompt=full_prompt, pipeline_prompts=input_prompts, input_variables=['data', 'column_names', 'vis_specs']
101
+ )
102
+ return pipeline_prompt
103
+
104
+ def chart_generator(llm, df, user_question_vis):
105
+ chart_type_chain = LLMChain(llm=llm,
106
+ prompt=prompt_generator_chart_type(),
107
+ output_parser=JsonOutputParser(),
108
+ output_key='vis_specs'
109
+ )
110
+
111
+ chart_types = chart_type_chain.run({
112
+ "data":df.head(10),
113
+ "topic": user_question_vis,
114
+ "num_charts": 1
115
+ })
116
+
117
+ return chart_types['charts']
118
+
119
+ def vis_generator(chart, llm, df):
120
+ params = chart['parameters']
121
+ fe_chain = LLMChain(llm=llm, prompt=prompt_generator_feature_engineering(), output_key='final_output')
122
+ fe_code = fe_chain.run({
123
+ "data": df.head(10),
124
+ "column_names": df.columns,
125
+ "vis_specs": chart
126
+ })
127
+ # st.write(fe_code)
128
+ final_df = None
129
+ try:
130
+ exec(fe_code)
131
+ st.write('Successfully Executed Feature Engineering Script')
132
+ final_df = df
133
+ except Exception as e:
134
+ st.write(f"Error during Feature Engineering Execution: {e}")
135
+
136
+ if final_df is not None:
137
+ # st.write(df.head())
138
+ # st.write(final_df.head()) # Using .head() to display just the first few rows
139
+ params['data_frame'] = final_df
140
+
141
+ chart_type = chart['chartType']
142
+ px_module = importlib.import_module("plotly.express")
143
+ chart_function = getattr(px_module, chart_type.split('.')[-1])
144
+ fig = chart_function(**params)
145
+
146
+ st.plotly_chart(fig, use_container_width=True)
147
+
148
+ else:
149
+ st.write("final_df was not defined.")
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit==1.32.2
2
+ python-dotenv==1.0.1
3
+ pandas
4
+ langchain-openai==0.1.1
5
+ langchain
6
+ langchain-experimental
7
+ tabulate==0.9.0
8
+ langchainhub
9
+ duckduckgo-search==5.2.2
10
+ google-search-results==2.4.2
11
+ matplotlib==3.8.4
12
+ seaborn==0.13.2
13
+ langchain-community
14
+ plotly==5.21.0
15
+ python-dotenv
16
+ langchain-groq