Spaces:
Sleeping
Sleeping
import streamlit as st | |
from langchain.prompts import PromptTemplate, PipelinePromptTemplate | |
from langchain_community.callbacks import StreamlitCallbackHandler | |
first_look_prompt = ''' | |
{salutation}, You need to explore the dataframe in a few indicated steps below. Please indicate clearly what is the steps being done. | |
1. Data Overview: | |
1.1. Show first five rows of the data | |
1.2. Show the columns name | |
1.3. Show the missing values and duplicated for each column | |
1.4. Show Data summary: df.describe() | |
1.5. Calculate correlation in the data | |
1.6. Identify potential outliers | |
1.7. Identify potential new features to include | |
''' | |
first_look_template = PromptTemplate.from_template(first_look_prompt) | |
def text_runner(_agent, df, text): | |
st.write(text) | |
st.write(_agent.run(text)) | |
def function_runner(_agent, text, function): | |
st.write(text) | |
st.write(function) | |
def first_look_function(df, _agent): | |
st.write('**Data Overview**') | |
text_runner(_agent, df, "Show columns name") | |
text_runner(_agent, df, "Show the missing values and duplicated for each column") | |
function_runner(_agent, "Show data summary", df.describe()) | |
text_runner(_agent, df, "Identify potential outliers") | |
text_runner(_agent, df, "Identify potential new features to include") | |
return None | |
sb_template = PromptTemplate.from_template( | |
"Output simple one liner steps for: {question}" | |
) | |
eda_template = ''' | |
{intro} | |
{do_not_list} | |
{dataframe_description} | |
''' | |
eda_prompt = PromptTemplate.from_template(eda_template) | |
intro_eda_template = ''' | |
Give me step by step idea for an EDA provided that this is the details of the dataframe. | |
The answer should be in bullet form, each step should be less than 5 words. | |
Example format of the list (start with '-', ends with '.'): | |
- Identify missing values. | |
''' | |
do_not_eda_template = ''' | |
- Do not show backend work such as import libraries, load dataframe. | |
- Do not provide the answer to the EDA, i.e. x columns, y rows. | |
- Do not provide any suggestion related to visualization. | |
- Provide not more than 8 concrete/ not repetitive steps. | |
- Do not show Feature Engineering steps | |
- Do not generate something that we couldn't answer based on the existing dataframe, i.e. corr values when there is no numerical columns in the dataframe | |
''' | |
dataframe_description_template = ''' | |
Here is the details of the dataframe: {dataframe_details} | |
''' | |
intro_eda_prompt = PromptTemplate.from_template(intro_eda_template) | |
do_not_eda_prompt = PromptTemplate.from_template(do_not_eda_template) | |
dataframe_description_eda_prompt = PromptTemplate.from_template(dataframe_description_template) | |
input_eda_prompts = [ | |
("intro", intro_eda_prompt), | |
("do_not_list", do_not_eda_prompt), | |
("dataframe_description", dataframe_description_eda_prompt), | |
] | |
filled_eda_prompt = PipelinePromptTemplate( | |
final_prompt=eda_prompt, pipeline_prompts=input_eda_prompts | |
) | |
def eda_selection_generator(_eda_chain, _df_details): | |
return _eda_chain.invoke({'dataframe_details': _df_details})['text'] | |
def individual_eda(_pd_agent, _eda_selected, peda_click_count): | |
st_callback = StreamlitCallbackHandler(st.container()) | |
st.write(_pd_agent.run(_eda_selected, callbacks=[st_callback])) | |
aaa_template = ''' | |
{intro} | |
{dataframe_description} | |
{do_not_list} | |
''' | |
aaa_prompt = PromptTemplate.from_template(aaa_template) | |
# Give me a list of possible questions that Pandas agent can answer well about the dataframe. | |
intro_aaa_template = ''' | |
Each sentence should be less than 6 words long and clear. | |
Provide not more than 8 concrete/ not repetitive questions. | |
''' | |
dataframe_description_aaa_template = ''' | |
Here is the details of the dataframe: {dataframe_details} | |
''' | |
do_not_aaa_template = ''' | |
- DO NOT provide any list that is already captured before in the double quotation "{eda_selection}". | |
- Do not provide list that cannot be answered by pandas agent. | |
- Do not provide questions about number of rows/ columns, missing values | |
''' | |
intro_aaa_prompt = PromptTemplate.from_template(intro_aaa_template) | |
dataframe_description_aaa_prompt = PromptTemplate.from_template(dataframe_description_aaa_template) | |
do_not_eda_prompt = PromptTemplate.from_template(do_not_aaa_template) | |
input_aaa_prompts = [ | |
("intro", intro_aaa_prompt), | |
("dataframe_description", dataframe_description_aaa_prompt), | |
("do_not_list", do_not_eda_prompt), | |
] | |
filled_aaa_prompt = PipelinePromptTemplate( | |
final_prompt=aaa_prompt, pipeline_prompts=input_aaa_prompts | |
) | |
def aaa_sample_generator(_aaa_chain, _dataframe_details, _eda_selection): | |
return _aaa_chain.invoke({'dataframe_details': _dataframe_details, 'eda_selection': _eda_selection})['text'] | |
def aaa_answer_generator(_pd_agent, _user_prompt, refreshed): | |
st_callback = StreamlitCallbackHandler(st.container()) | |
answer_to_user = _pd_agent.run(_user_prompt, callbacks=[st_callback]) | |
st.write(answer_to_user) |