Spaces:
Sleeping
Sleeping
File size: 4,988 Bytes
f322558 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
import streamlit as st
from langchain.prompts import PromptTemplate, PipelinePromptTemplate
from langchain_community.callbacks import StreamlitCallbackHandler
first_look_prompt = '''
{salutation}, You need to explore the dataframe in a few indicated steps below. Please indicate clearly what is the steps being done.
1. Data Overview:
1.1. Show first five rows of the data
1.2. Show the columns name
1.3. Show the missing values and duplicated for each column
1.4. Show Data summary: df.describe()
1.5. Calculate correlation in the data
1.6. Identify potential outliers
1.7. Identify potential new features to include
'''
first_look_template = PromptTemplate.from_template(first_look_prompt)
def text_runner(_agent, df, text):
st.write(text)
st.write(_agent.run(text))
def function_runner(_agent, text, function):
st.write(text)
st.write(function)
@st.cache_data
def first_look_function(df, _agent):
st.write('**Data Overview**')
text_runner(_agent, df, "Show columns name")
text_runner(_agent, df, "Show the missing values and duplicated for each column")
function_runner(_agent, "Show data summary", df.describe())
text_runner(_agent, df, "Identify potential outliers")
text_runner(_agent, df, "Identify potential new features to include")
return None
sb_template = PromptTemplate.from_template(
"Output simple one liner steps for: {question}"
)
eda_template = '''
{intro}
{do_not_list}
{dataframe_description}
'''
eda_prompt = PromptTemplate.from_template(eda_template)
intro_eda_template = '''
Give me step by step idea for an EDA provided that this is the details of the dataframe.
The answer should be in bullet form, each step should be less than 5 words.
Example format of the list (start with '-', ends with '.'):
- Identify missing values.
'''
do_not_eda_template = '''
- Do not show backend work such as import libraries, load dataframe.
- Do not provide the answer to the EDA, i.e. x columns, y rows.
- Do not provide any suggestion related to visualization.
- Provide not more than 8 concrete/ not repetitive steps.
- Do not show Feature Engineering steps
- Do not generate something that we couldn't answer based on the existing dataframe, i.e. corr values when there is no numerical columns in the dataframe
'''
dataframe_description_template = '''
Here is the details of the dataframe: {dataframe_details}
'''
intro_eda_prompt = PromptTemplate.from_template(intro_eda_template)
do_not_eda_prompt = PromptTemplate.from_template(do_not_eda_template)
dataframe_description_eda_prompt = PromptTemplate.from_template(dataframe_description_template)
input_eda_prompts = [
("intro", intro_eda_prompt),
("do_not_list", do_not_eda_prompt),
("dataframe_description", dataframe_description_eda_prompt),
]
filled_eda_prompt = PipelinePromptTemplate(
final_prompt=eda_prompt, pipeline_prompts=input_eda_prompts
)
@st.cache_data
def eda_selection_generator(_eda_chain, _df_details):
return _eda_chain.invoke({'dataframe_details': _df_details})['text']
@st.cache_data
def individual_eda(_pd_agent, _eda_selected, peda_click_count):
st_callback = StreamlitCallbackHandler(st.container())
st.write(_pd_agent.run(_eda_selected, callbacks=[st_callback]))
aaa_template = '''
{intro}
{dataframe_description}
{do_not_list}
'''
aaa_prompt = PromptTemplate.from_template(aaa_template)
# Give me a list of possible questions that Pandas agent can answer well about the dataframe.
intro_aaa_template = '''
Each sentence should be less than 6 words long and clear.
Provide not more than 8 concrete/ not repetitive questions.
'''
dataframe_description_aaa_template = '''
Here is the details of the dataframe: {dataframe_details}
'''
do_not_aaa_template = '''
- DO NOT provide any list that is already captured before in the double quotation "{eda_selection}".
- Do not provide list that cannot be answered by pandas agent.
- Do not provide questions about number of rows/ columns, missing values
'''
intro_aaa_prompt = PromptTemplate.from_template(intro_aaa_template)
dataframe_description_aaa_prompt = PromptTemplate.from_template(dataframe_description_aaa_template)
do_not_eda_prompt = PromptTemplate.from_template(do_not_aaa_template)
input_aaa_prompts = [
("intro", intro_aaa_prompt),
("dataframe_description", dataframe_description_aaa_prompt),
("do_not_list", do_not_eda_prompt),
]
filled_aaa_prompt = PipelinePromptTemplate(
final_prompt=aaa_prompt, pipeline_prompts=input_aaa_prompts
)
@st.cache_data
def aaa_sample_generator(_aaa_chain, _dataframe_details, _eda_selection):
return _aaa_chain.invoke({'dataframe_details': _dataframe_details, 'eda_selection': _eda_selection})['text']
@st.cache_data
def aaa_answer_generator(_pd_agent, _user_prompt, refreshed):
st_callback = StreamlitCallbackHandler(st.container())
answer_to_user = _pd_agent.run(_user_prompt, callbacks=[st_callback])
st.write(answer_to_user) |