File size: 4,988 Bytes
f322558
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import streamlit as st
from langchain.prompts import PromptTemplate, PipelinePromptTemplate
from langchain_community.callbacks import StreamlitCallbackHandler

first_look_prompt = '''
    {salutation}, You need to explore the dataframe in a few indicated steps below. Please indicate clearly what is the steps being done.
    1. Data Overview: 
    1.1. Show first five rows of the data
    1.2. Show the columns name
    1.3. Show the missing values and duplicated for each column
    1.4. Show Data summary: df.describe()
    1.5. Calculate correlation in the data
    1.6. Identify potential outliers
    1.7. Identify potential new features to include
'''

first_look_template = PromptTemplate.from_template(first_look_prompt)

def text_runner(_agent, df, text):
    st.write(text)
    st.write(_agent.run(text))

def function_runner(_agent, text, function):
    st.write(text)
    st.write(function)

@st.cache_data
def first_look_function(df, _agent):
    st.write('**Data Overview**')
    text_runner(_agent, df, "Show columns name")
    text_runner(_agent, df, "Show the missing values and duplicated for each column")
    function_runner(_agent, "Show data summary", df.describe())
    text_runner(_agent, df, "Identify potential outliers")
    text_runner(_agent, df, "Identify potential new features to include")

    return None


sb_template = PromptTemplate.from_template(
    "Output simple one liner steps for: {question}"
)

eda_template = '''
{intro}

{do_not_list}

{dataframe_description}
'''
eda_prompt = PromptTemplate.from_template(eda_template)

intro_eda_template = '''
Give me step by step idea for an EDA provided that this is the details of the dataframe. 
The answer should be in bullet form, each step should be less than 5 words. 
Example format of the list (start with '-', ends with '.'): 
- Identify missing values.
'''

do_not_eda_template = '''
- Do not show backend work such as import libraries, load dataframe.
- Do not provide the answer to the EDA, i.e. x columns, y rows. 
- Do not provide any suggestion related to visualization.
- Provide not more than 8 concrete/ not repetitive steps.
- Do not show Feature Engineering steps
- Do not generate something that we couldn't answer based on the existing dataframe, i.e. corr values when there is no numerical columns in the dataframe
'''

dataframe_description_template = '''
Here is the details of the dataframe: {dataframe_details}
'''

intro_eda_prompt = PromptTemplate.from_template(intro_eda_template)
do_not_eda_prompt = PromptTemplate.from_template(do_not_eda_template)
dataframe_description_eda_prompt = PromptTemplate.from_template(dataframe_description_template)

input_eda_prompts = [
    ("intro", intro_eda_prompt),
    ("do_not_list", do_not_eda_prompt),
    ("dataframe_description", dataframe_description_eda_prompt),
]

filled_eda_prompt = PipelinePromptTemplate(
    final_prompt=eda_prompt, pipeline_prompts=input_eda_prompts
)

@st.cache_data
def eda_selection_generator(_eda_chain, _df_details):
    return _eda_chain.invoke({'dataframe_details': _df_details})['text']

@st.cache_data
def individual_eda(_pd_agent, _eda_selected, peda_click_count):
    st_callback = StreamlitCallbackHandler(st.container())
    st.write(_pd_agent.run(_eda_selected, callbacks=[st_callback]))


aaa_template = '''
{intro}

{dataframe_description}

{do_not_list}
'''
aaa_prompt = PromptTemplate.from_template(aaa_template)

# Give me a list of possible questions that Pandas agent can answer well about the dataframe.
intro_aaa_template = '''

Each sentence should be less than 6 words long and clear. 
Provide not more than 8 concrete/ not repetitive questions.
'''

dataframe_description_aaa_template = '''
Here is the details of the dataframe: {dataframe_details}
'''

do_not_aaa_template = '''
- DO NOT provide any list that is already captured before in the double quotation "{eda_selection}".
- Do not provide list that cannot be answered by pandas agent.
- Do not provide questions about number of rows/ columns, missing values
'''

intro_aaa_prompt = PromptTemplate.from_template(intro_aaa_template)
dataframe_description_aaa_prompt = PromptTemplate.from_template(dataframe_description_aaa_template)
do_not_eda_prompt = PromptTemplate.from_template(do_not_aaa_template)

input_aaa_prompts = [
    ("intro", intro_aaa_prompt),
    ("dataframe_description", dataframe_description_aaa_prompt),
    ("do_not_list", do_not_eda_prompt),
]

filled_aaa_prompt = PipelinePromptTemplate(
    final_prompt=aaa_prompt, pipeline_prompts=input_aaa_prompts
)

@st.cache_data
def aaa_sample_generator(_aaa_chain, _dataframe_details, _eda_selection):
    return _aaa_chain.invoke({'dataframe_details': _dataframe_details, 'eda_selection': _eda_selection})['text']

@st.cache_data
def aaa_answer_generator(_pd_agent, _user_prompt, refreshed):
    st_callback = StreamlitCallbackHandler(st.container())
    answer_to_user = _pd_agent.run(_user_prompt, callbacks=[st_callback])
    st.write(answer_to_user)