jsds003 commited on
Commit
27a10b6
·
1 Parent(s): 3135c28

Added the application's contents to the initial commit

Browse files
Files changed (2) hide show
  1. requirements.txt +3 -1
  2. src/streamlit_app.py +305 -37
requirements.txt CHANGED
@@ -1,3 +1,5 @@
1
  altair
2
  pandas
3
- streamlit
 
 
 
1
  altair
2
  pandas
3
+ streamlit
4
+ transformers
5
+ pygwalker
src/streamlit_app.py CHANGED
@@ -1,40 +1,308 @@
1
- import altair as alt
2
- import numpy as np
3
  import pandas as pd
 
4
  import streamlit as st
 
 
 
5
 
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import pandas as pd
2
+ from transformers import pipeline
3
  import streamlit as st
4
+ from pygwalker.api.streamlit import StreamlitRenderer
5
+ import re
6
+ from typing import List, Any
7
 
8
+ @st.cache_resource
9
+ def getPipeline():
10
+ return pipeline("text-generation", model="nvidia/Llama-3.1-Nemotron-Nano-4B-v1.1")
11
+
12
+
13
+ @st.cache_resource
14
+ def get_pyg_renderer(df: pd.DataFrame):
15
+ return StreamlitRenderer(st.session_state.df)
16
+
17
+ pipe = getPipeline()
18
+
19
+ def FileSummaryHelper(df: pd.DataFrame) -> str:
20
+ """Gathers basiline information about the dataset"""
21
+
22
+ colSummaries = []
23
+
24
+ for col in df:
25
+ colSummaries.append(f"'{col}' | Data Type: {df[col].dtype} | Missing Percentage: {df[col].isna().mean()*100:.2f}%")
26
+ colTypesAndNulls = "\n".join(colSummaries)
27
+
28
+ duplicateVals = df.duplicated(keep=False).sum()
29
+ totalVals = len(df)
30
+
31
+ return f"""
32
+ The columns of the data have the following datatypes and missing value percentages:
33
+ {colTypesAndNulls}
34
+
35
+ The dataset has {totalVals} total rows.
36
+
37
+ The dataset has {duplicateVals} duplicated rows.
38
+ """
39
+
40
+ def FileDescriptionAgent(userDesc:str, df: pd.DataFrame) -> str:
41
+ """Generates a description of the contents of the file based on initial analysis."""
42
+
43
+ userDesc = "" if not userDesc else "I have described the dataset as follows: " + userDesc
44
+ fileSummary = FileSummaryHelper(df)
45
+
46
+ prompt = f""" You are given a DataFrame `df` with columns: {', '.join(df.columns.tolist())}
47
+ {fileSummary}
48
+ {userDesc}
49
+
50
+ Qualitatively describe the dataset in 2-3 concise sentences. Your response must only include the description with no explanations before or after."""
51
+
52
+ messages = [
53
+ {"role": "system", "content": \
54
+ "detailed thinking off. You are an insightful Data Analyst."},
55
+ {"role": "user","content":prompt}
56
+ ]
57
+
58
+ response = pipe(messages, temperature = 0.2, max_new_tokens = 1024, return_full_text=False)[0]['generated_text']
59
+
60
+ return response
61
+
62
+ def AnlaysisQuestionAgent(summary:str):
63
+
64
+ messages = [
65
+ {"role": "system", "content": \
66
+ """detailed thinking off. You are an inquisitive Data Analyst.
67
+ Given the following summary of a dataset, create a list of 3 analytical questions, following these rules:
68
+
69
+ Rules
70
+ -----
71
+ 1. The questions must be answerable through simple Pandas operations with only the given data.
72
+ 2. Your response must only include the three questions in a numbered list. Do not include explanations or caveats before or after.
73
+ 3. Ensure the output list is formated: 1. question1, 2. question2, 3. question3
74
+ """},
75
+ {"role":"user","content":summary}
76
+ ]
77
+
78
+ response = pipe(messages, temperature = 0.2, max_new_tokens = 1024, return_full_text=False)[0]['generated_text']
79
+
80
+ parts = re.split(r'\d+\.\s*', response)
81
+
82
+ result = [p.strip() for p in parts if p]
83
+
84
+ return result
85
+
86
+ def CodeGeneratorTool(cols: List[str], query: str) -> str:
87
+ """Generate a prompt for the LLM to write pandas-only code for a data query (no plotting)."""
88
+
89
+ return f"""
90
+ Given DataFrame `df` with columns: {', '.join(cols)}
91
+ Write Python code (pandas **only**, no plotting) to answer:
92
+ "{query}"
93
+
94
+ Rules
95
+ -----
96
+ 1. Use pandas operations on `df` only.
97
+ 2. Assign the final result to `result`.
98
+ 3. Wrap the snippet in a single ```python code fence (no extra prose).
99
+ """
100
+
101
+ def CodeExecutionHelper(code: str, df: pd.DataFrame):
102
+ """Executes the generated code, returning the result or error"""
103
+
104
+ env = {"pd": pd, "df": df}
105
+ try:
106
+ exec(code, {}, env)
107
+ return env.get("result", None)
108
+ except Exception as exc:
109
+ return f"Error executing code: {exc}"
110
+
111
+ def CodeExtractorHelper(text: str) -> str:
112
+ """Extracts the first python code block from the output"""
113
+
114
+ start = text.find("```python")
115
+ if start == -1:
116
+ return ""
117
+ start += len("```python")
118
+ end = text.find("```", start)
119
+ if end == -1:
120
+ return ""
121
+ return text[start:end].strip()
122
+
123
+ def ToolSelectorAgent(query: str, df: pd.DataFrame):
124
+ """Selects the appropriate tool for the users query"""
125
+
126
+ prompt = CodeGeneratorTool(df.columns.tolist(), query)
127
+
128
+ messages = [
129
+ {"role": "system", "content": \
130
+ "detailed thinking off. You are a Python data-analysis expert who writes clean, efficient code. \
131
+ Solve the given problem with optimal pandas operations. Be concise and focused. \
132
+ Your response must contain ONLY a properly-closed ```python code block with no explanations before or after. \
133
+ Ensure your solution is correct, handles edge cases, and follows best practices for data analysis."},
134
+ {"role": "user", "content": prompt}
135
+ ]
136
+
137
+ response = pipe(messages, temperature = 0.2, max_new_tokens = 1024, return_full_text=False)[0]['generated_text']
138
+ return CodeExtractorHelper(response)
139
+
140
+ def ReasoningPromptGenerator(query: str, result: Any) -> str:
141
+ """Packages the output into a response, provinding reasoning about the result."""
142
+
143
+ isError = isinstance(result, str) and result.startswith("Error executing code")
144
+
145
+ if isError:
146
+ desc = result
147
+ else:
148
+ desc = str(result)[:300] #why slice it
149
+
150
+ prompt = f"""
151
+ The user asked: "{query}".
152
+ The result value is: {desc}
153
+ Explain in 2-3 concise sentences what this tells about the data (no mention of charts)."""
154
+ return prompt
155
+
156
+ def ReasoningAgent(query: str, result: Any):
157
+ """Executes the reasoning prompt and returns the results and explination to the user"""
158
+
159
+ prompt = ReasoningPromptGenerator(query, result)
160
+ isError = isinstance(result, str) and result.startswith("Error executing code")
161
+
162
+ messages = [
163
+ {"role": "system", "content": \
164
+ "detailed thinking on. You are an insightful data analyst"},
165
+ {"role": "user","content": prompt}
166
+
167
+ ]
168
+
169
+ response = pipe(messages, temperature = 0.2, max_new_tokens = 1024, return_full_text=False)[0]['generated_text']
170
+ if "</think>" in response:
171
+ splitResponse = response.split("</think>",1)
172
+ response = splitResponse[1]
173
+ thinking = splitResponse[0]
174
+ return response, thinking
175
+
176
+ def ResponseBuilderTool(question:str)->str:
177
+ code = ToolSelectorAgent(question, st.session_state.df)
178
+ result = CodeExecutionHelper(code, st.session_state.df)
179
+ reasoning_txt, raw_thinking = ReasoningAgent(question, result)
180
+ reasoning_txt = reasoning_txt.replace("`", "")
181
+
182
+ # Build assistant response
183
+
184
+ if isinstance(result, (pd.DataFrame, pd.Series)):
185
+ header = f"Result: {len(result)} rows" if isinstance(result, pd.DataFrame) else "Result series"
186
+ else:
187
+ header = f"Result: {result}"
188
+
189
+ # Show only reasoning thinking in Model Thinking (collapsed by default)
190
+ thinking_html = ""
191
+ if raw_thinking:
192
+ thinking_html = (
193
+ '<details class="thinking">'
194
+ '<summary>🧠 Reasoning</summary>'
195
+ f'<pre>{raw_thinking}</pre>'
196
+ '</details>'
197
+ )
198
+
199
+ # Code accordion with proper HTML <pre><code> syntax highlighting
200
+ code_html = (
201
+ '<details class="code">'
202
+ '<summary>View code</summary>'
203
+ '<pre><code class="language-python">'
204
+ f'{code}'
205
+ '</code></pre>'
206
+ '</details>'
207
+ )
208
+
209
+ # Combine thinking, explanation, and code accordion
210
+ return f"{header}\n\n{thinking_html}{reasoning_txt}\n\n{code_html}"
211
+
212
+
213
+ def main():
214
+ """Streamlit App"""
215
+
216
+ st.set_page_config(layout="wide")
217
+ st.title("Analytics Agent")
218
+
219
+ file = st.file_uploader("Choose CSV", type=["csv"])
220
+
221
+ if file:
222
+ if("df" not in st.session_state) or (st.session_state.get("current_file") != file.name):
223
+ st.session_state.df = pd.read_csv(file)
224
+ st.session_state.current_file = file.name
225
+ with st.spinner("Summarizing..."):
226
+ st.session_state.file_summary = FileDescriptionAgent("",st.session_state.df)
227
+ st.markdown("### Data Summary:")
228
+ st.text(st.session_state.file_summary)
229
+
230
+ pygApp = get_pyg_renderer(st.session_state.df)
231
+ pygApp.explorer(default_tab="data")
232
+
233
+ st.markdown(
234
+ """
235
+ <style>
236
+ section[data-testid="stSidebar"] {
237
+ width: 500px !important; # Set the width to your desired value
238
+ }
239
+ </style>
240
+ """,
241
+ unsafe_allow_html=True,
242
+ )
243
+
244
+ with st.sidebar:
245
+ st.markdown("## Analysis Discussion:")
246
+
247
+ if("first_question" not in st.session_state):
248
+ st.session_state.first_question = ""
249
+
250
+ if("num_question_asked" not in st.session_state):
251
+ st.session_state.num_question_asked = 0
252
+
253
+ if("messages" not in st.session_state):
254
+ st.session_state.messages = []
255
+
256
+ if st.session_state.num_question_asked == 0:
257
+ with st.spinner("Preparing Anlaysis..."):
258
+ if("analsyis_questions" not in st.session_state):
259
+ st.session_state.analsyis_questions = AnlaysisQuestionAgent(st.session_state.file_summary)
260
+
261
+ with st.container():
262
+ if q1:= st.button(st.session_state.analsyis_questions[0]):
263
+ st.session_state.first_question = st.session_state.analsyis_questions[0]
264
+ if q2:= st.button(st.session_state.analsyis_questions[1]):
265
+ st.session_state.first_question = st.session_state.analsyis_questions[1]
266
+ if q3:= st.button(st.session_state.analsyis_questions[2]):
267
+ st.session_state.first_question = st.session_state.analsyis_questions[2]
268
+
269
+ chat = st.chat_input("Something else...")
270
+ if chat:
271
+ st.session_state.first_question = chat
272
+
273
+ st.session_state.num_question_asked += 1 if(q1 or q2 or q3 or chat is not None) else 0
274
+ if st.session_state.num_question_asked == 1:
275
+ st.session_state.messages.append({"role": "user", "content": st.session_state.first_question})
276
+ st.rerun()
277
+
278
+ elif st.session_state.num_question_asked == 1:
279
+ with st.container():
280
+ for msg in st.session_state.messages:
281
+ with st.chat_message(msg["role"]):
282
+ st.markdown(msg["content"], unsafe_allow_html=True)
283
+ with st.spinner("Working …"):
284
+ st.session_state.messages.append({
285
+ "role": "assistant",
286
+ "content": ResponseBuilderTool(st.session_state.first_question)
287
+ })
288
+ st.session_state.num_question_asked += 1
289
+ st.rerun()
290
+
291
+ else:
292
+ with st.container():
293
+ for msg in st.session_state.messages:
294
+ with st.chat_message(msg["role"]):
295
+ st.markdown(msg["content"], unsafe_allow_html=True)
296
+ if user_q := st.chat_input("Ask about your data…"):
297
+ st.session_state.messages.append({"role": "user", "content": user_q})
298
+ with st.spinner("Working …"):
299
+ st.session_state.messages.append({
300
+ "role": "assistant",
301
+ "content": ResponseBuilderTool(user_q)
302
+ })
303
+ st.session_state.num_question_asked += 1
304
+ st.rerun()
305
+
306
+ if __name__ == "__main__":
307
+ main()
308
+