Spaces:
Sleeping
Sleeping
Atharva Thakur
commited on
Commit
•
3a7810d
1
Parent(s):
ce41758
QA module added
Browse files- .gitignore +2 -1
- Experimentation/dataCodeTest.py +7 -6
- Modules/code_debugger.py +21 -0
- Modules/code_runner.py +22 -0
- Modules/data_QA.py +74 -12
- Modules/{data_code_run.py → data_code_gen.py} +49 -16
- Modules/data_visualizer.py +74 -6
- Modules/llm_summary.py +15 -7
- Modules/output_interpreter.py +27 -0
- app.py +6 -3
.gitignore
CHANGED
@@ -15,7 +15,8 @@ original_data.csv
|
|
15 |
#code files
|
16 |
code.py
|
17 |
data.pdf
|
18 |
-
|
|
|
19 |
#Env variables
|
20 |
.env
|
21 |
# Distribution / packaging
|
|
|
15 |
#code files
|
16 |
code.py
|
17 |
data.pdf
|
18 |
+
data.txt
|
19 |
+
file.pdf
|
20 |
#Env variables
|
21 |
.env
|
22 |
# Distribution / packaging
|
Experimentation/dataCodeTest.py
CHANGED
@@ -4,19 +4,20 @@ import pandas as pd
|
|
4 |
|
5 |
sys.path.append("..")
|
6 |
|
7 |
-
from Modules.
|
|
|
8 |
|
9 |
# data = pd.read_csv("test_data.csv")
|
10 |
|
11 |
-
code_runner =
|
12 |
|
13 |
-
message =
|
14 |
-
The 'Target' column holds failure prediction values as 0 (no failure) and 1 (failure). the name of the dataset is test_data.csv .'''
|
15 |
|
16 |
response= code_runner.generate_code(message)
|
17 |
# print("Response:", response)
|
18 |
|
19 |
|
20 |
-
|
21 |
|
22 |
-
|
|
|
|
4 |
|
5 |
sys.path.append("..")
|
6 |
|
7 |
+
from Modules.data_code_gen import DataCodeGen
|
8 |
+
from Modules.python_interpreter import PythonInterpreter, run_interpreter
|
9 |
|
10 |
# data = pd.read_csv("test_data.csv")
|
11 |
|
12 |
+
code_runner = DataCodeGen()
|
13 |
|
14 |
+
message = "give me a estimate of how many had a failure of any kind"
|
|
|
15 |
|
16 |
response= code_runner.generate_code(message)
|
17 |
# print("Response:", response)
|
18 |
|
19 |
|
20 |
+
python_code = code_runner.extract_code(response)
|
21 |
|
22 |
+
interpreter_code_output = run_interpreter(python_code)
|
23 |
+
print("Python code output:\n", interpreter_code_output)
|
Modules/code_debugger.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from litellm import completion
|
2 |
+
from dotenv import load_dotenv
|
3 |
+
import os
|
4 |
+
|
5 |
+
load_dotenv() # take environment variables from .env.
|
6 |
+
def code_debugger(python_code,error_message):
|
7 |
+
os.environ['GEMINI_API_KEY'] = os.getenv("GOOGLE_API_KEY")
|
8 |
+
output = completion(
|
9 |
+
model="gemini/gemini-pro",
|
10 |
+
messages=[
|
11 |
+
{"role": "user", "content": "You are a computer with the ability to run any code you want when you are given a prompt and return a response with a plan of what code you want to run. You should start your response with the python program, The commands you provide should be in a single code block encapsulated in '''python and ''' for Python and should be valid Python programs."},
|
12 |
+
{"role": "assistant", "content": "I am a computer with the ability to run any code I want when I am given a prompt and return a response with a python program. I will start my response with python program. The commands I provide should be in a single code block encapulated in ```python and ``` and should be a valid Python program."},
|
13 |
+
{"role": "user", "content": "Your are given a python code that has an error. you have to solve that error"},
|
14 |
+
{"role": "assistant", "content": "my job is write the correct python code to solve the error."},
|
15 |
+
{"role": "user", "content": f"Here is the python code and the associated error\n python code:-{python_code} \n error message:- {error_message}"},
|
16 |
+
]
|
17 |
+
)
|
18 |
+
|
19 |
+
response = output.choices[0].message.content
|
20 |
+
|
21 |
+
return response
|
Modules/code_runner.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import subprocess
|
2 |
+
def run_script():
|
3 |
+
try:
|
4 |
+
# Run the script
|
5 |
+
result = subprocess.run(
|
6 |
+
['python', 'code.py'],
|
7 |
+
capture_output=True, # Capture stdout and stderr
|
8 |
+
text=True # Get the output as string
|
9 |
+
)
|
10 |
+
|
11 |
+
# Check the return code to determine if an error occurred
|
12 |
+
if result.returncode != 0:
|
13 |
+
print("Script failed with error:")
|
14 |
+
print(result.stderr)
|
15 |
+
return 1,result.stderr
|
16 |
+
else:
|
17 |
+
print("Script completed successfully:")
|
18 |
+
print(result.stdout)
|
19 |
+
return 0,result.stdout
|
20 |
+
except Exception as e:
|
21 |
+
print(f"Failed to run the script: {e}")
|
22 |
+
return 2,e
|
Modules/data_QA.py
CHANGED
@@ -4,15 +4,18 @@ from langchain_experimental.agents import create_csv_agent
|
|
4 |
import pandas as pd
|
5 |
from dotenv import load_dotenv
|
6 |
import os
|
7 |
-
|
8 |
-
from Modules.
|
|
|
|
|
9 |
from Modules.python_interpreter import PythonInterpreter, run_interpreter
|
|
|
10 |
|
11 |
load_dotenv() # take environment variables from .env.
|
12 |
|
13 |
class DataQA:
|
14 |
def __init__(self):
|
15 |
-
|
16 |
# def ask_csv(self):
|
17 |
# GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
|
18 |
# llm = GoogleGenerativeAI(model="gemini-pro", google_api_key=GOOGLE_API_KEY)
|
@@ -22,14 +25,73 @@ class DataQA:
|
|
22 |
# response = csv_agent.invoke(question)
|
23 |
# st.write(response)
|
24 |
|
25 |
-
def ask_csv(self):
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
st.code(python_code)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
if st.button("Run Code") and python_code:
|
34 |
-
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
import pandas as pd
|
5 |
from dotenv import load_dotenv
|
6 |
import os
|
7 |
+
from Modules.code_runner import run_script
|
8 |
+
from Modules.code_debugger import code_debugger
|
9 |
+
from Modules.output_interpreter import output_interpreter
|
10 |
+
from Modules.data_code_gen import DataCodeGen
|
11 |
from Modules.python_interpreter import PythonInterpreter, run_interpreter
|
12 |
+
import subprocess
|
13 |
|
14 |
load_dotenv() # take environment variables from .env.
|
15 |
|
16 |
class DataQA:
|
17 |
def __init__(self):
|
18 |
+
pass
|
19 |
# def ask_csv(self):
|
20 |
# GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
|
21 |
# llm = GoogleGenerativeAI(model="gemini-pro", google_api_key=GOOGLE_API_KEY)
|
|
|
25 |
# response = csv_agent.invoke(question)
|
26 |
# st.write(response)
|
27 |
|
28 |
+
# def ask_csv(self):
|
29 |
+
# question = st.text_input("Ask your question:")
|
30 |
+
# code_runner = DataCodeRun()
|
31 |
+
# if question:
|
32 |
+
# response= code_runner.generate_code(question)
|
33 |
+
# plan, python_code = code_runner.extract_code(response)
|
34 |
+
# st.write(plan)
|
35 |
+
# st.code(python_code)
|
36 |
+
# if st.button("Run Code") and python_code:
|
37 |
+
# interpreter_code_output = run_interpreter(python_code)
|
38 |
+
# print("Python code output:\n", interpreter_code_output)
|
39 |
+
|
40 |
+
# @st.cache_data(experimental_allow_widgets=True)
|
41 |
+
|
42 |
+
|
43 |
+
|
44 |
+
def answer_query(self):
|
45 |
+
query = st.text_input("Ask your question:")
|
46 |
+
if query:
|
47 |
+
# message = '''generate the code to find the relation between 'Air temperature [K]' and 'Target' columns of the given dataset. The 'Target' column holds failure prediction values as 0 (no failure) and 1 (failure). the name of the dataset is test_data.csv .
|
48 |
+
# '''
|
49 |
+
# get the info about the dataset.
|
50 |
+
# call to code gen
|
51 |
+
code_gen = DataCodeGen()
|
52 |
+
response = code_gen.generate_code(query)
|
53 |
+
# st.write(response)
|
54 |
+
python_code = code_gen.extract_code(response)
|
55 |
st.code(python_code)
|
56 |
+
|
57 |
+
def save_and_run_code(python_code):
|
58 |
+
try:
|
59 |
+
with open("code.py", "w") as f:
|
60 |
+
f.write(python_code)
|
61 |
+
print("Python code saved as code.py")
|
62 |
+
except Exception as e:
|
63 |
+
print("Error:", str(e))
|
64 |
+
return 1, str(e) # Return an error code and message
|
65 |
+
|
66 |
+
return run_script()
|
67 |
+
|
68 |
+
def debug_code(python_code, error_message):
|
69 |
+
return code_debugger(python_code, error_message)
|
70 |
if st.button("Run Code") and python_code:
|
71 |
+
# Maximum number of attempts
|
72 |
+
max_attempts = 5
|
73 |
+
attempts = 0
|
74 |
+
|
75 |
+
while attempts < max_attempts:
|
76 |
+
return_code, return_message = save_and_run_code(python_code)
|
77 |
+
|
78 |
+
if return_code == 0:
|
79 |
+
print("Code executed successfully!")
|
80 |
+
break # Exit the loop if the code runs without errors
|
81 |
+
|
82 |
+
# If there was an error, debug the code
|
83 |
+
response = debug_code(python_code, return_message)
|
84 |
+
python_code = code_gen.extract_code(response)
|
85 |
+
attempts += 1 # Increment the attempts counter
|
86 |
+
st.write(python_code) # Display the corrected code
|
87 |
+
|
88 |
+
if attempts == max_attempts:
|
89 |
+
print("Exceeded maximum number of attempts. The code could not be executed successfully.")
|
90 |
+
|
91 |
+
# Process final output
|
92 |
+
answer = output_interpreter(query)
|
93 |
+
st.write(answer)
|
94 |
+
|
95 |
+
# Clean up by removing the code file
|
96 |
+
os.remove("code.py")
|
97 |
+
|
Modules/{data_code_run.py → data_code_gen.py}
RENAMED
@@ -3,26 +3,62 @@ from litellm import completion
|
|
3 |
from dotenv import load_dotenv
|
4 |
import os
|
5 |
from Modules.python_interpreter import PythonInterpreter, run_interpreter
|
6 |
-
|
7 |
|
8 |
load_dotenv() # take environment variables from .env.
|
9 |
|
10 |
-
class
|
11 |
def __init__(self):
|
12 |
pass
|
13 |
|
14 |
-
@st.cache_data(experimental_allow_widgets=True)
|
15 |
-
def generate_code(_,message):
|
16 |
-
os.environ['GEMINI_API_KEY'] = "AIzaSyAPlmL2oeRaldWRf2viQINPd92_vm3QN6o"
|
17 |
|
18 |
-
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
output = completion(
|
21 |
model="gemini/gemini-pro",
|
22 |
messages=[
|
23 |
-
{"role": "user", "content": "You are a
|
24 |
-
{"role": "assistant", "content": "I am a
|
25 |
-
{"role": "user", "content":
|
|
|
|
|
|
|
26 |
]
|
27 |
)
|
28 |
|
@@ -31,9 +67,6 @@ class DataCodeRun:
|
|
31 |
|
32 |
|
33 |
def extract_code(self,response):
|
34 |
-
plan = response.split("```python")[0]
|
35 |
-
plan = plan.replace("'", "")
|
36 |
-
plan = plan.replace('`', "")
|
37 |
# else:
|
38 |
# print(response.choices[0].message.content)
|
39 |
# # Extract plan from the response
|
@@ -44,15 +77,15 @@ class DataCodeRun:
|
|
44 |
|
45 |
if "```python" in response:
|
46 |
python_code = response.split("```python")[1].split("```")[0].strip()
|
47 |
-
return
|
48 |
elif "```" in response:
|
49 |
python_code = response.split("```")[1].split("```")[0].strip()
|
50 |
print("Code found in the response but not Left out the word python:", python_code)
|
51 |
-
return
|
52 |
elif "```python" in response.choices[0].message.content:
|
53 |
python_code = response.choices[0].message.content.split(
|
54 |
"```python")[1].split("```")[0].strip()
|
55 |
-
return
|
56 |
|
57 |
|
58 |
# if python_code:
|
|
|
3 |
from dotenv import load_dotenv
|
4 |
import os
|
5 |
from Modules.python_interpreter import PythonInterpreter, run_interpreter
|
6 |
+
import pandas as pd
|
7 |
|
8 |
load_dotenv() # take environment variables from .env.
|
9 |
|
10 |
+
class DataCodeGen:
|
11 |
def __init__(self):
|
12 |
pass
|
13 |
|
|
|
|
|
|
|
14 |
|
15 |
+
def get_data_info(self):
|
16 |
+
file_path = './data.csv'
|
17 |
+
df = pd.read_csv(file_path)
|
18 |
+
|
19 |
+
# Get column names
|
20 |
+
column_names = ", ".join(df.columns.tolist())
|
21 |
+
|
22 |
+
# Get data types
|
23 |
+
data_types = ", ".join([f"{col}: {dtype}" for col, dtype in df.dtypes.items()])
|
24 |
+
|
25 |
+
# Get number of rows and columns
|
26 |
+
num_rows, num_cols = df.shape
|
27 |
+
|
28 |
+
# Get unique values and example values for each column
|
29 |
+
unique_values_info = []
|
30 |
+
example_values_info = []
|
31 |
+
for col in df.columns:
|
32 |
+
unique_values = df[col].unique()
|
33 |
+
unique_values_info.append(f"{col}: {len(unique_values)} unique values")
|
34 |
+
example_values = df[col].head(5).tolist() # Get first 5 values as examples
|
35 |
+
example_values_info.append(f"{col}: {example_values}")
|
36 |
+
|
37 |
+
# Construct the dataset information string
|
38 |
+
info_string = f"Dataset Information:\n"
|
39 |
+
info_string += f"Dataset file path: {file_path}\n"
|
40 |
+
info_string += f"Columns: {column_names}\n"
|
41 |
+
info_string += f"Data Types: {data_types}\n"
|
42 |
+
info_string += f"Number of Rows: {num_rows}\n"
|
43 |
+
info_string += f"Number of Columns: {num_cols}\n"
|
44 |
+
info_string += f"Unique Values per Column: {'; '.join(unique_values_info)}\n"
|
45 |
+
# info_string += f"Example Values per Column: {'; '.join(example_values_info)}\n"
|
46 |
+
|
47 |
+
return info_string
|
48 |
+
|
49 |
+
@st.cache_data(experimental_allow_widgets=True)
|
50 |
+
def generate_code(_self,query):
|
51 |
+
os.environ['GEMINI_API_KEY'] = os.getenv("GOOGLE_API_KEY")
|
52 |
+
data_info= _self.get_data_info()
|
53 |
output = completion(
|
54 |
model="gemini/gemini-pro",
|
55 |
messages=[
|
56 |
+
{"role": "user", "content": "You are a data analyst with the ability to run any code you want when you are given a prompt and return a response with a plan of what code you want to run. You should start your response with the python program, The commands you provide should be in a single code block encapsulated in '''python and ''' for Python and should be valid Python programs."},
|
57 |
+
{"role": "assistant", "content": "I am a data analyst with the ability to run any code I want when I am given a prompt and return a response with a python program. I will start my response with python program. The commands I provide should be in a single code block encapulated in ```python and ``` and should be a valid Python program."},
|
58 |
+
{"role": "user", "content": "Your job is write the python code the answer for the given query regarding a dataset. The python should find the correct answer the query, also generate a visualization if necessary and store it in `file.pdf`. Store the answer to query and information regarding the visualization in `data.txt`. Even if the given task is to plot a graph you have to include textual information regarding the graphs like the labels and values in `data.txt`."},
|
59 |
+
{"role": "assistant", "content": "My job is write the python code that will find the answer for the given query regarding a dataset. The python should find the correct answer the query, also generate a visualization if necessary and store it in `file.pdf`. I have to store the answer to query along with label and value shown in the visualization in `data.txt`. Even if I have to just plot a graph I will include textual information regarding the graphs like the labels and values in `data.txt`."},
|
60 |
+
{"role": "user", "content": f"Here is some information about the dataset.\n {data_info}"},
|
61 |
+
{"role": "user", "content": f"Given query - {query}"},
|
62 |
]
|
63 |
)
|
64 |
|
|
|
67 |
|
68 |
|
69 |
def extract_code(self,response):
|
|
|
|
|
|
|
70 |
# else:
|
71 |
# print(response.choices[0].message.content)
|
72 |
# # Extract plan from the response
|
|
|
77 |
|
78 |
if "```python" in response:
|
79 |
python_code = response.split("```python")[1].split("```")[0].strip()
|
80 |
+
return python_code
|
81 |
elif "```" in response:
|
82 |
python_code = response.split("```")[1].split("```")[0].strip()
|
83 |
print("Code found in the response but not Left out the word python:", python_code)
|
84 |
+
return python_code
|
85 |
elif "```python" in response.choices[0].message.content:
|
86 |
python_code = response.choices[0].message.content.split(
|
87 |
"```python")[1].split("```")[0].strip()
|
88 |
+
return python_code
|
89 |
|
90 |
|
91 |
# if python_code:
|
Modules/data_visualizer.py
CHANGED
@@ -1,6 +1,9 @@
|
|
1 |
import streamlit as st
|
2 |
import re
|
3 |
-
|
|
|
|
|
|
|
4 |
from litellm import completion
|
5 |
from dotenv import load_dotenv
|
6 |
import os
|
@@ -13,17 +16,18 @@ class DataVisualizer:
|
|
13 |
self.data = data
|
14 |
st.subheader("Data Visualizer")
|
15 |
|
16 |
-
|
|
|
17 |
message = f'''
|
18 |
You are a data analyst working with a given dataset. Below is the information about the dataset:
|
19 |
========
|
20 |
-
{
|
21 |
========
|
22 |
|
23 |
Here is a sample of the data:
|
24 |
-
{
|
25 |
|
26 |
-
Number of rows in the dataset: {
|
27 |
|
28 |
Your task:
|
29 |
Suggest 5 visualizations that can be made in bullet points
|
@@ -73,4 +77,68 @@ class DataVisualizer:
|
|
73 |
try:
|
74 |
exec(code_block)
|
75 |
except Exception as e:
|
76 |
-
print(e)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
import re
|
3 |
+
import pandas as pd
|
4 |
+
import numpy as np
|
5 |
+
import matplotlib.pyplot as plt
|
6 |
+
import seaborn as sns
|
7 |
from litellm import completion
|
8 |
from dotenv import load_dotenv
|
9 |
import os
|
|
|
16 |
self.data = data
|
17 |
st.subheader("Data Visualizer")
|
18 |
|
19 |
+
@st.cache_data(experimental_allow_widgets=True)
|
20 |
+
def suggestions(_self):
|
21 |
message = f'''
|
22 |
You are a data analyst working with a given dataset. Below is the information about the dataset:
|
23 |
========
|
24 |
+
{_self.data.describe(include='all')}
|
25 |
========
|
26 |
|
27 |
Here is a sample of the data:
|
28 |
+
{_self.data.head()}
|
29 |
|
30 |
+
Number of rows in the dataset: {_self.data.shape[0]}
|
31 |
|
32 |
Your task:
|
33 |
Suggest 5 visualizations that can be made in bullet points
|
|
|
77 |
try:
|
78 |
exec(code_block)
|
79 |
except Exception as e:
|
80 |
+
print(e)
|
81 |
+
|
82 |
+
def visualize_data(self):
|
83 |
+
plot_type = st.selectbox('Choose a type of plot', ['Histogram', 'Box Plot', 'Pie Chart', 'Scatter Plot', 'Heatmap'])
|
84 |
+
|
85 |
+
if plot_type == 'Histogram':
|
86 |
+
numeric_columns = self.data.select_dtypes(include=[np.number]).columns
|
87 |
+
if numeric_columns.empty:
|
88 |
+
st.warning('No numeric columns in the data to visualize.')
|
89 |
+
else:
|
90 |
+
column_to_visualize = st.selectbox('Choose a column to visualize', numeric_columns)
|
91 |
+
fig, ax = plt.subplots()
|
92 |
+
ax.hist(self.data[column_to_visualize])
|
93 |
+
ax.set_title(f'Histogram of {column_to_visualize}')
|
94 |
+
ax.set_xlabel(column_to_visualize)
|
95 |
+
ax.set_ylabel('Frequency')
|
96 |
+
st.pyplot(fig)
|
97 |
+
|
98 |
+
elif plot_type == 'Box Plot':
|
99 |
+
numeric_columns = self.data.select_dtypes(include=[np.number]).columns
|
100 |
+
if numeric_columns.empty:
|
101 |
+
st.warning('No numeric columns in the data to visualize.')
|
102 |
+
else:
|
103 |
+
column_to_visualize = st.selectbox('Choose a column to visualize', numeric_columns)
|
104 |
+
fig, ax = plt.subplots()
|
105 |
+
ax.boxplot(self.data[column_to_visualize].dropna())
|
106 |
+
ax.set_title(f'Box Plot of {column_to_visualize}')
|
107 |
+
ax.set_ylabel(column_to_visualize)
|
108 |
+
st.pyplot(fig)
|
109 |
+
|
110 |
+
elif plot_type == 'Pie Chart':
|
111 |
+
nonnumeric_columns = self.data.select_dtypes(include=['object']).columns
|
112 |
+
if nonnumeric_columns.empty:
|
113 |
+
st.warning('No non numeric columns in the data to visualize.')
|
114 |
+
else:
|
115 |
+
column_to_visualize = st.selectbox('Choose a column to visualize', nonnumeric_columns)
|
116 |
+
fig, ax = plt.subplots()
|
117 |
+
self.data[column_to_visualize].value_counts().plot(kind='pie', ax=ax, autopct='%1.1f%%', textprops={'fontsize': 'small'})
|
118 |
+
ax.set_title(f'Pie Chart of {column_to_visualize}')
|
119 |
+
ax.set_ylabel('')
|
120 |
+
st.pyplot(fig)
|
121 |
+
|
122 |
+
elif plot_type == 'Scatter Plot':
|
123 |
+
left, right = st.columns(2)
|
124 |
+
with left:
|
125 |
+
x_col = st.selectbox('Choose values on X axis', self.data.select_dtypes(include=[np.number]).columns)
|
126 |
+
with right:
|
127 |
+
y_col = st.selectbox('Choose values on Y axis', self.data.select_dtypes(include=[np.number]).columns)
|
128 |
+
if x_col == y_col:
|
129 |
+
st.warning('Please select two different columns for scatter plot.')
|
130 |
+
else:
|
131 |
+
fig, ax = plt.subplots()
|
132 |
+
ax.scatter(self.data[x_col], self.data[y_col])
|
133 |
+
ax.set_title(f'Scatter Plot of {x_col} vs {y_col}')
|
134 |
+
ax.set_xlabel(x_col)
|
135 |
+
ax.set_ylabel(y_col)
|
136 |
+
st.pyplot(fig)
|
137 |
+
|
138 |
+
elif plot_type == 'Heatmap':
|
139 |
+
numeric_data = self.data.select_dtypes(include=[np.number])
|
140 |
+
corr = numeric_data.corr()
|
141 |
+
fig, ax = plt.subplots()
|
142 |
+
sns.heatmap(corr, annot=True, ax=ax)
|
143 |
+
ax.set_title('Correlation Heatmap')
|
144 |
+
st.pyplot(fig)
|
Modules/llm_summary.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
from litellm import completion
|
2 |
from dotenv import load_dotenv
|
3 |
import os
|
@@ -6,7 +7,7 @@ import pandas as pd
|
|
6 |
load_dotenv() # take environment variables from .env.
|
7 |
os.environ['GEMINI_API_KEY'] = os.getenv("GOOGLE_API_KEY")
|
8 |
|
9 |
-
|
10 |
def LLM_summary():
|
11 |
file_path = './data.csv'
|
12 |
df = pd.read_csv(file_path)
|
@@ -21,16 +22,23 @@ def LLM_summary():
|
|
21 |
|
22 |
# Get number of rows and columns
|
23 |
num_rows, num_cols = df.shape
|
24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
# Construct the dataset information string
|
26 |
info_string = f"Dataset Information:\n"
|
27 |
info_string += f"Columns: {column_names}\n"
|
28 |
info_string += f"Data Types: {data_types}\n"
|
29 |
info_string += f"Number of Rows: {num_rows}\n"
|
30 |
info_string += f"Number of Columns: {num_cols}\n"
|
|
|
|
|
31 |
|
32 |
-
|
33 |
-
|
34 |
message = f'''
|
35 |
You are a data analyser agent working with a given dataset.
|
36 |
Below is the info about the dataset -
|
@@ -39,8 +47,8 @@ def LLM_summary():
|
|
39 |
========
|
40 |
|
41 |
Your task -
|
42 |
-
Write a summary report of the dataset. You have to explain what the dataset is about.
|
43 |
-
You have to
|
44 |
|
45 |
|
46 |
Do not infer any data based on previous training, strictly use only source text given below as input.
|
@@ -53,6 +61,6 @@ def LLM_summary():
|
|
53 |
]
|
54 |
)
|
55 |
|
56 |
-
|
57 |
|
58 |
|
|
|
1 |
+
import streamlit as st
|
2 |
from litellm import completion
|
3 |
from dotenv import load_dotenv
|
4 |
import os
|
|
|
7 |
load_dotenv() # take environment variables from .env.
|
8 |
os.environ['GEMINI_API_KEY'] = os.getenv("GOOGLE_API_KEY")
|
9 |
|
10 |
+
@st.cache_data(experimental_allow_widgets=True)
|
11 |
def LLM_summary():
|
12 |
file_path = './data.csv'
|
13 |
df = pd.read_csv(file_path)
|
|
|
22 |
|
23 |
# Get number of rows and columns
|
24 |
num_rows, num_cols = df.shape
|
25 |
+
unique_values_info = []
|
26 |
+
example_values_info = []
|
27 |
+
for col in df.columns:
|
28 |
+
unique_values = df[col].unique()
|
29 |
+
unique_values_info.append(f"{col}: {len(unique_values)} unique values")
|
30 |
+
example_values = df[col].head(5).tolist() # Get first 5 values as examples
|
31 |
+
example_values_info.append(f"{col}: {example_values}")
|
32 |
+
|
33 |
# Construct the dataset information string
|
34 |
info_string = f"Dataset Information:\n"
|
35 |
info_string += f"Columns: {column_names}\n"
|
36 |
info_string += f"Data Types: {data_types}\n"
|
37 |
info_string += f"Number of Rows: {num_rows}\n"
|
38 |
info_string += f"Number of Columns: {num_cols}\n"
|
39 |
+
info_string += f"Unique Values per Column: {'; '.join(unique_values_info)}\n"
|
40 |
+
info_string += f"Example Values per Column: {'; '.join(example_values_info)}\n"
|
41 |
|
|
|
|
|
42 |
message = f'''
|
43 |
You are a data analyser agent working with a given dataset.
|
44 |
Below is the info about the dataset -
|
|
|
47 |
========
|
48 |
|
49 |
Your task -
|
50 |
+
Write a detailed and beautiful summary report of the dataset. You have to explain what the dataset is about.
|
51 |
+
You also have to questions that could be asked regarding the dataset so that we could gain some insights.
|
52 |
|
53 |
|
54 |
Do not infer any data based on previous training, strictly use only source text given below as input.
|
|
|
61 |
]
|
62 |
)
|
63 |
|
64 |
+
st.write(output.choices[0].message.content)
|
65 |
|
66 |
|
Modules/output_interpreter.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from litellm import completion
|
2 |
+
from dotenv import load_dotenv
|
3 |
+
import os
|
4 |
+
|
5 |
+
load_dotenv() # take environment variables from .env.
|
6 |
+
|
7 |
+
def read_file(filename):
|
8 |
+
try:
|
9 |
+
with open(filename, "r") as file:
|
10 |
+
data = file.read()
|
11 |
+
return data
|
12 |
+
except Exception as e:
|
13 |
+
return f"Error: {str(e)}"
|
14 |
+
|
15 |
+
def output_interpreter(query):
|
16 |
+
|
17 |
+
os.environ['GEMINI_API_KEY'] = os.getenv("GOOGLE_API_KEY")
|
18 |
+
data = read_file("data.txt")
|
19 |
+
output = completion(
|
20 |
+
model="gemini/gemini-pro",
|
21 |
+
messages=[
|
22 |
+
{"role": "user", "content": f"You are a data analyst. you were given a query - {query}\n After a python code to get the answer to query you got the following info - {data}. Summarize your findings and write a proper answer for the query."},
|
23 |
+
]
|
24 |
+
)
|
25 |
+
|
26 |
+
response = output.choices[0].message.content
|
27 |
+
return response
|
app.py
CHANGED
@@ -14,6 +14,7 @@ from Modules.data_transformer import DataTransformer
|
|
14 |
from Modules.data_visualizer import DataVisualizer
|
15 |
from Modules.data_QA import DataQA
|
16 |
from Modules.MLtoolkit import MLToolkit
|
|
|
17 |
|
18 |
|
19 |
#---SKLEARN-IMPORT---
|
@@ -45,12 +46,14 @@ def main():
|
|
45 |
data = pd.read_csv("data.csv")
|
46 |
data_analyzer = DataAnalyzer(data)
|
47 |
data_analyzer.show_eda()
|
|
|
|
|
48 |
data_analyzer.show_count_plots()
|
49 |
|
50 |
data_visualizer = DataVisualizer(data)
|
51 |
data_visualizer.suggestions()
|
52 |
-
data_visualizer.generate_viz()
|
53 |
-
|
54 |
|
55 |
# --- DATA CLEANING ---
|
56 |
if selected == "Data Cleaning":
|
@@ -68,7 +71,7 @@ def main():
|
|
68 |
if selected == "Q/A":
|
69 |
try:
|
70 |
data_QA = DataQA()
|
71 |
-
data_QA.
|
72 |
except Exception as e:
|
73 |
# Handle the exception (e.g., logging, printing an error message, etc.)
|
74 |
print(f"An error occurred: {e}")
|
|
|
14 |
from Modules.data_visualizer import DataVisualizer
|
15 |
from Modules.data_QA import DataQA
|
16 |
from Modules.MLtoolkit import MLToolkit
|
17 |
+
from Modules.llm_summary import LLM_summary
|
18 |
|
19 |
|
20 |
#---SKLEARN-IMPORT---
|
|
|
46 |
data = pd.read_csv("data.csv")
|
47 |
data_analyzer = DataAnalyzer(data)
|
48 |
data_analyzer.show_eda()
|
49 |
+
LLM_summary()
|
50 |
+
|
51 |
data_analyzer.show_count_plots()
|
52 |
|
53 |
data_visualizer = DataVisualizer(data)
|
54 |
data_visualizer.suggestions()
|
55 |
+
# data_visualizer.generate_viz()
|
56 |
+
data_visualizer.visualize_data()
|
57 |
|
58 |
# --- DATA CLEANING ---
|
59 |
if selected == "Data Cleaning":
|
|
|
71 |
if selected == "Q/A":
|
72 |
try:
|
73 |
data_QA = DataQA()
|
74 |
+
data_QA.answer_query()
|
75 |
except Exception as e:
|
76 |
# Handle the exception (e.g., logging, printing an error message, etc.)
|
77 |
print(f"An error occurred: {e}")
|