Atharva Thakur commited on
Commit
3a7810d
1 Parent(s): ce41758

QA module added

Browse files
.gitignore CHANGED
@@ -15,7 +15,8 @@ original_data.csv
15
  #code files
16
  code.py
17
  data.pdf
18
-
 
19
  #Env variables
20
  .env
21
  # Distribution / packaging
 
15
  #code files
16
  code.py
17
  data.pdf
18
+ data.txt
19
+ file.pdf
20
  #Env variables
21
  .env
22
  # Distribution / packaging
Experimentation/dataCodeTest.py CHANGED
@@ -4,19 +4,20 @@ import pandas as pd
4
 
5
  sys.path.append("..")
6
 
7
- from Modules.data_code_run import DataCodeRun
 
8
 
9
  # data = pd.read_csv("test_data.csv")
10
 
11
- code_runner = DataCodeRun()
12
 
13
- message = '''generate the code to find the relation between 'Air temperature [K]' and 'Target' columns of the given dataset.
14
- The 'Target' column holds failure prediction values as 0 (no failure) and 1 (failure). the name of the dataset is test_data.csv .'''
15
 
16
  response= code_runner.generate_code(message)
17
  # print("Response:", response)
18
 
19
 
20
- plan, python_code = code_runner.extract_code(response)
21
 
22
- print(python_code)
 
 
4
 
5
  sys.path.append("..")
6
 
7
+ from Modules.data_code_gen import DataCodeGen
8
+ from Modules.python_interpreter import PythonInterpreter, run_interpreter
9
 
10
  # data = pd.read_csv("test_data.csv")
11
 
12
+ code_runner = DataCodeGen()
13
 
14
+ message = "give me a estimate of how many had a failure of any kind"
 
15
 
16
  response= code_runner.generate_code(message)
17
  # print("Response:", response)
18
 
19
 
20
+ python_code = code_runner.extract_code(response)
21
 
22
+ interpreter_code_output = run_interpreter(python_code)
23
+ print("Python code output:\n", interpreter_code_output)
Modules/code_debugger.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from litellm import completion
2
+ from dotenv import load_dotenv
3
+ import os
4
+
5
+ load_dotenv() # take environment variables from .env.
6
+ def code_debugger(python_code,error_message):
7
+ os.environ['GEMINI_API_KEY'] = os.getenv("GOOGLE_API_KEY")
8
+ output = completion(
9
+ model="gemini/gemini-pro",
10
+ messages=[
11
+ {"role": "user", "content": "You are a computer with the ability to run any code you want when you are given a prompt and return a response with a plan of what code you want to run. You should start your response with the python program, The commands you provide should be in a single code block encapsulated in '''python and ''' for Python and should be valid Python programs."},
12
+ {"role": "assistant", "content": "I am a computer with the ability to run any code I want when I am given a prompt and return a response with a python program. I will start my response with python program. The commands I provide should be in a single code block encapulated in ```python and ``` and should be a valid Python program."},
13
+ {"role": "user", "content": "Your are given a python code that has an error. you have to solve that error"},
14
+ {"role": "assistant", "content": "my job is write the correct python code to solve the error."},
15
+ {"role": "user", "content": f"Here is the python code and the associated error\n python code:-{python_code} \n error message:- {error_message}"},
16
+ ]
17
+ )
18
+
19
+ response = output.choices[0].message.content
20
+
21
+ return response
Modules/code_runner.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ def run_script():
3
+ try:
4
+ # Run the script
5
+ result = subprocess.run(
6
+ ['python', 'code.py'],
7
+ capture_output=True, # Capture stdout and stderr
8
+ text=True # Get the output as string
9
+ )
10
+
11
+ # Check the return code to determine if an error occurred
12
+ if result.returncode != 0:
13
+ print("Script failed with error:")
14
+ print(result.stderr)
15
+ return 1,result.stderr
16
+ else:
17
+ print("Script completed successfully:")
18
+ print(result.stdout)
19
+ return 0,result.stdout
20
+ except Exception as e:
21
+ print(f"Failed to run the script: {e}")
22
+ return 2,e
Modules/data_QA.py CHANGED
@@ -4,15 +4,18 @@ from langchain_experimental.agents import create_csv_agent
4
  import pandas as pd
5
  from dotenv import load_dotenv
6
  import os
7
-
8
- from Modules.data_code_run import DataCodeRun
 
 
9
  from Modules.python_interpreter import PythonInterpreter, run_interpreter
 
10
 
11
  load_dotenv() # take environment variables from .env.
12
 
13
  class DataQA:
14
  def __init__(self):
15
- print("dataQA")
16
  # def ask_csv(self):
17
  # GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
18
  # llm = GoogleGenerativeAI(model="gemini-pro", google_api_key=GOOGLE_API_KEY)
@@ -22,14 +25,73 @@ class DataQA:
22
  # response = csv_agent.invoke(question)
23
  # st.write(response)
24
 
25
- def ask_csv(self):
26
- question = st.text_input("Ask your question:")
27
- code_runner = DataCodeRun()
28
- if question:
29
- response= code_runner.generate_code(question)
30
- plan, python_code = code_runner.extract_code(response)
31
- st.write(plan)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  st.code(python_code)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  if st.button("Run Code") and python_code:
34
- interpreter_code_output = run_interpreter(python_code)
35
- print("Python code output:\n", interpreter_code_output)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  import pandas as pd
5
  from dotenv import load_dotenv
6
  import os
7
+ from Modules.code_runner import run_script
8
+ from Modules.code_debugger import code_debugger
9
+ from Modules.output_interpreter import output_interpreter
10
+ from Modules.data_code_gen import DataCodeGen
11
  from Modules.python_interpreter import PythonInterpreter, run_interpreter
12
+ import subprocess
13
 
14
  load_dotenv() # take environment variables from .env.
15
 
16
  class DataQA:
17
  def __init__(self):
18
+ pass
19
  # def ask_csv(self):
20
  # GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
21
  # llm = GoogleGenerativeAI(model="gemini-pro", google_api_key=GOOGLE_API_KEY)
 
25
  # response = csv_agent.invoke(question)
26
  # st.write(response)
27
 
28
+ # def ask_csv(self):
29
+ # question = st.text_input("Ask your question:")
30
+ # code_runner = DataCodeRun()
31
+ # if question:
32
+ # response= code_runner.generate_code(question)
33
+ # plan, python_code = code_runner.extract_code(response)
34
+ # st.write(plan)
35
+ # st.code(python_code)
36
+ # if st.button("Run Code") and python_code:
37
+ # interpreter_code_output = run_interpreter(python_code)
38
+ # print("Python code output:\n", interpreter_code_output)
39
+
40
+ # @st.cache_data(experimental_allow_widgets=True)
41
+
42
+
43
+
44
+ def answer_query(self):
45
+ query = st.text_input("Ask your question:")
46
+ if query:
47
+ # message = '''generate the code to find the relation between 'Air temperature [K]' and 'Target' columns of the given dataset. The 'Target' column holds failure prediction values as 0 (no failure) and 1 (failure). the name of the dataset is test_data.csv .
48
+ # '''
49
+ # get the info about the dataset.
50
+ # call to code gen
51
+ code_gen = DataCodeGen()
52
+ response = code_gen.generate_code(query)
53
+ # st.write(response)
54
+ python_code = code_gen.extract_code(response)
55
  st.code(python_code)
56
+
57
+ def save_and_run_code(python_code):
58
+ try:
59
+ with open("code.py", "w") as f:
60
+ f.write(python_code)
61
+ print("Python code saved as code.py")
62
+ except Exception as e:
63
+ print("Error:", str(e))
64
+ return 1, str(e) # Return an error code and message
65
+
66
+ return run_script()
67
+
68
+ def debug_code(python_code, error_message):
69
+ return code_debugger(python_code, error_message)
70
  if st.button("Run Code") and python_code:
71
+ # Maximum number of attempts
72
+ max_attempts = 5
73
+ attempts = 0
74
+
75
+ while attempts < max_attempts:
76
+ return_code, return_message = save_and_run_code(python_code)
77
+
78
+ if return_code == 0:
79
+ print("Code executed successfully!")
80
+ break # Exit the loop if the code runs without errors
81
+
82
+ # If there was an error, debug the code
83
+ response = debug_code(python_code, return_message)
84
+ python_code = code_gen.extract_code(response)
85
+ attempts += 1 # Increment the attempts counter
86
+ st.write(python_code) # Display the corrected code
87
+
88
+ if attempts == max_attempts:
89
+ print("Exceeded maximum number of attempts. The code could not be executed successfully.")
90
+
91
+ # Process final output
92
+ answer = output_interpreter(query)
93
+ st.write(answer)
94
+
95
+ # Clean up by removing the code file
96
+ os.remove("code.py")
97
+
Modules/{data_code_run.py → data_code_gen.py} RENAMED
@@ -3,26 +3,62 @@ from litellm import completion
3
  from dotenv import load_dotenv
4
  import os
5
  from Modules.python_interpreter import PythonInterpreter, run_interpreter
6
-
7
 
8
  load_dotenv() # take environment variables from .env.
9
 
10
- class DataCodeRun:
11
  def __init__(self):
12
  pass
13
 
14
- @st.cache_data(experimental_allow_widgets=True)
15
- def generate_code(_,message):
16
- os.environ['GEMINI_API_KEY'] = "AIzaSyAPlmL2oeRaldWRf2viQINPd92_vm3QN6o"
17
 
18
- # message = '''generate the code to find the relation between 'Air temperature [K]' and 'Target' columns of the given dataset. The 'Target' column holds failure prediction values as 0 (no failure) and 1 (failure). the name of the dataset is test_data.csv .
19
- # '''
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  output = completion(
21
  model="gemini/gemini-pro",
22
  messages=[
23
- {"role": "user", "content": "You are a computer with the ability to run any code you want when you are given a prompt and return a response with a plan of what code you want to run. You should start your response with a plan that describes what the code is going do in detail, The commands you provide should be in a single code block encapsulated in '''python and ''' for Python and should be valid Python programs."},
24
- {"role": "assistant", "content": "I am a computer with the ability to run any code I want when I am given a prompt and return a response with a plan of what code I want to run I will start my response with a plan that would be encapsulated in ```plan and ```. Afterwards, The commands I provide should be in a single code block encapulated in ```python and ``` and should be a valid Python program."},
25
- {"role": "user", "content": message}
 
 
 
26
  ]
27
  )
28
 
@@ -31,9 +67,6 @@ class DataCodeRun:
31
 
32
 
33
  def extract_code(self,response):
34
- plan = response.split("```python")[0]
35
- plan = plan.replace("'", "")
36
- plan = plan.replace('`', "")
37
  # else:
38
  # print(response.choices[0].message.content)
39
  # # Extract plan from the response
@@ -44,15 +77,15 @@ class DataCodeRun:
44
 
45
  if "```python" in response:
46
  python_code = response.split("```python")[1].split("```")[0].strip()
47
- return plan,python_code
48
  elif "```" in response:
49
  python_code = response.split("```")[1].split("```")[0].strip()
50
  print("Code found in the response but not Left out the word python:", python_code)
51
- return plan,python_code
52
  elif "```python" in response.choices[0].message.content:
53
  python_code = response.choices[0].message.content.split(
54
  "```python")[1].split("```")[0].strip()
55
- return plan,python_code
56
 
57
 
58
  # if python_code:
 
3
  from dotenv import load_dotenv
4
  import os
5
  from Modules.python_interpreter import PythonInterpreter, run_interpreter
6
+ import pandas as pd
7
 
8
  load_dotenv() # take environment variables from .env.
9
 
10
+ class DataCodeGen:
11
  def __init__(self):
12
  pass
13
 
 
 
 
14
 
15
+ def get_data_info(self):
16
+ file_path = './data.csv'
17
+ df = pd.read_csv(file_path)
18
+
19
+ # Get column names
20
+ column_names = ", ".join(df.columns.tolist())
21
+
22
+ # Get data types
23
+ data_types = ", ".join([f"{col}: {dtype}" for col, dtype in df.dtypes.items()])
24
+
25
+ # Get number of rows and columns
26
+ num_rows, num_cols = df.shape
27
+
28
+ # Get unique values and example values for each column
29
+ unique_values_info = []
30
+ example_values_info = []
31
+ for col in df.columns:
32
+ unique_values = df[col].unique()
33
+ unique_values_info.append(f"{col}: {len(unique_values)} unique values")
34
+ example_values = df[col].head(5).tolist() # Get first 5 values as examples
35
+ example_values_info.append(f"{col}: {example_values}")
36
+
37
+ # Construct the dataset information string
38
+ info_string = f"Dataset Information:\n"
39
+ info_string += f"Dataset file path: {file_path}\n"
40
+ info_string += f"Columns: {column_names}\n"
41
+ info_string += f"Data Types: {data_types}\n"
42
+ info_string += f"Number of Rows: {num_rows}\n"
43
+ info_string += f"Number of Columns: {num_cols}\n"
44
+ info_string += f"Unique Values per Column: {'; '.join(unique_values_info)}\n"
45
+ # info_string += f"Example Values per Column: {'; '.join(example_values_info)}\n"
46
+
47
+ return info_string
48
+
49
+ @st.cache_data(experimental_allow_widgets=True)
50
+ def generate_code(_self,query):
51
+ os.environ['GEMINI_API_KEY'] = os.getenv("GOOGLE_API_KEY")
52
+ data_info= _self.get_data_info()
53
  output = completion(
54
  model="gemini/gemini-pro",
55
  messages=[
56
+ {"role": "user", "content": "You are a data analyst with the ability to run any code you want when you are given a prompt and return a response with a plan of what code you want to run. You should start your response with the python program, The commands you provide should be in a single code block encapsulated in '''python and ''' for Python and should be valid Python programs."},
57
+ {"role": "assistant", "content": "I am a data analyst with the ability to run any code I want when I am given a prompt and return a response with a python program. I will start my response with python program. The commands I provide should be in a single code block encapulated in ```python and ``` and should be a valid Python program."},
58
+ {"role": "user", "content": "Your job is write the python code the answer for the given query regarding a dataset. The python should find the correct answer the query, also generate a visualization if necessary and store it in `file.pdf`. Store the answer to query and information regarding the visualization in `data.txt`. Even if the given task is to plot a graph you have to include textual information regarding the graphs like the labels and values in `data.txt`."},
59
+ {"role": "assistant", "content": "My job is write the python code that will find the answer for the given query regarding a dataset. The python should find the correct answer the query, also generate a visualization if necessary and store it in `file.pdf`. I have to store the answer to query along with label and value shown in the visualization in `data.txt`. Even if I have to just plot a graph I will include textual information regarding the graphs like the labels and values in `data.txt`."},
60
+ {"role": "user", "content": f"Here is some information about the dataset.\n {data_info}"},
61
+ {"role": "user", "content": f"Given query - {query}"},
62
  ]
63
  )
64
 
 
67
 
68
 
69
  def extract_code(self,response):
 
 
 
70
  # else:
71
  # print(response.choices[0].message.content)
72
  # # Extract plan from the response
 
77
 
78
  if "```python" in response:
79
  python_code = response.split("```python")[1].split("```")[0].strip()
80
+ return python_code
81
  elif "```" in response:
82
  python_code = response.split("```")[1].split("```")[0].strip()
83
  print("Code found in the response but not Left out the word python:", python_code)
84
+ return python_code
85
  elif "```python" in response.choices[0].message.content:
86
  python_code = response.choices[0].message.content.split(
87
  "```python")[1].split("```")[0].strip()
88
+ return python_code
89
 
90
 
91
  # if python_code:
Modules/data_visualizer.py CHANGED
@@ -1,6 +1,9 @@
1
  import streamlit as st
2
  import re
3
-
 
 
 
4
  from litellm import completion
5
  from dotenv import load_dotenv
6
  import os
@@ -13,17 +16,18 @@ class DataVisualizer:
13
  self.data = data
14
  st.subheader("Data Visualizer")
15
 
16
- def suggestions(self):
 
17
  message = f'''
18
  You are a data analyst working with a given dataset. Below is the information about the dataset:
19
  ========
20
- {self.data.describe(include='all')}
21
  ========
22
 
23
  Here is a sample of the data:
24
- {self.data.head()}
25
 
26
- Number of rows in the dataset: {self.data.shape[0]}
27
 
28
  Your task:
29
  Suggest 5 visualizations that can be made in bullet points
@@ -73,4 +77,68 @@ class DataVisualizer:
73
  try:
74
  exec(code_block)
75
  except Exception as e:
76
- print(e)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  import re
3
+ import pandas as pd
4
+ import numpy as np
5
+ import matplotlib.pyplot as plt
6
+ import seaborn as sns
7
  from litellm import completion
8
  from dotenv import load_dotenv
9
  import os
 
16
  self.data = data
17
  st.subheader("Data Visualizer")
18
 
19
+ @st.cache_data(experimental_allow_widgets=True)
20
+ def suggestions(_self):
21
  message = f'''
22
  You are a data analyst working with a given dataset. Below is the information about the dataset:
23
  ========
24
+ {_self.data.describe(include='all')}
25
  ========
26
 
27
  Here is a sample of the data:
28
+ {_self.data.head()}
29
 
30
+ Number of rows in the dataset: {_self.data.shape[0]}
31
 
32
  Your task:
33
  Suggest 5 visualizations that can be made in bullet points
 
77
  try:
78
  exec(code_block)
79
  except Exception as e:
80
+ print(e)
81
+
82
+ def visualize_data(self):
83
+ plot_type = st.selectbox('Choose a type of plot', ['Histogram', 'Box Plot', 'Pie Chart', 'Scatter Plot', 'Heatmap'])
84
+
85
+ if plot_type == 'Histogram':
86
+ numeric_columns = self.data.select_dtypes(include=[np.number]).columns
87
+ if numeric_columns.empty:
88
+ st.warning('No numeric columns in the data to visualize.')
89
+ else:
90
+ column_to_visualize = st.selectbox('Choose a column to visualize', numeric_columns)
91
+ fig, ax = plt.subplots()
92
+ ax.hist(self.data[column_to_visualize])
93
+ ax.set_title(f'Histogram of {column_to_visualize}')
94
+ ax.set_xlabel(column_to_visualize)
95
+ ax.set_ylabel('Frequency')
96
+ st.pyplot(fig)
97
+
98
+ elif plot_type == 'Box Plot':
99
+ numeric_columns = self.data.select_dtypes(include=[np.number]).columns
100
+ if numeric_columns.empty:
101
+ st.warning('No numeric columns in the data to visualize.')
102
+ else:
103
+ column_to_visualize = st.selectbox('Choose a column to visualize', numeric_columns)
104
+ fig, ax = plt.subplots()
105
+ ax.boxplot(self.data[column_to_visualize].dropna())
106
+ ax.set_title(f'Box Plot of {column_to_visualize}')
107
+ ax.set_ylabel(column_to_visualize)
108
+ st.pyplot(fig)
109
+
110
+ elif plot_type == 'Pie Chart':
111
+ nonnumeric_columns = self.data.select_dtypes(include=['object']).columns
112
+ if nonnumeric_columns.empty:
113
+ st.warning('No non numeric columns in the data to visualize.')
114
+ else:
115
+ column_to_visualize = st.selectbox('Choose a column to visualize', nonnumeric_columns)
116
+ fig, ax = plt.subplots()
117
+ self.data[column_to_visualize].value_counts().plot(kind='pie', ax=ax, autopct='%1.1f%%', textprops={'fontsize': 'small'})
118
+ ax.set_title(f'Pie Chart of {column_to_visualize}')
119
+ ax.set_ylabel('')
120
+ st.pyplot(fig)
121
+
122
+ elif plot_type == 'Scatter Plot':
123
+ left, right = st.columns(2)
124
+ with left:
125
+ x_col = st.selectbox('Choose values on X axis', self.data.select_dtypes(include=[np.number]).columns)
126
+ with right:
127
+ y_col = st.selectbox('Choose values on Y axis', self.data.select_dtypes(include=[np.number]).columns)
128
+ if x_col == y_col:
129
+ st.warning('Please select two different columns for scatter plot.')
130
+ else:
131
+ fig, ax = plt.subplots()
132
+ ax.scatter(self.data[x_col], self.data[y_col])
133
+ ax.set_title(f'Scatter Plot of {x_col} vs {y_col}')
134
+ ax.set_xlabel(x_col)
135
+ ax.set_ylabel(y_col)
136
+ st.pyplot(fig)
137
+
138
+ elif plot_type == 'Heatmap':
139
+ numeric_data = self.data.select_dtypes(include=[np.number])
140
+ corr = numeric_data.corr()
141
+ fig, ax = plt.subplots()
142
+ sns.heatmap(corr, annot=True, ax=ax)
143
+ ax.set_title('Correlation Heatmap')
144
+ st.pyplot(fig)
Modules/llm_summary.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from litellm import completion
2
  from dotenv import load_dotenv
3
  import os
@@ -6,7 +7,7 @@ import pandas as pd
6
  load_dotenv() # take environment variables from .env.
7
  os.environ['GEMINI_API_KEY'] = os.getenv("GOOGLE_API_KEY")
8
 
9
-
10
  def LLM_summary():
11
  file_path = './data.csv'
12
  df = pd.read_csv(file_path)
@@ -21,16 +22,23 @@ def LLM_summary():
21
 
22
  # Get number of rows and columns
23
  num_rows, num_cols = df.shape
24
-
 
 
 
 
 
 
 
25
  # Construct the dataset information string
26
  info_string = f"Dataset Information:\n"
27
  info_string += f"Columns: {column_names}\n"
28
  info_string += f"Data Types: {data_types}\n"
29
  info_string += f"Number of Rows: {num_rows}\n"
30
  info_string += f"Number of Columns: {num_cols}\n"
 
 
31
 
32
-
33
-
34
  message = f'''
35
  You are a data analyser agent working with a given dataset.
36
  Below is the info about the dataset -
@@ -39,8 +47,8 @@ def LLM_summary():
39
  ========
40
 
41
  Your task -
42
- Write a summary report of the dataset. You have to explain what the dataset is about.
43
- You have to tell point-wise insights could be gained from the dataset
44
 
45
 
46
  Do not infer any data based on previous training, strictly use only source text given below as input.
@@ -53,6 +61,6 @@ def LLM_summary():
53
  ]
54
  )
55
 
56
- return output.choices[0].message.content
57
 
58
 
 
1
+ import streamlit as st
2
  from litellm import completion
3
  from dotenv import load_dotenv
4
  import os
 
7
  load_dotenv() # take environment variables from .env.
8
  os.environ['GEMINI_API_KEY'] = os.getenv("GOOGLE_API_KEY")
9
 
10
+ @st.cache_data(experimental_allow_widgets=True)
11
  def LLM_summary():
12
  file_path = './data.csv'
13
  df = pd.read_csv(file_path)
 
22
 
23
  # Get number of rows and columns
24
  num_rows, num_cols = df.shape
25
+ unique_values_info = []
26
+ example_values_info = []
27
+ for col in df.columns:
28
+ unique_values = df[col].unique()
29
+ unique_values_info.append(f"{col}: {len(unique_values)} unique values")
30
+ example_values = df[col].head(5).tolist() # Get first 5 values as examples
31
+ example_values_info.append(f"{col}: {example_values}")
32
+
33
  # Construct the dataset information string
34
  info_string = f"Dataset Information:\n"
35
  info_string += f"Columns: {column_names}\n"
36
  info_string += f"Data Types: {data_types}\n"
37
  info_string += f"Number of Rows: {num_rows}\n"
38
  info_string += f"Number of Columns: {num_cols}\n"
39
+ info_string += f"Unique Values per Column: {'; '.join(unique_values_info)}\n"
40
+ info_string += f"Example Values per Column: {'; '.join(example_values_info)}\n"
41
 
 
 
42
  message = f'''
43
  You are a data analyser agent working with a given dataset.
44
  Below is the info about the dataset -
 
47
  ========
48
 
49
  Your task -
50
+ Write a detailed and beautiful summary report of the dataset. You have to explain what the dataset is about.
51
+ You also have to questions that could be asked regarding the dataset so that we could gain some insights.
52
 
53
 
54
  Do not infer any data based on previous training, strictly use only source text given below as input.
 
61
  ]
62
  )
63
 
64
+ st.write(output.choices[0].message.content)
65
 
66
 
Modules/output_interpreter.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from litellm import completion
2
+ from dotenv import load_dotenv
3
+ import os
4
+
5
+ load_dotenv() # take environment variables from .env.
6
+
7
+ def read_file(filename):
8
+ try:
9
+ with open(filename, "r") as file:
10
+ data = file.read()
11
+ return data
12
+ except Exception as e:
13
+ return f"Error: {str(e)}"
14
+
15
+ def output_interpreter(query):
16
+
17
+ os.environ['GEMINI_API_KEY'] = os.getenv("GOOGLE_API_KEY")
18
+ data = read_file("data.txt")
19
+ output = completion(
20
+ model="gemini/gemini-pro",
21
+ messages=[
22
+ {"role": "user", "content": f"You are a data analyst. you were given a query - {query}\n After a python code to get the answer to query you got the following info - {data}. Summarize your findings and write a proper answer for the query."},
23
+ ]
24
+ )
25
+
26
+ response = output.choices[0].message.content
27
+ return response
app.py CHANGED
@@ -14,6 +14,7 @@ from Modules.data_transformer import DataTransformer
14
  from Modules.data_visualizer import DataVisualizer
15
  from Modules.data_QA import DataQA
16
  from Modules.MLtoolkit import MLToolkit
 
17
 
18
 
19
  #---SKLEARN-IMPORT---
@@ -45,12 +46,14 @@ def main():
45
  data = pd.read_csv("data.csv")
46
  data_analyzer = DataAnalyzer(data)
47
  data_analyzer.show_eda()
 
 
48
  data_analyzer.show_count_plots()
49
 
50
  data_visualizer = DataVisualizer(data)
51
  data_visualizer.suggestions()
52
- data_visualizer.generate_viz()
53
- # data_visualizer.visualize_data()
54
 
55
  # --- DATA CLEANING ---
56
  if selected == "Data Cleaning":
@@ -68,7 +71,7 @@ def main():
68
  if selected == "Q/A":
69
  try:
70
  data_QA = DataQA()
71
- data_QA.ask_csv()
72
  except Exception as e:
73
  # Handle the exception (e.g., logging, printing an error message, etc.)
74
  print(f"An error occurred: {e}")
 
14
  from Modules.data_visualizer import DataVisualizer
15
  from Modules.data_QA import DataQA
16
  from Modules.MLtoolkit import MLToolkit
17
+ from Modules.llm_summary import LLM_summary
18
 
19
 
20
  #---SKLEARN-IMPORT---
 
46
  data = pd.read_csv("data.csv")
47
  data_analyzer = DataAnalyzer(data)
48
  data_analyzer.show_eda()
49
+ LLM_summary()
50
+
51
  data_analyzer.show_count_plots()
52
 
53
  data_visualizer = DataVisualizer(data)
54
  data_visualizer.suggestions()
55
+ # data_visualizer.generate_viz()
56
+ data_visualizer.visualize_data()
57
 
58
  # --- DATA CLEANING ---
59
  if selected == "Data Cleaning":
 
71
  if selected == "Q/A":
72
  try:
73
  data_QA = DataQA()
74
+ data_QA.answer_query()
75
  except Exception as e:
76
  # Handle the exception (e.g., logging, printing an error message, etc.)
77
  print(f"An error occurred: {e}")