Spaces:

AtharvaThakur
/

Insights

Sleeping

App Files Files Community

Atharva Thakur commited on May 26, 2024

Commit

3a7810d

1 Parent(s): ce41758

QA module added

Browse files

Files changed (10) hide show

.gitignore +2 -1
Experimentation/dataCodeTest.py +7 -6
Modules/code_debugger.py +21 -0
Modules/code_runner.py +22 -0
Modules/data_QA.py +74 -12
Modules/{data_code_run.py → data_code_gen.py} +49 -16
Modules/data_visualizer.py +74 -6
Modules/llm_summary.py +15 -7
Modules/output_interpreter.py +27 -0
app.py +6 -3

.gitignore CHANGED Viewed

@@ -15,7 +15,8 @@ original_data.csv
 #code files
 code.py
 data.pdf
 #Env variables
 .env
 # Distribution / packaging

 #code files
 code.py
 data.pdf
+data.txt
+file.pdf
 #Env variables
 .env
 # Distribution / packaging

Experimentation/dataCodeTest.py CHANGED Viewed

@@ -4,19 +4,20 @@ import pandas as pd
 sys.path.append("..")
-from Modules.data_code_run import DataCodeRun
 # data = pd.read_csv("test_data.csv")
-code_runner = DataCodeRun()
-message = '''generate the code to find the relation between 'Air temperature [K]' and 'Target' columns of the given dataset.
-The 'Target' column holds failure prediction values as 0 (no failure) and 1 (failure). the name of the dataset is test_data.csv .'''
 response= code_runner.generate_code(message)
 # print("Response:", response)
-plan, python_code = code_runner.extract_code(response)
-print(python_code)

 sys.path.append("..")
+from Modules.data_code_gen import DataCodeGen
+from Modules.python_interpreter import PythonInterpreter, run_interpreter
 # data = pd.read_csv("test_data.csv")
+code_runner = DataCodeGen()
+message = "give me a estimate of how many had a failure of any kind"
 response= code_runner.generate_code(message)
 # print("Response:", response)
+python_code = code_runner.extract_code(response)
+interpreter_code_output = run_interpreter(python_code)
+print("Python code output:\n", interpreter_code_output)

Modules/code_debugger.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from litellm import completion
+from dotenv import load_dotenv
+import os
+load_dotenv()  # take environment variables from .env.
+def code_debugger(python_code,error_message):
+        os.environ['GEMINI_API_KEY'] = os.getenv("GOOGLE_API_KEY")
+        output = completion(
+            model="gemini/gemini-pro",
+            messages=[
+                    {"role": "user", "content": "You are a computer with the ability to run any code you want when you are given a prompt and return a response with a plan of what code you want to run. You should start your response with the python program, The commands you provide should be in a single code block encapsulated in '''python and ''' for Python and should be valid Python programs."},
+                    {"role": "assistant", "content": "I am a computer with the ability to run any code I want when I am given a prompt and return a response with a python program. I will start my response with python program. The commands I provide should be in a single code block encapulated in ```python and ``` and should be a valid Python program."},
+                    {"role": "user", "content": "Your are given a python code that has an error. you have to solve that error"},
+                    {"role": "assistant", "content": "my job is write the correct python code to solve the error."},
+                    {"role": "user", "content": f"Here is the python code and the associated error\n python code:-{python_code} \n error message:- {error_message}"},
+                ]
+        )
+        response = output.choices[0].message.content
+        return response

Modules/code_runner.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import subprocess
+def run_script():
+        try:
+            # Run the script
+            result = subprocess.run(
+                ['python', 'code.py'],
+                capture_output=True,  # Capture stdout and stderr
+                text=True             # Get the output as string
+            )
+            # Check the return code to determine if an error occurred
+            if result.returncode != 0:
+                print("Script failed with error:")
+                print(result.stderr)
+                return 1,result.stderr
+            else:
+                print("Script completed successfully:")
+                print(result.stdout)
+                return 0,result.stdout
+        except Exception as e:
+            print(f"Failed to run the script: {e}")
+            return 2,e

Modules/data_QA.py CHANGED Viewed

@@ -4,15 +4,18 @@ from langchain_experimental.agents import create_csv_agent
 import pandas as pd
 from dotenv import load_dotenv
 import os
-from Modules.data_code_run import DataCodeRun
 from Modules.python_interpreter import PythonInterpreter, run_interpreter
 load_dotenv()  # take environment variables from .env.
 class DataQA:
     def __init__(self):
-        print("dataQA")
     # def ask_csv(self):
     #     GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
     #     llm = GoogleGenerativeAI(model="gemini-pro", google_api_key=GOOGLE_API_KEY)
@@ -22,14 +25,73 @@ class DataQA:
     #         response = csv_agent.invoke(question)
     #         st.write(response)
-    def ask_csv(self):
-        question = st.text_input("Ask your question:")
-        code_runner = DataCodeRun()
-        if question:
-            response= code_runner.generate_code(question)
-            plan, python_code = code_runner.extract_code(response)
-            st.write(plan)
             st.code(python_code)
             if st.button("Run Code") and python_code:
-                interpreter_code_output = run_interpreter(python_code)
-                print("Python code output:\n", interpreter_code_output)

 import pandas as pd
 from dotenv import load_dotenv
 import os
+from Modules.code_runner import run_script
+from Modules.code_debugger import code_debugger
+from Modules.output_interpreter import output_interpreter
+from Modules.data_code_gen import DataCodeGen
 from Modules.python_interpreter import PythonInterpreter, run_interpreter
+import subprocess
 load_dotenv()  # take environment variables from .env.
 class DataQA:
     def __init__(self):
+        pass
     # def ask_csv(self):
     #     GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
     #     llm = GoogleGenerativeAI(model="gemini-pro", google_api_key=GOOGLE_API_KEY)
     #         response = csv_agent.invoke(question)
     #         st.write(response)
+    # def ask_csv(self):
+    #     question = st.text_input("Ask your question:")
+    #     code_runner = DataCodeRun()
+    #     if question:
+    #         response= code_runner.generate_code(question)
+    #         plan, python_code = code_runner.extract_code(response)
+    #         st.write(plan)
+    #         st.code(python_code)
+    #         if st.button("Run Code") and python_code:
+    #             interpreter_code_output = run_interpreter(python_code)
+    #             print("Python code output:\n", interpreter_code_output)
+    # @st.cache_data(experimental_allow_widgets=True)
+    def answer_query(self):
+        query = st.text_input("Ask your question:")
+        if query:
+        # message = '''generate the code to find the relation between 'Air temperature [K]' and 'Target' columns of the given dataset. The 'Target' column holds failure prediction values as 0 (no failure) and 1 (failure). the name of the dataset is test_data.csv .
+        # '''
+        # get the info about the dataset.
+        # call to code gen
+            code_gen = DataCodeGen()
+            response = code_gen.generate_code(query)
+            # st.write(response)
+            python_code = code_gen.extract_code(response)
             st.code(python_code)
+            def save_and_run_code(python_code):
+                try:
+                    with open("code.py", "w") as f:
+                        f.write(python_code)
+                    print("Python code saved as code.py")
+                except Exception as e:
+                    print("Error:", str(e))
+                    return 1, str(e)  # Return an error code and message
+                return run_script()
+            def debug_code(python_code, error_message):
+                return code_debugger(python_code, error_message)
             if st.button("Run Code") and python_code:
+                # Maximum number of attempts
+                max_attempts = 5
+                attempts = 0
+                while attempts < max_attempts:
+                    return_code, return_message = save_and_run_code(python_code)
+                    if return_code == 0:
+                        print("Code executed successfully!")
+                        break  # Exit the loop if the code runs without errors
+                    # If there was an error, debug the code
+                    response = debug_code(python_code, return_message)
+                    python_code = code_gen.extract_code(response)
+                    attempts += 1  # Increment the attempts counter
+                    st.write(python_code)  # Display the corrected code
+                if attempts == max_attempts:
+                    print("Exceeded maximum number of attempts. The code could not be executed successfully.")
+                # Process final output
+                answer = output_interpreter(query)
+                st.write(answer)
+                # Clean up by removing the code file
+                os.remove("code.py")

Modules/{data_code_run.py → data_code_gen.py} RENAMED Viewed

@@ -3,26 +3,62 @@ from litellm import completion
 from dotenv import load_dotenv
 import os
 from Modules.python_interpreter import PythonInterpreter, run_interpreter
 load_dotenv()  # take environment variables from .env.
-class DataCodeRun:
     def __init__(self):
         pass
-    @st.cache_data(experimental_allow_widgets=True)
-    def generate_code(_,message):
-        os.environ['GEMINI_API_KEY'] = "AIzaSyAPlmL2oeRaldWRf2viQINPd92_vm3QN6o"
-        # message = '''generate the code to find the relation between 'Air temperature [K]' and 'Target' columns of the given dataset. The 'Target' column holds failure prediction values as 0 (no failure) and 1 (failure). the name of the dataset is test_data.csv .
-        # '''
         output = completion(
             model="gemini/gemini-pro",
             messages=[
-                    {"role": "user", "content": "You are a computer with the ability to run any code you want when you are given a prompt and return a response with a plan of what code you want to run. You should start your response with a plan that describes what the code is going do in detail, The commands you provide should be in a single code block encapsulated in '''python and ''' for Python and should be valid Python programs."},
-                    {"role": "assistant", "content": "I am a computer with the ability to run any code I want when I am given a prompt and return a response with a plan of what code I want to run I will start my response with a plan that would be encapsulated in ```plan and ```. Afterwards, The commands I provide should be in a single code block encapulated in ```python and ``` and should be a valid Python program."},
-                    {"role": "user", "content": message}
                 ]
         )
@@ -31,9 +67,6 @@ class DataCodeRun:
     def extract_code(self,response):
-        plan = response.split("```python")[0]
-        plan = plan.replace("'", "")
-        plan = plan.replace('`', "")
         # else:
         #     print(response.choices[0].message.content)
         #     # Extract plan from the response
@@ -44,15 +77,15 @@ class DataCodeRun:
         if "```python" in response:
             python_code = response.split("```python")[1].split("```")[0].strip()
-            return plan,python_code
         elif "```" in response:
             python_code = response.split("```")[1].split("```")[0].strip()
             print("Code found in the response but not Left out the word python:", python_code)
-            return plan,python_code
         elif "```python" in response.choices[0].message.content:
             python_code = response.choices[0].message.content.split(
                 "```python")[1].split("```")[0].strip()
-            return plan,python_code
         # if python_code:

 from dotenv import load_dotenv
 import os
 from Modules.python_interpreter import PythonInterpreter, run_interpreter
+import pandas as pd
 load_dotenv()  # take environment variables from .env.
+class DataCodeGen:
     def __init__(self):
         pass
+    def get_data_info(self):
+        file_path = './data.csv'
+        df = pd.read_csv(file_path)
+        # Get column names
+        column_names = ", ".join(df.columns.tolist())
+        # Get data types
+        data_types = ", ".join([f"{col}: {dtype}" for col, dtype in df.dtypes.items()])
+        # Get number of rows and columns
+        num_rows, num_cols = df.shape
+        # Get unique values and example values for each column
+        unique_values_info = []
+        example_values_info = []
+        for col in df.columns:
+            unique_values = df[col].unique()
+            unique_values_info.append(f"{col}: {len(unique_values)} unique values")
+            example_values = df[col].head(5).tolist()  # Get first 5 values as examples
+            example_values_info.append(f"{col}: {example_values}")
+        # Construct the dataset information string
+        info_string = f"Dataset Information:\n"
+        info_string += f"Dataset file path: {file_path}\n"
+        info_string += f"Columns: {column_names}\n"
+        info_string += f"Data Types: {data_types}\n"
+        info_string += f"Number of Rows: {num_rows}\n"
+        info_string += f"Number of Columns: {num_cols}\n"
+        info_string += f"Unique Values per Column: {'; '.join(unique_values_info)}\n"
+        # info_string += f"Example Values per Column: {'; '.join(example_values_info)}\n"
+        return info_string
+    @st.cache_data(experimental_allow_widgets=True)
+    def generate_code(_self,query):
+        os.environ['GEMINI_API_KEY'] = os.getenv("GOOGLE_API_KEY")
+        data_info= _self.get_data_info()
         output = completion(
             model="gemini/gemini-pro",
             messages=[
+                    {"role": "user", "content": "You are a data analyst with the ability to run any code you want when you are given a prompt and return a response with a plan of what code you want to run. You should start your response with the python program, The commands you provide should be in a single code block encapsulated in '''python and ''' for Python and should be valid Python programs."},
+                    {"role": "assistant", "content": "I am a data analyst with the ability to run any code I want when I am given a prompt and return a response with a python program. I will start my response with python program. The commands I provide should be in a single code block encapulated in ```python and ``` and should be a valid Python program."},
+                    {"role": "user", "content": "Your job is write the python code the answer for the given query regarding a dataset. The python should find the correct answer the query, also generate a visualization if necessary and store it in `file.pdf`. Store the answer to query and information regarding the visualization in `data.txt`. Even if the given task is to plot a graph you have to include textual information regarding the graphs like the labels and values in `data.txt`."},
+                    {"role": "assistant", "content": "My job is write the python code that will find the answer for the given query regarding a dataset. The python should find the correct answer the query, also generate a visualization if necessary and store it in `file.pdf`. I have to store the answer to query along with label and value shown in the visualization in `data.txt`. Even if I have to just plot a graph I will include textual information regarding the graphs like the labels and values in `data.txt`."},
+                    {"role": "user", "content": f"Here is some information about the dataset.\n {data_info}"},
+                    {"role": "user", "content": f"Given query - {query}"},
                 ]
         )
     def extract_code(self,response):
         # else:
         #     print(response.choices[0].message.content)
         #     # Extract plan from the response
         if "```python" in response:
             python_code = response.split("```python")[1].split("```")[0].strip()
+            return python_code
         elif "```" in response:
             python_code = response.split("```")[1].split("```")[0].strip()
             print("Code found in the response but not Left out the word python:", python_code)
+            return python_code
         elif "```python" in response.choices[0].message.content:
             python_code = response.choices[0].message.content.split(
                 "```python")[1].split("```")[0].strip()
+            return python_code
         # if python_code:

Modules/data_visualizer.py CHANGED Viewed

@@ -1,6 +1,9 @@
 import streamlit as st
 import re
 from litellm import completion
 from dotenv import load_dotenv
 import os
@@ -13,17 +16,18 @@ class DataVisualizer:
         self.data = data
         st.subheader("Data Visualizer")
-    def suggestions(self):
         message = f'''
         You are a data analyst working with a given dataset. Below is the information about the dataset:
         ========
-        {self.data.describe(include='all')}
         ========
         Here is a sample of the data:
-        {self.data.head()}
-        Number of rows in the dataset: {self.data.shape[0]}
         Your task:
         Suggest 5 visualizations that can be made in bullet points
@@ -73,4 +77,68 @@ class DataVisualizer:
             try:
                 exec(code_block)
             except Exception as e:
-                print(e)

 import streamlit as st
 import re
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
 from litellm import completion
 from dotenv import load_dotenv
 import os
         self.data = data
         st.subheader("Data Visualizer")
+    @st.cache_data(experimental_allow_widgets=True)
+    def suggestions(_self):
         message = f'''
         You are a data analyst working with a given dataset. Below is the information about the dataset:
         ========
+        {_self.data.describe(include='all')}
         ========
         Here is a sample of the data:
+        {_self.data.head()}
+        Number of rows in the dataset: {_self.data.shape[0]}
         Your task:
         Suggest 5 visualizations that can be made in bullet points
             try:
                 exec(code_block)
             except Exception as e:
+                print(e)
+    def visualize_data(self):
+        plot_type = st.selectbox('Choose a type of plot', ['Histogram', 'Box Plot', 'Pie Chart', 'Scatter Plot', 'Heatmap'])
+        if plot_type == 'Histogram':
+            numeric_columns = self.data.select_dtypes(include=[np.number]).columns
+            if numeric_columns.empty:
+                st.warning('No numeric columns in the data to visualize.')
+            else:
+                column_to_visualize = st.selectbox('Choose a column to visualize', numeric_columns)
+                fig, ax = plt.subplots()
+                ax.hist(self.data[column_to_visualize])
+                ax.set_title(f'Histogram of {column_to_visualize}')
+                ax.set_xlabel(column_to_visualize)
+                ax.set_ylabel('Frequency')
+                st.pyplot(fig)
+        elif plot_type == 'Box Plot':
+            numeric_columns = self.data.select_dtypes(include=[np.number]).columns
+            if numeric_columns.empty:
+                st.warning('No numeric columns in the data to visualize.')
+            else:
+                column_to_visualize = st.selectbox('Choose a column to visualize', numeric_columns)
+                fig, ax = plt.subplots()
+                ax.boxplot(self.data[column_to_visualize].dropna())
+                ax.set_title(f'Box Plot of {column_to_visualize}')
+                ax.set_ylabel(column_to_visualize)
+                st.pyplot(fig)
+        elif plot_type == 'Pie Chart':
+            nonnumeric_columns = self.data.select_dtypes(include=['object']).columns
+            if nonnumeric_columns.empty:
+                st.warning('No non numeric columns in the data to visualize.')
+            else:
+                column_to_visualize = st.selectbox('Choose a column to visualize', nonnumeric_columns)
+                fig, ax = plt.subplots()
+                self.data[column_to_visualize].value_counts().plot(kind='pie', ax=ax, autopct='%1.1f%%', textprops={'fontsize': 'small'})
+                ax.set_title(f'Pie Chart of {column_to_visualize}')
+                ax.set_ylabel('')
+                st.pyplot(fig)
+        elif plot_type == 'Scatter Plot':
+            left, right = st.columns(2)
+            with left:
+                x_col = st.selectbox('Choose values on X axis', self.data.select_dtypes(include=[np.number]).columns)
+            with right:
+                y_col = st.selectbox('Choose values on Y axis', self.data.select_dtypes(include=[np.number]).columns)
+            if x_col == y_col:
+                st.warning('Please select two different columns for scatter plot.')
+            else:
+                fig, ax = plt.subplots()
+                ax.scatter(self.data[x_col], self.data[y_col])
+                ax.set_title(f'Scatter Plot of {x_col} vs {y_col}')
+                ax.set_xlabel(x_col)
+                ax.set_ylabel(y_col)
+                st.pyplot(fig)
+        elif plot_type == 'Heatmap':
+            numeric_data = self.data.select_dtypes(include=[np.number])
+            corr = numeric_data.corr()
+            fig, ax = plt.subplots()
+            sns.heatmap(corr, annot=True, ax=ax)
+            ax.set_title('Correlation Heatmap')
+            st.pyplot(fig)

Modules/llm_summary.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from litellm import completion
 from dotenv import load_dotenv
 import os
@@ -6,7 +7,7 @@ import pandas as pd
 load_dotenv()  # take environment variables from .env.
 os.environ['GEMINI_API_KEY'] = os.getenv("GOOGLE_API_KEY")
 def LLM_summary():
     file_path = './data.csv'
     df = pd.read_csv(file_path)
@@ -21,16 +22,23 @@ def LLM_summary():
     # Get number of rows and columns
     num_rows, num_cols = df.shape
     # Construct the dataset information string
     info_string = f"Dataset Information:\n"
     info_string += f"Columns: {column_names}\n"
     info_string += f"Data Types: {data_types}\n"
     info_string += f"Number of Rows: {num_rows}\n"
     info_string += f"Number of Columns: {num_cols}\n"
     message = f'''
     You are a data analyser agent working with a given dataset.
     Below is the info about the dataset -
@@ -39,8 +47,8 @@ def LLM_summary():
     ========
     Your task -
-    Write a summary report of the dataset. You have to explain what the dataset is about.
-    You have to tell point-wise insights could be gained from the dataset
     Do not infer any data based on previous training, strictly use only source text given below as input.
@@ -53,6 +61,6 @@ def LLM_summary():
             ]
     )
-    return output.choices[0].message.content

+import streamlit as st
 from litellm import completion
 from dotenv import load_dotenv
 import os
 load_dotenv()  # take environment variables from .env.
 os.environ['GEMINI_API_KEY'] = os.getenv("GOOGLE_API_KEY")
+@st.cache_data(experimental_allow_widgets=True)
 def LLM_summary():
     file_path = './data.csv'
     df = pd.read_csv(file_path)
     # Get number of rows and columns
     num_rows, num_cols = df.shape
+    unique_values_info = []
+    example_values_info = []
+    for col in df.columns:
+            unique_values = df[col].unique()
+            unique_values_info.append(f"{col}: {len(unique_values)} unique values")
+            example_values = df[col].head(5).tolist()  # Get first 5 values as examples
+            example_values_info.append(f"{col}: {example_values}")
     # Construct the dataset information string
     info_string = f"Dataset Information:\n"
     info_string += f"Columns: {column_names}\n"
     info_string += f"Data Types: {data_types}\n"
     info_string += f"Number of Rows: {num_rows}\n"
     info_string += f"Number of Columns: {num_cols}\n"
+    info_string += f"Unique Values per Column: {'; '.join(unique_values_info)}\n"
+    info_string += f"Example Values per Column: {'; '.join(example_values_info)}\n"
     message = f'''
     You are a data analyser agent working with a given dataset.
     Below is the info about the dataset -
     ========
     Your task -
+    Write a detailed and beautiful summary report of the dataset. You have to explain what the dataset is about.
+    You also have to questions that could be asked regarding the dataset so that we could gain some insights.
     Do not infer any data based on previous training, strictly use only source text given below as input.
             ]
     )
+    st.write(output.choices[0].message.content)

Modules/output_interpreter.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from litellm import completion
+from dotenv import load_dotenv
+import os
+load_dotenv()  # take environment variables from .env.
+def read_file(filename):
+    try:
+        with open(filename, "r") as file:
+            data = file.read()
+            return data
+    except Exception as e:
+        return f"Error: {str(e)}"
+def output_interpreter(query):
+    os.environ['GEMINI_API_KEY'] = os.getenv("GOOGLE_API_KEY")
+    data = read_file("data.txt")
+    output = completion(
+        model="gemini/gemini-pro",
+        messages=[
+                {"role": "user", "content": f"You are a data analyst. you were given a query - {query}\n After a python code to get the answer to query you got the following info - {data}. Summarize your findings and write a proper answer for the query."},
+            ]
+    )
+    response = output.choices[0].message.content
+    return response

app.py CHANGED Viewed

@@ -14,6 +14,7 @@ from Modules.data_transformer import DataTransformer
 from Modules.data_visualizer import DataVisualizer
 from Modules.data_QA import DataQA
 from Modules.MLtoolkit import MLToolkit
 #---SKLEARN-IMPORT---
@@ -45,12 +46,14 @@ def main():
             data = pd.read_csv("data.csv")
             data_analyzer = DataAnalyzer(data)
             data_analyzer.show_eda()
             data_analyzer.show_count_plots()
             data_visualizer = DataVisualizer(data)
             data_visualizer.suggestions()
-            data_visualizer.generate_viz()
-            # data_visualizer.visualize_data()
         # --- DATA CLEANING ---
         if selected == "Data Cleaning":
@@ -68,7 +71,7 @@ def main():
         if selected == "Q/A":
             try:
                 data_QA = DataQA()
-                data_QA.ask_csv()
             except Exception as e:
                 # Handle the exception (e.g., logging, printing an error message, etc.)
                 print(f"An error occurred: {e}")

 from Modules.data_visualizer import DataVisualizer
 from Modules.data_QA import DataQA
 from Modules.MLtoolkit import MLToolkit
+from Modules.llm_summary import LLM_summary
 #---SKLEARN-IMPORT---
             data = pd.read_csv("data.csv")
             data_analyzer = DataAnalyzer(data)
             data_analyzer.show_eda()
+            LLM_summary()
             data_analyzer.show_count_plots()
             data_visualizer = DataVisualizer(data)
             data_visualizer.suggestions()
+            # data_visualizer.generate_viz()
+            data_visualizer.visualize_data()
         # --- DATA CLEANING ---
         if selected == "Data Cleaning":
         if selected == "Q/A":
             try:
                 data_QA = DataQA()
+                data_QA.answer_query()
             except Exception as e:
                 # Handle the exception (e.g., logging, printing an error message, etc.)
                 print(f"An error occurred: {e}")