Spaces:

AtharvaThakur
/

Insights

Sleeping

App Files Files Community

Atharva Thakur commited on Apr 4, 2024

Commit

e3fe4bf

1 Parent(s): b42e2e7

Some more experiments with dataparty

Browse files

Files changed (5) hide show

code.py +10 -18
data_code_run.py +4 -3
test.py +30 -24
test2.py +2 -2
test3.py +4 -0

code.py CHANGED Viewed

@@ -1,24 +1,16 @@
-import matplotlib.pyplot as plt
 import seaborn as sns
-from sklearn.datasets import load_iris
-# Load the iris dataset
-iris = load_iris()
-# Visualize the distribution of sepal length
-sns.histplot(iris['data'][:, 0])
-plt.xlabel('Sepal Length')
-plt.ylabel('Count')
-plt.title('Distribution of Sepal Length')
-plt.show()
-# Visualize the relationship between sepal length and sepal width
-sns.scatterplot(iris['data'][:, 0], iris['data'][:, 1])
-plt.xlabel('Sepal Length')
-plt.ylabel('Sepal Width')
-plt.title('Relationship between Sepal Length and Sepal Width')
 plt.show()
-# Visualize the relationship between sepal length, sepal width, and petal length
-sns.pairplot(iris['data'], hue=iris['target'])
-plt.show()

+import pandas as pd
 import seaborn as sns
+import matplotlib.pyplot as plt
+# Load the dataset
+df = pd.read_csv('test_data.csv')
+# Check the correlation between 'Air temperature [K]' and 'Target'
+corr = df['Air temperature [K]'].corr(df['Target'])
+# Plot the scatter plot
+sns.scatterplot(x='Air temperature [K]', y='Target', data=df)
 plt.show()
+# Print the correlation coefficient
+print('Correlation coefficient:', corr)

data_code_run.py CHANGED Viewed

@@ -8,13 +8,14 @@ from python_interpreter import PythonInterpreter, run_interpreter
 load_dotenv()  # take environment variables from .env.
 class DataCodeRun:
-    def __init__(self, data):
-        self.data = data
     def run_code(self):
         os.environ['GEMINI_API_KEY'] = os.getenv("GOOGLE_API_KEY")
-        message = "write code to plot visualizations for iris dataset"
         output = completion(
             model="gemini/gemini-pro",
             messages=[

 load_dotenv()  # take environment variables from .env.
 class DataCodeRun:
+    def __init__(self):
+        pass
     def run_code(self):
         os.environ['GEMINI_API_KEY'] = os.getenv("GOOGLE_API_KEY")
+        message = '''generate the code to find the relation between 'Air temperature [K]' and 'Target' columns of the given dataset. The 'Target' column holds failure prediction values as 0 (no failure) and 1 (failure). the name of the dataset is test_data.csv .
+        '''
         output = completion(
             model="gemini/gemini-pro",
             messages=[

test.py CHANGED Viewed

@@ -9,41 +9,47 @@ load_dotenv()  # take environment variables from .env.
 os.environ['GEMINI_API_KEY'] = os.getenv("GOOGLE_API_KEY")
 file_path = './test_data.csv'
-data = pd.read_csv(file_path)
-string_data= data.to_string(index=False)
-data_info = '''Machine Predictive Maintenance Classification Dataset
-Since real predictive maintenance datasets are generally difficult to obtain and in particular difficult to publish, we present and provide a synthetic dataset that reflects real predictive maintenance encountered in the industry to the best of our knowledge.
-The dataset consists of 100 data points stored as rows with 14 features in columns
-UID: unique identifier ranging from 1 to 100
-productID: consisting of a letter L, M, or H for low (50% of all products), medium (30%), and high (20%) as product quality variants and a variant-specific serial number
-air temperature [K]: generated using a random walk process later normalized to a standard deviation of 2 K around 300 K
-process temperature [K]: generated using a random walk process normalized to a standard deviation of 1 K, added to the air temperature plus 10 K.
-rotational speed [rpm]: calculated from powepower of 2860 W, overlaid with a normally distributed noise
-torque [Nm]: torque values are normally distributed around 40 Nm with an Ïƒ = 10 Nm and no negative values.
-tool wear [min]: The quality variants H/M/L add 5/3/2 minutes of tool wear to the used tool in the process. and a
-'machine failure' label that indicates, whether the machine has failed in this particular data point for any of the following failure modes are true.'''
 # print(string_data)
 message = f'''
 You are a data analyser agent working with a given dataset.
 Below is the info about the dataset -
 ========
-{data_info}
 ========
 Your task -
-give me the percentage of no failures.
 Do not infer any data based on previous training, strictly use only source text given below as input.
-========
-{string_data}
-========
 '''
 output = completion(
     model="gemini/gemini-pro",

 os.environ['GEMINI_API_KEY'] = os.getenv("GOOGLE_API_KEY")
 file_path = './test_data.csv'
+df = pd.read_csv(file_path)
+string_data= df.to_string(index=False)
+# Get column names
+column_names = ", ".join(df.columns.tolist())
+# Get data types
+data_types = ", ".join([f"{col}: {dtype}" for col, dtype in df.dtypes.items()])
+# Get number of rows and columns
+num_rows, num_cols = df.shape
+# Construct the dataset information string
+info_string = f"Dataset Information:\n"
+info_string += f"Columns: {column_names}\n"
+info_string += f"Data Types: {data_types}\n"
+info_string += f"Number of Rows: {num_rows}\n"
+info_string += f"Number of Columns: {num_cols}\n"
 # print(string_data)
+request = "I want find relation between Air Temperature and Target"
 message = f'''
 You are a data analyser agent working with a given dataset.
 Below is the info about the dataset -
 ========
+{info_string}
 ========
 Your task -
+write a proper prompt to tell another agent to generate code to fulfill the below request by the user.
+You have to give all the details about the columns involved and only the required info about the dataset needed to fulfil the request.
+failues are given as 0 and 1 in target column.
+Request :
+=======
+{request}
+=======
 Do not infer any data based on previous training, strictly use only source text given below as input.
 '''
 output = completion(
     model="gemini/gemini-pro",

test2.py CHANGED Viewed

@@ -10,8 +10,8 @@ load_dotenv()  # take environment variables from .env.
 GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
 llm = GoogleGenerativeAI(model="gemini-pro", google_api_key=GOOGLE_API_KEY)
-csv_agent = create_csv_agent(llm,"data.csv", verbose=True)
-question = "number of row which have which have values greater than 0 in column Pregnancies"
 if question:
     response = csv_agent.run(question)
     print(response)

 GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
 llm = GoogleGenerativeAI(model="gemini-pro", google_api_key=GOOGLE_API_KEY)
+csv_agent = create_csv_agent(llm,"test_data.csv", verbose=True)
+question = "what is the relation between air temperature and target"
 if question:
     response = csv_agent.run(question)
     print(response)

test3.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from data_code_run import DataCodeRun
+d = DataCodeRun()
+d.run_code()