Atharva Thakur commited on
Commit
e3fe4bf
1 Parent(s): b42e2e7

Some more experiments with dataparty

Browse files
Files changed (5) hide show
  1. code.py +10 -18
  2. data_code_run.py +4 -3
  3. test.py +30 -24
  4. test2.py +2 -2
  5. test3.py +4 -0
code.py CHANGED
@@ -1,24 +1,16 @@
1
- import matplotlib.pyplot as plt
2
  import seaborn as sns
3
- from sklearn.datasets import load_iris
4
 
5
- # Load the iris dataset
6
- iris = load_iris()
7
 
8
- # Visualize the distribution of sepal length
9
- sns.histplot(iris['data'][:, 0])
10
- plt.xlabel('Sepal Length')
11
- plt.ylabel('Count')
12
- plt.title('Distribution of Sepal Length')
13
- plt.show()
14
 
15
- # Visualize the relationship between sepal length and sepal width
16
- sns.scatterplot(iris['data'][:, 0], iris['data'][:, 1])
17
- plt.xlabel('Sepal Length')
18
- plt.ylabel('Sepal Width')
19
- plt.title('Relationship between Sepal Length and Sepal Width')
20
  plt.show()
21
 
22
- # Visualize the relationship between sepal length, sepal width, and petal length
23
- sns.pairplot(iris['data'], hue=iris['target'])
24
- plt.show()
 
1
+ import pandas as pd
2
  import seaborn as sns
3
+ import matplotlib.pyplot as plt
4
 
5
+ # Load the dataset
6
+ df = pd.read_csv('test_data.csv')
7
 
8
+ # Check the correlation between 'Air temperature [K]' and 'Target'
9
+ corr = df['Air temperature [K]'].corr(df['Target'])
 
 
 
 
10
 
11
+ # Plot the scatter plot
12
+ sns.scatterplot(x='Air temperature [K]', y='Target', data=df)
 
 
 
13
  plt.show()
14
 
15
+ # Print the correlation coefficient
16
+ print('Correlation coefficient:', corr)
 
data_code_run.py CHANGED
@@ -8,13 +8,14 @@ from python_interpreter import PythonInterpreter, run_interpreter
8
  load_dotenv() # take environment variables from .env.
9
 
10
  class DataCodeRun:
11
- def __init__(self, data):
12
- self.data = data
13
 
14
  def run_code(self):
15
  os.environ['GEMINI_API_KEY'] = os.getenv("GOOGLE_API_KEY")
16
 
17
- message = "write code to plot visualizations for iris dataset"
 
18
  output = completion(
19
  model="gemini/gemini-pro",
20
  messages=[
 
8
  load_dotenv() # take environment variables from .env.
9
 
10
  class DataCodeRun:
11
+ def __init__(self):
12
+ pass
13
 
14
  def run_code(self):
15
  os.environ['GEMINI_API_KEY'] = os.getenv("GOOGLE_API_KEY")
16
 
17
+ message = '''generate the code to find the relation between 'Air temperature [K]' and 'Target' columns of the given dataset. The 'Target' column holds failure prediction values as 0 (no failure) and 1 (failure). the name of the dataset is test_data.csv .
18
+ '''
19
  output = completion(
20
  model="gemini/gemini-pro",
21
  messages=[
test.py CHANGED
@@ -9,41 +9,47 @@ load_dotenv() # take environment variables from .env.
9
  os.environ['GEMINI_API_KEY'] = os.getenv("GOOGLE_API_KEY")
10
 
11
  file_path = './test_data.csv'
12
- data = pd.read_csv(file_path)
13
-
14
- string_data= data.to_string(index=False)
15
-
16
- data_info = '''Machine Predictive Maintenance Classification Dataset
17
- Since real predictive maintenance datasets are generally difficult to obtain and in particular difficult to publish, we present and provide a synthetic dataset that reflects real predictive maintenance encountered in the industry to the best of our knowledge.
18
-
19
- The dataset consists of 100 data points stored as rows with 14 features in columns
20
-
21
- UID: unique identifier ranging from 1 to 100
22
- productID: consisting of a letter L, M, or H for low (50% of all products), medium (30%), and high (20%) as product quality variants and a variant-specific serial number
23
- air temperature [K]: generated using a random walk process later normalized to a standard deviation of 2 K around 300 K
24
- process temperature [K]: generated using a random walk process normalized to a standard deviation of 1 K, added to the air temperature plus 10 K.
25
- rotational speed [rpm]: calculated from powepower of 2860 W, overlaid with a normally distributed noise
26
- torque [Nm]: torque values are normally distributed around 40 Nm with an σ = 10 Nm and no negative values.
27
- tool wear [min]: The quality variants H/M/L add 5/3/2 minutes of tool wear to the used tool in the process. and a
28
- 'machine failure' label that indicates, whether the machine has failed in this particular data point for any of the following failure modes are true.'''
 
 
29
 
30
  # print(string_data)
31
-
32
  message = f'''
33
  You are a data analyser agent working with a given dataset.
34
  Below is the info about the dataset -
35
 
36
  ========
37
- {data_info}
38
  ========
39
 
40
  Your task -
41
- give me the percentage of no failures.
42
-
 
 
 
 
 
 
43
  Do not infer any data based on previous training, strictly use only source text given below as input.
44
- ========
45
- {string_data}
46
- ========
47
  '''
48
  output = completion(
49
  model="gemini/gemini-pro",
 
9
  os.environ['GEMINI_API_KEY'] = os.getenv("GOOGLE_API_KEY")
10
 
11
  file_path = './test_data.csv'
12
+ df = pd.read_csv(file_path)
13
+
14
+ string_data= df.to_string(index=False)
15
+
16
+ # Get column names
17
+ column_names = ", ".join(df.columns.tolist())
18
+
19
+ # Get data types
20
+ data_types = ", ".join([f"{col}: {dtype}" for col, dtype in df.dtypes.items()])
21
+
22
+ # Get number of rows and columns
23
+ num_rows, num_cols = df.shape
24
+
25
+ # Construct the dataset information string
26
+ info_string = f"Dataset Information:\n"
27
+ info_string += f"Columns: {column_names}\n"
28
+ info_string += f"Data Types: {data_types}\n"
29
+ info_string += f"Number of Rows: {num_rows}\n"
30
+ info_string += f"Number of Columns: {num_cols}\n"
31
 
32
  # print(string_data)
33
+ request = "I want find relation between Air Temperature and Target"
34
  message = f'''
35
  You are a data analyser agent working with a given dataset.
36
  Below is the info about the dataset -
37
 
38
  ========
39
+ {info_string}
40
  ========
41
 
42
  Your task -
43
+ write a proper prompt to tell another agent to generate code to fulfill the below request by the user.
44
+ You have to give all the details about the columns involved and only the required info about the dataset needed to fulfil the request.
45
+ failues are given as 0 and 1 in target column.
46
+
47
+ Request :
48
+ =======
49
+ {request}
50
+ =======
51
  Do not infer any data based on previous training, strictly use only source text given below as input.
52
+
 
 
53
  '''
54
  output = completion(
55
  model="gemini/gemini-pro",
test2.py CHANGED
@@ -10,8 +10,8 @@ load_dotenv() # take environment variables from .env.
10
 
11
  GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
12
  llm = GoogleGenerativeAI(model="gemini-pro", google_api_key=GOOGLE_API_KEY)
13
- csv_agent = create_csv_agent(llm,"data.csv", verbose=True)
14
- question = "number of row which have which have values greater than 0 in column Pregnancies"
15
  if question:
16
  response = csv_agent.run(question)
17
  print(response)
 
10
 
11
  GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
12
  llm = GoogleGenerativeAI(model="gemini-pro", google_api_key=GOOGLE_API_KEY)
13
+ csv_agent = create_csv_agent(llm,"test_data.csv", verbose=True)
14
+ question = "what is the relation between air temperature and target"
15
  if question:
16
  response = csv_agent.run(question)
17
  print(response)
test3.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from data_code_run import DataCodeRun
2
+
3
+ d = DataCodeRun()
4
+ d.run_code()