FelixPhilip
/

DeepFundingOracle

Model card Files Files and versions

DeepFundingOracle / Oracle /DataSmolAgent.py

FelixPhilip's picture

Oracle and smolagent

46b5521 5 months ago

history blame contribute delete

4.03 kB

	import pandas as pd
	import numpy as np
	from smolagents import HfApiModel,tool,CodeAgent
	from transformers import AutoTokenizer, AutoModelForCausalLM

	@tool
	def clean_data(df: pd.DataFrame) -> pd.DataFrame:
	"""
	Clean the DataFrame by stripping whitespace from column names and dropping rows that are completely empty.

	Args:
	df (pd.DataFrame): The input DataFrame containing the raw data.

	Returns:
	pd.DataFrame: A cleaned DataFrame with stripped column names and without completely empty rows.
	"""
	df.columns = df.columns.str.strip()
	df = df.dropna(how="all")
	return df

	@tool
	def extract_features(df: pd.DataFrame) -> pd.DataFrame:
	"""
	Dynamically extract features from the DataFrame.

	For numeric columns:
	- If all values are non-negative, a log-transformed version is created.

	For columns that appear to be dates:
	- Year, month, and day are extracted.

	For non-numeric, non-date columns:
	- They are encoded as categorical numeric codes.

	Args:
	df (pd.DataFrame): The input DataFrame containing the raw data.

	Returns:
	pd.DataFrame: The DataFrame updated with new dynamically engineered features.
	"""
	# Numeric columns: log transformation
	numeric_cols = df.select_dtypes(include=[np.number]).columns.to_list()
	for col in numeric_cols:
	if (df[col] >= 0).all():
	df[f"log_{col}"] = np.log(df[col] + 1)

	# Date-like columns extraction
	for col in df.columns:
	if "date" in col.lower() or "time" in col.lower():
	try:
	df[col] = pd.to_datetime(df[col], errors='coerce')
	df[f"{col}_year"] = df[col].dt.year
	df[f"{col}_month"] = df[col].dt.month
	df[f"{col}_day"] = df[col].dt.day
	except Exception:
	pass

	# Non-numeric processing: encode as categorical numeric codes.
	non_numeric = df.select_dtypes(include=["object"]).columns.to_list()
	valid_cat = []
	for col in non_numeric:
	try:
	pd.to_datetime(df[col], errors='raise')
	except Exception:
	valid_cat.append(col)
	for col in valid_cat:
	df[f"{col}_cat"] = df[col].astype("category").cat.codes

	return df

	@tool
	def save_to_csv(df: pd.DataFrame, filename: str = "output.csv") -> str:
	"""
	Save the DataFrame to a CSV file and return the file path.

	Args:
	df (pd.DataFrame): The DataFrame to save.
	filename (str): The name of the output CSV file.

	Returns:
	str: The file path of the saved CSV.
	"""
	df.to_csv(filename, index=False)
	return filename

	class DataSmolAgent(CodeAgent):
	"""
	A data processing agent that cleans and extracts features from the provided DataFrame.
	"""
	def __init__(self, df: pd.DataFrame):
	self.df = df
	self.tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")
	self.model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")
	super().__init__(
	tools=[
	clean_data,
	extract_features,
	save_to_csv, # Added save_to_csv tool
	],
	model=self.model,
	additional_authorized_imports=["pandas", "numpy"]
	)

	def run(self, prompt: str, output_csv: bool = False) -> pd.DataFrame:
	# Run the agent with the provided DataFrame
	clean_output = self.tools["clean_data"](df=self.df)
	self.df = clean_output.result if hasattr(clean_output, "result") else clean_output

	features_output = self.tools["extract_features"](df=self.df)
	self.df = features_output.result if hasattr(features_output, "result") else features_output

	if output_csv:
	csv_output = self.tools["save_to_csv"](df=self.df, filename="processed_output.csv")
	print(f"CSV saved at: {csv_output}")

	return self.df