Spaces:

sairaj2
/

openenv-datacleaner

Sleeping

App Files Files Community

openenv-datacleaner / task.py

sairaj2

Upload folder using huggingface_hub

188937b verified 7 days ago

raw

history blame contribute delete

2.51 kB

	import pandas as pd
	import numpy as np
	from typing import Dict, Any


	def generate_task(dataset_size: int = 1000, dirt_level: float = 0.3) -> pd.DataFrame:
	"""
	Generate a dirty dataset for the AutoClean AI task
	Contains: missing values, duplicates, inconsistent types, outliers, messy text
	"""
	np.random.seed(42)

	data = {
	'id': np.arange(dataset_size),
	'age': np.random.normal(35, 12, dataset_size).astype(int),
	'income': np.random.lognormal(10, 1, dataset_size).astype(int),
	'gender': np.random.choice(['Male', 'Female', 'male', 'female', 'M', 'F', None], dataset_size,
	p=[0.3, 0.3, 0.1, 0.1, 0.05, 0.05, 0.1]),
	'join_date': pd.date_range('2020-01-01', periods=dataset_size).tolist(),
	'score': np.random.normal(50, 15, dataset_size),
	'comments': np.random.choice(['Good', 'Excellent', 'Bad', 'Average', ' ', None, ' '], dataset_size),
	'category': np.random.choice(['A', 'B', 'C', 'D', None], dataset_size, p=[0.25, 0.25, 0.25, 0.2, 0.05])
	}

	df = pd.DataFrame(data)

	# Add missing values
	mask = np.random.choice([True, False], size=df.shape, p=[dirt_level * 0.4, 1 - dirt_level * 0.4])
	df = df.mask(mask)

	# Add duplicates
	duplicates = df.sample(frac=dirt_level * 0.25, random_state=42)
	df = pd.concat([df, duplicates], ignore_index=True)

	# Add outliers
	numeric_cols = ['age', 'income', 'score']
	for col in numeric_cols:
	outliers_idx = np.random.choice(df.index, size=int(dataset_size * dirt_level * 0.1), replace=False)
	df.loc[outliers_idx, col] = df[col].mean() * 10

	# Mess up data types
	df['age'] = df['age'].apply(lambda x: str(x) if np.random.random() < 0.1 else x)
	df['income'] = df['income'].apply(lambda x: f"${x}" if np.random.random() < 0.15 else x)

	return df.sample(frac=1, random_state=42).reset_index(drop=True)


	def get_task_description() -> Dict[str, Any]:
	return {
	"name": "AutoClean AI Data Cleaning Challenge",
	"goal": "Maximize the dataset cleanliness score by applying optimal cleaning operations",
	"success_threshold": 0.95,
	"max_steps": 50,
	"allowed_actions": [
	"fill_missing",
	"remove_duplicates",
	"normalize",
	"fix_types",
	"remove_outliers",
	"drop_column",
	"encode_categorical",
	"handle_text"
	]
	}