Spaces:

ayushsahu45
/

Multi-AI-Analytics-Platform

Running

App Files Files Community

Multi-AI-Analytics-Platform / data /data_loader.py

ayushsahu45

Upload 2 files

31d3380 verified 10 days ago

raw

history blame contribute delete

12.3 kB

	# import pandas as pd
	# import numpy as np
	# from PIL import Image
	# from pathlib import Path
	# from typing import List, Dict, Any, Union, Tuple, Optional
	# import os
	# import json


	# class DataLoader:
	# def __init__(self):
	# self.supported_image_formats = ['.jpg', '.jpeg', '.png', '.bmp', '.gif', '.webp']
	# self.supported_text_formats = ['.txt', '.csv', '.json', '.xlsx', '.xls']

	# def load_csv(self, file_path: Union[str, Path]) -> pd.DataFrame:
	# return pd.read_csv(file_path)

	# def load_excel(self, file_path: Union[str, Path], sheet_name: Union[str, int] = 0) -> pd.DataFrame:
	# return pd.read_excel(file_path, sheet_name=sheet_name)

	# def load_json(self, file_path: Union[str, Path]) -> pd.DataFrame:
	# return pd.read_json(file_path)

	# def load_image(self, file_path: Union[str, Path]) -> Image.Image:
	# return Image.open(file_path).convert('RGB')

	# def load_images_from_folder(self, folder_path: Union[str, Path]) -> List[Tuple[str, Image.Image]]:
	# folder = Path(folder_path)
	# images = []
	# for ext in self.supported_image_formats:
	# for file_path in folder.glob(f"*{ext}"):
	# try:
	# img = Image.open(file_path).convert('RGB')
	# images.append((str(file_path), img))
	# except Exception as e:
	# print(f"Error loading {file_path}: {e}")
	# return images

	# def load_text_file(self, file_path: Union[str, Path]) -> str:
	# with open(file_path, 'r', encoding='utf-8') as f:
	# return f.read()

	# def detect_file_type(self, file_path: Union[str, Path]) -> str:
	# path = Path(file_path)
	# suffix = path.suffix.lower()
	# if suffix in self.supported_image_formats:
	# return "image"
	# elif suffix == '.csv':
	# return "csv"
	# elif suffix in ['.xlsx', '.xls']:
	# return "excel"
	# elif suffix == '.json':
	# return "json"
	# elif suffix == '.txt':
	# return "text"
	# else:
	# return "unknown"

	# def auto_load(self, file_path: Union[str, Path]) -> Tuple[Any, str]:
	# file_type = self.detect_file_type(file_path)
	# if file_type == "csv":
	# return self.load_csv(file_path), "dataframe"
	# elif file_type == "excel":
	# return self.load_excel(file_path), "dataframe"
	# elif file_type == "json":
	# return self.load_json(file_path), "dataframe"
	# elif file_type == "image":
	# return self.load_image(file_path), "image"
	# elif file_type == "text":
	# return self.load_text_file(file_path), "text"
	# else:
	# raise ValueError(f"Unsupported file type: {file_type}")

	# def get_data_summary(self, df: pd.DataFrame) -> Dict[str, Any]:
	# summary = {
	# "row_count": int(len(df)),
	# "columns": df.columns.tolist(),
	# "features": int(len(df.columns)),
	# "dtypes": df.dtypes.astype(str).to_dict(),
	# "missing_values": df.isnull().sum().to_dict(),
	# "missing_percent": (df.isnull().sum() / len(df) * 100).round(2).to_dict(),
	# "numeric_columns": df.select_dtypes(include=[np.number]).columns.tolist(),
	# "categorical_columns": df.select_dtypes(include=['object', 'category']).columns.tolist(),
	# "duplicate_rows": int(df.duplicated().sum()),
	# }

	# numeric_df = df.select_dtypes(include=[np.number])
	# if not numeric_df.empty:
	# summary["numeric_summary"] = {
	# "mean": numeric_df.mean().round(4).to_dict(),
	# "std": numeric_df.std().round(4).to_dict(),
	# "min": numeric_df.min().to_dict(),
	# "max": numeric_df.max().to_dict(),
	# "median": numeric_df.median().to_dict(),
	# }

	# return summary

	# def preprocess_dataframe(
	# self,
	# df: pd.DataFrame,
	# drop_non_numeric: bool = True,
	# fill_strategy: str = "median"
	# ) -> pd.DataFrame:
	# df = df.copy()

	# df = df.dropna(axis=1, how='all')

	# for col in df.columns:
	# if df[col].dtype == 'object':
	# try:
	# df[col] = pd.to_numeric(df[col])
	# except (ValueError, TypeError):
	# if drop_non_numeric:
	# df = df.drop(columns=[col])
	# else:
	# df = pd.get_dummies(df, columns=[col], drop_first=True)

	# for col in df.columns:
	# if df[col].isnull().any():
	# if df[col].dtype in ['int64', 'float64', 'int32', 'float32']:
	# if fill_strategy == "median":
	# df[col] = df[col].fillna(df[col].median())
	# elif fill_strategy == "mean":
	# df[col] = df[col].fillna(df[col].mean())
	# else:
	# df[col] = df[col].fillna(0)
	# elif df[col].dtype == 'bool':
	# df[col] = df[col].fillna(False)
	# else:
	# mode_val = df[col].mode()
	# df[col] = df[col].fillna(mode_val.iloc[0] if not mode_val.empty else "unknown")

	# return df

	# def split_features_target(
	# self, df: pd.DataFrame, target_column: str
	# ) -> Tuple[pd.DataFrame, pd.Series]:
	# if target_column not in df.columns:
	# raise ValueError(f"Target column '{target_column}' not found in dataframe")
	# X = df.drop(columns=[target_column])
	# y = df[target_column]
	# return X, y

	# def get_class_distribution(self, series: pd.Series) -> Dict[str, int]:
	# return series.value_counts().to_dict()

	# def detect_task_type(self, series: pd.Series) -> str:
	# """Auto-detect whether classification or regression is appropriate."""
	# if series.dtype == 'object' or series.nunique() <= 20:
	# return "classification"
	# return "regression"



	import pandas as pd
	import numpy as np
	from PIL import Image
	from pathlib import Path
	from typing import List, Dict, Any, Union, Tuple, Optional
	import os
	import json


	class DataLoader:
	def __init__(self):
	self.supported_image_formats = ['.jpg', '.jpeg', '.png', '.bmp', '.gif', '.webp']
	self.supported_text_formats = ['.txt', '.csv', '.json', '.xlsx', '.xls']

	def load_csv(self, file_path: Union[str, Path]) -> pd.DataFrame:
	return pd.read_csv(file_path)

	def load_excel(self, file_path: Union[str, Path], sheet_name: Union[str, int] = 0) -> pd.DataFrame:
	return pd.read_excel(file_path, sheet_name=sheet_name)

	def load_json(self, file_path: Union[str, Path]) -> pd.DataFrame:
	return pd.read_json(file_path)

	def load_image(self, file_path: Union[str, Path]) -> Image.Image:
	return Image.open(file_path).convert('RGB')

	def load_images_from_folder(self, folder_path: Union[str, Path]) -> List[Tuple[str, Image.Image]]:
	folder = Path(folder_path)
	images = []
	for ext in self.supported_image_formats:
	for file_path in folder.glob(f"*{ext}"):
	try:
	img = Image.open(file_path).convert('RGB')
	images.append((str(file_path), img))
	except Exception as e:
	print(f"Error loading {file_path}: {e}")
	return images

	def load_text_file(self, file_path: Union[str, Path]) -> str:
	with open(file_path, 'r', encoding='utf-8') as f:
	return f.read()

	def detect_file_type(self, file_path: Union[str, Path]) -> str:
	path = Path(file_path)
	suffix = path.suffix.lower()
	if suffix in self.supported_image_formats:
	return "image"
	elif suffix == '.csv':
	return "csv"
	elif suffix in ['.xlsx', '.xls']:
	return "excel"
	elif suffix == '.json':
	return "json"
	elif suffix == '.txt':
	return "text"
	else:
	return "unknown"

	def auto_load(self, file_path: Union[str, Path]) -> Tuple[Any, str]:
	file_type = self.detect_file_type(file_path)
	if file_type == "csv":
	return self.load_csv(file_path), "dataframe"
	elif file_type == "excel":
	return self.load_excel(file_path), "dataframe"
	elif file_type == "json":
	return self.load_json(file_path), "dataframe"
	elif file_type == "image":
	return self.load_image(file_path), "image"
	elif file_type == "text":
	return self.load_text_file(file_path), "text"
	else:
	raise ValueError(f"Unsupported file type: {file_type}")

	def get_data_summary(self, df: pd.DataFrame) -> Dict[str, Any]:
	summary = {
	"row_count": int(len(df)),
	"columns": df.columns.tolist(),
	"features": int(len(df.columns)),
	"dtypes": df.dtypes.astype(str).to_dict(),
	"missing_values": df.isnull().sum().to_dict(),
	"missing_percent": (df.isnull().sum() / len(df) * 100).round(2).to_dict(),
	"numeric_columns": df.select_dtypes(include=[np.number]).columns.tolist(),
	"categorical_columns": df.select_dtypes(include=['object', 'category']).columns.tolist(),
	"duplicate_rows": int(df.duplicated().sum()),
	}

	numeric_df = df.select_dtypes(include=[np.number])
	if not numeric_df.empty:
	summary["numeric_summary"] = {
	"mean": numeric_df.mean().round(4).to_dict(),
	"std": numeric_df.std().round(4).to_dict(),
	"min": numeric_df.min().to_dict(),
	"max": numeric_df.max().to_dict(),
	"median": numeric_df.median().to_dict(),
	}

	return summary

	def preprocess_dataframe(
	self,
	df: pd.DataFrame,
	drop_non_numeric: bool = True,
	fill_strategy: str = "median"
	) -> pd.DataFrame:
	df = df.copy()

	# Drop fully empty columns
	df = df.dropna(axis=1, how='all')

	for col in df.columns:
	if df[col].dtype == 'object':
	try:
	df[col] = pd.to_numeric(df[col])
	except (ValueError, TypeError):
	if drop_non_numeric:
	df = df.drop(columns=[col])
	else:
	df = pd.get_dummies(df, columns=[col], drop_first=True)

	# Fill missing values
	for col in df.columns:
	if df[col].isnull().any():
	if df[col].dtype in ['int64', 'float64', 'int32', 'float32']:
	if fill_strategy == "median":
	df[col] = df[col].fillna(df[col].median())
	elif fill_strategy == "mean":
	df[col] = df[col].fillna(df[col].mean())
	else:
	df[col] = df[col].fillna(0)
	elif df[col].dtype == 'bool':
	df[col] = df[col].fillna(False)
	else:
	mode_val = df[col].mode()
	df[col] = df[col].fillna(mode_val.iloc[0] if not mode_val.empty else "unknown")

	return df

	def split_features_target(
	self, df: pd.DataFrame, target_column: str
	) -> Tuple[pd.DataFrame, pd.Series]:
	if target_column not in df.columns:
	raise ValueError(f"Target column '{target_column}' not found in dataframe")
	X = df.drop(columns=[target_column])
	y = df[target_column]
	return X, y

	def get_class_distribution(self, series: pd.Series) -> Dict[str, int]:
	return series.value_counts().to_dict()

	def detect_task_type(self, series: pd.Series) -> str:
	"""Auto-detect whether classification or regression is appropriate."""
	if series.dtype == 'object' or series.nunique() <= 20:
	return "classification"
	return "regression"