Spaces:

translators-will
/

Data-Cleaner

Sleeping

App Files Files Community

Data-Cleaner / data_clean_simple.py

translators-will

Update data_clean_simple.py

2d0c05d verified 10 months ago

raw

history blame contribute delete

6.02 kB

	# data_clean_simple.py

	import pandas as pd
	import streamlit as st
	import re
	from functools import lru_cache

	# Dictionary of common strategies for data cleaning
	cleaning_strategies = {
	"date": "For date columns, consider forward-fill or backward-fill from existing dates, or converting to datetime format.",
	"numeric": "For numeric columns, consider filling missing values with the mean or median of the column.",
	"text": "For text columns, consider filling missing values with a placeholder like 'Unknown' or the most frequent value.",
	"categorical": "For categorical columns, consider filling missing values with the mode or a placeholder like 'Unknown'."
	}

	@lru_cache(maxsize=32)
	def get_column_type(column_name):
	column_lower = column_name.lower()

	if any(keyword in column_lower for keyword in ['date', 'year', 'time', 'timestamp']):
	return 'date'
	elif any(keyword in column_lower for keyword in ['price', 'amount', 'cost', 'quantity', 'value']):
	return 'numeric'
	elif any(keyword in column_lower for keyword in ['name', 'description', 'text', 'comment']):
	return 'text'
	elif any(keyword in column_lower for keyword in ['category', 'type', 'label', 'class']):
	return 'categorical'
	else:
	if re.search(r'(num\|amt\|count\|age\|height\|weight\|total\|\d+)', column_lower):
	return 'numeric'
	return 'text'

	def suggest_fill_strategies(column_name, examples):
	column_type = get_column_type(column_name)

	# Create a basic analysis of the data
	valid_examples = [ex for ex in examples if pd.notna(ex) and ex != '']

	# Build a simple suggestion based on column type and examples
	suggestion = cleaning_strategies.get(column_type, cleaning_strategies['text'])

	if valid_examples:
	suggestion += f"\n\nExample values: {', '.join(map(str, valid_examples[:3]))}."

	# For numeric data, add statistics
	if column_type == 'numeric':
	mean_value = pd.Series(valid_examples).astype(float).mean()
	median_value = pd.Series(valid_examples).astype(float).median()
	suggestion += f"Consider replacing values with\n\nMean: {mean_value:.2f}, Median: {median_value:.2f}."

	return suggestion

	def clean_data(file_path, progress_callback=None):
	# Support CSV and TSV files
	# Load data and drop duplicates
	# Clean data with progress updates
	if file_path.endswith('.tsv'):
	df = pd.read_csv(file_path, sep='\t').drop_duplicates().copy()
	else:
	df = pd.read_csv(file_path).drop_duplicates().copy()

	suggestions_log = []

	total_columns = len(df.columns)

	# Convert column types
	for i, col in enumerate(df.columns):
	# Update progress if callback provided
	if progress_callback:
	progress = i / total_columns
	progress_callback(progress)

	if df[col].dtype == 'object':
	df[col] = df[col].str.strip().str.lower() # Normalize text

	# Escape newline characters
	df[col] = df[col].str.replace('\n', ' ', regex=False).replace('\r', ' ', regex=False)

	if any(keyword in col.lower() for keyword in ['date', 'year', 'time', 'timestamp']):
	df[col] = df[col].str.replace(r'[^\d]', '', regex=True)
	# Normalize 4-digit year ranges (e.g., 2000-2001, 2000--2001, 20002001)
	df[col] = df[col].replace(
	r'(?<!\d)(\d{4})\s[-–—./]?\s(\d{4})(?!\d)', r'\1-\2', regex=True
	)
	# Remove currency symbols and commas
	if df[col].astype(str).str.contains(r'[$,]', na=False, regex=True).any():
	df[col] = df[col].str.replace(r'[$,]', '', regex=True)

	# Always try to convert to numeric if possible
	if col.lower().find('id') == -1: # Skip ID columns which should remain as strings
	try:
	df[col] = pd.to_numeric(df[col], errors='ignore')
	except:
	return None

	# Check for missing or weird values
	null_count = df[col].isnull().sum()
	empty_str_count = (df[col] == '').sum() if df[col].dtype == 'object' else 0
	pattern_matches = df[col].astype(str).str.contains(r'none\|null\|n/a\|na\|\?+missing\|unknown',
	na=False, case=False, regex=True).sum()

	# Generate suggestions for missing or weird values
	if null_count > 0 or empty_str_count > 0 or pattern_matches > 0:
	# Get non-null, non-empty examples for analysis - handle empty dataframes
	try:
	good_df = df[col][
	df[col].notnull() &
	(df[col].astype(str) != "") &
	~df[col].astype(str).contains(r'none\|null\|n/a\|na\|\?+\|missing\|unknown',
	na=False, case=False, regex=True)
	]

	if len(good_df) > 0:
	sample_size = min(5, len(good_df))
	good_examples = good_df.drop_duplicates().sample(n=sample_size, random_state=1).tolist()
	else:
	good_examples = []
	except:
	good_examples = []

	# Generate suggestions
	suggestion = suggest_fill_strategies(col, good_examples)
	suggestions_log.append((col, suggestion))

	# Final progress update
	if progress_callback:
	progress_callback(1.0)

	# Reset index for consistency

	df = df.reset_index(drop=True)

	return df, suggestions_log

	def display_suggestions_report(suggestions_log):
	if suggestions_log:
	st.subheader("🤖 Data Cleaning Suggestions")
	for col, suggestion in suggestions_log:
	st.markdown(f"Column: `{col}`")
	if suggestion:
	st.code(suggestion, language="python")
	else:
	st.write("No suggestions or response error.")