Spaces:

SparkBrainsAI
/

SPARKNOVA

Build error

App Files Files Community

SPARKNOVA / data_engine.py

Tamannathakur

Upload 7 files

75bedb4 verified 4 months ago

raw

history blame contribute delete

29 kB

	import pandas as pd
	import numpy as np
	import plotly.express as px
	import plotly.graph_objects as go
	import os
	import tempfile

	def clean_numeric(df):
	df = df.copy()
	for col in df.columns:
	if pd.api.types.is_string_dtype(df[col]) or df[col].dtype == object:
	s = df[col].astype(str).str.strip()
	if s.str.contains("%", na=False).any():
	numeric_vals = pd.to_numeric(s.str.replace("%", "", regex=False), errors="coerce")
	if numeric_vals.notna().sum() / len(df) > 0.5:
	df[col] = numeric_vals / 100.0
	continue
	cleaned = s.str.replace(",", "", regex=False).str.replace("₹", "", regex=False).str.replace("$", "", regex=False)
	numeric_vals = pd.to_numeric(cleaned, errors="coerce")
	if numeric_vals.notna().sum() / len(df) > 0.5:
	df[col] = numeric_vals
	return df

	def run_analysis(analysis_type, selected_columns, uploaded_df):
	if uploaded_df is None:
	return "Please upload a dataset first.", None
	if analysis_type == "None" or analysis_type is None:
	return "", None

	if 'title' in uploaded_df.columns:
	title_nulls = uploaded_df['title'].isnull().sum()
	print(f"DEBUG: Title column has {title_nulls} null values at analysis time")

	whole_dataset_analyses = ["Summary", "Top 5 Rows", "Bottom 5 Rows", "Missing Values"]
	if analysis_type in whole_dataset_analyses:
	df_to_analyze = uploaded_df
	else:
	if not selected_columns:
	return f"Please select columns for {analysis_type} analysis.", None
	df_to_analyze = uploaded_df[selected_columns]

	try:
	if analysis_type == "Summary":
	numeric_cols = uploaded_df.select_dtypes(include=[np.number]).columns
	categorical_cols = uploaded_df.select_dtypes(include=['object', 'category']).columns
	result = f"Dataset Summary:\nRows: {len(uploaded_df):,}\nColumns: {len(uploaded_df.columns)}\nNumeric Columns: {len(numeric_cols)}\nText Columns: {len(categorical_cols)}\n\n"
	if len(numeric_cols) > 0:
	result += "Numeric Columns: " + ", ".join(numeric_cols.tolist()) + "\n"
	if len(categorical_cols) > 0:
	result += "Text Columns: " + ", ".join(categorical_cols.tolist())
	return result, None

	elif analysis_type == "Describe":
	result = "Column Description:\n" + "=" * 30 + "\n\n"
	for col in selected_columns:
	if col in df_to_analyze.columns:
	result += f"Column: {col}\n"
	if pd.api.types.is_numeric_dtype(df_to_analyze[col]):
	stats = df_to_analyze[col].describe()
	result += f" Type: Numeric\n Count: {stats['count']:.0f}\n Mean: {stats['mean']:.3f}\n Std: {stats['std']:.3f}\n Min: {stats['min']:.3f}\n 25%: {stats['25%']:.3f}\n 50%: {stats['50%']:.3f}\n 75%: {stats['75%']:.3f}\n Max: {stats['max']:.3f}\n\n"
	else:
	unique_count = df_to_analyze[col].nunique()
	null_count = df_to_analyze[col].isnull().sum()
	most_common = df_to_analyze[col].mode().iloc[0] if len(df_to_analyze[col].mode()) > 0 else "N/A"
	result += f" Type: Categorical/Text\n Unique Values: {unique_count}\n Missing Values: {null_count}\n Most Common: {most_common}\n"
	top_values = df_to_analyze[col].value_counts().head(5)
	result += " Top Values:\n"
	for val, count in top_values.items():
	result += f" {val}: {count} times\n"
	result += "\n"
	return result, None

	elif analysis_type == "Top 5 Rows":
	return "Top 5 Rows - See data table below", df_to_analyze.head(5)

	elif analysis_type == "Bottom 5 Rows":
	return "Bottom 5 Rows - See data table below", df_to_analyze.tail(5)

	elif analysis_type == "Missing Values":
	result = "Missing Values Analysis:\n" + "=" * 30 + "\n\n"
	patterns = ['UNKNOWN', 'unknown', 'ERROR', 'error', 'NULL', 'null', 'NA', 'na', 'N/A',
	'Not Given', 'not given', 'NOT GIVEN', '', ' ', '-', '?', 'NaN', 'nan',
	'None', 'none', 'NONE', '#N/A', 'n/a', 'N.A.', 'n.a.']

	for col in uploaded_df.columns:
	nan_count = uploaded_df[col].isnull().sum()
	pseudo_missing_count = 0

	non_null_data = uploaded_df[col].dropna()
	if len(non_null_data) > 0:
	col_str = non_null_data.astype(str).str.strip()
	empty_count = (col_str == '').sum()
	pattern_count = 0
	for pattern in patterns:
	if pattern != '':
	pattern_count += (col_str.str.lower() == pattern.lower()).sum()
	pseudo_missing_count = empty_count + pattern_count

	total_missing = nan_count + pseudo_missing_count
	missing_percent = (total_missing / len(uploaded_df)) * 100

	if col == 'title':
	print(f"DEBUG: Title analysis - NaN: {nan_count}, Pseudo: {pseudo_missing_count}, Total: {total_missing}")

	if total_missing > 0:
	details = []
	if nan_count > 0:
	details.append(f"{nan_count} NaN")
	if pseudo_missing_count > 0:
	details.append(f"{pseudo_missing_count} text-missing")
	detail_str = f" ({', '.join(details)})"
	else:
	detail_str = ""

	result += f"{col}: {total_missing} missing ({missing_percent:.2f}%){detail_str}\n"

	return result, None

	elif analysis_type == "Highest Correlation":
	numeric_cols = df_to_analyze.select_dtypes(include=[np.number]).columns
	if len(numeric_cols) < 2:
	return "Need at least 2 numeric columns for correlation analysis.", None
	corr_matrix = df_to_analyze[numeric_cols].corr()
	result = "Highest Correlations:\n" + "=" * 25 + "\n\n"
	correlations = []
	for i in range(len(corr_matrix.columns)):
	for j in range(i+1, len(corr_matrix.columns)):
	col1, col2 = corr_matrix.columns[i], corr_matrix.columns[j]
	corr_val = corr_matrix.iloc[i, j]
	correlations.append((abs(corr_val), col1, col2, corr_val))
	correlations.sort(reverse=True)
	for _, col1, col2, corr_val in correlations[:10]:
	result += f"{col1} ↔ {col2}: {corr_val:.3f}\n"
	return result, None

	elif analysis_type == "Group & Aggregate":
	if not selected_columns:
	result = "Please select columns for grouping and aggregation."
	else:
	categorical_cols = [col for col in selected_columns if not pd.api.types.is_numeric_dtype(df_to_analyze[col])]
	numeric_cols = [col for col in selected_columns if pd.api.types.is_numeric_dtype(df_to_analyze[col])]

	if categorical_cols and numeric_cols:
	group_col = categorical_cols[0]
	agg_col = numeric_cols[0]
	grouped = df_to_analyze.groupby(group_col)[agg_col].agg(['count', 'mean', 'sum']).round(2)
	result = f"Group & Aggregate Analysis:\n" + "=" * 35 + "\n\n"
	result += f"Grouped by: {group_col}\nAggregated: {agg_col}\n\n"
	result += grouped.to_string()
	elif categorical_cols:
	group_col = categorical_cols[0]
	grouped = df_to_analyze[group_col].value_counts()
	result = f"Group Count Analysis:\n" + "=" * 25 + "\n\n"
	result += grouped.to_string()
	else:
	result = "Please select at least one categorical column for grouping."
	return result, None

	elif analysis_type == "Calculate Expressions":
	numeric_cols = df_to_analyze.select_dtypes(include=[np.number]).columns

	if len(numeric_cols) >= 2:
	col1, col2 = numeric_cols[0], numeric_cols[1]
	df_calc = df_to_analyze.copy()
	df_calc['Sum'] = df_calc[col1] + df_calc[col2]
	df_calc['Difference'] = df_calc[col1] - df_calc[col2]

	result = f"Calculated Expressions:\n" + "=" * 30 + "\n\n"
	result += f"Using columns: {col1} and {col2}\n\n"
	result += f"New calculated columns:\nSum = {col1} + {col2}\nDifference = {col1} - {col2}\n\n"
	result += "Sample results:\n"
	result += df_calc[['Sum', 'Difference']].head().to_string()
	else:
	result = "Need at least 2 numeric columns for calculations."
	return result, None

	else:
	return f"Analysis type '{analysis_type}' is under development.", None

	except Exception as e:
	return f"Error in analysis: {str(e)}", None

	def create_chart_explanation(viz_type, df_to_plot, selected_columns, fig_data=None):
	try:
	if viz_type == "Bar Chart" and len(selected_columns) >= 2:
	x_col, y_col = selected_columns[0], selected_columns[1]
	if pd.api.types.is_numeric_dtype(df_to_plot[y_col]):
	max_val_idx = df_to_plot[y_col].idxmax()
	max_category = df_to_plot.loc[max_val_idx, x_col]
	max_value = df_to_plot[y_col].max()
	y_mean = df_to_plot[y_col].mean()
	else:
	grouped = df_to_plot.groupby(x_col)[y_col].count()
	max_category = grouped.idxmax()
	max_value = grouped.max()
	y_mean = grouped.mean()
	return f"BAR CHART: {y_col} by {x_col}\nHighest: {max_category} ({max_value:.2f})\nAverage: {y_mean:.2f}\nCategories: {df_to_plot[x_col].nunique()}"
	elif viz_type == "Line Chart" and fig_data is not None:
	max_combo = fig_data.loc[fig_data['Count'].idxmax()]
	min_combo = fig_data.loc[fig_data['Count'].idxmin()]
	return f"LINE CHART: Distribution\nHighest: {max_combo[selected_columns[1]]} in {max_combo[selected_columns[0]]} ({max_combo['Count']})\nLowest: {min_combo[selected_columns[1]]} in {min_combo[selected_columns[0]]} ({min_combo['Count']})\nTotal: {len(df_to_plot)}"
	except:
	pass
	return f"{viz_type} visualization\nShows data patterns and relationships"

	def create_visualization(viz_type, selected_columns, uploaded_df):
	if uploaded_df is None or viz_type == "None":
	return None, "", None
	if not selected_columns:
	return None, "Please select columns for visualization.", None
	df_to_plot = uploaded_df[selected_columns]

	try:
	if viz_type == "Bar Chart":
	if len(selected_columns) >= 2:
	x_col, y_col = selected_columns[0], selected_columns[1]
	color_col = selected_columns[2] if len(selected_columns) > 2 else None

	# Handle different data type combinations
	if pd.api.types.is_numeric_dtype(df_to_plot[y_col]):
	# Numeric Y-axis: use as-is
	plot_data = df_to_plot.head(100)
	fig = px.bar(plot_data, x=x_col, y=y_col, color=color_col, title=f"{y_col} by {x_col}")
	else:
	# Non-numeric Y-axis: count occurrences
	if pd.api.types.is_numeric_dtype(df_to_plot[x_col]):
	# If X is numeric, group and count Y values
	grouped = df_to_plot.groupby(x_col)[y_col].count().reset_index()
	grouped.columns = [x_col, f'Count of {y_col}']
	fig = px.bar(grouped, x=x_col, y=f'Count of {y_col}', title=f"Count of {y_col} by {x_col}")
	else:
	# Both categorical: cross-tabulation
	crosstab = pd.crosstab(df_to_plot[x_col], df_to_plot[y_col])
	crosstab_reset = crosstab.reset_index().melt(id_vars=[x_col], var_name=y_col, value_name='Count')
	fig = px.bar(crosstab_reset, x=x_col, y='Count', color=y_col, title=f"{y_col} distribution by {x_col}")

	explanation = create_chart_explanation(viz_type, df_to_plot, selected_columns)
	else:
	col = selected_columns[0]
	if pd.api.types.is_numeric_dtype(df_to_plot[col]):
	fig = px.histogram(df_to_plot, x=col, title=f"Distribution of {col}")
	else:
	value_counts = df_to_plot[col].value_counts().head(15)
	fig = px.bar(x=value_counts.index, y=value_counts.values, title=f"Top Values in {col}")
	explanation = f"Chart showing distribution of {col}"
	fig.update_layout(width=800, height=500)
	return fig, explanation, fig

	elif viz_type == "Pie Chart":
	col = selected_columns[0]
	if len(selected_columns) >= 2 and pd.api.types.is_numeric_dtype(df_to_plot[selected_columns[1]]):
	grouped_data = df_to_plot.groupby(col)[selected_columns[1]].sum().reset_index()
	fig = px.pie(grouped_data, values=selected_columns[1], names=col, title=f"Total {selected_columns[1]} by {col}")
	legend_title = f"{col} Categories"
	else:
	value_counts = df_to_plot[col].value_counts().head(10)
	fig = px.pie(values=value_counts.values, names=value_counts.index, title=f"Distribution of {col}")
	legend_title = f"{col} Values"

	fig.update_layout(
	width=800,
	height=500,
	showlegend=True,
	legend=dict(
	title=dict(text=legend_title, font=dict(size=14, color="black")),
	orientation="v",
	yanchor="middle",
	y=0.5,
	xanchor="left",
	x=1.05,
	font=dict(size=12)
	)
	)
	explanation = f"PIE CHART: {col} Distribution\nShows proportion of each category\nUse to understand category distribution patterns"
	return fig, explanation, fig

	elif viz_type == "Scatter Plot":
	if len(selected_columns) >= 2:
	x_col, y_col = selected_columns[0], selected_columns[1]
	color_col = selected_columns[2] if len(selected_columns) > 2 else None

	# Check if both columns are suitable for scatter plot
	if not (pd.api.types.is_numeric_dtype(df_to_plot[x_col]) and pd.api.types.is_numeric_dtype(df_to_plot[y_col])):
	return None, f"Scatter plot requires numeric data. {x_col} and {y_col} must be numeric.", None

	fig = px.scatter(df_to_plot, x=x_col, y=y_col, color=color_col, title=f"{y_col} vs {x_col}")
	explanation = f"Scatter plot showing relationship between {x_col} and {y_col}"
	else:
	return None, "Scatter plot requires at least 2 columns.", None
	fig.update_layout(width=800, height=500)
	return fig, explanation, fig

	elif viz_type == "Line Chart":
	if len(selected_columns) >= 2:
	x_col, y_col = selected_columns[0], selected_columns[1]

	if pd.api.types.is_numeric_dtype(df_to_plot[y_col]):
	# Numeric Y: sort by X and plot trend
	sorted_data = df_to_plot.sort_values(x_col)
	fig = px.line(sorted_data, x=x_col, y=y_col, title=f"Trend of {y_col} over {x_col}", markers=True)
	explanation = f"Line chart showing trend of {y_col} over {x_col}"
	else:
	# Non-numeric Y: create cross-tabulation
	crosstab = pd.crosstab(df_to_plot[x_col], df_to_plot[y_col])
	melted = pd.melt(crosstab.reset_index(), id_vars=[x_col], var_name=y_col, value_name='Count')
	fig = px.line(melted, x=x_col, y='Count', color=y_col, title=f"Distribution of {y_col} across {x_col}", markers=True)
	explanation = create_chart_explanation(viz_type, df_to_plot, selected_columns, melted)
	else:
	return None, "Line chart requires at least 2 columns.", None
	fig.update_layout(width=800, height=500)
	return fig, explanation, fig

	elif viz_type == "Histogram":
	col = selected_columns[0]
	if pd.api.types.is_numeric_dtype(df_to_plot[col]):
	fig = px.histogram(df_to_plot, x=col, title=f"Distribution of {col}", nbins=30)
	explanation = f"Histogram showing distribution of {col}"
	else:
	return None, f"Histogram requires numeric data. Try Bar Chart instead.", None
	fig.update_layout(width=800, height=500)
	return fig, explanation, fig

	elif viz_type == "Heat Map":
	if len(selected_columns) >= 2:
	numeric_cols = [col for col in selected_columns if pd.api.types.is_numeric_dtype(df_to_plot[col])]
	if len(numeric_cols) >= 2:
	corr_matrix = df_to_plot[numeric_cols].corr()
	fig = px.imshow(corr_matrix, text_auto=True, aspect="auto", title="Correlation Heatmap", color_continuous_scale='RdBu')
	explanation = f"Heatmap showing correlations between numeric columns"
	else:
	x_col, y_col = selected_columns[0], selected_columns[1]
	crosstab = pd.crosstab(df_to_plot[x_col], df_to_plot[y_col])
	fig = px.imshow(crosstab.values, x=crosstab.columns, y=crosstab.index, text_auto=True, aspect="auto", title=f"Cross-tabulation: {y_col} vs {x_col}")
	explanation = f"Heatmap showing cross-tabulation between {x_col} and {y_col}"
	else:
	return None, "Heat map requires at least 2 columns.", None
	fig.update_layout(width=800, height=500)
	return fig, explanation, fig

	elif viz_type == "Box Plot":
	if len(selected_columns) >= 1:
	y_col = selected_columns[0]
	if not pd.api.types.is_numeric_dtype(df_to_plot[y_col]):
	return None, f"Box plot requires numeric Y-axis. {y_col} is not numeric.", None

	x_col = selected_columns[1] if len(selected_columns) > 1 else None
	fig = px.box(df_to_plot, x=x_col, y=y_col, title=f"Box Plot of {y_col}" + (f" by {x_col}" if x_col else ""))
	explanation = f"Box plot showing distribution of {y_col}" + (f" grouped by {x_col}" if x_col else "")
	else:
	return None, "Box plot requires at least 1 column.", None
	fig.update_layout(width=800, height=500)
	return fig, explanation, fig

	else:
	return None, f"Visualization type '{viz_type}' is under development.", None

	except Exception as e:
	return None, f"Error creating visualization: {str(e)}", None

	def handle_missing_data(method, selected_columns, constant_value, uploaded_df, change_history):
	print(f"DEBUG: Starting {method} on columns {selected_columns}")

	if uploaded_df is None:
	return "Please upload a dataset first.", uploaded_df, change_history
	if method == "None":
	return "", uploaded_df, change_history
	if not selected_columns:
	return "Please select columns to apply data handling.", uploaded_df, change_history

	try:
	change_history.append(uploaded_df.copy())
	df_copy = uploaded_df.copy()

	if method == "Clean All Missing":
	return "Clean All Missing is not available", uploaded_df, change_history

	processed_columns = []
	dropped_columns = []

	for col in selected_columns:
	if col not in df_copy.columns:
	continue

	if method == "Forward Fill":
	if col == 'title':
	print(f"DEBUG: Skipping title column due to data inconsistencies")
	continue

	if df_copy[col].dtype == 'object':
	patterns = ['UNKNOWN', 'unknown', 'ERROR', 'error', 'NULL', 'null', 'NA', 'na', 'N/A',
	'Not Given', 'not given', 'NOT GIVEN', '', ' ', '-', '?', 'NaN', 'nan',
	'None', 'none', 'NONE', '#N/A', 'n/a', 'N.A.', 'n.a.']
	for pattern in patterns:
	df_copy[col] = df_copy[col].replace(pattern, np.nan)
	df_copy[col] = df_copy[col].replace('', np.nan)

	df_copy[col] = df_copy[col].ffill()
	processed_columns.append(col)
	elif method == "Backward Fill":
	if df_copy[col].dtype == 'object':
	patterns = ['UNKNOWN', 'unknown', 'ERROR', 'error', 'NULL', 'null', 'NA', 'na', 'N/A',
	'Not Given', 'not given', 'NOT GIVEN', '', ' ', '-', '?', 'NaN', 'nan',
	'None', 'none', 'NONE', '#N/A', 'n/a', 'N.A.', 'n.a.']
	for pattern in patterns:
	df_copy[col] = df_copy[col].replace(pattern, np.nan)
	df_copy[col] = df_copy[col].replace('', np.nan)

	df_copy[col] = df_copy[col].bfill()
	processed_columns.append(col)
	elif method == "Constant Fill":
	if df_copy[col].dtype == 'object':
	patterns = ['UNKNOWN', 'unknown', 'ERROR', 'error', 'NULL', 'null', 'NA', 'na', 'N/A',
	'Not Given', 'not given', 'NOT GIVEN', '', ' ', '-', '?', 'NaN', 'nan',
	'None', 'none', 'NONE', '#N/A', 'n/a', 'N.A.', 'n.a.']
	for pattern in patterns:
	df_copy[col] = df_copy[col].replace(pattern, np.nan)
	df_copy[col] = df_copy[col].replace('', np.nan)

	fill_val = constant_value.strip() if constant_value else "Unknown"
	df_copy[col] = df_copy[col].fillna(fill_val)
	processed_columns.append(col)
	elif method == "Mean Fill":
	if pd.api.types.is_numeric_dtype(df_copy[col]):
	if not df_copy[col].isna().all():
	mean_val = df_copy[col].mean()
	df_copy[col] = df_copy[col].fillna(mean_val)
	processed_columns.append(col)
	else:
	numeric_col = pd.to_numeric(df_copy[col], errors='coerce')
	if not numeric_col.isna().all():
	mean_val = numeric_col.mean()
	df_copy[col] = numeric_col.fillna(mean_val)
	processed_columns.append(col)
	elif method == "Median Fill":
	if pd.api.types.is_numeric_dtype(df_copy[col]):
	if not df_copy[col].isna().all():
	median_val = df_copy[col].median()
	df_copy[col] = df_copy[col].fillna(median_val)
	processed_columns.append(col)
	else:
	numeric_col = pd.to_numeric(df_copy[col], errors='coerce')
	if not numeric_col.isna().all():
	median_val = numeric_col.median()
	df_copy[col] = numeric_col.fillna(median_val)
	processed_columns.append(col)
	elif method == "Mode Fill":
	patterns = ['UNKNOWN', 'unknown', 'ERROR', 'error', 'NULL', 'null', 'NA', 'na', 'N/A',
	'Not Given', 'not given', 'NOT GIVEN', '', ' ', '-', '?', 'NaN', 'nan',
	'None', 'none', 'NONE', '#N/A', 'n/a', 'N.A.', 'n.a.']

	valid_values = df_copy[col][~df_copy[col].isin(patterns) & df_copy[col].notna()]

	if len(valid_values) > 0:
	mode_value = valid_values.mode()
	if len(mode_value) > 0:
	most_common = mode_value.iloc[0]
	print(f"DEBUG: Mode Fill - Most common value for {col}: {most_common}")

	for pattern in patterns:
	df_copy[col] = df_copy[col].replace(pattern, most_common)

	df_copy[col] = df_copy[col].fillna(most_common)

	processed_columns.append(col)
	elif method == "Drop Columns":
	df_copy = df_copy.drop(columns=[col])
	dropped_columns.append(col)

	uploaded_df = df_copy
	remaining_cols = [col for col in selected_columns if col not in dropped_columns]

	if 'title' in uploaded_df.columns:
	title_check = uploaded_df['title'].astype(str).str.contains('UNKNOWN', case=False, na=False).sum()
	print(f"DEBUG: After update, title has {title_check} UNKNOWN values")

	if processed_columns:
	result = f"Applied {method} to: {', '.join(processed_columns)}"
	for col in processed_columns:
	if col in uploaded_df.columns:
	after_missing = uploaded_df[col].isnull().sum()
	result += f"\n- {col}: {after_missing} missing values remaining"
	elif dropped_columns:
	result = f"Dropped columns: {', '.join(dropped_columns)}"
	else:
	result = "No columns processed - check column selection or data types"

	return result, uploaded_df, change_history

	except Exception as e:
	return f"Error: {str(e)}", uploaded_df, change_history

	def undo_last_change(uploaded_df, change_history):
	if not change_history:
	return "No changes to undo.", uploaded_df, change_history
	uploaded_df = change_history.pop()
	return f"Undid last change. Dataset now has {uploaded_df.shape[0]} rows × {uploaded_df.shape[1]} columns", uploaded_df, change_history

	def undo_all_changes(original_df, change_history):
	if original_df is None:
	return "No original dataset to restore.", None, change_history
	uploaded_df = original_df.copy()
	change_history = []
	return f"Dataset restored to original state ({uploaded_df.shape[0]} rows × {uploaded_df.shape[1]} columns)", uploaded_df, change_history

	def download_dataset(uploaded_df, dataset_name):
	if uploaded_df is None:
	return None

	if dataset_name:
	base_name = dataset_name.replace('.csv', '').replace('.xlsx', '').replace('.xls', '')
	filename = f"{base_name}_modified.csv"
	else:
	filename = "modified_dataset.csv"

	temp_dir = tempfile.gettempdir()
	filepath = os.path.join(temp_dir, filename)
	uploaded_df.to_csv(filepath, index=False)
	return filepath

	def display_data_format(format_type, selected_columns, uploaded_df):
	if uploaded_df is None or format_type == "None":
	return None
	if selected_columns and len(selected_columns) > 0:
	df_to_show = uploaded_df[selected_columns]
	else:
	df_to_show = uploaded_df
	return df_to_show.head(100) if format_type == "DataFrame" else None

	def display_text_format(format_type, selected_columns, uploaded_df):
	if uploaded_df is None or format_type == "None":
	return ""
	if selected_columns and len(selected_columns) > 0:
	df_to_show = uploaded_df[selected_columns]
	else:
	df_to_show = uploaded_df
	if format_type == "JSON":
	return df_to_show.head(20).to_json(orient='records', indent=2)
	elif format_type == "Dictionary":
	return str(df_to_show.head(20).to_dict(orient='records'))