Spaces:
Sleeping
Sleeping
| # app.py | |
| import streamlit as st | |
| import pandas as pd | |
| from io import StringIO, BytesIO | |
| import os | |
| from data_clean_final import clean_data, display_llm_report | |
| import tempfile | |
| # Set page config | |
| st.set_page_config(page_title="LLM Data Cleaner", page_icon=":bar_chart:", layout="wide") | |
| st.title("LLM-Powered Data Cleaner") | |
| st.markdown( | |
| """ | |
| Upload a CSV, TSV or Excel file, and we'll clean it up for you using AI-augmented techniques. | |
| The system will automatically: | |
| - Fix formatting issues | |
| - Handle missing values | |
| - Standardize data entries | |
| - Provide AI-powered suggestions for data improvements | |
| Then, you can download the cleaned data for your analysis. | |
| """ | |
| ) | |
| # File uploader | |
| uploaded_file = st.file_uploader("Choose a file", type=["csv", "tsv", "xlsx"]) | |
| if uploaded_file: | |
| file_bytes = uploaded_file.read() # Convert to BytesIO for pandas | |
| file_name = uploaded_file.name | |
| file_type = os.path.splitext(file_name)[1].lower() | |
| # Clean data and get LLM suggestions | |
| with st.spinner("Cleaning your data..."): | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=file_type) as temp_file: | |
| temp_file.write(file_bytes) | |
| temp_file_path = temp_file.name | |
| try: | |
| # Temporary file to clean_data function | |
| cleaned_df, suggestions = clean_data(temp_file_path) | |
| st.success("Data cleaned successfully!") | |
| finally: | |
| # Clean up the temporary file | |
| if os.path.exists(temp_file_path): | |
| os.unlink(temp_file_path) | |
| # Show original data | |
| st.subheader("Original Data") | |
| try: | |
| if file_type == ".tsv": | |
| original_df = pd.read_csv(BytesIO(file_bytes), sep='\t') | |
| elif file_type == ".xlsx": | |
| original_df = pd.read_excel(BytesIO(file_bytes)) | |
| else: | |
| original_df = pd.read_csv(BytesIO(file_bytes)) | |
| st.dataframe(original_df.head(10), use_container_width=True) | |
| except Exception as e: | |
| st.error(f"Error loading original data: {str(e)}") | |
| # Show cleaned data | |
| st.subheader("Cleaned Data Preview") | |
| st.dataframe(cleaned_df.head(10), use_container_width=True) | |
| # Data statistics | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.metric("Total Rows", len(cleaned_df)) | |
| with col2: | |
| st.metric("Total Columns", len(cleaned_df.columns)) | |
| with col3: | |
| null_percentage = round((cleaned_df.isnull().sum().sum() / (cleaned_df.shape[0] * cleaned_df.shape[1])) * 100, 2) | |
| st.metric("Null Values (%)", f"{null_percentage}%") | |
| # Show suggestions from LLM | |
| display_llm_report(suggestions) | |
| # Convert to downloadable format | |
| if file_type == ".csv": | |
| cleaned_file = cleaned_df.to_csv(index=False).encode("utf-8") | |
| download_name = "cleaned_data.csv" | |
| mime_type = "text/csv" | |
| elif file_type == ".tsv": | |
| cleaned_file = cleaned_df.to_csv(index=False, sep="\t").encode("utf-8") | |
| download_name = "cleaned_data.tsv" | |
| mime_type = "text/tsv" | |
| elif file_type == ".xlsx": | |
| output = BytesIO() | |
| with pd.ExcelWriter(output, engine='xlsxwriter') as writer: | |
| cleaned_df.to_excel(writer, index=False) | |
| cleaned_file = output.getvalue() | |
| download_name = "cleaned_data.xlsx" | |
| mime_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | |
| # Download button | |
| st.download_button( | |
| label="📁 Download Cleaned Data", | |
| data=cleaned_file, | |
| file_name=download_name, | |
| mime=mime_type | |
| ) |