CSV-Parquet-Convertors

Sleeping

File size: 10,705 Bytes

import gradio as gr
import pandas as pd
from io import BytesIO
import chardet

def detect_encoding(file_bytes):
    """Detect the encoding of a file using chardet"""
    # Only use a sample of the file for detection to improve performance
    result = chardet.detect(file_bytes[:10000])
    return result['encoding']

def convert_file(input_file, conversion_type, encoding_option):
    try:
        # Check if a file was uploaded
        if input_file is None:
            return None, "Please upload a file."
        
        # Determine if input_file is a file-like object or a file path string
        try:
            # Try reading from file-like object
            file_bytes = input_file.read()
            file_name = input_file.name
        except AttributeError:
            # If there's an AttributeError, treat input_file as a file path
            file_name = input_file
            with open(file_name, "rb") as f:
                file_bytes = f.read()
        
        file_extension = file_name.lower().split('.')[-1]
        df = None
        output_file = None
        converted_format = None
        
        # Handle encoding for CSV files
        if encoding_option == "Auto-detect":
            encoding = detect_encoding(file_bytes)
        else:
            encoding = encoding_option
        
        # Conversion: CSV to Parquet
        if conversion_type == "CSV to Parquet":
            if file_extension != "csv":
                return None, "For CSV to Parquet conversion, please upload a CSV file."
            
            # Try with the selected/detected encoding
            try:
                df = pd.read_csv(BytesIO(file_bytes), encoding=encoding)
            except UnicodeDecodeError:
                # If auto-detection fails, try a few common encodings
                common_encodings = ['latin1', 'iso-8859-1', 'cp1252']
                for enc in common_encodings:
                    try:
                        df = pd.read_csv(BytesIO(file_bytes), encoding=enc)
                        encoding = enc  # Update the successful encoding
                        break
                    except UnicodeDecodeError:
                        continue
                if df is None:
                    return None, f"Failed to decode the CSV file. Auto-detected encoding was '{encoding}'. Please try selecting a specific encoding."
            
            output_file = "output.parquet"
            df.to_parquet(output_file, index=False)
            converted_format = "Parquet"
        
        # Conversion: Parquet to CSV
        elif conversion_type == "Parquet to CSV":
            if file_extension != "parquet":
                return None, "For Parquet to CSV conversion, please upload a Parquet file."
            
            df = pd.read_parquet(BytesIO(file_bytes))
            output_file = "output.csv"
            df.to_csv(output_file, index=False, encoding=encoding)
            converted_format = "CSV"
        else:
            return None, "Invalid conversion type selected."
        
        # Generate a preview of the top 10 rows
        preview = df.head(10).to_string(index=False)
        info_message = (
            f"Input file: {file_name}\n"
            f"Converted file format: {converted_format}\n"
            f"Encoding used: {encoding}\n"
            f"Total rows: {len(df)}\n"
            f"Total columns: {len(df.columns)}\n\n"
            f"Preview (Top 10 Rows):\n{preview}"
        )
        return output_file, info_message
    
    except Exception as e:
        return None, f"Error during conversion: {str(e)}"

# Enhanced custom CSS for a more visually appealing interface
custom_css = """
body {
    background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
    font-family: 'Poppins', 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
}

.gradio-container {
    max-width: 950px;
    margin: 40px auto;
    padding: 30px;
    background-color: #ffffff;
    border-radius: 16px;
    box-shadow: 0 10px 25px rgba(0,0,0,0.1);
}

h1 {
    color: #3a4149;
    font-size: 2.5rem;
    text-align: center;
    margin-bottom: 5px;
    font-weight: 600;
}

h2 {
    color: #5a6570;
    font-size: 1.2rem;
    text-align: center;
    margin-bottom: 25px;
    font-weight: 400;
}

.header-icon {
    font-size: 3rem;
    text-align: center;
    margin-bottom: 10px;
    color: #4285f4;
}

.instruction-box {
    background-color: #f8f9fa;
    border-left: 4px solid #4285f4;
    padding: 15px;
    margin-bottom: 25px;
    border-radius: 6px;
}

.instruction-step {
    margin: 8px 0;
    padding-left: 10px;
}

.file-box {
    border: 2px dashed #ddd;
    border-radius: 12px;
    padding: 20px;
    transition: all 0.3s ease;
}

.file-box:hover {
    border-color: #4285f4;
    box-shadow: 0 5px 15px rgba(66, 133, 244, 0.15);
}

.conversion-radio label {
    padding: 10px 15px;
    margin: 5px;
    border-radius: 8px;
    border: 1px solid #eaeaea;
    transition: all 0.2s ease;
}

.conversion-radio input:checked + label {
    background-color: #e8f0fe;
    border-color: #4285f4;
    color: #4285f4;
}

.convert-button {
    background: linear-gradient(to right, #4285f4, #34a853) !important;
    color: white !important;
    border: none !important;
    padding: 12px 25px !important;
    font-size: 16px !important;
    font-weight: 500 !important;
    border-radius: 30px !important;
    cursor: pointer;
    margin: 20px auto !important;
    display: block !important;
    box-shadow: 0 4px 12px rgba(66, 133, 244, 0.25) !important;
}

.convert-button:hover {
    box-shadow: 0 6px 16px rgba(66, 133, 244, 0.4) !important;
    transform: translateY(-2px);
}

.footer {
    text-align: center;
    margin-top: 30px;
    color: #70757a;
    font-size: 0.9rem;
}

.preview-box {
    background-color: #f8f9fa;
    border-radius: 8px;
    padding: 15px;
    font-family: monospace;
    white-space: pre-wrap;
    max-height: 400px;
    overflow-y: auto;
}

.info-tag {
    display: inline-block;
    background-color: #e8f0fe;
    color: #4285f4;
    padding: 4px 10px;
    border-radius: 20px;
    font-size: 0.85rem;
    margin-right: 8px;
    margin-bottom: 8px;
}

.divider {
    height: 1px;
    background: linear-gradient(to right, transparent, #ddd, transparent);
    margin: 25px 0;
}

.error-message {
    color: #d93025;
    background-color: #fce8e6;
    padding: 10px;
    border-radius: 8px;
    margin-top: 10px;
    font-size: 0.9rem;
}

.success-message {
    color: #188038;
    background-color: #e6f4ea;
    padding: 10px;
    border-radius: 8px;
    margin-top: 10px;
    font-size: 0.9rem;
}
"""

with gr.Blocks(css=custom_css, title="DataFormat Converter") as demo:
    gr.HTML('<div class="header-icon">📊</div>')
    gr.Markdown("# DataFormat Converter")
    gr.Markdown("## Seamlessly convert between CSV and Parquet formats with just a few clicks")
    
    gr.HTML('<div class="divider"></div>')
    
    with gr.Row():
        with gr.Column():
            gr.HTML("""
            <div class="instruction-box">
                <h3>How It Works</h3>
                <div class="instruction-step">1. Upload your CSV or Parquet file</div>
                <div class="instruction-step">2. Select the conversion direction</div>
                <div class="instruction-step">3. Choose encoding (or leave as auto-detect)</div>
                <div class="instruction-step">4. Click "Convert" and download your transformed file</div>
            </div>
            
            <div class="info-section">
                <div class="info-tag">Fast Conversion</div>
                <div class="info-tag">Data Preview</div>
                <div class="info-tag">Multi-Encoding Support</div>
                <div class="info-tag">Maintains Structure</div>
            </div>
            """)
            
            gr.HTML("""
            <div style="margin-top: 25px;">
                <h3>Why Convert?</h3>
                <p>Parquet files offer significant advantages for data storage and analysis:</p>
                <ul>
                    <li>Smaller file size (up to 87% reduction)</li>
                    <li>Faster query performance</li>
                    <li>Column-oriented storage</li>
                    <li>Better compression</li>
                </ul>
                <p>CSV files are useful for:</p>
                <ul>
                    <li>Universal compatibility</li>
                    <li>Human readability</li>
                    <li>Simple integration with many tools</li>
                </ul>
            </div>
            """)
    
        with gr.Column():
            # Replace gr.Box with a div using gr.HTML for the file-box styling
            gr.HTML('<div class="file-box">')
            input_file = gr.File(label="Upload Your File")
            conversion_type = gr.Radio(
                choices=["CSV to Parquet", "Parquet to CSV"], 
                label="Select Conversion Type",
                value="CSV to Parquet",
                elem_classes=["conversion-radio"]
            )
            encoding_option = gr.Dropdown(
                choices=["Auto-detect", "utf-8", "latin1", "iso-8859-1", "cp1252", "utf-16"],
                value="Auto-detect",
                label="Select CSV Encoding"
            )
            convert_button = gr.Button("Convert Now", elem_classes=["convert-button"])
            gr.HTML('</div>')  # Close the file-box div
            
            with gr.Accordion("Conversion Results", open=False):
                output_file = gr.File(label="Download Converted File")
                
            with gr.Accordion("Data Preview", open=True):
                preview = gr.Textbox(
                    label="File Information and Preview", 
                    lines=15,
                    elem_classes=["preview-box"]
                )
    
    gr.HTML('<div class="divider"></div>')
    
    gr.HTML("""
    <div class="footer">
        <p>DataFormat Converter © 2025 | Built with Gradio | An efficient tool for data professionals</p>
    </div>
    """)
    
    convert_button.click(
        fn=convert_file, 
        inputs=[input_file, conversion_type, encoding_option], 
        outputs=[output_file, preview]
    )

    # Add dependency handling to show/hide encoding options based on conversion type
    def update_encoding_visibility(conversion_type):
        if conversion_type == "CSV to Parquet":
            return gr.update(visible=True)
        else:
            return gr.update(visible=False)
    
    conversion_type.change(
        fn=update_encoding_visibility,
        inputs=conversion_type,
        outputs=encoding_option
    )

if __name__ == "__main__":
    demo.launch()