Spaces:
Running
Running
import gradio as gr | |
import pandas as pd | |
from io import BytesIO | |
import chardet | |
def detect_encoding(file_bytes): | |
"""Detect the encoding of a file using chardet""" | |
# Only use a sample of the file for detection to improve performance | |
result = chardet.detect(file_bytes[:10000]) | |
return result['encoding'] | |
def convert_file(input_file, conversion_type, encoding_option): | |
try: | |
# Check if a file was uploaded | |
if input_file is None: | |
return None, "Please upload a file." | |
# Determine if input_file is a file-like object or a file path string | |
try: | |
# Try reading from file-like object | |
file_bytes = input_file.read() | |
file_name = input_file.name | |
except AttributeError: | |
# If there's an AttributeError, treat input_file as a file path | |
file_name = input_file | |
with open(file_name, "rb") as f: | |
file_bytes = f.read() | |
file_extension = file_name.lower().split('.')[-1] | |
df = None | |
output_file = None | |
converted_format = None | |
# Handle encoding for CSV files | |
if encoding_option == "Auto-detect": | |
encoding = detect_encoding(file_bytes) | |
else: | |
encoding = encoding_option | |
# Conversion: CSV to Parquet | |
if conversion_type == "CSV to Parquet": | |
if file_extension != "csv": | |
return None, "For CSV to Parquet conversion, please upload a CSV file." | |
# Try with the selected/detected encoding | |
try: | |
df = pd.read_csv(BytesIO(file_bytes), encoding=encoding) | |
except UnicodeDecodeError: | |
# If auto-detection fails, try a few common encodings | |
common_encodings = ['latin1', 'iso-8859-1', 'cp1252'] | |
for enc in common_encodings: | |
try: | |
df = pd.read_csv(BytesIO(file_bytes), encoding=enc) | |
encoding = enc # Update the successful encoding | |
break | |
except UnicodeDecodeError: | |
continue | |
if df is None: | |
return None, f"Failed to decode the CSV file. Auto-detected encoding was '{encoding}'. Please try selecting a specific encoding." | |
output_file = "output.parquet" | |
df.to_parquet(output_file, index=False) | |
converted_format = "Parquet" | |
# Conversion: Parquet to CSV | |
elif conversion_type == "Parquet to CSV": | |
if file_extension != "parquet": | |
return None, "For Parquet to CSV conversion, please upload a Parquet file." | |
df = pd.read_parquet(BytesIO(file_bytes)) | |
output_file = "output.csv" | |
df.to_csv(output_file, index=False, encoding=encoding) | |
converted_format = "CSV" | |
else: | |
return None, "Invalid conversion type selected." | |
# Generate a preview of the top 10 rows | |
preview = df.head(10).to_string(index=False) | |
info_message = ( | |
f"Input file: {file_name}\n" | |
f"Converted file format: {converted_format}\n" | |
f"Encoding used: {encoding}\n" | |
f"Total rows: {len(df)}\n" | |
f"Total columns: {len(df.columns)}\n\n" | |
f"Preview (Top 10 Rows):\n{preview}" | |
) | |
return output_file, info_message | |
except Exception as e: | |
return None, f"Error during conversion: {str(e)}" | |
# Enhanced custom CSS for a more visually appealing interface | |
custom_css = """ | |
body { | |
background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%); | |
font-family: 'Poppins', 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; | |
} | |
.gradio-container { | |
max-width: 950px; | |
margin: 40px auto; | |
padding: 30px; | |
background-color: #ffffff; | |
border-radius: 16px; | |
box-shadow: 0 10px 25px rgba(0,0,0,0.1); | |
} | |
h1 { | |
color: #3a4149; | |
font-size: 2.5rem; | |
text-align: center; | |
margin-bottom: 5px; | |
font-weight: 600; | |
} | |
h2 { | |
color: #5a6570; | |
font-size: 1.2rem; | |
text-align: center; | |
margin-bottom: 25px; | |
font-weight: 400; | |
} | |
.header-icon { | |
font-size: 3rem; | |
text-align: center; | |
margin-bottom: 10px; | |
color: #4285f4; | |
} | |
.instruction-box { | |
background-color: #f8f9fa; | |
border-left: 4px solid #4285f4; | |
padding: 15px; | |
margin-bottom: 25px; | |
border-radius: 6px; | |
} | |
.instruction-step { | |
margin: 8px 0; | |
padding-left: 10px; | |
} | |
.file-box { | |
border: 2px dashed #ddd; | |
border-radius: 12px; | |
padding: 20px; | |
transition: all 0.3s ease; | |
} | |
.file-box:hover { | |
border-color: #4285f4; | |
box-shadow: 0 5px 15px rgba(66, 133, 244, 0.15); | |
} | |
.conversion-radio label { | |
padding: 10px 15px; | |
margin: 5px; | |
border-radius: 8px; | |
border: 1px solid #eaeaea; | |
transition: all 0.2s ease; | |
} | |
.conversion-radio input:checked + label { | |
background-color: #e8f0fe; | |
border-color: #4285f4; | |
color: #4285f4; | |
} | |
.convert-button { | |
background: linear-gradient(to right, #4285f4, #34a853) !important; | |
color: white !important; | |
border: none !important; | |
padding: 12px 25px !important; | |
font-size: 16px !important; | |
font-weight: 500 !important; | |
border-radius: 30px !important; | |
cursor: pointer; | |
margin: 20px auto !important; | |
display: block !important; | |
box-shadow: 0 4px 12px rgba(66, 133, 244, 0.25) !important; | |
} | |
.convert-button:hover { | |
box-shadow: 0 6px 16px rgba(66, 133, 244, 0.4) !important; | |
transform: translateY(-2px); | |
} | |
.footer { | |
text-align: center; | |
margin-top: 30px; | |
color: #70757a; | |
font-size: 0.9rem; | |
} | |
.preview-box { | |
background-color: #f8f9fa; | |
border-radius: 8px; | |
padding: 15px; | |
font-family: monospace; | |
white-space: pre-wrap; | |
max-height: 400px; | |
overflow-y: auto; | |
} | |
.info-tag { | |
display: inline-block; | |
background-color: #e8f0fe; | |
color: #4285f4; | |
padding: 4px 10px; | |
border-radius: 20px; | |
font-size: 0.85rem; | |
margin-right: 8px; | |
margin-bottom: 8px; | |
} | |
.divider { | |
height: 1px; | |
background: linear-gradient(to right, transparent, #ddd, transparent); | |
margin: 25px 0; | |
} | |
.error-message { | |
color: #d93025; | |
background-color: #fce8e6; | |
padding: 10px; | |
border-radius: 8px; | |
margin-top: 10px; | |
font-size: 0.9rem; | |
} | |
.success-message { | |
color: #188038; | |
background-color: #e6f4ea; | |
padding: 10px; | |
border-radius: 8px; | |
margin-top: 10px; | |
font-size: 0.9rem; | |
} | |
""" | |
with gr.Blocks(css=custom_css, title="DataFormat Converter") as demo: | |
gr.HTML('<div class="header-icon">📊</div>') | |
gr.Markdown("# DataFormat Converter") | |
gr.Markdown("## Seamlessly convert between CSV and Parquet formats with just a few clicks") | |
gr.HTML('<div class="divider"></div>') | |
with gr.Row(): | |
with gr.Column(): | |
gr.HTML(""" | |
<div class="instruction-box"> | |
<h3>How It Works</h3> | |
<div class="instruction-step">1. Upload your CSV or Parquet file</div> | |
<div class="instruction-step">2. Select the conversion direction</div> | |
<div class="instruction-step">3. Choose encoding (or leave as auto-detect)</div> | |
<div class="instruction-step">4. Click "Convert" and download your transformed file</div> | |
</div> | |
<div class="info-section"> | |
<div class="info-tag">Fast Conversion</div> | |
<div class="info-tag">Data Preview</div> | |
<div class="info-tag">Multi-Encoding Support</div> | |
<div class="info-tag">Maintains Structure</div> | |
</div> | |
""") | |
gr.HTML(""" | |
<div style="margin-top: 25px;"> | |
<h3>Why Convert?</h3> | |
<p>Parquet files offer significant advantages for data storage and analysis:</p> | |
<ul> | |
<li>Smaller file size (up to 87% reduction)</li> | |
<li>Faster query performance</li> | |
<li>Column-oriented storage</li> | |
<li>Better compression</li> | |
</ul> | |
<p>CSV files are useful for:</p> | |
<ul> | |
<li>Universal compatibility</li> | |
<li>Human readability</li> | |
<li>Simple integration with many tools</li> | |
</ul> | |
</div> | |
""") | |
with gr.Column(): | |
# Replace gr.Box with a div using gr.HTML for the file-box styling | |
gr.HTML('<div class="file-box">') | |
input_file = gr.File(label="Upload Your File") | |
conversion_type = gr.Radio( | |
choices=["CSV to Parquet", "Parquet to CSV"], | |
label="Select Conversion Type", | |
value="CSV to Parquet", | |
elem_classes=["conversion-radio"] | |
) | |
encoding_option = gr.Dropdown( | |
choices=["Auto-detect", "utf-8", "latin1", "iso-8859-1", "cp1252", "utf-16"], | |
value="Auto-detect", | |
label="Select CSV Encoding" | |
) | |
convert_button = gr.Button("Convert Now", elem_classes=["convert-button"]) | |
gr.HTML('</div>') # Close the file-box div | |
with gr.Accordion("Conversion Results", open=False): | |
output_file = gr.File(label="Download Converted File") | |
with gr.Accordion("Data Preview", open=True): | |
preview = gr.Textbox( | |
label="File Information and Preview", | |
lines=15, | |
elem_classes=["preview-box"] | |
) | |
gr.HTML('<div class="divider"></div>') | |
gr.HTML(""" | |
<div class="footer"> | |
<p>DataFormat Converter © 2025 | Built with Gradio | An efficient tool for data professionals</p> | |
</div> | |
""") | |
convert_button.click( | |
fn=convert_file, | |
inputs=[input_file, conversion_type, encoding_option], | |
outputs=[output_file, preview] | |
) | |
# Add dependency handling to show/hide encoding options based on conversion type | |
def update_encoding_visibility(conversion_type): | |
if conversion_type == "CSV to Parquet": | |
return gr.update(visible=True) | |
else: | |
return gr.update(visible=False) | |
conversion_type.change( | |
fn=update_encoding_visibility, | |
inputs=conversion_type, | |
outputs=encoding_option | |
) | |
if __name__ == "__main__": | |
demo.launch() |