Datasets-Convertor

Running

App Files Files Community

openfree commited on 2 days ago

Commit

6bc8109

verified ·

1 Parent(s): be1f0be

Delete app-backup.py

Browse files

Files changed (1) hide show

app-backup.py +0 -179

app-backup.py DELETED Viewed

@@ -1,179 +0,0 @@
-import gradio as gr
-import pandas as pd
-import json
-from io import BytesIO
-import requests
-def dataset_converter(input_file, conversion_type, parquet_url):
-    # Initialize variables for file data and extension
-    file_bytes = None
-    file_name = None
-    file_extension = None
-    # Read the input file if provided
-    if input_file is not None:
-        try:
-            file_bytes = input_file.read()
-            file_name = input_file.name
-        except AttributeError:
-            file_name = input_file
-            with open(file_name, "rb") as f:
-                file_bytes = f.read()
-        file_extension = file_name.lower().split('.')[-1]
-    # Conversion: CSV to Parquet
-    if conversion_type == "CSV to Parquet":
-        if input_file is None or file_extension != "csv":
-            raise ValueError("For CSV to Parquet conversion, please upload a CSV file. 📄")
-        df = pd.read_csv(BytesIO(file_bytes))
-        output_file = "output.parquet"
-        df.to_parquet(output_file, index=False)
-        converted_format = "Parquet"
-        preview_str = df.head(10).to_string(index=False)
-    # Conversion: Parquet to CSV
-    elif conversion_type == "Parquet to CSV":
-        if input_file is None or file_extension != "parquet":
-            raise ValueError("For Parquet to CSV conversion, please upload a Parquet file. 📄")
-        df = pd.read_parquet(BytesIO(file_bytes))
-        output_file = "output.csv"
-        df.to_csv(output_file, index=False)
-        converted_format = "CSV"
-        preview_str = df.head(10).to_string(index=False)
-    # Conversion: CSV to JSONL
-    elif conversion_type == "CSV to JSONL":
-        if input_file is None or file_extension != "csv":
-            raise ValueError("For CSV to JSONL conversion, please upload a CSV file. 📄")
-        # Read CSV with latin1 encoding
-        df = pd.read_csv(BytesIO(file_bytes), encoding='latin1')
-        output_file = "metadata.jsonl"
-        total_data = []
-        for index, row in df.iterrows():
-            data = {}
-            file_name_val = None  # Initialize file_name for each row
-            for column in df.columns:
-                if column == 'file_name':
-                    file_name_val = row[column]
-                data[column] = row[column]
-            row_data = {"file_name": file_name_val, "ground_truth": json.dumps(data)}
-            total_data.append(row_data)
-        # Write JSONL output (using write mode so previous data is overwritten)
-        with open(output_file, 'w', encoding='utf-8') as f:
-            for row_data in total_data:
-                f.write(json.dumps(row_data) + '\n')
-        converted_format = "JSONL"
-        preview_str = df.head(10).to_string(index=False)
-    # Conversion: Parquet to JSONL
-    elif conversion_type == "Parquet to JSONL":
-        # Use uploaded file if available; otherwise try the provided URL
-        if input_file is not None:
-            df = pd.read_parquet(BytesIO(file_bytes))
-        elif parquet_url:
-            response = requests.get(parquet_url)
-            response.raise_for_status()  # Ensure the request was successful
-            df = pd.read_parquet(BytesIO(response.content))
-            file_name = "from_url.parquet"
-        else:
-            raise ValueError("For Parquet to JSONL conversion, please upload a file or provide a URL. 🌐")
-        output_file = "output.jsonl"
-        # Recursive function to decode bytes to UTF-8 strings
-        def recursive_sanitize(val):
-            if isinstance(val, bytes):
-                return val.decode("utf-8", errors="replace")
-            elif isinstance(val, dict):
-                return {k: recursive_sanitize(v) for k, v in val.items()}
-            elif isinstance(val, list):
-                return [recursive_sanitize(item) for item in val]
-            else:
-                return val
-        records = df.to_dict(orient="records")
-        with open(output_file, "w", encoding="utf-8") as f:
-            for record in records:
-                sanitized_record = recursive_sanitize(record)
-                f.write(json.dumps(sanitized_record, ensure_ascii=False) + "\n")
-        converted_format = "JSONL"
-        preview_str = df.head(10).to_string(index=False)
-    else:
-        raise ValueError("Invalid conversion type selected. ⚠️")
-    info_message = (
-        f"Input file: {file_name if file_name is not None else 'N/A'}\n"
-        f"Converted file format: {converted_format}\n\n"
-        f"Preview (Top 10 Rows):\n{preview_str}\n\n"
-        "Community: https://discord.gg/openfreeai 🚀"
-    )
-    return output_file, info_message
-# Custom CSS for a modern and sleek look
-custom_css = """
-body {
-    background-color: #f4f4f4;
-    font-family: 'Helvetica Neue', Arial, sans-serif;
-}
-.gradio-container {
-    max-width: 900px;
-    margin: 40px auto;
-    padding: 20px;
-    background-color: #ffffff;
-    border-radius: 12px;
-    box-shadow: 0 8px 16px rgba(0,0,0,0.1);
-}
-h1, h2 {
-    color: #333333;
-}
-.gradio-input, .gradio-output {
-    margin-bottom: 20px;
-}
-.gradio-button {
-    background-color: #4CAF50 !important;
-    color: white !important;
-    border: none !important;
-    padding: 10px 20px !important;
-    font-size: 16px !important;
-    border-radius: 6px !important;
-    cursor: pointer;
-}
-.gradio-button:hover {
-    background-color: #45a049 !important;
-}
-"""
-with gr.Blocks(css=custom_css, title="Datasets Convertor") as demo:
-    gr.Markdown("# Datasets Convertor 🚀")
-    gr.Markdown(
-        "Upload a CSV or Parquet file (or provide a Parquet file URL for Parquet to JSONL conversion) "
-        "and select the conversion type. The app converts the file to the desired format and displays a preview of the top 10 rows. ✨"
-    )
-    with gr.Row():
-        with gr.Column(scale=1):
-            input_file = gr.File(label="Upload CSV or Parquet File 📄")
-        with gr.Column(scale=1):
-            conversion_type = gr.Radio(
-                choices=["CSV to Parquet", "Parquet to CSV", "CSV to JSONL", "Parquet to JSONL"],
-                label="Conversion Type 🔄"
-            )
-    # Optional URL input for Parquet to JSONL conversion
-    parquet_url = gr.Textbox(label="Parquet File URL (Optional) 🌐", placeholder="Enter URL if not uploading a file")
-    convert_button = gr.Button("Convert ⚡", elem_classes=["gradio-button"])
-    with gr.Row():
-        output_file = gr.File(label="Converted File 💾")
-        preview = gr.Textbox(label="Preview (Top 10 Rows) 🔍", lines=15)
-    convert_button.click(
-        fn=dataset_converter,
-        inputs=[input_file, conversion_type, parquet_url],
-        outputs=[output_file, preview]
-    )
-    gr.Markdown("**Join our Community:** [https://discord.gg/openfreeai](https://discord.gg/openfreeai) 🤝")
-demo.launch()