openfree commited on
Commit
6bc8109
·
verified ·
1 Parent(s): be1f0be

Delete app-backup.py

Browse files
Files changed (1) hide show
  1. app-backup.py +0 -179
app-backup.py DELETED
@@ -1,179 +0,0 @@
1
- import gradio as gr
2
- import pandas as pd
3
- import json
4
- from io import BytesIO
5
- import requests
6
-
7
- def dataset_converter(input_file, conversion_type, parquet_url):
8
- # Initialize variables for file data and extension
9
- file_bytes = None
10
- file_name = None
11
- file_extension = None
12
-
13
- # Read the input file if provided
14
- if input_file is not None:
15
- try:
16
- file_bytes = input_file.read()
17
- file_name = input_file.name
18
- except AttributeError:
19
- file_name = input_file
20
- with open(file_name, "rb") as f:
21
- file_bytes = f.read()
22
- file_extension = file_name.lower().split('.')[-1]
23
-
24
- # Conversion: CSV to Parquet
25
- if conversion_type == "CSV to Parquet":
26
- if input_file is None or file_extension != "csv":
27
- raise ValueError("For CSV to Parquet conversion, please upload a CSV file. 📄")
28
- df = pd.read_csv(BytesIO(file_bytes))
29
- output_file = "output.parquet"
30
- df.to_parquet(output_file, index=False)
31
- converted_format = "Parquet"
32
- preview_str = df.head(10).to_string(index=False)
33
-
34
- # Conversion: Parquet to CSV
35
- elif conversion_type == "Parquet to CSV":
36
- if input_file is None or file_extension != "parquet":
37
- raise ValueError("For Parquet to CSV conversion, please upload a Parquet file. 📄")
38
- df = pd.read_parquet(BytesIO(file_bytes))
39
- output_file = "output.csv"
40
- df.to_csv(output_file, index=False)
41
- converted_format = "CSV"
42
- preview_str = df.head(10).to_string(index=False)
43
-
44
- # Conversion: CSV to JSONL
45
- elif conversion_type == "CSV to JSONL":
46
- if input_file is None or file_extension != "csv":
47
- raise ValueError("For CSV to JSONL conversion, please upload a CSV file. 📄")
48
- # Read CSV with latin1 encoding
49
- df = pd.read_csv(BytesIO(file_bytes), encoding='latin1')
50
- output_file = "metadata.jsonl"
51
- total_data = []
52
- for index, row in df.iterrows():
53
- data = {}
54
- file_name_val = None # Initialize file_name for each row
55
- for column in df.columns:
56
- if column == 'file_name':
57
- file_name_val = row[column]
58
- data[column] = row[column]
59
- row_data = {"file_name": file_name_val, "ground_truth": json.dumps(data)}
60
- total_data.append(row_data)
61
- # Write JSONL output (using write mode so previous data is overwritten)
62
- with open(output_file, 'w', encoding='utf-8') as f:
63
- for row_data in total_data:
64
- f.write(json.dumps(row_data) + '\n')
65
- converted_format = "JSONL"
66
- preview_str = df.head(10).to_string(index=False)
67
-
68
- # Conversion: Parquet to JSONL
69
- elif conversion_type == "Parquet to JSONL":
70
- # Use uploaded file if available; otherwise try the provided URL
71
- if input_file is not None:
72
- df = pd.read_parquet(BytesIO(file_bytes))
73
- elif parquet_url:
74
- response = requests.get(parquet_url)
75
- response.raise_for_status() # Ensure the request was successful
76
- df = pd.read_parquet(BytesIO(response.content))
77
- file_name = "from_url.parquet"
78
- else:
79
- raise ValueError("For Parquet to JSONL conversion, please upload a file or provide a URL. 🌐")
80
-
81
- output_file = "output.jsonl"
82
- # Recursive function to decode bytes to UTF-8 strings
83
- def recursive_sanitize(val):
84
- if isinstance(val, bytes):
85
- return val.decode("utf-8", errors="replace")
86
- elif isinstance(val, dict):
87
- return {k: recursive_sanitize(v) for k, v in val.items()}
88
- elif isinstance(val, list):
89
- return [recursive_sanitize(item) for item in val]
90
- else:
91
- return val
92
-
93
- records = df.to_dict(orient="records")
94
- with open(output_file, "w", encoding="utf-8") as f:
95
- for record in records:
96
- sanitized_record = recursive_sanitize(record)
97
- f.write(json.dumps(sanitized_record, ensure_ascii=False) + "\n")
98
- converted_format = "JSONL"
99
- preview_str = df.head(10).to_string(index=False)
100
-
101
- else:
102
- raise ValueError("Invalid conversion type selected. ⚠️")
103
-
104
- info_message = (
105
- f"Input file: {file_name if file_name is not None else 'N/A'}\n"
106
- f"Converted file format: {converted_format}\n\n"
107
- f"Preview (Top 10 Rows):\n{preview_str}\n\n"
108
- "Community: https://discord.gg/openfreeai 🚀"
109
- )
110
- return output_file, info_message
111
-
112
- # Custom CSS for a modern and sleek look
113
- custom_css = """
114
- body {
115
- background-color: #f4f4f4;
116
- font-family: 'Helvetica Neue', Arial, sans-serif;
117
- }
118
- .gradio-container {
119
- max-width: 900px;
120
- margin: 40px auto;
121
- padding: 20px;
122
- background-color: #ffffff;
123
- border-radius: 12px;
124
- box-shadow: 0 8px 16px rgba(0,0,0,0.1);
125
- }
126
- h1, h2 {
127
- color: #333333;
128
- }
129
- .gradio-input, .gradio-output {
130
- margin-bottom: 20px;
131
- }
132
- .gradio-button {
133
- background-color: #4CAF50 !important;
134
- color: white !important;
135
- border: none !important;
136
- padding: 10px 20px !important;
137
- font-size: 16px !important;
138
- border-radius: 6px !important;
139
- cursor: pointer;
140
- }
141
- .gradio-button:hover {
142
- background-color: #45a049 !important;
143
- }
144
- """
145
-
146
- with gr.Blocks(css=custom_css, title="Datasets Convertor") as demo:
147
- gr.Markdown("# Datasets Convertor 🚀")
148
- gr.Markdown(
149
- "Upload a CSV or Parquet file (or provide a Parquet file URL for Parquet to JSONL conversion) "
150
- "and select the conversion type. The app converts the file to the desired format and displays a preview of the top 10 rows. ✨"
151
- )
152
-
153
- with gr.Row():
154
- with gr.Column(scale=1):
155
- input_file = gr.File(label="Upload CSV or Parquet File 📄")
156
- with gr.Column(scale=1):
157
- conversion_type = gr.Radio(
158
- choices=["CSV to Parquet", "Parquet to CSV", "CSV to JSONL", "Parquet to JSONL"],
159
- label="Conversion Type 🔄"
160
- )
161
-
162
- # Optional URL input for Parquet to JSONL conversion
163
- parquet_url = gr.Textbox(label="Parquet File URL (Optional) 🌐", placeholder="Enter URL if not uploading a file")
164
-
165
- convert_button = gr.Button("Convert ⚡", elem_classes=["gradio-button"])
166
-
167
- with gr.Row():
168
- output_file = gr.File(label="Converted File 💾")
169
- preview = gr.Textbox(label="Preview (Top 10 Rows) 🔍", lines=15)
170
-
171
- convert_button.click(
172
- fn=dataset_converter,
173
- inputs=[input_file, conversion_type, parquet_url],
174
- outputs=[output_file, preview]
175
- )
176
-
177
- gr.Markdown("**Join our Community:** [https://discord.gg/openfreeai](https://discord.gg/openfreeai) 🤝")
178
-
179
- demo.launch()