cstr commited on
Commit
b83fe04
1 Parent(s): cd02a2c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -11
app.py CHANGED
@@ -41,20 +41,25 @@ def convert_parquet_to_jsonl_polars(input_file, output_dir, override=False):
41
  print(f"Data written to {output_file_path}")
42
 
43
  def convert_parquet_to_jsonl(parquet_filename, jsonl_filename):
44
- # Read the parquet file
45
- df = pd.read_parquet(parquet_filename)
46
-
47
- # Convert the dataframe to a JSON string and handle Unicode characters and forward slashes
48
- json_str = df.to_json(orient='records', lines=True, force_ascii=False)
49
 
50
- # Replace escaped forward slashes if needed
51
- json_str = json_str.replace('\\/', '/')
 
52
 
53
- # Write the modified JSON string to the JSONL file
54
- with open(jsonl_filename, 'w', encoding='utf-8') as file:
55
- file.write(json_str)
56
 
57
- print(f"Data saved to {jsonl_filename}")
 
 
 
 
 
 
58
 
59
  # Function to count lines in a JSONL file
60
  def count_lines_in_jsonl(file_path):
 
41
  print(f"Data written to {output_file_path}")
42
 
43
  def convert_parquet_to_jsonl(parquet_filename, jsonl_filename):
44
+ try:
45
+ # Read the parquet file
46
+ df = pd.read_parquet(parquet_filename)
47
+ logger.info(f"Read Parquet file {parquet_filename} successfully.")
 
48
 
49
+ # Convert the dataframe to a JSON string and handle Unicode characters and forward slashes
50
+ json_str = df.to_json(orient='records', lines=True, force_ascii=False)
51
+ logger.info(f"Converted Parquet file to JSON string.")
52
 
53
+ # Replace escaped forward slashes if needed
54
+ json_str = json_str.replace('\\/', '/')
 
55
 
56
+ # Write the modified JSON string to the JSONL file
57
+ with open(jsonl_filename, 'w', encoding='utf-8') as file:
58
+ file.write(json_str)
59
+ logger.info(f"Data saved to {jsonl_filename}")
60
+ except Exception as e:
61
+ logger.error(f"Failed to convert Parquet to JSONL: {e}")
62
+ raise
63
 
64
  # Function to count lines in a JSONL file
65
  def count_lines_in_jsonl(file_path):