Spaces:
Runtime error
Runtime error
# MUSS AUFGERÄUMT WERDEN | |
import json | |
import pandas as pd | |
def split_json_file(input_filepath, lines_per_file=50): | |
""" | |
Splits a JSON file into multiple files, each containing up to 'lines_per_file' lines. | |
param input_filepath: The path to the input JSON file. | |
param lines_per_file: The maximum number of lines per output file. | |
""" | |
# Counter for file naming | |
file_counter = 1 | |
# Open the input file | |
with open(input_filepath, 'r') as input_file: | |
# Read the lines from the input file | |
lines = input_file.readlines() | |
# Iterate through the lines in chunks of 'lines_per_file' | |
for i in range(0, len(lines), lines_per_file): | |
# Determine the output file name | |
output_filename = f'translate_data/english_{file_counter}.json' | |
# Write the current chunk to the output file | |
with open(output_filename, 'w') as output_file: | |
# Grab the current chunk of lines | |
chunk = lines[i:i+lines_per_file] | |
# Write each line to the output file | |
for line in chunk: | |
output_file.write(line) | |
print(f'Created {output_filename}') | |
# Increment the file counter | |
file_counter += 1 | |
def merge_and_save(list1, list2, dict1, dict2, filename='output.csv'): | |
""" | |
Merges two lists and two dictionaries into a pandas DataFrame according to the specified structure: | |
headers: ['list1', 'list2', 'keys dict1', 'vals dict1', 'keys dict2', 'vals dict2'] | |
and saves it as a CSV file. | |
Parameters: | |
- list1 (list): First list to merge, contributing to column 'list1'. | |
- list2 (list): Second list to merge, contributing to column 'list2'. | |
- dict1 (dict): First dictionary to merge, keys and values added as separate columns. | |
- dict2 (dict): Second dictionary to merge, keys and values added as separate columns. | |
- filename (str): Filename for the saved CSV file. | |
""" | |
# Combining all elements into a structured list of dictionaries for DataFrame construction | |
data = [] | |
dict1_items = list(dict1.items()) | |
dict2_items = list(dict2.items()) | |
for i in range(len(list1)): | |
row = { | |
'list1': list1[i], | |
'list2': list2[i], | |
'keys dict1': dict1_items[i][0], | |
'vals dict1': dict1_items[i][1], | |
'keys dict2': dict2_items[i][0], | |
'vals dict2': dict2_items[i][1] | |
} | |
data.append(row) | |
# Creating the DataFrame | |
df = pd.DataFrame(data) | |
# Saving the DataFrame to a CSV file | |
df.to_csv(filename, index=False) | |
print(f"DataFrame saved as '{filename}' in the current directory.") | |
# new line for every entry | |
def safe_my_dict_as_json(file_name, my_dict): | |
print(my_dict) | |
# Open a file for writing | |
with open(file_name, 'w') as f: | |
# Write the opening brace of the JSON object | |
f.write('{\n') | |
# Get total number of items to control comma insertion | |
total_items = len(my_dict) | |
if type(my_dict) == list: | |
my_dict = my_dict[0] | |
# Iterate over items, keeping track of the current item index | |
for i, (key, value) in enumerate(my_dict.items()): | |
# Serialize the key with JSON to handle special characters and ensure proper quoting | |
json_key = json.dumps(key) | |
# Convert the list to a JSON-formatted string (without indentation) | |
json_value = json.dumps(value) | |
# Determine if a comma is needed (for all but the last item) | |
comma = ',' if i < total_items - 1 else '' | |
# Write the formatted string to the file | |
f.write(f" {json_key}: {json_value}{comma}\n") | |
# Write the closing brace of the JSON object | |
f.write('}\n') | |
if __name__ == "__main__": | |
print("here are all functions that write to the Datasets") | |