ki_rag_classify / my_1_writer.py
elia-waefler's picture
Upload 17 files
c2b923e verified
# MUSS AUFGERÄUMT WERDEN
import json
import pandas as pd
def split_json_file(input_filepath, lines_per_file=50):
"""
Splits a JSON file into multiple files, each containing up to 'lines_per_file' lines.
param input_filepath: The path to the input JSON file.
param lines_per_file: The maximum number of lines per output file.
"""
# Counter for file naming
file_counter = 1
# Open the input file
with open(input_filepath, 'r') as input_file:
# Read the lines from the input file
lines = input_file.readlines()
# Iterate through the lines in chunks of 'lines_per_file'
for i in range(0, len(lines), lines_per_file):
# Determine the output file name
output_filename = f'translate_data/english_{file_counter}.json'
# Write the current chunk to the output file
with open(output_filename, 'w') as output_file:
# Grab the current chunk of lines
chunk = lines[i:i+lines_per_file]
# Write each line to the output file
for line in chunk:
output_file.write(line)
print(f'Created {output_filename}')
# Increment the file counter
file_counter += 1
def merge_and_save(list1, list2, dict1, dict2, filename='output.csv'):
"""
Merges two lists and two dictionaries into a pandas DataFrame according to the specified structure:
headers: ['list1', 'list2', 'keys dict1', 'vals dict1', 'keys dict2', 'vals dict2']
and saves it as a CSV file.
Parameters:
- list1 (list): First list to merge, contributing to column 'list1'.
- list2 (list): Second list to merge, contributing to column 'list2'.
- dict1 (dict): First dictionary to merge, keys and values added as separate columns.
- dict2 (dict): Second dictionary to merge, keys and values added as separate columns.
- filename (str): Filename for the saved CSV file.
"""
# Combining all elements into a structured list of dictionaries for DataFrame construction
data = []
dict1_items = list(dict1.items())
dict2_items = list(dict2.items())
for i in range(len(list1)):
row = {
'list1': list1[i],
'list2': list2[i],
'keys dict1': dict1_items[i][0],
'vals dict1': dict1_items[i][1],
'keys dict2': dict2_items[i][0],
'vals dict2': dict2_items[i][1]
}
data.append(row)
# Creating the DataFrame
df = pd.DataFrame(data)
# Saving the DataFrame to a CSV file
df.to_csv(filename, index=False)
print(f"DataFrame saved as '{filename}' in the current directory.")
# new line for every entry
def safe_my_dict_as_json(file_name, my_dict):
print(my_dict)
# Open a file for writing
with open(file_name, 'w') as f:
# Write the opening brace of the JSON object
f.write('{\n')
# Get total number of items to control comma insertion
total_items = len(my_dict)
if type(my_dict) == list:
my_dict = my_dict[0]
# Iterate over items, keeping track of the current item index
for i, (key, value) in enumerate(my_dict.items()):
# Serialize the key with JSON to handle special characters and ensure proper quoting
json_key = json.dumps(key)
# Convert the list to a JSON-formatted string (without indentation)
json_value = json.dumps(value)
# Determine if a comma is needed (for all but the last item)
comma = ',' if i < total_items - 1 else ''
# Write the formatted string to the file
f.write(f" {json_key}: {json_value}{comma}\n")
# Write the closing brace of the JSON object
f.write('}\n')
if __name__ == "__main__":
print("here are all functions that write to the Datasets")