Spaces:

elia-waefler
/

ki_rag_classify

Runtime error

App Files Files Community

ki_rag_classify / my_1_writer.py

elia-waefler

Upload 17 files

c2b923e verified 7 months ago

raw

history blame contribute delete

4 kB

	# MUSS AUFGERÄUMT WERDEN

	import json
	import pandas as pd


	def split_json_file(input_filepath, lines_per_file=50):
	"""
	Splits a JSON file into multiple files, each containing up to 'lines_per_file' lines.

	param input_filepath: The path to the input JSON file.
	param lines_per_file: The maximum number of lines per output file.
	"""
	# Counter for file naming
	file_counter = 1
	# Open the input file
	with open(input_filepath, 'r') as input_file:
	# Read the lines from the input file
	lines = input_file.readlines()
	# Iterate through the lines in chunks of 'lines_per_file'
	for i in range(0, len(lines), lines_per_file):
	# Determine the output file name
	output_filename = f'translate_data/english_{file_counter}.json'
	# Write the current chunk to the output file
	with open(output_filename, 'w') as output_file:
	# Grab the current chunk of lines
	chunk = lines[i:i+lines_per_file]
	# Write each line to the output file
	for line in chunk:
	output_file.write(line)
	print(f'Created {output_filename}')
	# Increment the file counter
	file_counter += 1


	def merge_and_save(list1, list2, dict1, dict2, filename='output.csv'):
	"""
	Merges two lists and two dictionaries into a pandas DataFrame according to the specified structure:
	headers: ['list1', 'list2', 'keys dict1', 'vals dict1', 'keys dict2', 'vals dict2']
	and saves it as a CSV file.

	Parameters:
	- list1 (list): First list to merge, contributing to column 'list1'.
	- list2 (list): Second list to merge, contributing to column 'list2'.
	- dict1 (dict): First dictionary to merge, keys and values added as separate columns.
	- dict2 (dict): Second dictionary to merge, keys and values added as separate columns.
	- filename (str): Filename for the saved CSV file.
	"""
	# Combining all elements into a structured list of dictionaries for DataFrame construction
	data = []
	dict1_items = list(dict1.items())
	dict2_items = list(dict2.items())
	for i in range(len(list1)):
	row = {
	'list1': list1[i],
	'list2': list2[i],
	'keys dict1': dict1_items[i][0],
	'vals dict1': dict1_items[i][1],
	'keys dict2': dict2_items[i][0],
	'vals dict2': dict2_items[i][1]
	}
	data.append(row)

	# Creating the DataFrame
	df = pd.DataFrame(data)

	# Saving the DataFrame to a CSV file
	df.to_csv(filename, index=False)
	print(f"DataFrame saved as '{filename}' in the current directory.")


	# new line for every entry
	def safe_my_dict_as_json(file_name, my_dict):
	print(my_dict)
	# Open a file for writing
	with open(file_name, 'w') as f:
	# Write the opening brace of the JSON object
	f.write('{\n')
	# Get total number of items to control comma insertion
	total_items = len(my_dict)
	if type(my_dict) == list:
	my_dict = my_dict[0]
	# Iterate over items, keeping track of the current item index
	for i, (key, value) in enumerate(my_dict.items()):
	# Serialize the key with JSON to handle special characters and ensure proper quoting
	json_key = json.dumps(key)
	# Convert the list to a JSON-formatted string (without indentation)
	json_value = json.dumps(value)
	# Determine if a comma is needed (for all but the last item)
	comma = ',' if i < total_items - 1 else ''
	# Write the formatted string to the file
	f.write(f" {json_key}: {json_value}{comma}\n")
	# Write the closing brace of the JSON object
	f.write('}\n')


	if __name__ == "__main__":
	print("here are all functions that write to the Datasets")