import json from pathlib import Path import logging from typing import List, Dict logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) """ Standalone script to deduplicate dialogues from multiple JSON files. """ def load_json_file(file_path: str) -> List[Dict]: """Load and parse JSON file.""" try: with open(file_path, 'r', encoding='utf-8') as f: return json.load(f) except json.JSONDecodeError as e: logger.error(f"Error parsing JSON from {file_path}: {e}") return [] except Exception as e: logger.error(f"Error reading file {file_path}: {e}") return [] def combine_json_files(input_directory: str, output_file: str): """ Combine multiple JSON files and removing duplicate dialogues based on dialogue_id. Args: input_directory: Directory containing JSON files to process output_file: Path to save the combined output """ # Track unique dialogues dialogue_map = {} duplicate_count = 0 # Process all JSON files in the directory input_path = Path(input_directory) for json_file in input_path.glob('*.json'): logger.info(f"Processing {json_file}") data = load_json_file(str(json_file)) # Process each dialogue in the file for dialogue in data: dialogue_id = dialogue.get('dialogue_id') if not dialogue_id: logger.warning(f"Found dialogue without ID in {json_file}") continue # Keep the first occurrence if dialogue_id in dialogue_map: duplicate_count += 1 logger.debug(f"Duplicate dialogue_id found: {dialogue_id}") else: dialogue_map[dialogue_id] = dialogue # Convert the map of unique dialogues back to a list unique_dialogues = list(dialogue_map.values()) # Save combined dialogues to a new file try: with open(output_file, 'w', encoding='utf-8') as f: json.dump(unique_dialogues, f, indent=4) logger.info(f"Successfully combined files. Found {duplicate_count} duplicates.") logger.info(f"Total unique dialogues: {len(unique_dialogues)}") except Exception as e: logger.error(f"Error writing output file: {e}") if __name__ == "__main__": combine_json_files( input_directory="/Users/joe/Desktop/Grad School/CSC525/CSC525_mod8_option2_joseph_armani/processed_outputs", output_file="augmented_dialogues.json" )