csc525_retrieval_based_chatbot / deduplicate_augmented_dialogues.py
JoeArmani
restructuring
71ca212
import json
from pathlib import Path
import logging
from typing import List, Dict
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
"""
Standalone script to deduplicate dialogues from multiple JSON files.
"""
def load_json_file(file_path: str) -> List[Dict]:
"""Load and parse JSON file."""
try:
with open(file_path, 'r', encoding='utf-8') as f:
return json.load(f)
except json.JSONDecodeError as e:
logger.error(f"Error parsing JSON from {file_path}: {e}")
return []
except Exception as e:
logger.error(f"Error reading file {file_path}: {e}")
return []
def combine_json_files(input_directory: str, output_file: str):
"""
Combine multiple JSON files and removing duplicate dialogues based on dialogue_id.
Args:
input_directory: Directory containing JSON files to process
output_file: Path to save the combined output
"""
# Track unique dialogues
dialogue_map = {}
duplicate_count = 0
# Process all JSON files in the directory
input_path = Path(input_directory)
for json_file in input_path.glob('*.json'):
logger.info(f"Processing {json_file}")
data = load_json_file(str(json_file))
# Process each dialogue in the file
for dialogue in data:
dialogue_id = dialogue.get('dialogue_id')
if not dialogue_id:
logger.warning(f"Found dialogue without ID in {json_file}")
continue
# Keep the first occurrence
if dialogue_id in dialogue_map:
duplicate_count += 1
logger.debug(f"Duplicate dialogue_id found: {dialogue_id}")
else:
dialogue_map[dialogue_id] = dialogue
# Convert the map of unique dialogues back to a list
unique_dialogues = list(dialogue_map.values())
# Save combined dialogues to a new file
try:
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(unique_dialogues, f, indent=4)
logger.info(f"Successfully combined files. Found {duplicate_count} duplicates.")
logger.info(f"Total unique dialogues: {len(unique_dialogues)}")
except Exception as e:
logger.error(f"Error writing output file: {e}")
if __name__ == "__main__":
combine_json_files(
input_directory="/Users/joe/Desktop/Grad School/CSC525/CSC525_mod8_option2_joseph_armani/processed_outputs",
output_file="augmented_dialogues.json"
)