csc525_retrieval_based_chatbot / run_taskmaster_processor.py
JoeArmani
sentence transformer
64e7c31
import json
from datetime import datetime
from pathlib import Path
from taskmaster_processor import TaskmasterProcessor, RawDataProcessingConfig
def main():
# Setup config and processor
base_dir = "raw_datasets/taskmaster"
config = RawDataProcessingConfig(
debug=True,
max_length=512,
min_turns=4,
min_user_words=3
)
processor = TaskmasterProcessor(config)
# Load dialogues
dialogues = processor.load_taskmaster_dataset(base_dir=base_dir, max_examples=None)
# Filter and convert dialogues
final_dialogues = processor.filter_and_convert(dialogues)
# Save processed dialogues
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = Path("processed_outputs")
output_dir.mkdir(parents=True, exist_ok=True)
out_file = output_dir / f"taskmaster_only_{timestamp}.json"
with open(out_file, 'w', encoding='utf-8') as f:
json.dump(final_dialogues, f, indent=2)
print(f"[Taskmaster Only] Kept {len(final_dialogues)} dialogues => {out_file}")
if __name__ == "__main__":
main()