from file_converter import convert_pdfs_to_txt | |
from chunker import chunk_files_in_directory | |
def preprocess(): | |
raw_dir = "data/raw" | |
converted_dir = "data/converted" | |
chunked_dir = "data/chunked" | |
print("[INFO] Preprocessing PDF files...") | |
convert_pdfs_to_txt(raw_dir, converted_dir) | |
chunk_files_in_directory(converted_dir, chunked_dir) | |
print(f"[INFO] Preprocessing complete. Converted files saved in [{converted_dir}], chunked files saved in [{chunked_dir}].") | |
if __name__ == "__main__": | |
preprocess() |