ragtest-sakimilo / preprocess_raw_documents.py
lingyit1108's picture
swap to new embedding model and handle user 'i dont know' scenario
8c107a7
raw
history blame contribute delete
546 Bytes
import os
import shutil
from tqdm import tqdm
def split_content(filepath, separator, tmp_folder):
os.makedirs(tmp_folder, exist_ok=True)
base_file_name = os.path.basename(filepath)
fname, fextn = base_file_name.split(".")
with open(filepath, "r") as fp:
content = fp.read()
content_chunk = content.split(separator)
for index, chunk in tqdm(enumerate(content_chunk)):
new_fpath = os.path.join(tmp_folder, f"{fname}_{index:03d}.{fextn}")
with open(new_fpath, "w") as fp:
fp.write(chunk)