|
from pathlib import Path |
|
from langchain_community.document_loaders import ( |
|
PyPDFLoader, |
|
TextLoader, |
|
PythonLoader, |
|
NotebookLoader, |
|
) |
|
import pickle |
|
|
|
def main(): |
|
BASE_DIR = Path(__file__).resolve().parent.parent |
|
DATA_DIR = BASE_DIR / "data" |
|
OUTPUT_DIR = BASE_DIR / "output" |
|
OUTPUT_PATH = OUTPUT_DIR / "all_docs.pkl" |
|
|
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True) |
|
|
|
loaders = { |
|
".pdf": PyPDFLoader, |
|
".txt": lambda path: TextLoader(path, encoding="utf-8"), |
|
".py": PythonLoader, |
|
".ipynb": NotebookLoader, |
|
} |
|
|
|
documents = [] |
|
for file in DATA_DIR.rglob("*"): |
|
loader_class = loaders.get(file.suffix.lower()) |
|
if loader_class: |
|
try: |
|
docs = loader_class(str(file)).load() |
|
documents.extend(docs) |
|
print(f"[β] Loaded: {file.name}") |
|
except Exception as e: |
|
print(f"[!] Failed to load {file.name}: {e}") |
|
|
|
with open(OUTPUT_PATH, "wb") as f: |
|
pickle.dump(documents, f) |
|
print(f"π¦ Saved {len(documents)} documents to {OUTPUT_PATH}") |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|