Spaces:
Sleeping
Sleeping
| from pathlib import Path | |
| from langchain_community.document_loaders import ( | |
| PyPDFLoader, | |
| TextLoader, | |
| PythonLoader, | |
| NotebookLoader, | |
| ) | |
| import pickle | |
| BASE_DIR = Path(__file__).resolve().parent.parent | |
| DATA_DIR = BASE_DIR / "data" | |
| OUTPUT_DIR = BASE_DIR / "output" | |
| OUTPUT_PATH = OUTPUT_DIR / "all_docs.pkl" | |
| OUTPUT_DIR.mkdir(parents=True, exist_ok=True) | |
| loaders = { | |
| ".pdf": PyPDFLoader, | |
| ".txt": lambda path: TextLoader(path, encoding="utf-8"), | |
| ".py": PythonLoader, | |
| ".ipynb": NotebookLoader, | |
| } | |
| documents = [] | |
| for file in DATA_DIR.rglob("*"): | |
| loader_class = loaders.get(file.suffix.lower()) | |
| if loader_class: | |
| try: | |
| docs = loader_class(str(file)).load() | |
| documents.extend(docs) | |
| print(f"[β] Loaded: {file.name}") | |
| except Exception as e: | |
| print(f"[!] Failed to load {file.name}: {e}") | |
| with open(OUTPUT_PATH, "wb") as f: | |
| pickle.dump(documents, f) | |
| print(f"π¦ Saved {len(documents)} documents to {OUTPUT_PATH}") |