Spaces:
Runtime error
Runtime error
File size: 4,896 Bytes
129cd69 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
"""Use to load blobs from the local file system."""
from pathlib import Path
from typing import Callable, Iterable, Iterator, Optional, Sequence, TypeVar, Union
from langchain.document_loaders.blob_loaders.schema import Blob, BlobLoader
T = TypeVar("T")
def _make_iterator(
length_func: Callable[[], int], show_progress: bool = False
) -> Callable[[Iterable[T]], Iterator[T]]:
"""Create a function that optionally wraps an iterable in tqdm."""
if show_progress:
try:
from tqdm.auto import tqdm
except ImportError:
raise ImportError(
"You must install tqdm to use show_progress=True."
"You can install tqdm with `pip install tqdm`."
)
# Make sure to provide `total` here so that tqdm can show
# a progress bar that takes into account the total number of files.
def _with_tqdm(iterable: Iterable[T]) -> Iterator[T]:
"""Wrap an iterable in a tqdm progress bar."""
return tqdm(iterable, total=length_func())
iterator = _with_tqdm
else:
iterator = iter # type: ignore
return iterator
# PUBLIC API
class FileSystemBlobLoader(BlobLoader):
"""Load blobs in the local file system.
Example:
.. code-block:: python
from langchain.document_loaders.blob_loaders import FileSystemBlobLoader
loader = FileSystemBlobLoader("/path/to/directory")
for blob in loader.yield_blobs():
print(blob)
"""
def __init__(
self,
path: Union[str, Path],
*,
glob: str = "**/[!.]*",
exclude: Sequence[str] = (),
suffixes: Optional[Sequence[str]] = None,
show_progress: bool = False,
) -> None:
"""Initialize with a path to directory and how to glob over it.
Args:
path: Path to directory to load from
glob: Glob pattern relative to the specified path
by default set to pick up all non-hidden files
exclude: patterns to exclude from results, use glob syntax
suffixes: Provide to keep only files with these suffixes
Useful when wanting to keep files with different suffixes
Suffixes must include the dot, e.g. ".txt"
show_progress: If true, will show a progress bar as the files are loaded.
This forces an iteration through all matching files
to count them prior to loading them.
Examples:
.. code-block:: python
# Recursively load all text files in a directory.
loader = FileSystemBlobLoader("/path/to/directory", glob="**/*.txt")
# Recursively load all non-hidden files in a directory.
loader = FileSystemBlobLoader("/path/to/directory", glob="**/[!.]*")
# Load all files in a directory without recursion.
loader = FileSystemBlobLoader("/path/to/directory", glob="*")
# Recursively load all files in a directory, except for py or pyc files.
loader = FileSystemBlobLoader(
"/path/to/directory",
glob="**/*.txt",
exclude=["**/*.py", "**/*.pyc"]
)
"""
if isinstance(path, Path):
_path = path
elif isinstance(path, str):
_path = Path(path)
else:
raise TypeError(f"Expected str or Path, got {type(path)}")
self.path = _path.expanduser() # Expand user to handle ~
self.glob = glob
self.suffixes = set(suffixes or [])
self.show_progress = show_progress
self.exclude = exclude
def yield_blobs(
self,
) -> Iterable[Blob]:
"""Yield blobs that match the requested pattern."""
iterator = _make_iterator(
length_func=self.count_matching_files, show_progress=self.show_progress
)
for path in iterator(self._yield_paths()):
yield Blob.from_path(path)
def _yield_paths(self) -> Iterable[Path]:
"""Yield paths that match the requested pattern."""
paths = self.path.glob(self.glob)
for path in paths:
if self.exclude:
if any(path.match(glob) for glob in self.exclude):
continue
if path.is_file():
if self.suffixes and path.suffix not in self.suffixes:
continue
yield path
def count_matching_files(self) -> int:
"""Count files that match the pattern without loading them."""
# Carry out a full iteration to count the files without
# materializing anything expensive in memory.
num = 0
for _ in self._yield_paths():
num += 1
return num
|