|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Contains utilities to handle paths in Huggingface Hub.""" |
|
|
|
from fnmatch import fnmatch |
|
from pathlib import Path |
|
from typing import Callable, Generator, Iterable, List, Optional, TypeVar, Union |
|
|
|
|
|
T = TypeVar("T") |
|
|
|
|
|
DEFAULT_IGNORE_PATTERNS = [ |
|
".git", |
|
".git/*", |
|
"*/.git", |
|
"**/.git/**", |
|
".cache/huggingface", |
|
".cache/huggingface/*", |
|
"*/.cache/huggingface", |
|
"**/.cache/huggingface/**", |
|
] |
|
|
|
FORBIDDEN_FOLDERS = [".git", ".cache"] |
|
|
|
|
|
def filter_repo_objects( |
|
items: Iterable[T], |
|
*, |
|
allow_patterns: Optional[Union[List[str], str]] = None, |
|
ignore_patterns: Optional[Union[List[str], str]] = None, |
|
key: Optional[Callable[[T], str]] = None, |
|
) -> Generator[T, None, None]: |
|
"""Filter repo objects based on an allowlist and a denylist. |
|
|
|
Input must be a list of paths (`str` or `Path`) or a list of arbitrary objects. |
|
In the later case, `key` must be provided and specifies a function of one argument |
|
that is used to extract a path from each element in iterable. |
|
|
|
Patterns are Unix shell-style wildcards which are NOT regular expressions. See |
|
https://docs.python.org/3/library/fnmatch.html for more details. |
|
|
|
Args: |
|
items (`Iterable`): |
|
List of items to filter. |
|
allow_patterns (`str` or `List[str]`, *optional*): |
|
Patterns constituting the allowlist. If provided, item paths must match at |
|
least one pattern from the allowlist. |
|
ignore_patterns (`str` or `List[str]`, *optional*): |
|
Patterns constituting the denylist. If provided, item paths must not match |
|
any patterns from the denylist. |
|
key (`Callable[[T], str]`, *optional*): |
|
Single-argument function to extract a path from each item. If not provided, |
|
the `items` must already be `str` or `Path`. |
|
|
|
Returns: |
|
Filtered list of objects, as a generator. |
|
|
|
Raises: |
|
:class:`ValueError`: |
|
If `key` is not provided and items are not `str` or `Path`. |
|
|
|
Example usage with paths: |
|
```python |
|
>>> # Filter only PDFs that are not hidden. |
|
>>> list(filter_repo_objects( |
|
... ["aaa.PDF", "bbb.jpg", ".ccc.pdf", ".ddd.png"], |
|
... allow_patterns=["*.pdf"], |
|
... ignore_patterns=[".*"], |
|
... )) |
|
["aaa.pdf"] |
|
``` |
|
|
|
Example usage with objects: |
|
```python |
|
>>> list(filter_repo_objects( |
|
... [ |
|
... CommitOperationAdd(path_or_fileobj="/tmp/aaa.pdf", path_in_repo="aaa.pdf") |
|
... CommitOperationAdd(path_or_fileobj="/tmp/bbb.jpg", path_in_repo="bbb.jpg") |
|
... CommitOperationAdd(path_or_fileobj="/tmp/.ccc.pdf", path_in_repo=".ccc.pdf") |
|
... CommitOperationAdd(path_or_fileobj="/tmp/.ddd.png", path_in_repo=".ddd.png") |
|
... ], |
|
... allow_patterns=["*.pdf"], |
|
... ignore_patterns=[".*"], |
|
... key=lambda x: x.repo_in_path |
|
... )) |
|
[CommitOperationAdd(path_or_fileobj="/tmp/aaa.pdf", path_in_repo="aaa.pdf")] |
|
``` |
|
""" |
|
if isinstance(allow_patterns, str): |
|
allow_patterns = [allow_patterns] |
|
|
|
if isinstance(ignore_patterns, str): |
|
ignore_patterns = [ignore_patterns] |
|
|
|
if allow_patterns is not None: |
|
allow_patterns = [_add_wildcard_to_directories(p) for p in allow_patterns] |
|
if ignore_patterns is not None: |
|
ignore_patterns = [_add_wildcard_to_directories(p) for p in ignore_patterns] |
|
|
|
if key is None: |
|
|
|
def _identity(item: T) -> str: |
|
if isinstance(item, str): |
|
return item |
|
if isinstance(item, Path): |
|
return str(item) |
|
raise ValueError(f"Please provide `key` argument in `filter_repo_objects`: `{item}` is not a string.") |
|
|
|
key = _identity |
|
|
|
for item in items: |
|
path = key(item) |
|
|
|
|
|
if allow_patterns is not None and not any(fnmatch(path, r) for r in allow_patterns): |
|
continue |
|
|
|
|
|
if ignore_patterns is not None and any(fnmatch(path, r) for r in ignore_patterns): |
|
continue |
|
|
|
yield item |
|
|
|
|
|
def _add_wildcard_to_directories(pattern: str) -> str: |
|
if pattern[-1] == "/": |
|
return pattern + "*" |
|
return pattern |
|
|