# coding=utf-8 # Copyright 2022-present, the HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Contains utilities to handle paths in Huggingface Hub.""" from fnmatch import fnmatch from pathlib import Path from typing import Callable, Generator, Iterable, List, Optional, TypeVar, Union T = TypeVar("T") IGNORE_GIT_FOLDER_PATTERNS = [".git", ".git/*", "*/.git", "**/.git/**"] def filter_repo_objects( items: Iterable[T], *, allow_patterns: Optional[Union[List[str], str]] = None, ignore_patterns: Optional[Union[List[str], str]] = None, key: Optional[Callable[[T], str]] = None, ) -> Generator[T, None, None]: """Filter repo objects based on an allowlist and a denylist. Input must be a list of paths (`str` or `Path`) or a list of arbitrary objects. In the later case, `key` must be provided and specifies a function of one argument that is used to extract a path from each element in iterable. Patterns are Unix shell-style wildcards which are NOT regular expressions. See https://docs.python.org/3/library/fnmatch.html for more details. Args: items (`Iterable`): List of items to filter. allow_patterns (`str` or `List[str]`, *optional*): Patterns constituting the allowlist. If provided, item paths must match at least one pattern from the allowlist. ignore_patterns (`str` or `List[str]`, *optional*): Patterns constituting the denylist. If provided, item paths must not match any patterns from the denylist. key (`Callable[[T], str]`, *optional*): Single-argument function to extract a path from each item. If not provided, the `items` must already be `str` or `Path`. Returns: Filtered list of objects, as a generator. Raises: :class:`ValueError`: If `key` is not provided and items are not `str` or `Path`. Example usage with paths: ```python >>> # Filter only PDFs that are not hidden. >>> list(filter_repo_objects( ... ["aaa.PDF", "bbb.jpg", ".ccc.pdf", ".ddd.png"], ... allow_patterns=["*.pdf"], ... ignore_patterns=[".*"], ... )) ["aaa.pdf"] ``` Example usage with objects: ```python >>> list(filter_repo_objects( ... [ ... CommitOperationAdd(path_or_fileobj="/tmp/aaa.pdf", path_in_repo="aaa.pdf") ... CommitOperationAdd(path_or_fileobj="/tmp/bbb.jpg", path_in_repo="bbb.jpg") ... CommitOperationAdd(path_or_fileobj="/tmp/.ccc.pdf", path_in_repo=".ccc.pdf") ... CommitOperationAdd(path_or_fileobj="/tmp/.ddd.png", path_in_repo=".ddd.png") ... ], ... allow_patterns=["*.pdf"], ... ignore_patterns=[".*"], ... key=lambda x: x.repo_in_path ... )) [CommitOperationAdd(path_or_fileobj="/tmp/aaa.pdf", path_in_repo="aaa.pdf")] ``` """ if isinstance(allow_patterns, str): allow_patterns = [allow_patterns] if isinstance(ignore_patterns, str): ignore_patterns = [ignore_patterns] if key is None: def _identity(item: T) -> str: if isinstance(item, str): return item if isinstance(item, Path): return str(item) raise ValueError(f"Please provide `key` argument in `filter_repo_objects`: `{item}` is not a string.") key = _identity # Items must be `str` or `Path`, otherwise raise ValueError for item in items: path = key(item) # Skip if there's an allowlist and path doesn't match any if allow_patterns is not None and not any(fnmatch(path, r) for r in allow_patterns): continue # Skip if there's a denylist and path matches any if ignore_patterns is not None and any(fnmatch(path, r) for r in ignore_patterns): continue yield item